{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.380115495462068, "learning_rate": 5.208333333333333e-09, "logps/chosen": -37.8603515625, "logps/rejected": -41.08392333984375, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.6284235715866089, "losses/total": 0.6931471824645996, "ref_logps/chosen": -37.8603515625, "ref_logps/rejected": -41.08392333984375, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 27.45295406748287, "learning_rate": 1.0416666666666666e-08, "logps/chosen": -37.44581604003906, "logps/rejected": -47.522891998291016, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.9501174688339233, "losses/total": 0.6931471824645996, "ref_logps/chosen": -37.44581604003906, "ref_logps/rejected": -47.522891998291016, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "grad_norm": 26.06686264505646, "learning_rate": 1.5625e-08, "logps/chosen": -27.914955139160156, "logps/rejected": -52.444091796875, "loss": 0.6938, "losses/dpo": 0.675484299659729, "losses/sft": 2.231576919555664, "losses/total": 0.675484299659729, "ref_logps/chosen": -27.925832748413086, "ref_logps/rejected": -52.46782302856445, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0010878322646021843, "rewards/margins": -0.0012856184039264917, "rewards/rejected": 0.0023734511341899633, "step": 3 }, { "epoch": 0.0, "grad_norm": 19.84270002766211, "learning_rate": 2.083333333333333e-08, "logps/chosen": -26.83194351196289, "logps/rejected": -31.101865768432617, "loss": 0.6942, "losses/dpo": 0.7056854963302612, "losses/sft": 1.9789139032363892, "losses/total": 0.7056854963302612, "ref_logps/chosen": -26.82935905456543, "ref_logps/rejected": -31.11972427368164, "rewards/accuracies": 0.375, "rewards/chosen": -0.0002585112815722823, "rewards/margins": -0.0020447582937777042, "rewards/rejected": 0.0017862468957901, "step": 4 }, { "epoch": 0.0, "grad_norm": 22.01383435793805, "learning_rate": 2.6041666666666667e-08, "logps/chosen": -24.065032958984375, "logps/rejected": -32.82991027832031, "loss": 0.6918, "losses/dpo": 0.6893840432167053, "losses/sft": 1.388756275177002, "losses/total": 0.6893840432167053, "ref_logps/chosen": -24.093971252441406, "ref_logps/rejected": -32.83185577392578, "rewards/accuracies": 0.75, "rewards/chosen": 0.002893678843975067, "rewards/margins": 0.0026987239252775908, "rewards/rejected": 0.00019495480228215456, "step": 5 }, { "epoch": 0.01, "grad_norm": 23.528684257327573, "learning_rate": 3.125e-08, "logps/chosen": -30.710527420043945, "logps/rejected": -35.3096923828125, "loss": 0.6942, "losses/dpo": 0.7112069129943848, "losses/sft": 0.7559832334518433, "losses/total": 0.7112069129943848, "ref_logps/chosen": -30.688087463378906, "ref_logps/rejected": -35.307559967041016, "rewards/accuracies": 0.4375, "rewards/chosen": -0.002243923954665661, "rewards/margins": -0.0020307153463363647, "rewards/rejected": -0.00021320884115993977, "step": 6 }, { "epoch": 0.01, "grad_norm": 20.86406687018874, "learning_rate": 3.645833333333334e-08, "logps/chosen": -20.95810317993164, "logps/rejected": -36.69044876098633, "loss": 0.6972, "losses/dpo": 0.6964845657348633, "losses/sft": 0.4605206549167633, "losses/total": 0.6964845657348633, "ref_logps/chosen": -20.925945281982422, "ref_logps/rejected": -36.737403869628906, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0032158582471311092, "rewards/margins": -0.007911184802651405, "rewards/rejected": 0.004695326089859009, "step": 7 }, { "epoch": 0.01, "grad_norm": 20.471817420435062, "learning_rate": 4.166666666666666e-08, "logps/chosen": -22.625606536865234, "logps/rejected": -29.56753921508789, "loss": 0.694, "losses/dpo": 0.6923890113830566, "losses/sft": 0.6883086562156677, "losses/total": 0.6923890113830566, "ref_logps/chosen": -22.623611450195312, "ref_logps/rejected": -29.580848693847656, "rewards/accuracies": 0.5, "rewards/chosen": -0.000199583126232028, "rewards/margins": -0.0015303820837289095, "rewards/rejected": 0.0013307984918355942, "step": 8 }, { "epoch": 0.01, "grad_norm": 21.45583114734525, "learning_rate": 4.6875e-08, "logps/chosen": -21.797367095947266, "logps/rejected": -36.66053009033203, "loss": 0.691, "losses/dpo": 0.6878825426101685, "losses/sft": 0.40481385588645935, "losses/total": 0.6878825426101685, "ref_logps/chosen": -21.821674346923828, "ref_logps/rejected": -36.640533447265625, "rewards/accuracies": 0.625, "rewards/chosen": 0.0024306923151016235, "rewards/margins": 0.004430398344993591, "rewards/rejected": -0.0019997060298919678, "step": 9 }, { "epoch": 0.01, "grad_norm": 26.26927228298958, "learning_rate": 5.208333333333333e-08, "logps/chosen": -39.44868469238281, "logps/rejected": -43.324798583984375, "loss": 0.6996, "losses/dpo": 0.6989561915397644, "losses/sft": 1.1736102104187012, "losses/total": 0.6989561915397644, "ref_logps/chosen": -39.35295486450195, "ref_logps/rejected": -43.357051849365234, "rewards/accuracies": 0.25, "rewards/chosen": -0.009573065675795078, "rewards/margins": -0.012798302806913853, "rewards/rejected": 0.0032252368982881308, "step": 10 }, { "epoch": 0.01, "grad_norm": 22.62321092921081, "learning_rate": 5.729166666666666e-08, "logps/chosen": -29.348243713378906, "logps/rejected": -35.06862258911133, "loss": 0.6966, "losses/dpo": 0.6949277520179749, "losses/sft": 0.9592458605766296, "losses/total": 0.6949277520179749, "ref_logps/chosen": -29.330493927001953, "ref_logps/rejected": -35.12003707885742, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0017749338876456022, "rewards/margins": -0.006915953941643238, "rewards/rejected": 0.005141019821166992, "step": 11 }, { "epoch": 0.01, "grad_norm": 22.17908429030323, "learning_rate": 6.25e-08, "logps/chosen": -30.346847534179688, "logps/rejected": -35.53053283691406, "loss": 0.6965, "losses/dpo": 0.7032960653305054, "losses/sft": 1.0836734771728516, "losses/total": 0.7032960653305054, "ref_logps/chosen": -30.302871704101562, "ref_logps/rejected": -35.551719665527344, "rewards/accuracies": 0.1875, "rewards/chosen": -0.004397737793624401, "rewards/margins": -0.0065163373947143555, "rewards/rejected": 0.0021185993682593107, "step": 12 }, { "epoch": 0.01, "grad_norm": 24.24431125452364, "learning_rate": 6.770833333333333e-08, "logps/chosen": -30.664535522460938, "logps/rejected": -36.8706169128418, "loss": 0.6941, "losses/dpo": 0.6892952919006348, "losses/sft": 0.13877521455287933, "losses/total": 0.6892952919006348, "ref_logps/chosen": -30.701936721801758, "ref_logps/rejected": -36.926727294921875, "rewards/accuracies": 0.4375, "rewards/chosen": 0.003740100422874093, "rewards/margins": -0.0018708035349845886, "rewards/rejected": 0.005610904190689325, "step": 13 }, { "epoch": 0.01, "grad_norm": 22.41012804114792, "learning_rate": 7.291666666666667e-08, "logps/chosen": -31.56171989440918, "logps/rejected": -34.388004302978516, "loss": 0.6914, "losses/dpo": 0.6925297975540161, "losses/sft": 1.3797324895858765, "losses/total": 0.6925297975540161, "ref_logps/chosen": -31.546314239501953, "ref_logps/rejected": -34.33654022216797, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0015408038161695004, "rewards/margins": 0.0036055869422852993, "rewards/rejected": -0.005146390292793512, "step": 14 }, { "epoch": 0.01, "grad_norm": 20.56475289624765, "learning_rate": 7.812499999999999e-08, "logps/chosen": -30.485029220581055, "logps/rejected": -25.507259368896484, "loss": 0.6945, "losses/dpo": 0.6913262009620667, "losses/sft": 0.6149562001228333, "losses/total": 0.6913262009620667, "ref_logps/chosen": -30.471023559570312, "ref_logps/rejected": -25.51897430419922, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014005035627633333, "rewards/margins": -0.002572178840637207, "rewards/rejected": 0.0011716753942891955, "step": 15 }, { "epoch": 0.02, "grad_norm": 18.104228197391837, "learning_rate": 8.333333333333333e-08, "logps/chosen": -23.100540161132812, "logps/rejected": -25.51854705810547, "loss": 0.694, "losses/dpo": 0.6977440118789673, "losses/sft": 0.6464166045188904, "losses/total": 0.6977440118789673, "ref_logps/chosen": -23.137847900390625, "ref_logps/rejected": -25.572269439697266, "rewards/accuracies": 0.4375, "rewards/chosen": 0.003730631433427334, "rewards/margins": -0.0016416488215327263, "rewards/rejected": 0.00537228025496006, "step": 16 }, { "epoch": 0.02, "grad_norm": 21.03873125050733, "learning_rate": 8.854166666666667e-08, "logps/chosen": -23.835796356201172, "logps/rejected": -35.94401550292969, "loss": 0.6919, "losses/dpo": 0.6974456310272217, "losses/sft": 0.663185715675354, "losses/total": 0.6974456310272217, "ref_logps/chosen": -23.864200592041016, "ref_logps/rejected": -35.94670104980469, "rewards/accuracies": 0.625, "rewards/chosen": 0.002840301487594843, "rewards/margins": 0.002572184894233942, "rewards/rejected": 0.0002681163605302572, "step": 17 }, { "epoch": 0.02, "grad_norm": 21.973945469386937, "learning_rate": 9.375e-08, "logps/chosen": -21.951805114746094, "logps/rejected": -42.93829345703125, "loss": 0.6952, "losses/dpo": 0.7005785703659058, "losses/sft": 0.3013179898262024, "losses/total": 0.7005785703659058, "ref_logps/chosen": -21.959224700927734, "ref_logps/rejected": -42.9852294921875, "rewards/accuracies": 0.5, "rewards/chosen": 0.000741815660148859, "rewards/margins": -0.003951340913772583, "rewards/rejected": 0.004693156573921442, "step": 18 }, { "epoch": 0.02, "grad_norm": 23.671292515356566, "learning_rate": 9.895833333333332e-08, "logps/chosen": -35.36517333984375, "logps/rejected": -38.200439453125, "loss": 0.6897, "losses/dpo": 0.6989502310752869, "losses/sft": 1.1800127029418945, "losses/total": 0.6989502310752869, "ref_logps/chosen": -35.40092468261719, "ref_logps/rejected": -38.16667175292969, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0035754560958594084, "rewards/margins": 0.0069526201114058495, "rewards/rejected": -0.0033771635498851538, "step": 19 }, { "epoch": 0.02, "grad_norm": 20.932943691339855, "learning_rate": 1.0416666666666667e-07, "logps/chosen": -25.999528884887695, "logps/rejected": -31.32129669189453, "loss": 0.6908, "losses/dpo": 0.7052661776542664, "losses/sft": 0.7907893061637878, "losses/total": 0.7052661776542664, "ref_logps/chosen": -26.019527435302734, "ref_logps/rejected": -31.293842315673828, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019999893847852945, "rewards/margins": 0.004745745565742254, "rewards/rejected": -0.0027457564137876034, "step": 20 }, { "epoch": 0.02, "grad_norm": 21.621287712006918, "learning_rate": 1.09375e-07, "logps/chosen": -33.18739318847656, "logps/rejected": -31.041976928710938, "loss": 0.697, "losses/dpo": 0.6994649171829224, "losses/sft": 0.6084625124931335, "losses/total": 0.6994649171829224, "ref_logps/chosen": -33.1328125, "ref_logps/rejected": -31.063539505004883, "rewards/accuracies": 0.375, "rewards/chosen": -0.005458048544824123, "rewards/margins": -0.007614323403686285, "rewards/rejected": 0.002156275324523449, "step": 21 }, { "epoch": 0.02, "grad_norm": 20.60435271894607, "learning_rate": 1.1458333333333332e-07, "logps/chosen": -27.308177947998047, "logps/rejected": -26.209165573120117, "loss": 0.6959, "losses/dpo": 0.6876956820487976, "losses/sft": 1.1862397193908691, "losses/total": 0.6876956820487976, "ref_logps/chosen": -27.26185417175293, "ref_logps/rejected": -26.216665267944336, "rewards/accuracies": 0.25, "rewards/chosen": -0.0046323807910084724, "rewards/margins": -0.00538243493065238, "rewards/rejected": 0.0007500543724745512, "step": 22 }, { "epoch": 0.02, "grad_norm": 21.063706035935898, "learning_rate": 1.1979166666666668e-07, "logps/chosen": -28.472787857055664, "logps/rejected": -37.83734130859375, "loss": 0.6908, "losses/dpo": 0.6939562559127808, "losses/sft": 1.0666980743408203, "losses/total": 0.6939562559127808, "ref_logps/chosen": -28.535715103149414, "ref_logps/rejected": -37.85205841064453, "rewards/accuracies": 0.625, "rewards/chosen": 0.006292751524597406, "rewards/margins": 0.0048211547546088696, "rewards/rejected": 0.001471596653573215, "step": 23 }, { "epoch": 0.02, "grad_norm": 21.009913078437286, "learning_rate": 1.25e-07, "logps/chosen": -25.080204010009766, "logps/rejected": -31.56366539001465, "loss": 0.6946, "losses/dpo": 0.6904217600822449, "losses/sft": 0.5501120686531067, "losses/total": 0.6904217600822449, "ref_logps/chosen": -25.072494506835938, "ref_logps/rejected": -31.584144592285156, "rewards/accuracies": 0.375, "rewards/chosen": -0.0007710267091169953, "rewards/margins": -0.0028189257718622684, "rewards/rejected": 0.0020478994119912386, "step": 24 }, { "epoch": 0.02, "grad_norm": 19.563544762938708, "learning_rate": 1.3020833333333334e-07, "logps/chosen": -22.73567771911621, "logps/rejected": -24.115489959716797, "loss": 0.6957, "losses/dpo": 0.7030764222145081, "losses/sft": 0.5415139198303223, "losses/total": 0.7030764222145081, "ref_logps/chosen": -22.697032928466797, "ref_logps/rejected": -24.12625503540039, "rewards/accuracies": 0.4375, "rewards/chosen": -0.003864619182422757, "rewards/margins": -0.004941216204315424, "rewards/rejected": 0.0010765971383079886, "step": 25 }, { "epoch": 0.02, "grad_norm": 22.139459857503677, "learning_rate": 1.3541666666666666e-07, "logps/chosen": -21.14828872680664, "logps/rejected": -33.75491714477539, "loss": 0.6928, "losses/dpo": 0.6926923990249634, "losses/sft": 0.2455293983221054, "losses/total": 0.6926923990249634, "ref_logps/chosen": -21.158390045166016, "ref_logps/rejected": -33.75730514526367, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0010100125800818205, "rewards/margins": 0.0007711591897532344, "rewards/rejected": 0.00023885362315922976, "step": 26 }, { "epoch": 0.03, "grad_norm": 25.31527802581097, "learning_rate": 1.40625e-07, "logps/chosen": -41.530242919921875, "logps/rejected": -36.08148193359375, "loss": 0.695, "losses/dpo": 0.6947487592697144, "losses/sft": 0.7929549813270569, "losses/total": 0.6947487592697144, "ref_logps/chosen": -41.536277770996094, "ref_logps/rejected": -36.12469482421875, "rewards/accuracies": 0.375, "rewards/chosen": 0.0006035745609551668, "rewards/margins": -0.0037178248167037964, "rewards/rejected": 0.004321399610489607, "step": 27 }, { "epoch": 0.03, "grad_norm": 20.04318629344745, "learning_rate": 1.4583333333333335e-07, "logps/chosen": -27.272668838500977, "logps/rejected": -30.088224411010742, "loss": 0.6923, "losses/dpo": 0.6979986429214478, "losses/sft": 1.2255476713180542, "losses/total": 0.6979986429214478, "ref_logps/chosen": -27.287776947021484, "ref_logps/rejected": -30.08507537841797, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0015108406078070402, "rewards/margins": 0.0018257619813084602, "rewards/rejected": -0.00031492122798226774, "step": 28 }, { "epoch": 0.03, "grad_norm": 20.23954645910381, "learning_rate": 1.5104166666666664e-07, "logps/chosen": -19.39305877685547, "logps/rejected": -38.6036262512207, "loss": 0.6961, "losses/dpo": 0.6927523612976074, "losses/sft": 1.0540591478347778, "losses/total": 0.6927523612976074, "ref_logps/chosen": -19.42737579345703, "ref_logps/rejected": -38.696163177490234, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0034317909739911556, "rewards/margins": -0.005822015460580587, "rewards/rejected": 0.009253805503249168, "step": 29 }, { "epoch": 0.03, "grad_norm": 19.89472179156743, "learning_rate": 1.5624999999999999e-07, "logps/chosen": -25.03938865661621, "logps/rejected": -33.34719467163086, "loss": 0.6915, "losses/dpo": 0.6893761157989502, "losses/sft": 0.9870947003364563, "losses/total": 0.6893761157989502, "ref_logps/chosen": -25.07794761657715, "ref_logps/rejected": -33.3521614074707, "rewards/accuracies": 0.5, "rewards/chosen": 0.003855949966236949, "rewards/margins": 0.0033593415282666683, "rewards/rejected": 0.0004966079723089933, "step": 30 }, { "epoch": 0.03, "grad_norm": 22.313005684979302, "learning_rate": 1.6145833333333333e-07, "logps/chosen": -36.30854034423828, "logps/rejected": -32.15274429321289, "loss": 0.6903, "losses/dpo": 0.6928702592849731, "losses/sft": 0.9198456406593323, "losses/total": 0.6928702592849731, "ref_logps/chosen": -36.33011245727539, "ref_logps/rejected": -32.11732864379883, "rewards/accuracies": 0.75, "rewards/chosen": 0.002157121431082487, "rewards/margins": 0.005698660388588905, "rewards/rejected": -0.003541538491845131, "step": 31 }, { "epoch": 0.03, "grad_norm": 24.035877334921015, "learning_rate": 1.6666666666666665e-07, "logps/chosen": -31.135778427124023, "logps/rejected": -36.15631103515625, "loss": 0.6914, "losses/dpo": 0.6919117569923401, "losses/sft": 1.109789490699768, "losses/total": 0.6919117569923401, "ref_logps/chosen": -31.169631958007812, "ref_logps/rejected": -36.1555290222168, "rewards/accuracies": 0.625, "rewards/chosen": 0.00338550191372633, "rewards/margins": 0.0034638612996786833, "rewards/rejected": -7.835938595235348e-05, "step": 32 }, { "epoch": 0.03, "grad_norm": 24.417087284544994, "learning_rate": 1.71875e-07, "logps/chosen": -36.1037483215332, "logps/rejected": -39.72431945800781, "loss": 0.6883, "losses/dpo": 0.6955977082252502, "losses/sft": 1.0960596799850464, "losses/total": 0.6955977082252502, "ref_logps/chosen": -36.13005828857422, "ref_logps/rejected": -39.65172576904297, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0026312456466257572, "rewards/margins": 0.009890196844935417, "rewards/rejected": -0.007258951663970947, "step": 33 }, { "epoch": 0.03, "grad_norm": 20.936963240832412, "learning_rate": 1.7708333333333334e-07, "logps/chosen": -25.560340881347656, "logps/rejected": -33.930076599121094, "loss": 0.6918, "losses/dpo": 0.6960599422454834, "losses/sft": 0.6988459229469299, "losses/total": 0.6960599422454834, "ref_logps/chosen": -25.573150634765625, "ref_logps/rejected": -33.91607666015625, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0012808383908122778, "rewards/margins": 0.002681151032447815, "rewards/rejected": -0.0014003126416355371, "step": 34 }, { "epoch": 0.03, "grad_norm": 20.1006275022137, "learning_rate": 1.8229166666666666e-07, "logps/chosen": -23.989791870117188, "logps/rejected": -27.746070861816406, "loss": 0.6951, "losses/dpo": 0.6980480551719666, "losses/sft": 0.9145336747169495, "losses/total": 0.6980480551719666, "ref_logps/chosen": -23.94711685180664, "ref_logps/rejected": -27.740924835205078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004267798736691475, "rewards/margins": -0.003753183875232935, "rewards/rejected": -0.0005146145704202354, "step": 35 }, { "epoch": 0.03, "grad_norm": 21.632816885378297, "learning_rate": 1.875e-07, "logps/chosen": -25.89436149597168, "logps/rejected": -32.60966873168945, "loss": 0.6948, "losses/dpo": 0.7038102149963379, "losses/sft": 1.046172022819519, "losses/total": 0.7038102149963379, "ref_logps/chosen": -25.869474411010742, "ref_logps/rejected": -32.616905212402344, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0024886757601052523, "rewards/margins": -0.0032119713723659515, "rewards/rejected": 0.0007232949137687683, "step": 36 }, { "epoch": 0.03, "grad_norm": 19.81154750569189, "learning_rate": 1.9270833333333332e-07, "logps/chosen": -23.234350204467773, "logps/rejected": -31.311748504638672, "loss": 0.6951, "losses/dpo": 0.7048007845878601, "losses/sft": 0.827793538570404, "losses/total": 0.7048007845878601, "ref_logps/chosen": -23.21068572998047, "ref_logps/rejected": -31.32668113708496, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0023666266351938248, "rewards/margins": -0.003860021010041237, "rewards/rejected": 0.001493394491262734, "step": 37 }, { "epoch": 0.04, "grad_norm": 24.569039904446434, "learning_rate": 1.9791666666666664e-07, "logps/chosen": -32.101905822753906, "logps/rejected": -45.594200134277344, "loss": 0.6921, "losses/dpo": 0.7030043005943298, "losses/sft": 1.0690467357635498, "losses/total": 0.7030043005943298, "ref_logps/chosen": -32.129920959472656, "ref_logps/rejected": -45.60040283203125, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028018115554004908, "rewards/margins": 0.002181771444156766, "rewards/rejected": 0.0006200404604896903, "step": 38 }, { "epoch": 0.04, "grad_norm": 20.239404088343235, "learning_rate": 2.03125e-07, "logps/chosen": -32.28240966796875, "logps/rejected": -28.389739990234375, "loss": 0.6922, "losses/dpo": 0.6925745010375977, "losses/sft": 1.1388819217681885, "losses/total": 0.6925745010375977, "ref_logps/chosen": -32.2904052734375, "ref_logps/rejected": -28.37813949584961, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007996616768650711, "rewards/margins": 0.0019594968762248755, "rewards/rejected": -0.0011598349083214998, "step": 39 }, { "epoch": 0.04, "grad_norm": 26.765574366123598, "learning_rate": 2.0833333333333333e-07, "logps/chosen": -37.947776794433594, "logps/rejected": -49.38038635253906, "loss": 0.6955, "losses/dpo": 0.698715329170227, "losses/sft": 1.2773995399475098, "losses/total": 0.698715329170227, "ref_logps/chosen": -37.958038330078125, "ref_logps/rejected": -49.435298919677734, "rewards/accuracies": 0.3125, "rewards/chosen": 0.001026207348331809, "rewards/margins": -0.004464631900191307, "rewards/rejected": 0.00549083948135376, "step": 40 }, { "epoch": 0.04, "grad_norm": 19.935398669279387, "learning_rate": 2.1354166666666665e-07, "logps/chosen": -26.876983642578125, "logps/rejected": -29.59823226928711, "loss": 0.692, "losses/dpo": 0.6999084949493408, "losses/sft": 0.5188363790512085, "losses/total": 0.6999084949493408, "ref_logps/chosen": -26.88933753967285, "ref_logps/rejected": -29.58661460876465, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0012354031205177307, "rewards/margins": 0.002397097647190094, "rewards/rejected": -0.0011616945266723633, "step": 41 }, { "epoch": 0.04, "grad_norm": 20.600815065743937, "learning_rate": 2.1875e-07, "logps/chosen": -27.48914909362793, "logps/rejected": -32.98211669921875, "loss": 0.6923, "losses/dpo": 0.6885395050048828, "losses/sft": 0.14800411462783813, "losses/total": 0.6885395050048828, "ref_logps/chosen": -27.503482818603516, "ref_logps/rejected": -32.97887420654297, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0014333680737763643, "rewards/margins": 0.0017580153653398156, "rewards/rejected": -0.00032464717514812946, "step": 42 }, { "epoch": 0.04, "grad_norm": 24.35743753923453, "learning_rate": 2.2395833333333334e-07, "logps/chosen": -29.630659103393555, "logps/rejected": -42.02675247192383, "loss": 0.6935, "losses/dpo": 0.7144672870635986, "losses/sft": 1.4024723768234253, "losses/total": 0.7144672870635986, "ref_logps/chosen": -29.64403533935547, "ref_logps/rejected": -42.04615783691406, "rewards/accuracies": 0.5, "rewards/chosen": 0.0013376949355006218, "rewards/margins": -0.0006030122749507427, "rewards/rejected": 0.0019407074432820082, "step": 43 }, { "epoch": 0.04, "grad_norm": 23.85028537991585, "learning_rate": 2.2916666666666663e-07, "logps/chosen": -32.61023712158203, "logps/rejected": -47.918006896972656, "loss": 0.6913, "losses/dpo": 0.6938121318817139, "losses/sft": 1.1862221956253052, "losses/total": 0.6938121318817139, "ref_logps/chosen": -32.63817596435547, "ref_logps/rejected": -47.907958984375, "rewards/accuracies": 0.5, "rewards/chosen": 0.002793522085994482, "rewards/margins": 0.0037976279854774475, "rewards/rejected": -0.0010041058994829655, "step": 44 }, { "epoch": 0.04, "grad_norm": 21.44323973180879, "learning_rate": 2.3437499999999998e-07, "logps/chosen": -29.731849670410156, "logps/rejected": -38.44366455078125, "loss": 0.6924, "losses/dpo": 0.6725023984909058, "losses/sft": 0.6522619724273682, "losses/total": 0.6725023984909058, "ref_logps/chosen": -29.760921478271484, "ref_logps/rejected": -38.45610809326172, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029071478638798, "rewards/margins": 0.0016626778524369001, "rewards/rejected": 0.0012444702442735434, "step": 45 }, { "epoch": 0.04, "grad_norm": 21.408801746661776, "learning_rate": 2.3958333333333335e-07, "logps/chosen": -26.612689971923828, "logps/rejected": -38.33985137939453, "loss": 0.6908, "losses/dpo": 0.6903026103973389, "losses/sft": 1.2368303537368774, "losses/total": 0.6903026103973389, "ref_logps/chosen": -26.635051727294922, "ref_logps/rejected": -38.31396484375, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0022363741882145405, "rewards/margins": 0.004825451411306858, "rewards/rejected": -0.0025890772230923176, "step": 46 }, { "epoch": 0.04, "grad_norm": 19.97976860652416, "learning_rate": 2.4479166666666667e-07, "logps/chosen": -21.154605865478516, "logps/rejected": -31.210342407226562, "loss": 0.6906, "losses/dpo": 0.6810884475708008, "losses/sft": 1.118998646736145, "losses/total": 0.6810884475708008, "ref_logps/chosen": -21.17608070373535, "ref_logps/rejected": -31.179595947265625, "rewards/accuracies": 0.625, "rewards/chosen": 0.0021474212408065796, "rewards/margins": 0.00522198062390089, "rewards/rejected": -0.0030745593830943108, "step": 47 }, { "epoch": 0.05, "grad_norm": 21.70317429486433, "learning_rate": 2.5e-07, "logps/chosen": -24.168516159057617, "logps/rejected": -31.860130310058594, "loss": 0.6918, "losses/dpo": 0.6952889561653137, "losses/sft": 0.902199923992157, "losses/total": 0.6952889561653137, "ref_logps/chosen": -24.193498611450195, "ref_logps/rejected": -31.856504440307617, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0024983256589621305, "rewards/margins": 0.0028607307467609644, "rewards/rejected": -0.0003624052042141557, "step": 48 }, { "epoch": 0.05, "grad_norm": 20.60557255564455, "learning_rate": 2.552083333333333e-07, "logps/chosen": -23.041872024536133, "logps/rejected": -32.55037307739258, "loss": 0.692, "losses/dpo": 0.6873000264167786, "losses/sft": 0.9268174767494202, "losses/total": 0.6873000264167786, "ref_logps/chosen": -23.05540657043457, "ref_logps/rejected": -32.540382385253906, "rewards/accuracies": 0.625, "rewards/chosen": 0.0013532951707020402, "rewards/margins": 0.0023525201249867678, "rewards/rejected": -0.0009992250707000494, "step": 49 }, { "epoch": 0.05, "grad_norm": 19.68547883352091, "learning_rate": 2.604166666666667e-07, "logps/chosen": -27.975990295410156, "logps/rejected": -30.2783145904541, "loss": 0.6966, "losses/dpo": 0.6876497268676758, "losses/sft": 1.0162992477416992, "losses/total": 0.6876497268676758, "ref_logps/chosen": -27.952674865722656, "ref_logps/rejected": -30.324085235595703, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0023314119316637516, "rewards/margins": -0.006908267270773649, "rewards/rejected": 0.004576855804771185, "step": 50 }, { "epoch": 0.05, "grad_norm": 22.844878575920646, "learning_rate": 2.65625e-07, "logps/chosen": -29.02272605895996, "logps/rejected": -37.96780014038086, "loss": 0.6964, "losses/dpo": 0.6980652213096619, "losses/sft": 0.2979593575000763, "losses/total": 0.6980652213096619, "ref_logps/chosen": -28.9434814453125, "ref_logps/rejected": -37.954017639160156, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00792459025979042, "rewards/margins": -0.006545967422425747, "rewards/rejected": -0.00137862260453403, "step": 51 }, { "epoch": 0.05, "grad_norm": 21.06299012718472, "learning_rate": 2.708333333333333e-07, "logps/chosen": -18.937931060791016, "logps/rejected": -42.26324462890625, "loss": 0.6944, "losses/dpo": 0.7152536511421204, "losses/sft": 0.8200209140777588, "losses/total": 0.7152536511421204, "ref_logps/chosen": -18.918128967285156, "ref_logps/rejected": -42.26679992675781, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0019803172908723354, "rewards/margins": -0.0023354454897344112, "rewards/rejected": 0.00035512796603143215, "step": 52 }, { "epoch": 0.05, "grad_norm": 22.80289009591428, "learning_rate": 2.760416666666667e-07, "logps/chosen": -27.84115219116211, "logps/rejected": -41.79947280883789, "loss": 0.6957, "losses/dpo": 0.7053032517433167, "losses/sft": 0.9196627736091614, "losses/total": 0.7053032517433167, "ref_logps/chosen": -27.790102005004883, "ref_logps/rejected": -41.79937744140625, "rewards/accuracies": 0.375, "rewards/chosen": -0.005104833748191595, "rewards/margins": -0.005095202475786209, "rewards/rejected": -9.631738066673279e-06, "step": 53 }, { "epoch": 0.05, "grad_norm": 21.160712934443634, "learning_rate": 2.8125e-07, "logps/chosen": -28.089000701904297, "logps/rejected": -28.065242767333984, "loss": 0.6942, "losses/dpo": 0.6884220838546753, "losses/sft": 1.3696531057357788, "losses/total": 0.6884220838546753, "ref_logps/chosen": -28.043071746826172, "ref_logps/rejected": -28.038833618164062, "rewards/accuracies": 0.5, "rewards/chosen": -0.004592927172780037, "rewards/margins": -0.0019520954228937626, "rewards/rejected": -0.0026408315170556307, "step": 54 }, { "epoch": 0.05, "grad_norm": 24.02056559065956, "learning_rate": 2.864583333333333e-07, "logps/chosen": -30.13437271118164, "logps/rejected": -46.82486343383789, "loss": 0.6938, "losses/dpo": 0.6960442066192627, "losses/sft": 0.1095627173781395, "losses/total": 0.6960442066192627, "ref_logps/chosen": -30.08250617980957, "ref_logps/rejected": -46.78582763671875, "rewards/accuracies": 0.25, "rewards/chosen": -0.005186424124985933, "rewards/margins": -0.0012824534205719829, "rewards/rejected": -0.0039039698895066977, "step": 55 }, { "epoch": 0.05, "grad_norm": 21.383654367260945, "learning_rate": 2.916666666666667e-07, "logps/chosen": -28.980976104736328, "logps/rejected": -28.924907684326172, "loss": 0.6888, "losses/dpo": 0.694461464881897, "losses/sft": 0.42424333095550537, "losses/total": 0.694461464881897, "ref_logps/chosen": -29.029512405395508, "ref_logps/rejected": -28.88519859313965, "rewards/accuracies": 0.5, "rewards/chosen": 0.004853768739849329, "rewards/margins": 0.008824486285448074, "rewards/rejected": -0.00397071847692132, "step": 56 }, { "epoch": 0.05, "grad_norm": 22.03439882520339, "learning_rate": 2.9687499999999996e-07, "logps/chosen": -28.862022399902344, "logps/rejected": -30.32440948486328, "loss": 0.6883, "losses/dpo": 0.6976296305656433, "losses/sft": 0.5318347215652466, "losses/total": 0.6976296305656433, "ref_logps/chosen": -28.89822006225586, "ref_logps/rejected": -30.26236343383789, "rewards/accuracies": 0.625, "rewards/chosen": 0.003619836177676916, "rewards/margins": 0.00982445664703846, "rewards/rejected": -0.006204620003700256, "step": 57 }, { "epoch": 0.05, "grad_norm": 21.804638220483643, "learning_rate": 3.020833333333333e-07, "logps/chosen": -27.42025375366211, "logps/rejected": -30.69321632385254, "loss": 0.6935, "losses/dpo": 0.6926889419555664, "losses/sft": 1.0505743026733398, "losses/total": 0.6926889419555664, "ref_logps/chosen": -27.4237117767334, "ref_logps/rejected": -30.702587127685547, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003457337152212858, "rewards/margins": -0.0005913510103709996, "rewards/rejected": 0.0009370847255922854, "step": 58 }, { "epoch": 0.06, "grad_norm": 20.56186912514367, "learning_rate": 3.0729166666666665e-07, "logps/chosen": -27.021804809570312, "logps/rejected": -26.62579345703125, "loss": 0.6933, "losses/dpo": 0.6717853546142578, "losses/sft": 0.9432034492492676, "losses/total": 0.6717853546142578, "ref_logps/chosen": -27.021015167236328, "ref_logps/rejected": -26.62818145751953, "rewards/accuracies": 0.4375, "rewards/chosen": -7.897312752902508e-05, "rewards/margins": -0.0003177463077008724, "rewards/rejected": 0.00023877323837950826, "step": 59 }, { "epoch": 0.06, "grad_norm": 25.0389454664293, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -27.093734741210938, "logps/rejected": -39.77492904663086, "loss": 0.6906, "losses/dpo": 0.7066395282745361, "losses/sft": 1.9447426795959473, "losses/total": 0.7066395282745361, "ref_logps/chosen": -27.111194610595703, "ref_logps/rejected": -39.73943328857422, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017459139926359057, "rewards/margins": 0.005295580718666315, "rewards/rejected": -0.003549666842445731, "step": 60 }, { "epoch": 0.06, "grad_norm": 21.76110894694096, "learning_rate": 3.177083333333333e-07, "logps/chosen": -33.15117645263672, "logps/rejected": -25.45164680480957, "loss": 0.6922, "losses/dpo": 0.7042762041091919, "losses/sft": 1.0182559490203857, "losses/total": 0.7042762041091919, "ref_logps/chosen": -33.14457702636719, "ref_logps/rejected": -25.42586326599121, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0006596535095013678, "rewards/margins": 0.0019187422003597021, "rewards/rejected": -0.0025783954188227654, "step": 61 }, { "epoch": 0.06, "grad_norm": 21.371435558778423, "learning_rate": 3.2291666666666666e-07, "logps/chosen": -33.7812614440918, "logps/rejected": -29.490591049194336, "loss": 0.6907, "losses/dpo": 0.6935842037200928, "losses/sft": 0.8471775054931641, "losses/total": 0.6935842037200928, "ref_logps/chosen": -33.8017578125, "ref_logps/rejected": -29.46078872680664, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0020498037338256836, "rewards/margins": 0.005029791966080666, "rewards/rejected": -0.0029799879994243383, "step": 62 }, { "epoch": 0.06, "grad_norm": 20.3273857833281, "learning_rate": 3.28125e-07, "logps/chosen": -24.859054565429688, "logps/rejected": -32.70137023925781, "loss": 0.6928, "losses/dpo": 0.6908287405967712, "losses/sft": 0.7349932193756104, "losses/total": 0.6908287405967712, "ref_logps/chosen": -24.900182723999023, "ref_logps/rejected": -32.735755920410156, "rewards/accuracies": 0.4375, "rewards/chosen": 0.004112543538212776, "rewards/margins": 0.0006740251556038857, "rewards/rejected": 0.0034385179169476032, "step": 63 }, { "epoch": 0.06, "grad_norm": 20.258556638911248, "learning_rate": 3.333333333333333e-07, "logps/chosen": -26.764896392822266, "logps/rejected": -26.48898696899414, "loss": 0.6919, "losses/dpo": 0.6961848139762878, "losses/sft": 0.4403434693813324, "losses/total": 0.6961848139762878, "ref_logps/chosen": -26.772743225097656, "ref_logps/rejected": -26.471506118774414, "rewards/accuracies": 0.375, "rewards/chosen": 0.000784526695497334, "rewards/margins": 0.002532592974603176, "rewards/rejected": -0.001748066395521164, "step": 64 }, { "epoch": 0.06, "grad_norm": 25.82679182140741, "learning_rate": 3.3854166666666667e-07, "logps/chosen": -29.27450942993164, "logps/rejected": -37.51946258544922, "loss": 0.6896, "losses/dpo": 0.694163978099823, "losses/sft": 1.11147141456604, "losses/total": 0.694163978099823, "ref_logps/chosen": -29.34178352355957, "ref_logps/rejected": -37.51383972167969, "rewards/accuracies": 0.4375, "rewards/chosen": 0.006727633997797966, "rewards/margins": 0.007289610803127289, "rewards/rejected": -0.0005619763396680355, "step": 65 }, { "epoch": 0.06, "grad_norm": 21.147010038204403, "learning_rate": 3.4375e-07, "logps/chosen": -22.91146469116211, "logps/rejected": -37.825531005859375, "loss": 0.6895, "losses/dpo": 0.6817489266395569, "losses/sft": 1.4511889219284058, "losses/total": 0.6817489266395569, "ref_logps/chosen": -22.976428985595703, "ref_logps/rejected": -37.817138671875, "rewards/accuracies": 0.5625, "rewards/chosen": 0.006496277172118425, "rewards/margins": 0.007335904985666275, "rewards/rejected": -0.0008396283956244588, "step": 66 }, { "epoch": 0.06, "grad_norm": 19.53064217390535, "learning_rate": 3.489583333333333e-07, "logps/chosen": -22.38909149169922, "logps/rejected": -23.74490737915039, "loss": 0.6923, "losses/dpo": 0.7068639993667603, "losses/sft": 1.5899856090545654, "losses/total": 0.7068639993667603, "ref_logps/chosen": -22.384462356567383, "ref_logps/rejected": -23.722909927368164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0004629164468497038, "rewards/margins": 0.0017368880799040198, "rewards/rejected": -0.002199804875999689, "step": 67 }, { "epoch": 0.06, "grad_norm": 24.4303703886048, "learning_rate": 3.541666666666667e-07, "logps/chosen": -38.69633102416992, "logps/rejected": -34.19725036621094, "loss": 0.6906, "losses/dpo": 0.6786676645278931, "losses/sft": 0.6386687159538269, "losses/total": 0.6786676645278931, "ref_logps/chosen": -38.74794387817383, "ref_logps/rejected": -34.19642639160156, "rewards/accuracies": 0.5, "rewards/chosen": 0.0051612406969070435, "rewards/margins": 0.005243733990937471, "rewards/rejected": -8.249306119978428e-05, "step": 68 }, { "epoch": 0.07, "grad_norm": 21.17922479190775, "learning_rate": 3.59375e-07, "logps/chosen": -23.671390533447266, "logps/rejected": -28.10793685913086, "loss": 0.6909, "losses/dpo": 0.6927350759506226, "losses/sft": 0.6384695172309875, "losses/total": 0.6927350759506226, "ref_logps/chosen": -23.70224380493164, "ref_logps/rejected": -28.092395782470703, "rewards/accuracies": 0.5, "rewards/chosen": 0.0030855119694024324, "rewards/margins": 0.004639526829123497, "rewards/rejected": -0.0015540153253823519, "step": 69 }, { "epoch": 0.07, "grad_norm": 21.196772514372576, "learning_rate": 3.645833333333333e-07, "logps/chosen": -26.465023040771484, "logps/rejected": -38.226341247558594, "loss": 0.6937, "losses/dpo": 0.7043315768241882, "losses/sft": 1.1022038459777832, "losses/total": 0.7043315768241882, "ref_logps/chosen": -26.421083450317383, "ref_logps/rejected": -38.192073822021484, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004393884912133217, "rewards/margins": -0.0009669661521911621, "rewards/rejected": -0.0034269182942807674, "step": 70 }, { "epoch": 0.07, "grad_norm": 20.134831081941645, "learning_rate": 3.697916666666667e-07, "logps/chosen": -27.295299530029297, "logps/rejected": -34.197265625, "loss": 0.6904, "losses/dpo": 0.6948862671852112, "losses/sft": 1.2709161043167114, "losses/total": 0.6948862671852112, "ref_logps/chosen": -27.36055564880371, "ref_logps/rejected": -34.206634521484375, "rewards/accuracies": 0.75, "rewards/chosen": 0.006525379605591297, "rewards/margins": 0.005588829517364502, "rewards/rejected": 0.0009365499718114734, "step": 71 }, { "epoch": 0.07, "grad_norm": 19.917981976179334, "learning_rate": 3.75e-07, "logps/chosen": -26.745792388916016, "logps/rejected": -33.669090270996094, "loss": 0.6892, "losses/dpo": 0.6700929403305054, "losses/sft": 1.6777012348175049, "losses/total": 0.6700929403305054, "ref_logps/chosen": -26.780105590820312, "ref_logps/rejected": -33.623069763183594, "rewards/accuracies": 0.625, "rewards/chosen": 0.0034315751399844885, "rewards/margins": 0.0080337505787611, "rewards/rejected": -0.004602176137268543, "step": 72 }, { "epoch": 0.07, "grad_norm": 20.70447461421529, "learning_rate": 3.802083333333333e-07, "logps/chosen": -23.440563201904297, "logps/rejected": -29.687862396240234, "loss": 0.695, "losses/dpo": 0.6920562386512756, "losses/sft": 0.1864195317029953, "losses/total": 0.6920562386512756, "ref_logps/chosen": -23.445938110351562, "ref_logps/rejected": -29.72901153564453, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0005375788896344602, "rewards/margins": -0.003577335039153695, "rewards/rejected": 0.00411491421982646, "step": 73 }, { "epoch": 0.07, "grad_norm": 24.27594233210316, "learning_rate": 3.8541666666666665e-07, "logps/chosen": -33.069480895996094, "logps/rejected": -33.0693359375, "loss": 0.696, "losses/dpo": 0.7000867128372192, "losses/sft": 1.0499125719070435, "losses/total": 0.7000867128372192, "ref_logps/chosen": -33.02191162109375, "ref_logps/rejected": -33.07781219482422, "rewards/accuracies": 0.25, "rewards/chosen": -0.004757130518555641, "rewards/margins": -0.0056047262623906136, "rewards/rejected": 0.0008475955110043287, "step": 74 }, { "epoch": 0.07, "grad_norm": 20.07057316532505, "learning_rate": 3.9062499999999997e-07, "logps/chosen": -23.422992706298828, "logps/rejected": -28.330509185791016, "loss": 0.6915, "losses/dpo": 0.6866110563278198, "losses/sft": 0.9139311909675598, "losses/total": 0.6866110563278198, "ref_logps/chosen": -23.4095458984375, "ref_logps/rejected": -28.284446716308594, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013446283992379904, "rewards/margins": 0.003261800389736891, "rewards/rejected": -0.0046064285561442375, "step": 75 }, { "epoch": 0.07, "grad_norm": 22.14161959651125, "learning_rate": 3.958333333333333e-07, "logps/chosen": -28.291383743286133, "logps/rejected": -37.55588912963867, "loss": 0.6901, "losses/dpo": 0.6994426846504211, "losses/sft": 1.0744826793670654, "losses/total": 0.6994426846504211, "ref_logps/chosen": -28.27648162841797, "ref_logps/rejected": -37.47980499267578, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014902979601174593, "rewards/margins": 0.0061180321499705315, "rewards/rejected": -0.007608330342918634, "step": 76 }, { "epoch": 0.07, "grad_norm": 21.33676189120627, "learning_rate": 4.0104166666666666e-07, "logps/chosen": -28.996421813964844, "logps/rejected": -36.39888000488281, "loss": 0.6877, "losses/dpo": 0.6785542964935303, "losses/sft": 1.2260427474975586, "losses/total": 0.6785542964935303, "ref_logps/chosen": -29.033872604370117, "ref_logps/rejected": -36.325439453125, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003745168447494507, "rewards/margins": 0.011089339852333069, "rewards/rejected": -0.007344171404838562, "step": 77 }, { "epoch": 0.07, "grad_norm": 23.348390583228614, "learning_rate": 4.0625e-07, "logps/chosen": -27.91647720336914, "logps/rejected": -37.29054260253906, "loss": 0.6917, "losses/dpo": 0.7018224596977234, "losses/sft": 1.5613633394241333, "losses/total": 0.7018224596977234, "ref_logps/chosen": -27.96649932861328, "ref_logps/rejected": -37.311492919921875, "rewards/accuracies": 0.5, "rewards/chosen": 0.005002221092581749, "rewards/margins": 0.002906885463744402, "rewards/rejected": 0.002095335628837347, "step": 78 }, { "epoch": 0.07, "grad_norm": 20.188299735337125, "learning_rate": 4.114583333333333e-07, "logps/chosen": -24.666900634765625, "logps/rejected": -28.775285720825195, "loss": 0.6907, "losses/dpo": 0.6943562030792236, "losses/sft": 1.0879288911819458, "losses/total": 0.6943562030792236, "ref_logps/chosen": -24.684425354003906, "ref_logps/rejected": -28.74350357055664, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017525495495647192, "rewards/margins": 0.00493090134114027, "rewards/rejected": -0.0031783522572368383, "step": 79 }, { "epoch": 0.08, "grad_norm": 22.926581279630373, "learning_rate": 4.1666666666666667e-07, "logps/chosen": -25.96597671508789, "logps/rejected": -37.82392883300781, "loss": 0.6928, "losses/dpo": 0.6898135542869568, "losses/sft": 1.5201321840286255, "losses/total": 0.6898135542869568, "ref_logps/chosen": -25.958084106445312, "ref_logps/rejected": -37.80912399291992, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007892727735452354, "rewards/margins": 0.0006913664983585477, "rewards/rejected": -0.0014806390972808003, "step": 80 }, { "epoch": 0.08, "grad_norm": 21.09853636803559, "learning_rate": 4.21875e-07, "logps/chosen": -31.512479782104492, "logps/rejected": -32.86335754394531, "loss": 0.6924, "losses/dpo": 0.6945921182632446, "losses/sft": 1.5190742015838623, "losses/total": 0.6945921182632446, "ref_logps/chosen": -31.500568389892578, "ref_logps/rejected": -32.83515548706055, "rewards/accuracies": 0.5, "rewards/chosen": -0.001191210700199008, "rewards/margins": 0.001629391685128212, "rewards/rejected": -0.0028206021524965763, "step": 81 }, { "epoch": 0.08, "grad_norm": 22.654691834938212, "learning_rate": 4.270833333333333e-07, "logps/chosen": -31.021615982055664, "logps/rejected": -33.78495788574219, "loss": 0.691, "losses/dpo": 0.7019869685173035, "losses/sft": 1.5752376317977905, "losses/total": 0.7019869685173035, "ref_logps/chosen": -31.040332794189453, "ref_logps/rejected": -33.758262634277344, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018716990016400814, "rewards/margins": 0.004541614558547735, "rewards/rejected": -0.00266991532407701, "step": 82 }, { "epoch": 0.08, "grad_norm": 20.794186872443237, "learning_rate": 4.322916666666667e-07, "logps/chosen": -26.19452667236328, "logps/rejected": -32.379966735839844, "loss": 0.6866, "losses/dpo": 0.6665722131729126, "losses/sft": 0.9994277954101562, "losses/total": 0.6665722131729126, "ref_logps/chosen": -26.210554122924805, "ref_logps/rejected": -32.2633171081543, "rewards/accuracies": 0.875, "rewards/chosen": 0.0016029835678637028, "rewards/margins": 0.013267986476421356, "rewards/rejected": -0.01166500337421894, "step": 83 }, { "epoch": 0.08, "grad_norm": 20.69510685726942, "learning_rate": 4.375e-07, "logps/chosen": -26.955123901367188, "logps/rejected": -29.765426635742188, "loss": 0.6911, "losses/dpo": 0.7009487152099609, "losses/sft": 0.7629775404930115, "losses/total": 0.7009487152099609, "ref_logps/chosen": -26.94285774230957, "ref_logps/rejected": -29.712011337280273, "rewards/accuracies": 0.75, "rewards/chosen": -0.0012268096907064319, "rewards/margins": 0.0041146669536828995, "rewards/rejected": -0.005341476295143366, "step": 84 }, { "epoch": 0.08, "grad_norm": 17.48493984826689, "learning_rate": 4.427083333333333e-07, "logps/chosen": -17.182580947875977, "logps/rejected": -26.22580337524414, "loss": 0.6857, "losses/dpo": 0.6875573992729187, "losses/sft": 0.9220892190933228, "losses/total": 0.6875573992729187, "ref_logps/chosen": -17.26123046875, "ref_logps/rejected": -26.15496063232422, "rewards/accuracies": 0.75, "rewards/chosen": 0.00786479189991951, "rewards/margins": 0.014949031174182892, "rewards/rejected": -0.007084238342940807, "step": 85 }, { "epoch": 0.08, "grad_norm": 18.60763057387995, "learning_rate": 4.479166666666667e-07, "logps/chosen": -25.18168830871582, "logps/rejected": -25.691646575927734, "loss": 0.6961, "losses/dpo": 0.6977482438087463, "losses/sft": 0.9755790829658508, "losses/total": 0.6977482438087463, "ref_logps/chosen": -25.154155731201172, "ref_logps/rejected": -25.722854614257812, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0027535497210919857, "rewards/margins": -0.005874503403902054, "rewards/rejected": 0.0031209527514874935, "step": 86 }, { "epoch": 0.08, "grad_norm": 21.75076097946544, "learning_rate": 4.53125e-07, "logps/chosen": -24.93636131286621, "logps/rejected": -37.811622619628906, "loss": 0.6932, "losses/dpo": 0.7045615911483765, "losses/sft": 1.015774130821228, "losses/total": 0.7045615911483765, "ref_logps/chosen": -24.942893981933594, "ref_logps/rejected": -37.81858444213867, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006532191764563322, "rewards/margins": -4.297844134271145e-05, "rewards/rejected": 0.0006961976177990437, "step": 87 }, { "epoch": 0.08, "grad_norm": 20.422772730050387, "learning_rate": 4.5833333333333327e-07, "logps/chosen": -23.439058303833008, "logps/rejected": -37.17435836791992, "loss": 0.6917, "losses/dpo": 0.6886216998100281, "losses/sft": 0.5607869625091553, "losses/total": 0.6886216998100281, "ref_logps/chosen": -23.448896408081055, "ref_logps/rejected": -37.15543746948242, "rewards/accuracies": 0.5625, "rewards/chosen": 0.000983782229013741, "rewards/margins": 0.0028758554253727198, "rewards/rejected": -0.0018920735456049442, "step": 88 }, { "epoch": 0.08, "grad_norm": 22.39156055932735, "learning_rate": 4.6354166666666664e-07, "logps/chosen": -36.2021598815918, "logps/rejected": -36.3503303527832, "loss": 0.6923, "losses/dpo": 0.7027943134307861, "losses/sft": 0.6118872165679932, "losses/total": 0.7027943134307861, "ref_logps/chosen": -36.17906951904297, "ref_logps/rejected": -36.30857849121094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0023091763723641634, "rewards/margins": 0.0018658516928553581, "rewards/rejected": -0.0041750287637114525, "step": 89 }, { "epoch": 0.08, "grad_norm": 22.869344116182983, "learning_rate": 4.6874999999999996e-07, "logps/chosen": -33.93260192871094, "logps/rejected": -38.55732727050781, "loss": 0.6903, "losses/dpo": 0.6775533556938171, "losses/sft": 1.1477431058883667, "losses/total": 0.6775533556938171, "ref_logps/chosen": -33.938621520996094, "ref_logps/rejected": -38.505531311035156, "rewards/accuracies": 0.625, "rewards/chosen": 0.0006020220462232828, "rewards/margins": 0.005781931336969137, "rewards/rejected": -0.005179909057915211, "step": 90 }, { "epoch": 0.09, "grad_norm": 23.328362618839012, "learning_rate": 4.739583333333333e-07, "logps/chosen": -31.802949905395508, "logps/rejected": -32.727447509765625, "loss": 0.6855, "losses/dpo": 0.690068244934082, "losses/sft": 0.8196256160736084, "losses/total": 0.690068244934082, "ref_logps/chosen": -31.850675582885742, "ref_logps/rejected": -32.619651794433594, "rewards/accuracies": 0.75, "rewards/chosen": 0.004772371146827936, "rewards/margins": 0.015552191063761711, "rewards/rejected": -0.010779820382595062, "step": 91 }, { "epoch": 0.09, "grad_norm": 19.427431230425892, "learning_rate": 4.791666666666667e-07, "logps/chosen": -26.557769775390625, "logps/rejected": -30.052669525146484, "loss": 0.6893, "losses/dpo": 0.695798933506012, "losses/sft": 0.3323949873447418, "losses/total": 0.695798933506012, "ref_logps/chosen": -26.551851272583008, "ref_logps/rejected": -29.968990325927734, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005917757516726851, "rewards/margins": 0.00777572114020586, "rewards/rejected": -0.008367497473955154, "step": 92 }, { "epoch": 0.09, "grad_norm": 27.48585072273388, "learning_rate": 4.84375e-07, "logps/chosen": -43.8309326171875, "logps/rejected": -52.750526428222656, "loss": 0.691, "losses/dpo": 0.716570258140564, "losses/sft": 0.910555899143219, "losses/total": 0.716570258140564, "ref_logps/chosen": -43.77696228027344, "ref_logps/rejected": -52.65165710449219, "rewards/accuracies": 0.625, "rewards/chosen": -0.0053969742730259895, "rewards/margins": 0.0044899312779307365, "rewards/rejected": -0.009886905550956726, "step": 93 }, { "epoch": 0.09, "grad_norm": 18.38930312028714, "learning_rate": 4.895833333333333e-07, "logps/chosen": -23.797378540039062, "logps/rejected": -25.51896095275879, "loss": 0.6927, "losses/dpo": 0.6715975403785706, "losses/sft": 0.9723811745643616, "losses/total": 0.6715975403785706, "ref_logps/chosen": -23.770660400390625, "ref_logps/rejected": -25.48215675354004, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0026718880981206894, "rewards/margins": 0.0010084419045597315, "rewards/rejected": -0.003680330701172352, "step": 94 }, { "epoch": 0.09, "grad_norm": 20.4677372143771, "learning_rate": 4.947916666666667e-07, "logps/chosen": -20.596759796142578, "logps/rejected": -31.419239044189453, "loss": 0.692, "losses/dpo": 0.6924352049827576, "losses/sft": 0.6185705065727234, "losses/total": 0.6924352049827576, "ref_logps/chosen": -20.572059631347656, "ref_logps/rejected": -31.370990753173828, "rewards/accuracies": 0.625, "rewards/chosen": -0.0024699214845895767, "rewards/margins": 0.0023549674078822136, "rewards/rejected": -0.00482488889247179, "step": 95 }, { "epoch": 0.09, "grad_norm": 21.33577703932663, "learning_rate": 5e-07, "logps/chosen": -25.32129669189453, "logps/rejected": -38.093345642089844, "loss": 0.6901, "losses/dpo": 0.6812100410461426, "losses/sft": 0.7792777419090271, "losses/total": 0.6812100410461426, "ref_logps/chosen": -25.36585807800293, "ref_logps/rejected": -38.0745735168457, "rewards/accuracies": 0.4375, "rewards/chosen": 0.004456114489585161, "rewards/margins": 0.006333080120384693, "rewards/rejected": -0.0018769652815535665, "step": 96 }, { "epoch": 0.09, "grad_norm": 22.852423573592052, "learning_rate": 4.999998702877531e-07, "logps/chosen": -29.43635368347168, "logps/rejected": -34.47437286376953, "loss": 0.6889, "losses/dpo": 0.6730969548225403, "losses/sft": 0.582313060760498, "losses/total": 0.6730969548225403, "ref_logps/chosen": -29.483409881591797, "ref_logps/rejected": -34.43589782714844, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004705715458840132, "rewards/margins": 0.008553558960556984, "rewards/rejected": -0.0038478432688862085, "step": 97 }, { "epoch": 0.09, "grad_norm": 23.254374465605643, "learning_rate": 4.999994811511473e-07, "logps/chosen": -34.40141296386719, "logps/rejected": -37.260948181152344, "loss": 0.6903, "losses/dpo": 0.6937819123268127, "losses/sft": 1.0218812227249146, "losses/total": 0.6937819123268127, "ref_logps/chosen": -34.45296096801758, "ref_logps/rejected": -37.25326156616211, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005155056715011597, "rewards/margins": 0.005923757795244455, "rewards/rejected": -0.0007687003817409277, "step": 98 }, { "epoch": 0.09, "grad_norm": 21.750275955055315, "learning_rate": 4.999988325905862e-07, "logps/chosen": -28.49581527709961, "logps/rejected": -41.22924041748047, "loss": 0.6874, "losses/dpo": 0.6906458139419556, "losses/sft": 1.199034571647644, "losses/total": 0.6906458139419556, "ref_logps/chosen": -28.503170013427734, "ref_logps/rejected": -41.118621826171875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0007355841808021069, "rewards/margins": 0.011797288432717323, "rewards/rejected": -0.011061704717576504, "step": 99 }, { "epoch": 0.09, "grad_norm": 20.671592024368653, "learning_rate": 4.999979246067428e-07, "logps/chosen": -27.191505432128906, "logps/rejected": -31.120943069458008, "loss": 0.6897, "losses/dpo": 0.6873868703842163, "losses/sft": 1.1597222089767456, "losses/total": 0.6873868703842163, "ref_logps/chosen": -27.263648986816406, "ref_logps/rejected": -31.124114990234375, "rewards/accuracies": 0.625, "rewards/chosen": 0.0072141485288739204, "rewards/margins": 0.00689701596274972, "rewards/rejected": 0.0003171325661242008, "step": 100 }, { "epoch": 0.1, "grad_norm": 20.420344606860237, "learning_rate": 4.999967572005595e-07, "logps/chosen": -25.946109771728516, "logps/rejected": -34.69294357299805, "loss": 0.6838, "losses/dpo": 0.6927950978279114, "losses/sft": 0.5914534330368042, "losses/total": 0.6927950978279114, "ref_logps/chosen": -26.01517105102539, "ref_logps/rejected": -34.57215118408203, "rewards/accuracies": 0.75, "rewards/chosen": 0.006906193681061268, "rewards/margins": 0.01898546889424324, "rewards/rejected": -0.012079274281859398, "step": 101 }, { "epoch": 0.1, "grad_norm": 17.229767879205532, "learning_rate": 4.999953303732474e-07, "logps/chosen": -20.483642578125, "logps/rejected": -25.029516220092773, "loss": 0.6871, "losses/dpo": 0.6817685961723328, "losses/sft": 0.5613736510276794, "losses/total": 0.6817685961723328, "ref_logps/chosen": -20.532299041748047, "ref_logps/rejected": -24.95586395263672, "rewards/accuracies": 0.625, "rewards/chosen": 0.004865664057433605, "rewards/margins": 0.0122307650744915, "rewards/rejected": -0.007365101482719183, "step": 102 }, { "epoch": 0.1, "grad_norm": 24.03885592812614, "learning_rate": 4.999936441262874e-07, "logps/chosen": -35.246620178222656, "logps/rejected": -41.73207092285156, "loss": 0.684, "losses/dpo": 0.6913340091705322, "losses/sft": 0.9193404912948608, "losses/total": 0.6913340091705322, "ref_logps/chosen": -35.221920013427734, "ref_logps/rejected": -41.52159881591797, "rewards/accuracies": 0.875, "rewards/chosen": -0.002469867467880249, "rewards/margins": 0.01857711747288704, "rewards/rejected": -0.021046984940767288, "step": 103 }, { "epoch": 0.1, "grad_norm": 20.486731052831566, "learning_rate": 4.999916984614293e-07, "logps/chosen": -25.989418029785156, "logps/rejected": -28.926048278808594, "loss": 0.692, "losses/dpo": 0.6893778443336487, "losses/sft": 0.8910327553749084, "losses/total": 0.6893778443336487, "ref_logps/chosen": -25.98788833618164, "ref_logps/rejected": -28.898807525634766, "rewards/accuracies": 0.5, "rewards/chosen": -0.00015291559975594282, "rewards/margins": 0.0025710647460073233, "rewards/rejected": -0.002723980462178588, "step": 104 }, { "epoch": 0.1, "grad_norm": 22.327266177322493, "learning_rate": 4.99989493380692e-07, "logps/chosen": -32.630523681640625, "logps/rejected": -39.266746520996094, "loss": 0.6872, "losses/dpo": 0.6708762049674988, "losses/sft": 0.5591299533843994, "losses/total": 0.6708762049674988, "ref_logps/chosen": -32.70920944213867, "ref_logps/rejected": -39.22475814819336, "rewards/accuracies": 0.75, "rewards/chosen": 0.007868415676057339, "rewards/margins": 0.012067139148712158, "rewards/rejected": -0.004198723938316107, "step": 105 }, { "epoch": 0.1, "grad_norm": 22.20977164064144, "learning_rate": 4.999870288863637e-07, "logps/chosen": -32.667030334472656, "logps/rejected": -31.87518310546875, "loss": 0.6867, "losses/dpo": 0.6934016942977905, "losses/sft": 0.7733942866325378, "losses/total": 0.6934016942977905, "ref_logps/chosen": -32.714847564697266, "ref_logps/rejected": -31.791175842285156, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004781687166541815, "rewards/margins": 0.013182107359170914, "rewards/rejected": -0.008400418795645237, "step": 106 }, { "epoch": 0.1, "grad_norm": 20.98221428823255, "learning_rate": 4.999843049810019e-07, "logps/chosen": -25.586666107177734, "logps/rejected": -30.228605270385742, "loss": 0.6976, "losses/dpo": 0.6982261538505554, "losses/sft": 0.4232485592365265, "losses/total": 0.6982261538505554, "ref_logps/chosen": -25.56687355041504, "ref_logps/rejected": -30.296142578125, "rewards/accuracies": 0.5, "rewards/chosen": -0.0019792423117905855, "rewards/margins": -0.008733030408620834, "rewards/rejected": 0.0067537883296608925, "step": 107 }, { "epoch": 0.1, "grad_norm": 21.440637090330434, "learning_rate": 4.999813216674331e-07, "logps/chosen": -29.429649353027344, "logps/rejected": -32.80973815917969, "loss": 0.686, "losses/dpo": 0.6549234986305237, "losses/sft": 0.5979283452033997, "losses/total": 0.6549234986305237, "ref_logps/chosen": -29.450054168701172, "ref_logps/rejected": -32.68550109863281, "rewards/accuracies": 0.625, "rewards/chosen": 0.002040332183241844, "rewards/margins": 0.014464367181062698, "rewards/rejected": -0.012424034997820854, "step": 108 }, { "epoch": 0.1, "grad_norm": 20.066974842133, "learning_rate": 4.999780789487531e-07, "logps/chosen": -25.631641387939453, "logps/rejected": -27.972856521606445, "loss": 0.6912, "losses/dpo": 0.6805813908576965, "losses/sft": 0.49039217829704285, "losses/total": 0.6805813908576965, "ref_logps/chosen": -25.595388412475586, "ref_logps/rejected": -27.895366668701172, "rewards/accuracies": 0.5, "rewards/chosen": -0.0036251870915293694, "rewards/margins": 0.0041238488629460335, "rewards/rejected": -0.007749035954475403, "step": 109 }, { "epoch": 0.1, "grad_norm": 22.809776292597125, "learning_rate": 4.999745768283269e-07, "logps/chosen": -26.815513610839844, "logps/rejected": -36.660072326660156, "loss": 0.682, "losses/dpo": 0.693722128868103, "losses/sft": 0.42690446972846985, "losses/total": 0.693722128868103, "ref_logps/chosen": -26.840848922729492, "ref_logps/rejected": -36.45867156982422, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0025335969403386116, "rewards/margins": 0.02267368882894516, "rewards/rejected": -0.020140090957283974, "step": 110 }, { "epoch": 0.1, "grad_norm": 20.489092741167095, "learning_rate": 4.999708153097887e-07, "logps/chosen": -23.561927795410156, "logps/rejected": -31.1812686920166, "loss": 0.6895, "losses/dpo": 0.6863742470741272, "losses/sft": 0.775502622127533, "losses/total": 0.6863742470741272, "ref_logps/chosen": -23.584800720214844, "ref_logps/rejected": -31.12869644165039, "rewards/accuracies": 0.5, "rewards/chosen": 0.0022873487323522568, "rewards/margins": 0.007544323801994324, "rewards/rejected": -0.00525697460398078, "step": 111 }, { "epoch": 0.11, "grad_norm": 23.856604534271533, "learning_rate": 4.999667943970416e-07, "logps/chosen": -32.92667007446289, "logps/rejected": -40.99705505371094, "loss": 0.6922, "losses/dpo": 0.6704139113426208, "losses/sft": 1.4108445644378662, "losses/total": 0.6704139113426208, "ref_logps/chosen": -32.82093811035156, "ref_logps/rejected": -40.86976623535156, "rewards/accuracies": 0.5, "rewards/chosen": -0.010573083534836769, "rewards/margins": 0.0021560993045568466, "rewards/rejected": -0.012729182839393616, "step": 112 }, { "epoch": 0.11, "grad_norm": 20.862040307436533, "learning_rate": 4.999625140942584e-07, "logps/chosen": -25.666879653930664, "logps/rejected": -32.04814147949219, "loss": 0.6902, "losses/dpo": 0.6934311389923096, "losses/sft": 0.18616443872451782, "losses/total": 0.6934311389923096, "ref_logps/chosen": -25.625770568847656, "ref_logps/rejected": -31.945009231567383, "rewards/accuracies": 0.5, "rewards/chosen": -0.004110786132514477, "rewards/margins": 0.006202241871505976, "rewards/rejected": -0.01031302846968174, "step": 113 }, { "epoch": 0.11, "grad_norm": 25.22333492328748, "learning_rate": 4.999579744058804e-07, "logps/chosen": -37.457515716552734, "logps/rejected": -43.811187744140625, "loss": 0.6799, "losses/dpo": 0.6734267473220825, "losses/sft": 0.7400234341621399, "losses/total": 0.6734267473220825, "ref_logps/chosen": -37.603271484375, "ref_logps/rejected": -43.68743133544922, "rewards/accuracies": 0.8125, "rewards/chosen": 0.014575528912246227, "rewards/margins": 0.026950985193252563, "rewards/rejected": -0.012375456281006336, "step": 114 }, { "epoch": 0.11, "grad_norm": 20.561264327257852, "learning_rate": 4.999531753366188e-07, "logps/chosen": -32.993812561035156, "logps/rejected": -32.07012176513672, "loss": 0.6879, "losses/dpo": 0.6837591528892517, "losses/sft": 0.744716227054596, "losses/total": 0.6837591528892517, "ref_logps/chosen": -32.991912841796875, "ref_logps/rejected": -31.959362030029297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00019017420709133148, "rewards/margins": 0.010885760188102722, "rewards/rejected": -0.011075935326516628, "step": 115 }, { "epoch": 0.11, "grad_norm": 20.870285579914395, "learning_rate": 4.999481168914533e-07, "logps/chosen": -25.634740829467773, "logps/rejected": -30.15637969970703, "loss": 0.6889, "losses/dpo": 0.6722331643104553, "losses/sft": 0.8871920704841614, "losses/total": 0.6722331643104553, "ref_logps/chosen": -25.597373962402344, "ref_logps/rejected": -30.03225326538086, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0037366272881627083, "rewards/margins": 0.008676244877278805, "rewards/rejected": -0.012412873096764088, "step": 116 }, { "epoch": 0.11, "grad_norm": 19.666225236194958, "learning_rate": 4.999427990756332e-07, "logps/chosen": -23.291461944580078, "logps/rejected": -29.592756271362305, "loss": 0.6889, "losses/dpo": 0.6796263456344604, "losses/sft": 1.0201696157455444, "losses/total": 0.6796263456344604, "ref_logps/chosen": -23.338411331176758, "ref_logps/rejected": -29.553611755371094, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004694690927863121, "rewards/margins": 0.008609052747488022, "rewards/rejected": -0.003914361819624901, "step": 117 }, { "epoch": 0.11, "grad_norm": 20.762693309126398, "learning_rate": 4.999372218946766e-07, "logps/chosen": -20.96467399597168, "logps/rejected": -30.98967742919922, "loss": 0.6822, "losses/dpo": 0.696890115737915, "losses/sft": 0.6864515542984009, "losses/total": 0.696890115737915, "ref_logps/chosen": -21.009117126464844, "ref_logps/rejected": -30.811019897460938, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004444369580596685, "rewards/margins": 0.02231009490787983, "rewards/rejected": -0.017865724861621857, "step": 118 }, { "epoch": 0.11, "grad_norm": 24.008873682901935, "learning_rate": 4.999313853543712e-07, "logps/chosen": -33.51298522949219, "logps/rejected": -45.79591369628906, "loss": 0.6858, "losses/dpo": 0.697155237197876, "losses/sft": 0.1524510234594345, "losses/total": 0.697155237197876, "ref_logps/chosen": -33.517311096191406, "ref_logps/rejected": -45.64968490600586, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0004326032940298319, "rewards/margins": 0.015055842697620392, "rewards/rejected": -0.014623240567743778, "step": 119 }, { "epoch": 0.11, "grad_norm": 20.46964098325378, "learning_rate": 4.999252894607734e-07, "logps/chosen": -25.34711456298828, "logps/rejected": -33.37767028808594, "loss": 0.687, "losses/dpo": 0.6679772734642029, "losses/sft": 0.2991921901702881, "losses/total": 0.6679772734642029, "ref_logps/chosen": -25.366085052490234, "ref_logps/rejected": -33.272342681884766, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0018970775417983532, "rewards/margins": 0.012430232018232346, "rewards/rejected": -0.010533155873417854, "step": 120 }, { "epoch": 0.11, "grad_norm": 23.066321098133894, "learning_rate": 4.999189342202089e-07, "logps/chosen": -28.963043212890625, "logps/rejected": -38.825096130371094, "loss": 0.6887, "losses/dpo": 0.6969168782234192, "losses/sft": 1.2233314514160156, "losses/total": 0.6969168782234192, "ref_logps/chosen": -28.961322784423828, "ref_logps/rejected": -38.73084259033203, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0001719773281365633, "rewards/margins": 0.009253591299057007, "rewards/rejected": -0.009425569325685501, "step": 121 }, { "epoch": 0.12, "grad_norm": 21.122212455069725, "learning_rate": 4.999123196392726e-07, "logps/chosen": -28.73985481262207, "logps/rejected": -31.710105895996094, "loss": 0.6795, "losses/dpo": 0.6830872297286987, "losses/sft": 0.3674885630607605, "losses/total": 0.6830872297286987, "ref_logps/chosen": -28.767837524414062, "ref_logps/rejected": -31.461021423339844, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0027983279433101416, "rewards/margins": 0.02770676463842392, "rewards/rejected": -0.024908434599637985, "step": 122 }, { "epoch": 0.12, "grad_norm": 23.765295494836575, "learning_rate": 4.999054457248282e-07, "logps/chosen": -28.91507911682129, "logps/rejected": -41.87816619873047, "loss": 0.6809, "losses/dpo": 0.6787292957305908, "losses/sft": 0.2907832860946655, "losses/total": 0.6787292957305908, "ref_logps/chosen": -28.96450424194336, "ref_logps/rejected": -41.677215576171875, "rewards/accuracies": 0.75, "rewards/chosen": 0.004942661616951227, "rewards/margins": 0.025037318468093872, "rewards/rejected": -0.020094657316803932, "step": 123 }, { "epoch": 0.12, "grad_norm": 22.363089007316916, "learning_rate": 4.998983124840091e-07, "logps/chosen": -34.01361083984375, "logps/rejected": -35.58916091918945, "loss": 0.684, "losses/dpo": 0.685874879360199, "losses/sft": 0.7306395769119263, "losses/total": 0.685874879360199, "ref_logps/chosen": -34.04533767700195, "ref_logps/rejected": -35.43464660644531, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0031724870204925537, "rewards/margins": 0.01862395368516445, "rewards/rejected": -0.015451466664671898, "step": 124 }, { "epoch": 0.12, "grad_norm": 20.365814521193027, "learning_rate": 4.998909199242173e-07, "logps/chosen": -34.247901916503906, "logps/rejected": -31.70351791381836, "loss": 0.6961, "losses/dpo": 0.694584846496582, "losses/sft": 0.9475712180137634, "losses/total": 0.694584846496582, "ref_logps/chosen": -34.13238525390625, "ref_logps/rejected": -31.64427947998047, "rewards/accuracies": 0.5, "rewards/chosen": -0.011551795527338982, "rewards/margins": -0.005627912003546953, "rewards/rejected": -0.005923882592469454, "step": 125 }, { "epoch": 0.12, "grad_norm": 18.48448570242175, "learning_rate": 4.998832680531239e-07, "logps/chosen": -23.587120056152344, "logps/rejected": -25.784130096435547, "loss": 0.6845, "losses/dpo": 0.6584799885749817, "losses/sft": 1.2754439115524292, "losses/total": 0.6584799885749817, "ref_logps/chosen": -23.63869285583496, "ref_logps/rejected": -25.659820556640625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00515754334628582, "rewards/margins": 0.017588313668966293, "rewards/rejected": -0.012430768460035324, "step": 126 }, { "epoch": 0.12, "grad_norm": 18.256253348176237, "learning_rate": 4.998753568786695e-07, "logps/chosen": -20.16432762145996, "logps/rejected": -29.218341827392578, "loss": 0.6815, "losses/dpo": 0.6689563989639282, "losses/sft": 0.717616617679596, "losses/total": 0.6689563989639282, "ref_logps/chosen": -20.14303970336914, "ref_logps/rejected": -28.95680046081543, "rewards/accuracies": 0.625, "rewards/chosen": -0.0021289722062647343, "rewards/margins": 0.02402513101696968, "rewards/rejected": -0.026154104620218277, "step": 127 }, { "epoch": 0.12, "grad_norm": 22.770416703563395, "learning_rate": 4.998671864090632e-07, "logps/chosen": -39.74087905883789, "logps/rejected": -36.345985412597656, "loss": 0.6822, "losses/dpo": 0.6851697564125061, "losses/sft": 0.9269623160362244, "losses/total": 0.6851697564125061, "ref_logps/chosen": -39.86006546020508, "ref_logps/rejected": -36.24360275268555, "rewards/accuracies": 0.8125, "rewards/chosen": 0.011918460950255394, "rewards/margins": 0.022156817838549614, "rewards/rejected": -0.01023835502564907, "step": 128 }, { "epoch": 0.12, "grad_norm": 21.61177100932311, "learning_rate": 4.998587566527837e-07, "logps/chosen": -23.498046875, "logps/rejected": -29.816272735595703, "loss": 0.6913, "losses/dpo": 0.6664048433303833, "losses/sft": 0.5861063003540039, "losses/total": 0.6664048433303833, "ref_logps/chosen": -23.44232940673828, "ref_logps/rejected": -29.71890640258789, "rewards/accuracies": 0.5, "rewards/chosen": -0.005571586079895496, "rewards/margins": 0.004165046848356724, "rewards/rejected": -0.00973663292825222, "step": 129 }, { "epoch": 0.12, "grad_norm": 25.17144934593941, "learning_rate": 4.998500676185785e-07, "logps/chosen": -31.092041015625, "logps/rejected": -44.05819320678711, "loss": 0.6844, "losses/dpo": 0.6817198991775513, "losses/sft": 0.3368386924266815, "losses/total": 0.6817198991775513, "ref_logps/chosen": -30.94867706298828, "ref_logps/rejected": -43.73463439941406, "rewards/accuracies": 0.625, "rewards/chosen": -0.01433630846440792, "rewards/margins": 0.018019583076238632, "rewards/rejected": -0.032355889678001404, "step": 130 }, { "epoch": 0.12, "grad_norm": 20.345831184559426, "learning_rate": 4.998411193154641e-07, "logps/chosen": -28.297542572021484, "logps/rejected": -33.67168426513672, "loss": 0.6863, "losses/dpo": 0.7248511910438538, "losses/sft": 1.2524502277374268, "losses/total": 0.7248511910438538, "ref_logps/chosen": -28.367219924926758, "ref_logps/rejected": -33.60215377807617, "rewards/accuracies": 0.625, "rewards/chosen": 0.006967585068196058, "rewards/margins": 0.013920925557613373, "rewards/rejected": -0.006953340955078602, "step": 131 }, { "epoch": 0.12, "grad_norm": 21.331387267647216, "learning_rate": 4.998319117527263e-07, "logps/chosen": -21.164527893066406, "logps/rejected": -37.49475860595703, "loss": 0.6863, "losses/dpo": 0.689494252204895, "losses/sft": 1.1927286386489868, "losses/total": 0.689494252204895, "ref_logps/chosen": -21.25104522705078, "ref_logps/rejected": -37.4396858215332, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008651982992887497, "rewards/margins": 0.014159216545522213, "rewards/rejected": -0.005507233552634716, "step": 132 }, { "epoch": 0.13, "grad_norm": 23.34650279760413, "learning_rate": 4.998224449399197e-07, "logps/chosen": -24.757186889648438, "logps/rejected": -36.14613342285156, "loss": 0.6806, "losses/dpo": 0.6288632154464722, "losses/sft": 0.1612386256456375, "losses/total": 0.6288632154464722, "ref_logps/chosen": -24.705570220947266, "ref_logps/rejected": -35.8382568359375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0051619200967252254, "rewards/margins": 0.025625787675380707, "rewards/rejected": -0.03078770823776722, "step": 133 }, { "epoch": 0.13, "grad_norm": 22.299976036871644, "learning_rate": 4.998127188868679e-07, "logps/chosen": -34.75825119018555, "logps/rejected": -41.32115936279297, "loss": 0.6808, "losses/dpo": 0.6628859043121338, "losses/sft": 1.1234036684036255, "losses/total": 0.6628859043121338, "ref_logps/chosen": -34.79224395751953, "ref_logps/rejected": -41.097599029541016, "rewards/accuracies": 0.625, "rewards/chosen": 0.003399414010345936, "rewards/margins": 0.025755366310477257, "rewards/rejected": -0.022355949506163597, "step": 134 }, { "epoch": 0.13, "grad_norm": 21.900712524376058, "learning_rate": 4.998027336036638e-07, "logps/chosen": -28.54541778564453, "logps/rejected": -34.65163803100586, "loss": 0.6766, "losses/dpo": 0.6629456281661987, "losses/sft": 0.3073120415210724, "losses/total": 0.6629456281661987, "ref_logps/chosen": -28.74681282043457, "ref_logps/rejected": -34.51539611816406, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02013937383890152, "rewards/margins": 0.03376343473792076, "rewards/rejected": -0.013624059967696667, "step": 135 }, { "epoch": 0.13, "grad_norm": 24.585091206031155, "learning_rate": 4.997924891006688e-07, "logps/chosen": -41.613975524902344, "logps/rejected": -32.555782318115234, "loss": 0.6894, "losses/dpo": 0.7081424593925476, "losses/sft": 0.812528669834137, "losses/total": 0.7081424593925476, "ref_logps/chosen": -41.60591125488281, "ref_logps/rejected": -32.46968078613281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0008061749394983053, "rewards/margins": 0.00780405942350626, "rewards/rejected": -0.008610233664512634, "step": 136 }, { "epoch": 0.13, "grad_norm": 19.27789037058589, "learning_rate": 4.997819853885139e-07, "logps/chosen": -22.993146896362305, "logps/rejected": -24.609840393066406, "loss": 0.6943, "losses/dpo": 0.682862401008606, "losses/sft": 0.39416855573654175, "losses/total": 0.682862401008606, "ref_logps/chosen": -22.835372924804688, "ref_logps/rejected": -24.46989631652832, "rewards/accuracies": 0.5, "rewards/chosen": -0.015777431428432465, "rewards/margins": -0.001783137209713459, "rewards/rejected": -0.013994294218719006, "step": 137 }, { "epoch": 0.13, "grad_norm": 24.173839190145724, "learning_rate": 4.997712224780987e-07, "logps/chosen": -27.96249008178711, "logps/rejected": -39.21653747558594, "loss": 0.6953, "losses/dpo": 0.6771550178527832, "losses/sft": 0.39638999104499817, "losses/total": 0.6771550178527832, "ref_logps/chosen": -27.974885940551758, "ref_logps/rejected": -39.26532745361328, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001239472534507513, "rewards/margins": -0.0036397725343704224, "rewards/rejected": 0.004879244603216648, "step": 138 }, { "epoch": 0.13, "grad_norm": 24.61838662787576, "learning_rate": 4.997602003805917e-07, "logps/chosen": -31.940263748168945, "logps/rejected": -42.09412384033203, "loss": 0.687, "losses/dpo": 0.7090741395950317, "losses/sft": 0.5408652424812317, "losses/total": 0.7090741395950317, "ref_logps/chosen": -31.955120086669922, "ref_logps/rejected": -41.98014450073242, "rewards/accuracies": 0.625, "rewards/chosen": 0.0014856541529297829, "rewards/margins": 0.012883678078651428, "rewards/rejected": -0.01139802299439907, "step": 139 }, { "epoch": 0.13, "grad_norm": 18.920680216723447, "learning_rate": 4.997489191074307e-07, "logps/chosen": -20.299442291259766, "logps/rejected": -26.662384033203125, "loss": 0.6809, "losses/dpo": 0.6960943937301636, "losses/sft": 0.8790920376777649, "losses/total": 0.6960943937301636, "ref_logps/chosen": -20.31946563720703, "ref_logps/rejected": -26.431804656982422, "rewards/accuracies": 0.75, "rewards/chosen": 0.0020022972021251917, "rewards/margins": 0.025060197338461876, "rewards/rejected": -0.023057900369167328, "step": 140 }, { "epoch": 0.13, "grad_norm": 19.760380575190002, "learning_rate": 4.997373786703222e-07, "logps/chosen": -24.979206085205078, "logps/rejected": -23.630733489990234, "loss": 0.689, "losses/dpo": 0.6786045432090759, "losses/sft": 0.36919867992401123, "losses/total": 0.6786045432090759, "ref_logps/chosen": -25.033031463623047, "ref_logps/rejected": -23.599830627441406, "rewards/accuracies": 0.625, "rewards/chosen": 0.0053822691552340984, "rewards/margins": 0.008472495712339878, "rewards/rejected": -0.0030902267899364233, "step": 141 }, { "epoch": 0.13, "grad_norm": 21.305943548855776, "learning_rate": 4.997255790812417e-07, "logps/chosen": -27.817081451416016, "logps/rejected": -34.71155548095703, "loss": 0.6838, "losses/dpo": 0.7176374793052673, "losses/sft": 1.3707911968231201, "losses/total": 0.7176374793052673, "ref_logps/chosen": -27.762062072753906, "ref_logps/rejected": -34.466102600097656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005501880310475826, "rewards/margins": 0.01904306747019291, "rewards/rejected": -0.02454494684934616, "step": 142 }, { "epoch": 0.13, "grad_norm": 21.787443006281723, "learning_rate": 4.997135203524336e-07, "logps/chosen": -32.77314376831055, "logps/rejected": -39.74238586425781, "loss": 0.6729, "losses/dpo": 0.6702504754066467, "losses/sft": 0.5548468232154846, "losses/total": 0.6702504754066467, "ref_logps/chosen": -32.85594940185547, "ref_logps/rejected": -39.409461975097656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.008280606009066105, "rewards/margins": 0.04157261550426483, "rewards/rejected": -0.033292006701231, "step": 143 }, { "epoch": 0.14, "grad_norm": 21.56519382758421, "learning_rate": 4.99701202496411e-07, "logps/chosen": -24.503585815429688, "logps/rejected": -38.217857360839844, "loss": 0.6843, "losses/dpo": 0.6581798791885376, "losses/sft": 1.4265743494033813, "losses/total": 0.6581798791885376, "ref_logps/chosen": -24.550642013549805, "ref_logps/rejected": -38.08131408691406, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004705802537500858, "rewards/margins": 0.01836022175848484, "rewards/rejected": -0.013654418289661407, "step": 144 }, { "epoch": 0.14, "grad_norm": 21.353957463943537, "learning_rate": 4.996886255259565e-07, "logps/chosen": -25.863052368164062, "logps/rejected": -27.350875854492188, "loss": 0.694, "losses/dpo": 0.6572393178939819, "losses/sft": 1.3114207983016968, "losses/total": 0.6572393178939819, "ref_logps/chosen": -25.715190887451172, "ref_logps/rejected": -27.21734619140625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01478629931807518, "rewards/margins": -0.0014332006685435772, "rewards/rejected": -0.013353098183870316, "step": 145 }, { "epoch": 0.14, "grad_norm": 20.73422194252406, "learning_rate": 4.99675789454121e-07, "logps/chosen": -27.771286010742188, "logps/rejected": -37.215049743652344, "loss": 0.6713, "losses/dpo": 0.720003068447113, "losses/sft": 1.1716922521591187, "losses/total": 0.720003068447113, "ref_logps/chosen": -27.913175582885742, "ref_logps/rejected": -36.90971374511719, "rewards/accuracies": 0.9375, "rewards/chosen": 0.014188999310135841, "rewards/margins": 0.04472271725535393, "rewards/rejected": -0.030533719807863235, "step": 146 }, { "epoch": 0.14, "grad_norm": 20.665259736524135, "learning_rate": 4.996626942942244e-07, "logps/chosen": -30.750080108642578, "logps/rejected": -35.30836868286133, "loss": 0.6814, "losses/dpo": 0.6620415449142456, "losses/sft": 0.3231738805770874, "losses/total": 0.6620415449142456, "ref_logps/chosen": -30.918067932128906, "ref_logps/rejected": -35.23432922363281, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016799012199044228, "rewards/margins": 0.024202410131692886, "rewards/rejected": -0.007403397932648659, "step": 147 }, { "epoch": 0.14, "grad_norm": 20.298208160348057, "learning_rate": 4.996493400598555e-07, "logps/chosen": -27.16568374633789, "logps/rejected": -21.871212005615234, "loss": 0.682, "losses/dpo": 0.6925111413002014, "losses/sft": 0.9111620783805847, "losses/total": 0.6925111413002014, "ref_logps/chosen": -27.174259185791016, "ref_logps/rejected": -21.650930404663086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0008573969826102257, "rewards/margins": 0.022885404527187347, "rewards/rejected": -0.022028004750609398, "step": 148 }, { "epoch": 0.14, "grad_norm": 23.40129090116223, "learning_rate": 4.996357267648722e-07, "logps/chosen": -27.476791381835938, "logps/rejected": -48.022560119628906, "loss": 0.6755, "losses/dpo": 0.6875530481338501, "losses/sft": 1.1465109586715698, "losses/total": 0.6875530481338501, "ref_logps/chosen": -27.454944610595703, "ref_logps/rejected": -47.63130569458008, "rewards/accuracies": 0.8125, "rewards/chosen": -0.002184668555855751, "rewards/margins": 0.03694087266921997, "rewards/rejected": -0.03912554308772087, "step": 149 }, { "epoch": 0.14, "grad_norm": 24.93266853678787, "learning_rate": 4.996218544234008e-07, "logps/chosen": -32.52482986450195, "logps/rejected": -43.240379333496094, "loss": 0.6912, "losses/dpo": 0.6739779710769653, "losses/sft": 1.042344093322754, "losses/total": 0.6739779710769653, "ref_logps/chosen": -32.40483856201172, "ref_logps/rejected": -43.07245635986328, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011999325826764107, "rewards/margins": 0.004793066531419754, "rewards/rejected": -0.01679239422082901, "step": 150 }, { "epoch": 0.14, "grad_norm": 22.248238865313596, "learning_rate": 4.996077230498367e-07, "logps/chosen": -33.20498275756836, "logps/rejected": -38.19388198852539, "loss": 0.6831, "losses/dpo": 0.7018842697143555, "losses/sft": 1.369569182395935, "losses/total": 0.7018842697143555, "ref_logps/chosen": -33.27879333496094, "ref_logps/rejected": -38.06022644042969, "rewards/accuracies": 0.625, "rewards/chosen": 0.007381319534033537, "rewards/margins": 0.02074676752090454, "rewards/rejected": -0.013365447521209717, "step": 151 }, { "epoch": 0.14, "grad_norm": 17.779130101091518, "learning_rate": 4.995933326588438e-07, "logps/chosen": -20.130386352539062, "logps/rejected": -27.628307342529297, "loss": 0.6941, "losses/dpo": 0.7354535460472107, "losses/sft": 1.440975308418274, "losses/total": 0.7354535460472107, "ref_logps/chosen": -20.008010864257812, "ref_logps/rejected": -27.511932373046875, "rewards/accuracies": 0.625, "rewards/chosen": -0.012237709946930408, "rewards/margins": -0.0006000369321554899, "rewards/rejected": -0.011637670919299126, "step": 152 }, { "epoch": 0.14, "grad_norm": 22.464694954224953, "learning_rate": 4.995786832653553e-07, "logps/chosen": -30.358869552612305, "logps/rejected": -31.338321685791016, "loss": 0.6858, "losses/dpo": 0.6548117995262146, "losses/sft": 0.9112861752510071, "losses/total": 0.6548117995262146, "ref_logps/chosen": -30.317829132080078, "ref_logps/rejected": -31.1434383392334, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004104122053831816, "rewards/margins": 0.01538429968059063, "rewards/rejected": -0.019488422200083733, "step": 153 }, { "epoch": 0.15, "grad_norm": 21.371278262795087, "learning_rate": 4.995637748845725e-07, "logps/chosen": -27.422985076904297, "logps/rejected": -30.07447052001953, "loss": 0.6825, "losses/dpo": 0.6819406151771545, "losses/sft": 0.4666486978530884, "losses/total": 0.6819406151771545, "ref_logps/chosen": -27.480810165405273, "ref_logps/rejected": -29.914627075195312, "rewards/accuracies": 0.75, "rewards/chosen": 0.005782646127045155, "rewards/margins": 0.02176680602133274, "rewards/rejected": -0.01598415896296501, "step": 154 }, { "epoch": 0.15, "grad_norm": 22.541150528670038, "learning_rate": 4.995486075319661e-07, "logps/chosen": -26.510425567626953, "logps/rejected": -32.58055877685547, "loss": 0.6987, "losses/dpo": 0.724975049495697, "losses/sft": 1.1669894456863403, "losses/total": 0.724975049495697, "ref_logps/chosen": -26.365386962890625, "ref_logps/rejected": -32.54110336303711, "rewards/accuracies": 0.5, "rewards/chosen": -0.01450380403548479, "rewards/margins": -0.01055806316435337, "rewards/rejected": -0.00394574087113142, "step": 155 }, { "epoch": 0.15, "grad_norm": 20.34967004064837, "learning_rate": 4.995331812232752e-07, "logps/chosen": -21.957096099853516, "logps/rejected": -29.475318908691406, "loss": 0.6821, "losses/dpo": 0.6922837495803833, "losses/sft": 1.071643352508545, "losses/total": 0.6922837495803833, "ref_logps/chosen": -22.02566146850586, "ref_logps/rejected": -29.316980361938477, "rewards/accuracies": 0.875, "rewards/chosen": 0.006856388412415981, "rewards/margins": 0.02269042283296585, "rewards/rejected": -0.015834033489227295, "step": 156 }, { "epoch": 0.15, "grad_norm": 22.511120343942785, "learning_rate": 4.995174959745074e-07, "logps/chosen": -27.679122924804688, "logps/rejected": -38.31925582885742, "loss": 0.6833, "losses/dpo": 0.6806364059448242, "losses/sft": 0.3416358232498169, "losses/total": 0.6806364059448242, "ref_logps/chosen": -27.587318420410156, "ref_logps/rejected": -38.02167510986328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009180471301078796, "rewards/margins": 0.020577585324645042, "rewards/rejected": -0.029758058488368988, "step": 157 }, { "epoch": 0.15, "grad_norm": 19.11998010364547, "learning_rate": 4.995015518019393e-07, "logps/chosen": -28.526649475097656, "logps/rejected": -27.332332611083984, "loss": 0.6842, "losses/dpo": 0.702682614326477, "losses/sft": 1.5870875120162964, "losses/total": 0.702682614326477, "ref_logps/chosen": -28.532180786132812, "ref_logps/rejected": -27.152135848999023, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005532951327040792, "rewards/margins": 0.018573040142655373, "rewards/rejected": -0.018019743263721466, "step": 158 }, { "epoch": 0.15, "grad_norm": 19.526824367963396, "learning_rate": 4.994853487221164e-07, "logps/chosen": -25.036169052124023, "logps/rejected": -26.29005241394043, "loss": 0.6856, "losses/dpo": 0.7191353440284729, "losses/sft": 1.220201849937439, "losses/total": 0.7191353440284729, "ref_logps/chosen": -25.028663635253906, "ref_logps/rejected": -26.124507904052734, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007506522815674543, "rewards/margins": 0.0158039852976799, "rewards/rejected": -0.016554638743400574, "step": 159 }, { "epoch": 0.15, "grad_norm": 23.112796454663258, "learning_rate": 4.994688867518523e-07, "logps/chosen": -41.75770568847656, "logps/rejected": -37.6129150390625, "loss": 0.6804, "losses/dpo": 0.67547607421875, "losses/sft": 1.6840301752090454, "losses/total": 0.67547607421875, "ref_logps/chosen": -41.59837341308594, "ref_logps/rejected": -37.17631530761719, "rewards/accuracies": 0.625, "rewards/chosen": -0.01593298837542534, "rewards/margins": 0.027727220207452774, "rewards/rejected": -0.04366020858287811, "step": 160 }, { "epoch": 0.15, "grad_norm": 22.072156964718378, "learning_rate": 4.994521659082298e-07, "logps/chosen": -34.35651779174805, "logps/rejected": -34.497169494628906, "loss": 0.6744, "losses/dpo": 0.7191218137741089, "losses/sft": 0.8763597011566162, "losses/total": 0.7191218137741089, "ref_logps/chosen": -34.52425003051758, "ref_logps/rejected": -34.28130340576172, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01677338406443596, "rewards/margins": 0.03835989907383919, "rewards/rejected": -0.02158651500940323, "step": 161 }, { "epoch": 0.15, "grad_norm": 22.906312420783685, "learning_rate": 4.994351862086e-07, "logps/chosen": -35.69791030883789, "logps/rejected": -37.633113861083984, "loss": 0.6824, "losses/dpo": 0.6612864136695862, "losses/sft": 1.3902051448822021, "losses/total": 0.6612864136695862, "ref_logps/chosen": -35.69746398925781, "ref_logps/rejected": -37.408348083496094, "rewards/accuracies": 0.625, "rewards/chosen": -4.5105814933776855e-05, "rewards/margins": 0.022431466728448868, "rewards/rejected": -0.022476572543382645, "step": 162 }, { "epoch": 0.15, "grad_norm": 18.79848616336298, "learning_rate": 4.994179476705826e-07, "logps/chosen": -24.415294647216797, "logps/rejected": -29.234479904174805, "loss": 0.6811, "losses/dpo": 0.7026026248931885, "losses/sft": 0.8848031163215637, "losses/total": 0.7026026248931885, "ref_logps/chosen": -24.448223114013672, "ref_logps/rejected": -29.017642974853516, "rewards/accuracies": 0.625, "rewards/chosen": 0.0032930118031799793, "rewards/margins": 0.024976937100291252, "rewards/rejected": -0.02168392762541771, "step": 163 }, { "epoch": 0.15, "grad_norm": 21.34666454827528, "learning_rate": 4.99400450312066e-07, "logps/chosen": -27.265140533447266, "logps/rejected": -33.41569137573242, "loss": 0.685, "losses/dpo": 0.6830543875694275, "losses/sft": 1.309188961982727, "losses/total": 0.6830543875694275, "ref_logps/chosen": -27.295724868774414, "ref_logps/rejected": -33.27271270751953, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0030583133921027184, "rewards/margins": 0.017356114462018013, "rewards/rejected": -0.01429780200123787, "step": 164 }, { "epoch": 0.16, "grad_norm": 20.808834625877235, "learning_rate": 4.993826941512073e-07, "logps/chosen": -21.991701126098633, "logps/rejected": -34.47837829589844, "loss": 0.6719, "losses/dpo": 0.6361923813819885, "losses/sft": 0.45504996180534363, "losses/total": 0.6361923813819885, "ref_logps/chosen": -22.019254684448242, "ref_logps/rejected": -34.0550651550293, "rewards/accuracies": 0.75, "rewards/chosen": 0.0027553141117095947, "rewards/margins": 0.045086611062288284, "rewards/rejected": -0.04233130067586899, "step": 165 }, { "epoch": 0.16, "grad_norm": 21.32868570023296, "learning_rate": 4.99364679206432e-07, "logps/chosen": -24.12898063659668, "logps/rejected": -33.96471405029297, "loss": 0.6799, "losses/dpo": 0.6186413764953613, "losses/sft": 1.0190367698669434, "losses/total": 0.6186413764953613, "ref_logps/chosen": -24.035282135009766, "ref_logps/rejected": -33.595890045166016, "rewards/accuracies": 0.625, "rewards/chosen": -0.009369767270982265, "rewards/margins": 0.027512384578585625, "rewards/rejected": -0.036882150918245316, "step": 166 }, { "epoch": 0.16, "grad_norm": 18.987764357112876, "learning_rate": 4.993464054964342e-07, "logps/chosen": -23.318145751953125, "logps/rejected": -32.37856674194336, "loss": 0.6794, "losses/dpo": 0.6724587678909302, "losses/sft": 1.268723726272583, "losses/total": 0.6724587678909302, "ref_logps/chosen": -23.258689880371094, "ref_logps/rejected": -32.03080749511719, "rewards/accuracies": 0.5, "rewards/chosen": -0.0059456853196024895, "rewards/margins": 0.028830615803599358, "rewards/rejected": -0.03477630019187927, "step": 167 }, { "epoch": 0.16, "grad_norm": 22.631664167222134, "learning_rate": 4.993278730401763e-07, "logps/chosen": -30.53357696533203, "logps/rejected": -39.00453567504883, "loss": 0.6849, "losses/dpo": 0.6853042244911194, "losses/sft": 1.3687257766723633, "losses/total": 0.6853042244911194, "ref_logps/chosen": -30.610408782958984, "ref_logps/rejected": -38.904930114746094, "rewards/accuracies": 0.625, "rewards/chosen": 0.007683169562369585, "rewards/margins": 0.01764351688325405, "rewards/rejected": -0.009960346855223179, "step": 168 }, { "epoch": 0.16, "grad_norm": 23.851987577963015, "learning_rate": 4.993090818568897e-07, "logps/chosen": -32.347755432128906, "logps/rejected": -47.2177848815918, "loss": 0.6787, "losses/dpo": 0.6515413522720337, "losses/sft": 0.8513352870941162, "losses/total": 0.6515413522720337, "ref_logps/chosen": -32.291282653808594, "ref_logps/rejected": -46.8640251159668, "rewards/accuracies": 0.75, "rewards/chosen": -0.005647024139761925, "rewards/margins": 0.029728669673204422, "rewards/rejected": -0.035375699400901794, "step": 169 }, { "epoch": 0.16, "grad_norm": 21.86678267971038, "learning_rate": 4.992900319660737e-07, "logps/chosen": -34.787940979003906, "logps/rejected": -32.97093963623047, "loss": 0.6802, "losses/dpo": 0.7152419090270996, "losses/sft": 0.6269170641899109, "losses/total": 0.7152419090270996, "ref_logps/chosen": -34.641815185546875, "ref_logps/rejected": -32.552825927734375, "rewards/accuracies": 0.5, "rewards/chosen": -0.014612860977649689, "rewards/margins": 0.02719813585281372, "rewards/rejected": -0.04181099683046341, "step": 170 }, { "epoch": 0.16, "grad_norm": 20.024947849683276, "learning_rate": 4.992707233874964e-07, "logps/chosen": -26.98607635498047, "logps/rejected": -29.02410888671875, "loss": 0.6777, "losses/dpo": 0.6740058660507202, "losses/sft": 1.0474852323532104, "losses/total": 0.6740058660507202, "ref_logps/chosen": -27.096900939941406, "ref_logps/rejected": -28.820735931396484, "rewards/accuracies": 0.8125, "rewards/chosen": 0.011082596145570278, "rewards/margins": 0.031420059502124786, "rewards/rejected": -0.020337462425231934, "step": 171 }, { "epoch": 0.16, "grad_norm": 22.24791586034801, "learning_rate": 4.992511561411944e-07, "logps/chosen": -35.139530181884766, "logps/rejected": -32.255741119384766, "loss": 0.6862, "losses/dpo": 0.6659494638442993, "losses/sft": 0.5083767175674438, "losses/total": 0.6659494638442993, "ref_logps/chosen": -35.05296325683594, "ref_logps/rejected": -32.01114273071289, "rewards/accuracies": 0.5, "rewards/chosen": -0.008656510151922703, "rewards/margins": 0.01580357924103737, "rewards/rejected": -0.024460088461637497, "step": 172 }, { "epoch": 0.16, "grad_norm": 19.017727351332915, "learning_rate": 4.992313302474726e-07, "logps/chosen": -22.979782104492188, "logps/rejected": -31.904804229736328, "loss": 0.6743, "losses/dpo": 0.6713506579399109, "losses/sft": 0.18452970683574677, "losses/total": 0.6713506579399109, "ref_logps/chosen": -23.039989471435547, "ref_logps/rejected": -31.575439453125, "rewards/accuracies": 0.75, "rewards/chosen": 0.006020767614245415, "rewards/margins": 0.03895729407668114, "rewards/rejected": -0.03293652832508087, "step": 173 }, { "epoch": 0.16, "grad_norm": 20.535634801207994, "learning_rate": 4.99211245726904e-07, "logps/chosen": -19.388904571533203, "logps/rejected": -38.83685302734375, "loss": 0.657, "losses/dpo": 0.6551394462585449, "losses/sft": 0.9087169766426086, "losses/total": 0.6551394462585449, "ref_logps/chosen": -19.495561599731445, "ref_logps/rejected": -38.19712448120117, "rewards/accuracies": 0.9375, "rewards/chosen": 0.010665598325431347, "rewards/margins": 0.07463853806257248, "rewards/rejected": -0.06397293508052826, "step": 174 }, { "epoch": 0.17, "grad_norm": 19.945100466971585, "learning_rate": 4.991909026003305e-07, "logps/chosen": -24.1237735748291, "logps/rejected": -32.704498291015625, "loss": 0.6793, "losses/dpo": 0.6722328662872314, "losses/sft": 0.5968920588493347, "losses/total": 0.6722328662872314, "ref_logps/chosen": -24.17896842956543, "ref_logps/rejected": -32.471893310546875, "rewards/accuracies": 0.625, "rewards/chosen": 0.005519455764442682, "rewards/margins": 0.028780337423086166, "rewards/rejected": -0.02326088398694992, "step": 175 }, { "epoch": 0.17, "grad_norm": 21.563039877000207, "learning_rate": 4.991703008888622e-07, "logps/chosen": -29.655975341796875, "logps/rejected": -36.24789810180664, "loss": 0.6702, "losses/dpo": 0.6832626461982727, "losses/sft": 0.5809920430183411, "losses/total": 0.6832626461982727, "ref_logps/chosen": -29.775802612304688, "ref_logps/rejected": -35.89815902709961, "rewards/accuracies": 0.875, "rewards/chosen": 0.011982385069131851, "rewards/margins": 0.04695636034011841, "rewards/rejected": -0.03497397527098656, "step": 176 }, { "epoch": 0.17, "grad_norm": 21.77822959665617, "learning_rate": 4.991494406138772e-07, "logps/chosen": -32.10219192504883, "logps/rejected": -40.830814361572266, "loss": 0.6619, "losses/dpo": 0.6938949823379517, "losses/sft": 1.2952247858047485, "losses/total": 0.6938949823379517, "ref_logps/chosen": -32.22834396362305, "ref_logps/rejected": -40.31315994262695, "rewards/accuracies": 0.875, "rewards/chosen": 0.01261545903980732, "rewards/margins": 0.06438097357749939, "rewards/rejected": -0.05176551640033722, "step": 177 }, { "epoch": 0.17, "grad_norm": 21.081852292993517, "learning_rate": 4.991283217970223e-07, "logps/chosen": -21.930885314941406, "logps/rejected": -38.04200744628906, "loss": 0.6809, "losses/dpo": 0.6799581050872803, "losses/sft": 0.7161468267440796, "losses/total": 0.6799581050872803, "ref_logps/chosen": -21.839237213134766, "ref_logps/rejected": -37.69557189941406, "rewards/accuracies": 0.75, "rewards/chosen": -0.009164568036794662, "rewards/margins": 0.025479262694716454, "rewards/rejected": -0.03464382886886597, "step": 178 }, { "epoch": 0.17, "grad_norm": 22.20299369215953, "learning_rate": 4.991069444602125e-07, "logps/chosen": -25.53278350830078, "logps/rejected": -28.059001922607422, "loss": 0.7058, "losses/dpo": 0.7114830613136292, "losses/sft": 1.0692613124847412, "losses/total": 0.7114830613136292, "ref_logps/chosen": -25.262819290161133, "ref_logps/rejected": -28.02458953857422, "rewards/accuracies": 0.375, "rewards/chosen": -0.026996592059731483, "rewards/margins": -0.023555336520075798, "rewards/rejected": -0.00344125647097826, "step": 179 }, { "epoch": 0.17, "grad_norm": 22.33550594359507, "learning_rate": 4.990853086256309e-07, "logps/chosen": -30.468542098999023, "logps/rejected": -42.26133728027344, "loss": 0.6654, "losses/dpo": 0.6221312880516052, "losses/sft": 1.356953740119934, "losses/total": 0.6221312880516052, "ref_logps/chosen": -30.43911361694336, "ref_logps/rejected": -41.65385437011719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.002942759543657303, "rewards/margins": 0.05780564248561859, "rewards/rejected": -0.060748398303985596, "step": 180 }, { "epoch": 0.17, "grad_norm": 23.4635934142345, "learning_rate": 4.990634143157292e-07, "logps/chosen": -27.643136978149414, "logps/rejected": -43.44340133666992, "loss": 0.6813, "losses/dpo": 0.6595314145088196, "losses/sft": 1.1128854751586914, "losses/total": 0.6595314145088196, "ref_logps/chosen": -27.556968688964844, "ref_logps/rejected": -43.10759353637695, "rewards/accuracies": 0.6875, "rewards/chosen": -0.008616894483566284, "rewards/margins": 0.02496396191418171, "rewards/rejected": -0.03358085826039314, "step": 181 }, { "epoch": 0.17, "grad_norm": 18.038612100826953, "learning_rate": 4.990412615532268e-07, "logps/chosen": -22.567588806152344, "logps/rejected": -30.07050323486328, "loss": 0.6733, "losses/dpo": 0.6629981994628906, "losses/sft": 0.9351394176483154, "losses/total": 0.6629981994628906, "ref_logps/chosen": -22.52756118774414, "ref_logps/rejected": -29.61446762084961, "rewards/accuracies": 0.625, "rewards/chosen": -0.00400280486792326, "rewards/margins": 0.0416005402803421, "rewards/rejected": -0.045603346079587936, "step": 182 }, { "epoch": 0.17, "grad_norm": 21.792350359161162, "learning_rate": 4.990188503611117e-07, "logps/chosen": -30.307695388793945, "logps/rejected": -41.88825607299805, "loss": 0.6754, "losses/dpo": 0.6916003823280334, "losses/sft": 1.351412057876587, "losses/total": 0.6916003823280334, "ref_logps/chosen": -30.30023193359375, "ref_logps/rejected": -41.50617218017578, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007461607456207275, "rewards/margins": 0.037462059408426285, "rewards/rejected": -0.03820822015404701, "step": 183 }, { "epoch": 0.17, "grad_norm": 23.138816992444706, "learning_rate": 4.989961807626399e-07, "logps/chosen": -30.143207550048828, "logps/rejected": -35.53678894042969, "loss": 0.6742, "losses/dpo": 0.7134324312210083, "losses/sft": 1.0802582502365112, "losses/total": 0.7134324312210083, "ref_logps/chosen": -30.19788932800293, "ref_logps/rejected": -35.19401550292969, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005468010902404785, "rewards/margins": 0.03974561393260956, "rewards/rejected": -0.03427760303020477, "step": 184 }, { "epoch": 0.17, "grad_norm": 26.14504412342929, "learning_rate": 4.989732527813358e-07, "logps/chosen": -27.548690795898438, "logps/rejected": -47.195831298828125, "loss": 0.6805, "losses/dpo": 0.7254155278205872, "losses/sft": 0.6882319450378418, "losses/total": 0.7254155278205872, "ref_logps/chosen": -27.488506317138672, "ref_logps/rejected": -46.873497009277344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006018537096679211, "rewards/margins": 0.026215016841888428, "rewards/rejected": -0.03223355486989021, "step": 185 }, { "epoch": 0.18, "grad_norm": 23.226090772338633, "learning_rate": 4.989500664409913e-07, "logps/chosen": -33.54684066772461, "logps/rejected": -33.43348693847656, "loss": 0.6798, "losses/dpo": 0.6376914381980896, "losses/sft": 1.1081444025039673, "losses/total": 0.6376914381980896, "ref_logps/chosen": -33.53228759765625, "ref_logps/rejected": -33.12754821777344, "rewards/accuracies": 0.625, "rewards/chosen": -0.0014555458910763264, "rewards/margins": 0.029137950390577316, "rewards/rejected": -0.030593495815992355, "step": 186 }, { "epoch": 0.18, "grad_norm": 21.929719385087065, "learning_rate": 4.989266217656673e-07, "logps/chosen": -27.380088806152344, "logps/rejected": -36.39398956298828, "loss": 0.6774, "losses/dpo": 0.6615445613861084, "losses/sft": 0.5858849883079529, "losses/total": 0.6615445613861084, "ref_logps/chosen": -27.427635192871094, "ref_logps/rejected": -36.114013671875, "rewards/accuracies": 0.625, "rewards/chosen": 0.004754327237606049, "rewards/margins": 0.032752275466918945, "rewards/rejected": -0.027997946366667747, "step": 187 }, { "epoch": 0.18, "grad_norm": 20.760421783585517, "learning_rate": 4.989029187796919e-07, "logps/chosen": -25.650367736816406, "logps/rejected": -29.558181762695312, "loss": 0.6867, "losses/dpo": 0.6353164315223694, "losses/sft": 0.7503383755683899, "losses/total": 0.6353164315223694, "ref_logps/chosen": -25.545969009399414, "ref_logps/rejected": -29.303133010864258, "rewards/accuracies": 0.5625, "rewards/chosen": -0.010439908131957054, "rewards/margins": 0.015064802020788193, "rewards/rejected": -0.025504708290100098, "step": 188 }, { "epoch": 0.18, "grad_norm": 21.273445725770134, "learning_rate": 4.988789575076619e-07, "logps/chosen": -24.850753784179688, "logps/rejected": -36.73593521118164, "loss": 0.6782, "losses/dpo": 0.6467155814170837, "losses/sft": 0.8376176953315735, "losses/total": 0.6467155814170837, "ref_logps/chosen": -24.88440704345703, "ref_logps/rejected": -36.446537017822266, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0033651511184871197, "rewards/margins": 0.0323048010468483, "rewards/rejected": -0.02893964760005474, "step": 189 }, { "epoch": 0.18, "grad_norm": 21.5909922918631, "learning_rate": 4.988547379744418e-07, "logps/chosen": -29.276939392089844, "logps/rejected": -38.76862335205078, "loss": 0.6707, "losses/dpo": 0.6829964518547058, "losses/sft": 0.8938606977462769, "losses/total": 0.6829964518547058, "ref_logps/chosen": -29.390256881713867, "ref_logps/rejected": -38.407623291015625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01133186835795641, "rewards/margins": 0.04743148013949394, "rewards/rejected": -0.03609961271286011, "step": 190 }, { "epoch": 0.18, "grad_norm": 21.673139835440995, "learning_rate": 4.98830260205164e-07, "logps/chosen": -26.418012619018555, "logps/rejected": -36.81540298461914, "loss": 0.6698, "losses/dpo": 0.70854252576828, "losses/sft": 1.4489798545837402, "losses/total": 0.70854252576828, "ref_logps/chosen": -26.351116180419922, "ref_logps/rejected": -36.256927490234375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.006689528468996286, "rewards/margins": 0.04915802180767059, "rewards/rejected": -0.05584754794836044, "step": 191 }, { "epoch": 0.18, "grad_norm": 21.01712608244742, "learning_rate": 4.988055242252294e-07, "logps/chosen": -32.433921813964844, "logps/rejected": -36.89479064941406, "loss": 0.6812, "losses/dpo": 0.7515528798103333, "losses/sft": 1.3945636749267578, "losses/total": 0.7515528798103333, "ref_logps/chosen": -32.41492462158203, "ref_logps/rejected": -36.61641311645508, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0018997937440872192, "rewards/margins": 0.02593797817826271, "rewards/rejected": -0.02783777192234993, "step": 192 }, { "epoch": 0.18, "grad_norm": 21.90037656879654, "learning_rate": 4.98780530060306e-07, "logps/chosen": -27.70879364013672, "logps/rejected": -34.08728790283203, "loss": 0.6585, "losses/dpo": 0.7202637195587158, "losses/sft": 0.6450088024139404, "losses/total": 0.7202637195587158, "ref_logps/chosen": -27.653257369995117, "ref_logps/rejected": -33.304229736328125, "rewards/accuracies": 0.75, "rewards/chosen": -0.005553704686462879, "rewards/margins": 0.07275208830833435, "rewards/rejected": -0.07830579578876495, "step": 193 }, { "epoch": 0.18, "grad_norm": 16.79003761196266, "learning_rate": 4.987552777363306e-07, "logps/chosen": -18.826889038085938, "logps/rejected": -24.45302963256836, "loss": 0.6677, "losses/dpo": 0.6967752575874329, "losses/sft": 0.9169226288795471, "losses/total": 0.6967752575874329, "ref_logps/chosen": -18.901365280151367, "ref_logps/rejected": -23.991409301757812, "rewards/accuracies": 0.75, "rewards/chosen": 0.0074476804584264755, "rewards/margins": 0.05360975116491318, "rewards/rejected": -0.04616206884384155, "step": 194 }, { "epoch": 0.18, "grad_norm": 24.450230263642815, "learning_rate": 4.987297672795072e-07, "logps/chosen": -42.8287353515625, "logps/rejected": -46.86622619628906, "loss": 0.6839, "losses/dpo": 0.6129754781723022, "losses/sft": 1.648181676864624, "losses/total": 0.6129754781723022, "ref_logps/chosen": -42.4691162109375, "ref_logps/rejected": -46.289039611816406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03596171364188194, "rewards/margins": 0.02175729349255562, "rewards/rejected": -0.05771900713443756, "step": 195 }, { "epoch": 0.18, "grad_norm": 20.418697753643286, "learning_rate": 4.987039987163081e-07, "logps/chosen": -26.303417205810547, "logps/rejected": -41.301719665527344, "loss": 0.663, "losses/dpo": 0.7225056886672974, "losses/sft": 1.0883967876434326, "losses/total": 0.7225056886672974, "ref_logps/chosen": -26.494760513305664, "ref_logps/rejected": -40.8677864074707, "rewards/accuracies": 0.75, "rewards/chosen": 0.019134540110826492, "rewards/margins": 0.06252799183130264, "rewards/rejected": -0.04339344799518585, "step": 196 }, { "epoch": 0.19, "grad_norm": 22.632785738893613, "learning_rate": 4.986779720734732e-07, "logps/chosen": -36.014488220214844, "logps/rejected": -42.78166580200195, "loss": 0.667, "losses/dpo": 0.7356557250022888, "losses/sft": 0.5987066030502319, "losses/total": 0.7356557250022888, "ref_logps/chosen": -36.11430358886719, "ref_logps/rejected": -42.312625885009766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009981203824281693, "rewards/margins": 0.05688544362783432, "rewards/rejected": -0.04690423607826233, "step": 197 }, { "epoch": 0.19, "grad_norm": 21.566851569718843, "learning_rate": 4.986516873780104e-07, "logps/chosen": -21.434274673461914, "logps/rejected": -38.96238327026367, "loss": 0.6578, "losses/dpo": 0.6826919317245483, "losses/sft": 0.6254227757453918, "losses/total": 0.6826919317245483, "ref_logps/chosen": -21.3748779296875, "ref_logps/rejected": -38.157615661621094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005939514376223087, "rewards/margins": 0.07453717291355133, "rewards/rejected": -0.08047669380903244, "step": 198 }, { "epoch": 0.19, "grad_norm": 18.284506963581617, "learning_rate": 4.986251446571953e-07, "logps/chosen": -23.15096092224121, "logps/rejected": -25.006628036499023, "loss": 0.6789, "losses/dpo": 0.6778327822685242, "losses/sft": 0.9095815420150757, "losses/total": 0.6778327822685242, "ref_logps/chosen": -23.113887786865234, "ref_logps/rejected": -24.6751708984375, "rewards/accuracies": 0.75, "rewards/chosen": -0.003707354888319969, "rewards/margins": 0.02943824604153633, "rewards/rejected": -0.03314560279250145, "step": 199 }, { "epoch": 0.19, "grad_norm": 19.74206817678887, "learning_rate": 4.985983439385712e-07, "logps/chosen": -23.86321258544922, "logps/rejected": -35.6258544921875, "loss": 0.6545, "losses/dpo": 0.685820460319519, "losses/sft": 0.5706751942634583, "losses/total": 0.685820460319519, "ref_logps/chosen": -23.717018127441406, "ref_logps/rejected": -34.64622116088867, "rewards/accuracies": 0.6875, "rewards/chosen": -0.014619345776736736, "rewards/margins": 0.08334380388259888, "rewards/rejected": -0.09796314686536789, "step": 200 }, { "epoch": 0.19, "grad_norm": 20.646719818688226, "learning_rate": 4.985712852499489e-07, "logps/chosen": -25.58361053466797, "logps/rejected": -31.448596954345703, "loss": 0.6913, "losses/dpo": 0.6592358350753784, "losses/sft": 0.6248819828033447, "losses/total": 0.6592358350753784, "ref_logps/chosen": -25.320276260375977, "ref_logps/rejected": -31.135337829589844, "rewards/accuracies": 0.5, "rewards/chosen": -0.026333630084991455, "rewards/margins": 0.004992317873984575, "rewards/rejected": -0.03132595121860504, "step": 201 }, { "epoch": 0.19, "grad_norm": 23.001876098996828, "learning_rate": 4.985439686194075e-07, "logps/chosen": -38.620330810546875, "logps/rejected": -32.472042083740234, "loss": 0.6718, "losses/dpo": 0.634304404258728, "losses/sft": 0.20243875682353973, "losses/total": 0.634304404258728, "ref_logps/chosen": -38.59175109863281, "ref_logps/rejected": -31.990230560302734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0028584194369614124, "rewards/margins": 0.04532270133495331, "rewards/rejected": -0.04818112030625343, "step": 202 }, { "epoch": 0.19, "grad_norm": 20.21181391502155, "learning_rate": 4.985163940752931e-07, "logps/chosen": -27.035537719726562, "logps/rejected": -24.2683162689209, "loss": 0.6895, "losses/dpo": 0.6463289260864258, "losses/sft": 1.0029401779174805, "losses/total": 0.6463289260864258, "ref_logps/chosen": -26.886198043823242, "ref_logps/rejected": -24.012264251708984, "rewards/accuracies": 0.5, "rewards/chosen": -0.014933954924345016, "rewards/margins": 0.010671265423297882, "rewards/rejected": -0.02560521848499775, "step": 203 }, { "epoch": 0.19, "grad_norm": 20.990311287965557, "learning_rate": 4.984885616462202e-07, "logps/chosen": -28.548248291015625, "logps/rejected": -32.22708511352539, "loss": 0.6757, "losses/dpo": 0.6698517799377441, "losses/sft": 0.6522434949874878, "losses/total": 0.6698517799377441, "ref_logps/chosen": -28.516965866088867, "ref_logps/rejected": -31.821884155273438, "rewards/accuracies": 0.625, "rewards/chosen": -0.003128057811409235, "rewards/margins": 0.03739186003804207, "rewards/rejected": -0.04051991552114487, "step": 204 }, { "epoch": 0.19, "grad_norm": 21.466795476584142, "learning_rate": 4.984604713610699e-07, "logps/chosen": -26.114761352539062, "logps/rejected": -35.27678298950195, "loss": 0.6819, "losses/dpo": 0.7380991578102112, "losses/sft": 1.1415503025054932, "losses/total": 0.7380991578102112, "ref_logps/chosen": -26.05685043334961, "ref_logps/rejected": -34.96989440917969, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005791217554360628, "rewards/margins": 0.024897897616028786, "rewards/rejected": -0.030689116567373276, "step": 205 }, { "epoch": 0.19, "grad_norm": 19.473220239038504, "learning_rate": 4.984321232489917e-07, "logps/chosen": -22.808326721191406, "logps/rejected": -28.102230072021484, "loss": 0.6679, "losses/dpo": 0.6646828651428223, "losses/sft": 0.7671761512756348, "losses/total": 0.6646828651428223, "ref_logps/chosen": -22.876300811767578, "ref_logps/rejected": -27.64159393310547, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006797256879508495, "rewards/margins": 0.05286087095737457, "rewards/rejected": -0.0460636131465435, "step": 206 }, { "epoch": 0.2, "grad_norm": 21.987127097167136, "learning_rate": 4.984035173394025e-07, "logps/chosen": -32.35721969604492, "logps/rejected": -34.438758850097656, "loss": 0.6838, "losses/dpo": 0.6393654942512512, "losses/sft": 1.2986185550689697, "losses/total": 0.6393654942512512, "ref_logps/chosen": -32.350067138671875, "ref_logps/rejected": -34.20571517944336, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007153302431106567, "rewards/margins": 0.02258940041065216, "rewards/rejected": -0.023304728791117668, "step": 207 }, { "epoch": 0.2, "grad_norm": 19.998481080711812, "learning_rate": 4.983746536619863e-07, "logps/chosen": -28.60678482055664, "logps/rejected": -30.546842575073242, "loss": 0.6821, "losses/dpo": 0.6860745549201965, "losses/sft": 0.09759398549795151, "losses/total": 0.6860745549201965, "ref_logps/chosen": -28.53179168701172, "ref_logps/rejected": -30.231895446777344, "rewards/accuracies": 0.5, "rewards/chosen": -0.007499369326978922, "rewards/margins": 0.023995401337742805, "rewards/rejected": -0.03149477019906044, "step": 208 }, { "epoch": 0.2, "grad_norm": 21.44318175627243, "learning_rate": 4.983455322466953e-07, "logps/chosen": -26.66656494140625, "logps/rejected": -37.683589935302734, "loss": 0.6322, "losses/dpo": 0.6406020522117615, "losses/sft": 0.3883194923400879, "losses/total": 0.6406020522117615, "ref_logps/chosen": -26.937335968017578, "ref_logps/rejected": -36.66865921020508, "rewards/accuracies": 0.875, "rewards/chosen": 0.027076851576566696, "rewards/margins": 0.12857002019882202, "rewards/rejected": -0.10149317979812622, "step": 209 }, { "epoch": 0.2, "grad_norm": 22.004169249587402, "learning_rate": 4.983161531237483e-07, "logps/chosen": -26.311809539794922, "logps/rejected": -36.108848571777344, "loss": 0.6681, "losses/dpo": 0.6696144342422485, "losses/sft": 0.4522739052772522, "losses/total": 0.6696144342422485, "ref_logps/chosen": -26.304378509521484, "ref_logps/rejected": -35.582672119140625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0007432580459862947, "rewards/margins": 0.05187436193227768, "rewards/rejected": -0.052617620676755905, "step": 210 }, { "epoch": 0.2, "grad_norm": 21.81612666034674, "learning_rate": 4.982865163236323e-07, "logps/chosen": -25.598190307617188, "logps/rejected": -32.80715560913086, "loss": 0.6754, "losses/dpo": 0.7470968961715698, "losses/sft": 1.3498557806015015, "losses/total": 0.7470968961715698, "ref_logps/chosen": -25.623186111450195, "ref_logps/rejected": -32.43326950073242, "rewards/accuracies": 0.625, "rewards/chosen": 0.002499549649655819, "rewards/margins": 0.03988812863826752, "rewards/rejected": -0.03738858178257942, "step": 211 }, { "epoch": 0.2, "grad_norm": 24.817938104176065, "learning_rate": 4.982566218771011e-07, "logps/chosen": -39.99713897705078, "logps/rejected": -30.823486328125, "loss": 0.6795, "losses/dpo": 0.7043887972831726, "losses/sft": 1.2054787874221802, "losses/total": 0.7043887972831726, "ref_logps/chosen": -39.64111328125, "ref_logps/rejected": -30.170848846435547, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03560296446084976, "rewards/margins": 0.02966069057583809, "rewards/rejected": -0.06526365131139755, "step": 212 }, { "epoch": 0.2, "grad_norm": 22.037605332222007, "learning_rate": 4.982264698151762e-07, "logps/chosen": -29.19732666015625, "logps/rejected": -38.954559326171875, "loss": 0.6567, "losses/dpo": 0.6672189235687256, "losses/sft": 0.5596910715103149, "losses/total": 0.6672189235687256, "ref_logps/chosen": -29.384235382080078, "ref_logps/rejected": -38.35938262939453, "rewards/accuracies": 0.8125, "rewards/chosen": 0.018690848723053932, "rewards/margins": 0.07820872962474823, "rewards/rejected": -0.05951787531375885, "step": 213 }, { "epoch": 0.2, "grad_norm": 23.154362394603726, "learning_rate": 4.981960601691462e-07, "logps/chosen": -34.40770721435547, "logps/rejected": -36.338497161865234, "loss": 0.6537, "losses/dpo": 0.7116807103157043, "losses/sft": 1.0986615419387817, "losses/total": 0.7116807103157043, "ref_logps/chosen": -34.23246765136719, "ref_logps/rejected": -35.325862884521484, "rewards/accuracies": 0.75, "rewards/chosen": -0.01752389222383499, "rewards/margins": 0.08373979479074478, "rewards/rejected": -0.10126368701457977, "step": 214 }, { "epoch": 0.2, "grad_norm": 23.74665691902844, "learning_rate": 4.981653929705674e-07, "logps/chosen": -28.08710479736328, "logps/rejected": -46.94313430786133, "loss": 0.6514, "losses/dpo": 0.7262231707572937, "losses/sft": 0.9197044372558594, "losses/total": 0.7262231707572937, "ref_logps/chosen": -27.90131378173828, "ref_logps/rejected": -45.85188293457031, "rewards/accuracies": 0.8125, "rewards/chosen": -0.018578967079520226, "rewards/margins": 0.09054629504680634, "rewards/rejected": -0.10912525653839111, "step": 215 }, { "epoch": 0.2, "grad_norm": 21.82775750848168, "learning_rate": 4.981344682512629e-07, "logps/chosen": -32.49782943725586, "logps/rejected": -32.71067810058594, "loss": 0.6776, "losses/dpo": 0.7098404169082642, "losses/sft": 0.7547588348388672, "losses/total": 0.7098404169082642, "ref_logps/chosen": -32.6079216003418, "ref_logps/rejected": -32.48939895629883, "rewards/accuracies": 0.6875, "rewards/chosen": 0.011008922941982746, "rewards/margins": 0.03313687816262245, "rewards/rejected": -0.02212795801460743, "step": 216 }, { "epoch": 0.2, "grad_norm": 19.565403491237447, "learning_rate": 4.981032860433233e-07, "logps/chosen": -29.506376266479492, "logps/rejected": -29.66910171508789, "loss": 0.6675, "losses/dpo": 0.6236523985862732, "losses/sft": 0.9565038681030273, "losses/total": 0.6236523985862732, "ref_logps/chosen": -29.3478946685791, "ref_logps/rejected": -28.95464324951172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.015848258510231972, "rewards/margins": 0.055597610771656036, "rewards/rejected": -0.07144585996866226, "step": 217 }, { "epoch": 0.21, "grad_norm": 21.175751194302023, "learning_rate": 4.980718463791063e-07, "logps/chosen": -32.315582275390625, "logps/rejected": -30.631380081176758, "loss": 0.6935, "losses/dpo": 0.6788669228553772, "losses/sft": 1.66483473777771, "losses/total": 0.6788669228553772, "ref_logps/chosen": -31.876745223999023, "ref_logps/rejected": -30.163908004760742, "rewards/accuracies": 0.5, "rewards/chosen": -0.04388337954878807, "rewards/margins": 0.002863747999072075, "rewards/rejected": -0.046747125685214996, "step": 218 }, { "epoch": 0.21, "grad_norm": 20.791341963969632, "learning_rate": 4.980401492912366e-07, "logps/chosen": -30.085552215576172, "logps/rejected": -31.170787811279297, "loss": 0.6762, "losses/dpo": 0.6692009568214417, "losses/sft": 0.31103435158729553, "losses/total": 0.6692009568214417, "ref_logps/chosen": -29.947689056396484, "ref_logps/rejected": -30.65897560119629, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01378624141216278, "rewards/margins": 0.037394970655441284, "rewards/rejected": -0.051181212067604065, "step": 219 }, { "epoch": 0.21, "grad_norm": 22.09478160492232, "learning_rate": 4.980081948126066e-07, "logps/chosen": -32.37030029296875, "logps/rejected": -36.06874084472656, "loss": 0.6837, "losses/dpo": 0.7157841324806213, "losses/sft": 1.5893782377243042, "losses/total": 0.7157841324806213, "ref_logps/chosen": -32.09016418457031, "ref_logps/rejected": -35.579490661621094, "rewards/accuracies": 0.625, "rewards/chosen": -0.02801400050520897, "rewards/margins": 0.020910969004034996, "rewards/rejected": -0.04892497509717941, "step": 220 }, { "epoch": 0.21, "grad_norm": 19.87016973141811, "learning_rate": 4.979759829763749e-07, "logps/chosen": -27.520435333251953, "logps/rejected": -33.67799377441406, "loss": 0.6688, "losses/dpo": 0.641739547252655, "losses/sft": 1.3383820056915283, "losses/total": 0.641739547252655, "ref_logps/chosen": -27.745025634765625, "ref_logps/rejected": -33.397464752197266, "rewards/accuracies": 0.8125, "rewards/chosen": 0.022458981722593307, "rewards/margins": 0.050511691719293594, "rewards/rejected": -0.02805270440876484, "step": 221 }, { "epoch": 0.21, "grad_norm": 20.606864653921114, "learning_rate": 4.979435138159681e-07, "logps/chosen": -17.088638305664062, "logps/rejected": -35.714698791503906, "loss": 0.6562, "losses/dpo": 0.6673023104667664, "losses/sft": 0.6810814142227173, "losses/total": 0.6673023104667664, "ref_logps/chosen": -17.341659545898438, "ref_logps/rejected": -35.18836975097656, "rewards/accuracies": 0.75, "rewards/chosen": 0.02530200406908989, "rewards/margins": 0.07793492078781128, "rewards/rejected": -0.05263291299343109, "step": 222 }, { "epoch": 0.21, "grad_norm": 22.09398094331322, "learning_rate": 4.979107873650791e-07, "logps/chosen": -27.69796371459961, "logps/rejected": -36.69651412963867, "loss": 0.6847, "losses/dpo": 0.6890752911567688, "losses/sft": 1.324795126914978, "losses/total": 0.6890752911567688, "ref_logps/chosen": -27.500099182128906, "ref_logps/rejected": -36.282989501953125, "rewards/accuracies": 0.625, "rewards/chosen": -0.019786635413765907, "rewards/margins": 0.021565861999988556, "rewards/rejected": -0.041352495551109314, "step": 223 }, { "epoch": 0.21, "grad_norm": 18.979789715043356, "learning_rate": 4.978778036576682e-07, "logps/chosen": -21.40099334716797, "logps/rejected": -31.345703125, "loss": 0.68, "losses/dpo": 0.6707872748374939, "losses/sft": 0.42168521881103516, "losses/total": 0.6707872748374939, "ref_logps/chosen": -21.211631774902344, "ref_logps/rejected": -30.875591278076172, "rewards/accuracies": 0.625, "rewards/chosen": -0.01893613114953041, "rewards/margins": 0.028075171634554863, "rewards/rejected": -0.047011300921440125, "step": 224 }, { "epoch": 0.21, "grad_norm": 19.516087906749704, "learning_rate": 4.978445627279625e-07, "logps/chosen": -19.613021850585938, "logps/rejected": -35.236907958984375, "loss": 0.6704, "losses/dpo": 0.5647019743919373, "losses/sft": 0.9573656916618347, "losses/total": 0.5647019743919373, "ref_logps/chosen": -19.74871826171875, "ref_logps/rejected": -34.87858200073242, "rewards/accuracies": 0.625, "rewards/chosen": 0.01356971263885498, "rewards/margins": 0.0494026243686676, "rewards/rejected": -0.03583291172981262, "step": 225 }, { "epoch": 0.21, "grad_norm": 22.40127644298977, "learning_rate": 4.97811064610456e-07, "logps/chosen": -31.794795989990234, "logps/rejected": -31.670190811157227, "loss": 0.6726, "losses/dpo": 0.6742733716964722, "losses/sft": 0.3760080635547638, "losses/total": 0.6742733716964722, "ref_logps/chosen": -31.623340606689453, "ref_logps/rejected": -31.039581298828125, "rewards/accuracies": 0.625, "rewards/chosen": -0.01714545302093029, "rewards/margins": 0.04591544717550278, "rewards/rejected": -0.06306090950965881, "step": 226 }, { "epoch": 0.21, "grad_norm": 21.850406859604945, "learning_rate": 4.977773093399096e-07, "logps/chosen": -28.55599594116211, "logps/rejected": -31.19952392578125, "loss": 0.6588, "losses/dpo": 0.6388475298881531, "losses/sft": 1.0582984685897827, "losses/total": 0.6388475298881531, "ref_logps/chosen": -28.493873596191406, "ref_logps/rejected": -30.414838790893555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0062121059745550156, "rewards/margins": 0.07225653529167175, "rewards/rejected": -0.07846865057945251, "step": 227 }, { "epoch": 0.22, "grad_norm": 18.73428838205373, "learning_rate": 4.977432969513514e-07, "logps/chosen": -20.246570587158203, "logps/rejected": -26.177352905273438, "loss": 0.7016, "losses/dpo": 0.7169329524040222, "losses/sft": 0.7626660466194153, "losses/total": 0.7169329524040222, "ref_logps/chosen": -19.99535369873047, "ref_logps/rejected": -26.07242202758789, "rewards/accuracies": 0.4375, "rewards/chosen": -0.025121519342064857, "rewards/margins": -0.014628229662775993, "rewards/rejected": -0.010493289679288864, "step": 228 }, { "epoch": 0.22, "grad_norm": 22.188852056542228, "learning_rate": 4.977090274800756e-07, "logps/chosen": -27.821226119995117, "logps/rejected": -34.41599655151367, "loss": 0.6734, "losses/dpo": 0.6980329155921936, "losses/sft": 0.45930957794189453, "losses/total": 0.6980329155921936, "ref_logps/chosen": -27.946788787841797, "ref_logps/rejected": -34.10529327392578, "rewards/accuracies": 0.5, "rewards/chosen": 0.012556455098092556, "rewards/margins": 0.04362662509083748, "rewards/rejected": -0.031070170924067497, "step": 229 }, { "epoch": 0.22, "grad_norm": 21.526759301082315, "learning_rate": 4.976745009616435e-07, "logps/chosen": -28.315242767333984, "logps/rejected": -41.11632537841797, "loss": 0.6572, "losses/dpo": 0.7555768489837646, "losses/sft": 1.4731535911560059, "losses/total": 0.7555768489837646, "ref_logps/chosen": -28.035192489624023, "ref_logps/rejected": -40.06474685668945, "rewards/accuracies": 0.75, "rewards/chosen": -0.028004787862300873, "rewards/margins": 0.0771530345082283, "rewards/rejected": -0.10515781491994858, "step": 230 }, { "epoch": 0.22, "grad_norm": 22.252776099530386, "learning_rate": 4.976397174318837e-07, "logps/chosen": -32.95344543457031, "logps/rejected": -38.76870346069336, "loss": 0.6859, "losses/dpo": 0.7317342162132263, "losses/sft": 1.1057343482971191, "losses/total": 0.7317342162132263, "ref_logps/chosen": -32.83092498779297, "ref_logps/rejected": -38.46167755126953, "rewards/accuracies": 0.625, "rewards/chosen": -0.012252074666321278, "rewards/margins": 0.018450606614351273, "rewards/rejected": -0.030702680349349976, "step": 231 }, { "epoch": 0.22, "grad_norm": 22.02176209110998, "learning_rate": 4.976046769268905e-07, "logps/chosen": -37.47603988647461, "logps/rejected": -39.71766662597656, "loss": 0.6576, "losses/dpo": 0.6191545128822327, "losses/sft": 0.7892920970916748, "losses/total": 0.6191545128822327, "ref_logps/chosen": -37.06852722167969, "ref_logps/rejected": -38.55116271972656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04075150191783905, "rewards/margins": 0.07589869946241379, "rewards/rejected": -0.11665019392967224, "step": 232 }, { "epoch": 0.22, "grad_norm": 21.041727773838854, "learning_rate": 4.975693794830256e-07, "logps/chosen": -28.052141189575195, "logps/rejected": -33.400630950927734, "loss": 0.6499, "losses/dpo": 0.7296112179756165, "losses/sft": 1.1901943683624268, "losses/total": 0.7296112179756165, "ref_logps/chosen": -28.07520294189453, "ref_logps/rejected": -32.46827697753906, "rewards/accuracies": 0.75, "rewards/chosen": 0.002306369598954916, "rewards/margins": 0.09554171562194824, "rewards/rejected": -0.09323534369468689, "step": 233 }, { "epoch": 0.22, "grad_norm": 21.479177057354395, "learning_rate": 4.97533825136917e-07, "logps/chosen": -35.07136535644531, "logps/rejected": -31.216585159301758, "loss": 0.6788, "losses/dpo": 0.6255566477775574, "losses/sft": 1.0410970449447632, "losses/total": 0.6255566477775574, "ref_logps/chosen": -35.036888122558594, "ref_logps/rejected": -30.87218475341797, "rewards/accuracies": 0.625, "rewards/chosen": -0.003447728231549263, "rewards/margins": 0.030992338433861732, "rewards/rejected": -0.034440066665410995, "step": 234 }, { "epoch": 0.22, "grad_norm": 18.575463004536505, "learning_rate": 4.974980139254594e-07, "logps/chosen": -22.40657615661621, "logps/rejected": -28.605506896972656, "loss": 0.6777, "losses/dpo": 0.6433977484703064, "losses/sft": 0.7091078162193298, "losses/total": 0.6433977484703064, "ref_logps/chosen": -22.57191276550293, "ref_logps/rejected": -28.449092864990234, "rewards/accuracies": 0.625, "rewards/chosen": 0.016533710062503815, "rewards/margins": 0.03217483311891556, "rewards/rejected": -0.015641119331121445, "step": 235 }, { "epoch": 0.22, "grad_norm": 20.315641296262754, "learning_rate": 4.974619458858141e-07, "logps/chosen": -25.047941207885742, "logps/rejected": -35.383365631103516, "loss": 0.6659, "losses/dpo": 0.5952655076980591, "losses/sft": 0.5534670352935791, "losses/total": 0.5952655076980591, "ref_logps/chosen": -24.92584228515625, "ref_logps/rejected": -34.677467346191406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.012209921143949032, "rewards/margins": 0.058379877358675, "rewards/rejected": -0.07058979570865631, "step": 236 }, { "epoch": 0.22, "grad_norm": 18.540575142832658, "learning_rate": 4.974256210554087e-07, "logps/chosen": -22.364486694335938, "logps/rejected": -28.65050506591797, "loss": 0.688, "losses/dpo": 0.7570123672485352, "losses/sft": 0.9213727712631226, "losses/total": 0.7570123672485352, "ref_logps/chosen": -22.264787673950195, "ref_logps/rejected": -28.41490364074707, "rewards/accuracies": 0.4375, "rewards/chosen": -0.009969914332032204, "rewards/margins": 0.013590252958238125, "rewards/rejected": -0.023560166358947754, "step": 237 }, { "epoch": 0.22, "grad_norm": 22.09589023856155, "learning_rate": 4.973890394719375e-07, "logps/chosen": -29.948272705078125, "logps/rejected": -43.6513671875, "loss": 0.6568, "losses/dpo": 0.6619794964790344, "losses/sft": 0.9422954320907593, "losses/total": 0.6619794964790344, "ref_logps/chosen": -29.8604736328125, "ref_logps/rejected": -42.809783935546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.00877995602786541, "rewards/margins": 0.07537825405597687, "rewards/rejected": -0.08415821194648743, "step": 238 }, { "epoch": 0.23, "grad_norm": 22.873360739941383, "learning_rate": 4.973522011733611e-07, "logps/chosen": -25.546092987060547, "logps/rejected": -39.68781280517578, "loss": 0.6751, "losses/dpo": 0.6811619997024536, "losses/sft": 1.352763295173645, "losses/total": 0.6811619997024536, "ref_logps/chosen": -25.488483428955078, "ref_logps/rejected": -39.22767639160156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005760695785284042, "rewards/margins": 0.04025306552648544, "rewards/rejected": -0.04601375758647919, "step": 239 }, { "epoch": 0.23, "grad_norm": 21.726221875670905, "learning_rate": 4.973151061979065e-07, "logps/chosen": -26.689245223999023, "logps/rejected": -37.12638854980469, "loss": 0.6563, "losses/dpo": 0.6607230305671692, "losses/sft": 0.609571635723114, "losses/total": 0.6607230305671692, "ref_logps/chosen": -26.45098114013672, "ref_logps/rejected": -36.07799530029297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.023826386779546738, "rewards/margins": 0.08101311326026917, "rewards/rejected": -0.1048395037651062, "step": 240 }, { "epoch": 0.23, "grad_norm": 18.49437285161885, "learning_rate": 4.972777545840671e-07, "logps/chosen": -18.150959014892578, "logps/rejected": -29.840621948242188, "loss": 0.6699, "losses/dpo": 0.6002298593521118, "losses/sft": 0.9133957624435425, "losses/total": 0.6002298593521118, "ref_logps/chosen": -18.074695587158203, "ref_logps/rejected": -29.261573791503906, "rewards/accuracies": 0.625, "rewards/chosen": -0.007626420818269253, "rewards/margins": 0.0502782016992569, "rewards/rejected": -0.057904623448848724, "step": 241 }, { "epoch": 0.23, "grad_norm": 19.741761272772898, "learning_rate": 4.972401463706027e-07, "logps/chosen": -23.770071029663086, "logps/rejected": -39.04508972167969, "loss": 0.6768, "losses/dpo": 0.744041383266449, "losses/sft": 0.2992403507232666, "losses/total": 0.744041383266449, "ref_logps/chosen": -23.906475067138672, "ref_logps/rejected": -38.82221984863281, "rewards/accuracies": 0.5625, "rewards/chosen": 0.013640335761010647, "rewards/margins": 0.03592720627784729, "rewards/rejected": -0.02228686586022377, "step": 242 }, { "epoch": 0.23, "grad_norm": 22.799838377589413, "learning_rate": 4.972022815965389e-07, "logps/chosen": -34.01243591308594, "logps/rejected": -48.328834533691406, "loss": 0.6497, "losses/dpo": 0.6089914441108704, "losses/sft": 1.706616759300232, "losses/total": 0.6089914441108704, "ref_logps/chosen": -33.839351654052734, "ref_logps/rejected": -47.17365646362305, "rewards/accuracies": 0.75, "rewards/chosen": -0.01730809547007084, "rewards/margins": 0.0982096940279007, "rewards/rejected": -0.11551778018474579, "step": 243 }, { "epoch": 0.23, "grad_norm": 22.23356258614663, "learning_rate": 4.971641603011685e-07, "logps/chosen": -26.478517532348633, "logps/rejected": -38.04374313354492, "loss": 0.695, "losses/dpo": 0.689846396446228, "losses/sft": 0.32593318819999695, "losses/total": 0.689846396446228, "ref_logps/chosen": -26.050556182861328, "ref_logps/rejected": -37.6168212890625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.042796194553375244, "rewards/margins": -0.00010424759238958359, "rewards/rejected": -0.042691949754953384, "step": 244 }, { "epoch": 0.23, "grad_norm": 20.667773441843774, "learning_rate": 4.971257825240493e-07, "logps/chosen": -25.63518714904785, "logps/rejected": -38.189762115478516, "loss": 0.6591, "losses/dpo": 0.7137342095375061, "losses/sft": 1.5458799600601196, "losses/total": 0.7137342095375061, "ref_logps/chosen": -25.530189514160156, "ref_logps/rejected": -37.35130310058594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.010499740019440651, "rewards/margins": 0.07334621250629425, "rewards/rejected": -0.08384595811367035, "step": 245 }, { "epoch": 0.23, "grad_norm": 22.187184701335564, "learning_rate": 4.970871483050063e-07, "logps/chosen": -26.04515838623047, "logps/rejected": -38.14044952392578, "loss": 0.669, "losses/dpo": 0.6672460436820984, "losses/sft": 0.9335076808929443, "losses/total": 0.6672460436820984, "ref_logps/chosen": -25.975826263427734, "ref_logps/rejected": -37.53854751586914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.006933196447789669, "rewards/margins": 0.05325692892074585, "rewards/rejected": -0.060190122574567795, "step": 246 }, { "epoch": 0.23, "grad_norm": 22.179708747987206, "learning_rate": 4.970482576841299e-07, "logps/chosen": -25.69795036315918, "logps/rejected": -38.24945068359375, "loss": 0.6458, "losses/dpo": 0.6059558391571045, "losses/sft": 0.5890608429908752, "losses/total": 0.6059558391571045, "ref_logps/chosen": -25.61048698425293, "ref_logps/rejected": -37.14259719848633, "rewards/accuracies": 0.75, "rewards/chosen": -0.008746426552534103, "rewards/margins": 0.10193879157304764, "rewards/rejected": -0.11068521440029144, "step": 247 }, { "epoch": 0.23, "grad_norm": 18.677959036952945, "learning_rate": 4.970091107017767e-07, "logps/chosen": -21.700847625732422, "logps/rejected": -33.009857177734375, "loss": 0.6647, "losses/dpo": 0.5726900696754456, "losses/sft": 0.5228408575057983, "losses/total": 0.5726900696754456, "ref_logps/chosen": -21.702970504760742, "ref_logps/rejected": -32.372989654541016, "rewards/accuracies": 0.625, "rewards/chosen": 0.00021238811314105988, "rewards/margins": 0.06389913707971573, "rewards/rejected": -0.06368675082921982, "step": 248 }, { "epoch": 0.23, "grad_norm": 23.44698918955101, "learning_rate": 4.969697073985698e-07, "logps/chosen": -33.26472091674805, "logps/rejected": -47.55925750732422, "loss": 0.6518, "losses/dpo": 0.666020929813385, "losses/sft": 0.21695157885551453, "losses/total": 0.666020929813385, "ref_logps/chosen": -33.448020935058594, "ref_logps/rejected": -46.77513122558594, "rewards/accuracies": 0.75, "rewards/chosen": 0.018329858779907227, "rewards/margins": 0.09674245119094849, "rewards/rejected": -0.07841259241104126, "step": 249 }, { "epoch": 0.24, "grad_norm": 22.282729302068095, "learning_rate": 4.969300478153978e-07, "logps/chosen": -26.144290924072266, "logps/rejected": -43.01355743408203, "loss": 0.6612, "losses/dpo": 0.588470458984375, "losses/sft": 0.48244133591651917, "losses/total": 0.588470458984375, "ref_logps/chosen": -25.48859977722168, "ref_logps/rejected": -41.62657928466797, "rewards/accuracies": 0.625, "rewards/chosen": -0.0655689686536789, "rewards/margins": 0.07312900573015213, "rewards/rejected": -0.13869798183441162, "step": 250 }, { "epoch": 0.24, "grad_norm": 20.89016116804969, "learning_rate": 4.968901319934151e-07, "logps/chosen": -26.574417114257812, "logps/rejected": -26.68726348876953, "loss": 0.6692, "losses/dpo": 0.7065987586975098, "losses/sft": 1.187434434890747, "losses/total": 0.7065987586975098, "ref_logps/chosen": -26.450822830200195, "ref_logps/rejected": -26.05083465576172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.012359315529465675, "rewards/margins": 0.05128353834152222, "rewards/rejected": -0.06364285945892334, "step": 251 }, { "epoch": 0.24, "grad_norm": 18.484476913861364, "learning_rate": 4.968499599740426e-07, "logps/chosen": -23.267362594604492, "logps/rejected": -28.23869514465332, "loss": 0.6505, "losses/dpo": 0.6452504396438599, "losses/sft": 0.4090399742126465, "losses/total": 0.6452504396438599, "ref_logps/chosen": -23.461334228515625, "ref_logps/rejected": -27.523670196533203, "rewards/accuracies": 0.75, "rewards/chosen": 0.01939711719751358, "rewards/margins": 0.09089973568916321, "rewards/rejected": -0.07150261849164963, "step": 252 }, { "epoch": 0.24, "grad_norm": 22.181489855723793, "learning_rate": 4.968095317989666e-07, "logps/chosen": -25.862003326416016, "logps/rejected": -50.98854064941406, "loss": 0.6194, "losses/dpo": 0.6529636383056641, "losses/sft": 0.48927849531173706, "losses/total": 0.6529636383056641, "ref_logps/chosen": -25.64590835571289, "ref_logps/rejected": -49.200225830078125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.021609598770737648, "rewards/margins": 0.1572219580411911, "rewards/rejected": -0.178831547498703, "step": 253 }, { "epoch": 0.24, "grad_norm": 20.56615255640051, "learning_rate": 4.967688475101393e-07, "logps/chosen": -22.6092529296875, "logps/rejected": -35.359310150146484, "loss": 0.6197, "losses/dpo": 0.6695813536643982, "losses/sft": 0.5001195669174194, "losses/total": 0.6695813536643982, "ref_logps/chosen": -22.77074432373047, "ref_logps/rejected": -33.94105911254883, "rewards/accuracies": 0.9375, "rewards/chosen": 0.016148947179317474, "rewards/margins": 0.1579740345478058, "rewards/rejected": -0.1418250948190689, "step": 254 }, { "epoch": 0.24, "grad_norm": 22.095411033845355, "learning_rate": 4.967279071497788e-07, "logps/chosen": -39.29261779785156, "logps/rejected": -31.991044998168945, "loss": 0.6608, "losses/dpo": 0.6838697791099548, "losses/sft": 2.0639092922210693, "losses/total": 0.6838697791099548, "ref_logps/chosen": -39.01885986328125, "ref_logps/rejected": -31.004858016967773, "rewards/accuracies": 0.625, "rewards/chosen": -0.027376368641853333, "rewards/margins": 0.07124221324920654, "rewards/rejected": -0.09861858934164047, "step": 255 }, { "epoch": 0.24, "grad_norm": 21.80106876355902, "learning_rate": 4.966867107603687e-07, "logps/chosen": -28.97195816040039, "logps/rejected": -35.91027069091797, "loss": 0.6739, "losses/dpo": 0.5913442969322205, "losses/sft": 0.10599804669618607, "losses/total": 0.5913442969322205, "ref_logps/chosen": -28.70940589904785, "ref_logps/rejected": -35.189056396484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.02625531703233719, "rewards/margins": 0.04586618393659592, "rewards/rejected": -0.0721215009689331, "step": 256 }, { "epoch": 0.24, "grad_norm": 16.175100373244604, "learning_rate": 4.966452583846585e-07, "logps/chosen": -19.02623748779297, "logps/rejected": -24.098758697509766, "loss": 0.6559, "losses/dpo": 0.710719645023346, "losses/sft": 0.4632294774055481, "losses/total": 0.710719645023346, "ref_logps/chosen": -18.998687744140625, "ref_logps/rejected": -23.249202728271484, "rewards/accuracies": 0.625, "rewards/chosen": -0.002754833083599806, "rewards/margins": 0.08220073580741882, "rewards/rejected": -0.08495557308197021, "step": 257 }, { "epoch": 0.24, "grad_norm": 18.370597510318365, "learning_rate": 4.966035500656633e-07, "logps/chosen": -21.506826400756836, "logps/rejected": -28.64692497253418, "loss": 0.6842, "losses/dpo": 0.6746664643287659, "losses/sft": 0.11224710196256638, "losses/total": 0.6746664643287659, "ref_logps/chosen": -21.377105712890625, "ref_logps/rejected": -28.292400360107422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.012972228229045868, "rewards/margins": 0.022480417042970657, "rewards/rejected": -0.03545264154672623, "step": 258 }, { "epoch": 0.24, "grad_norm": 22.427926772012363, "learning_rate": 4.965615858466636e-07, "logps/chosen": -25.545761108398438, "logps/rejected": -34.259830474853516, "loss": 0.6589, "losses/dpo": 0.625557005405426, "losses/sft": 1.0453689098358154, "losses/total": 0.625557005405426, "ref_logps/chosen": -25.537282943725586, "ref_logps/rejected": -33.520057678222656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0008477999363094568, "rewards/margins": 0.07312911748886108, "rewards/rejected": -0.07397691905498505, "step": 259 }, { "epoch": 0.25, "grad_norm": 20.97762456132204, "learning_rate": 4.965193657712056e-07, "logps/chosen": -30.289043426513672, "logps/rejected": -38.298622131347656, "loss": 0.6346, "losses/dpo": 0.5894919633865356, "losses/sft": 0.8299669027328491, "losses/total": 0.5894919633865356, "ref_logps/chosen": -30.213411331176758, "ref_logps/rejected": -36.90840148925781, "rewards/accuracies": 0.75, "rewards/chosen": -0.007563076913356781, "rewards/margins": 0.1314588040113449, "rewards/rejected": -0.1390218734741211, "step": 260 }, { "epoch": 0.25, "grad_norm": 22.848180384463458, "learning_rate": 4.964768898831011e-07, "logps/chosen": -27.189964294433594, "logps/rejected": -39.061729431152344, "loss": 0.6423, "losses/dpo": 0.772132158279419, "losses/sft": 1.6264736652374268, "losses/total": 0.772132158279419, "ref_logps/chosen": -27.162860870361328, "ref_logps/rejected": -37.89704132080078, "rewards/accuracies": 0.75, "rewards/chosen": -0.0027103694155812263, "rewards/margins": 0.11375884711742401, "rewards/rejected": -0.11646922677755356, "step": 261 }, { "epoch": 0.25, "grad_norm": 20.937647713653753, "learning_rate": 4.964341582264272e-07, "logps/chosen": -26.458309173583984, "logps/rejected": -31.56889533996582, "loss": 0.6255, "losses/dpo": 0.5938650369644165, "losses/sft": 1.0711146593093872, "losses/total": 0.5938650369644165, "ref_logps/chosen": -26.763593673706055, "ref_logps/rejected": -30.42709732055664, "rewards/accuracies": 0.9375, "rewards/chosen": 0.030528610572218895, "rewards/margins": 0.14470842480659485, "rewards/rejected": -0.11417980492115021, "step": 262 }, { "epoch": 0.25, "grad_norm": 21.463632542851624, "learning_rate": 4.963911708455264e-07, "logps/chosen": -26.471607208251953, "logps/rejected": -33.03654479980469, "loss": 0.6724, "losses/dpo": 0.6366347670555115, "losses/sft": 0.3200298249721527, "losses/total": 0.6366347670555115, "ref_logps/chosen": -25.924964904785156, "ref_logps/rejected": -32.02159881591797, "rewards/accuracies": 0.625, "rewards/chosen": -0.054664067924022675, "rewards/margins": 0.04683064669370651, "rewards/rejected": -0.10149472206830978, "step": 263 }, { "epoch": 0.25, "grad_norm": 20.530078335371453, "learning_rate": 4.963479277850066e-07, "logps/chosen": -24.671192169189453, "logps/rejected": -33.287872314453125, "loss": 0.6397, "losses/dpo": 0.6740328073501587, "losses/sft": 0.47118276357650757, "losses/total": 0.6740328073501587, "ref_logps/chosen": -24.240673065185547, "ref_logps/rejected": -31.659595489501953, "rewards/accuracies": 0.8125, "rewards/chosen": -0.043051980435848236, "rewards/margins": 0.11977577209472656, "rewards/rejected": -0.1628277450799942, "step": 264 }, { "epoch": 0.25, "grad_norm": 21.031971885083948, "learning_rate": 4.963044290897412e-07, "logps/chosen": -16.75653076171875, "logps/rejected": -34.18169403076172, "loss": 0.6332, "losses/dpo": 0.6971394419670105, "losses/sft": 0.41879573464393616, "losses/total": 0.6971394419670105, "ref_logps/chosen": -16.948543548583984, "ref_logps/rejected": -33.096763610839844, "rewards/accuracies": 0.875, "rewards/chosen": 0.019201386719942093, "rewards/margins": 0.12769421935081482, "rewards/rejected": -0.10849282145500183, "step": 265 }, { "epoch": 0.25, "grad_norm": 24.622521777070183, "learning_rate": 4.962606748048686e-07, "logps/chosen": -32.9697380065918, "logps/rejected": -44.40463638305664, "loss": 0.6361, "losses/dpo": 0.6247701644897461, "losses/sft": 0.5361149311065674, "losses/total": 0.6247701644897461, "ref_logps/chosen": -32.653770446777344, "ref_logps/rejected": -42.839683532714844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03159667178988457, "rewards/margins": 0.12489880621433258, "rewards/rejected": -0.15649548172950745, "step": 266 }, { "epoch": 0.25, "grad_norm": 22.270207343145564, "learning_rate": 4.962166649757924e-07, "logps/chosen": -32.88003158569336, "logps/rejected": -35.530696868896484, "loss": 0.6574, "losses/dpo": 0.6451241970062256, "losses/sft": 1.1021939516067505, "losses/total": 0.6451241970062256, "ref_logps/chosen": -32.18479919433594, "ref_logps/rejected": -34.09107208251953, "rewards/accuracies": 0.75, "rewards/chosen": -0.0695231705904007, "rewards/margins": 0.07443943619728088, "rewards/rejected": -0.14396259188652039, "step": 267 }, { "epoch": 0.25, "grad_norm": 21.910752668126616, "learning_rate": 4.961723996481817e-07, "logps/chosen": -21.981388092041016, "logps/rejected": -43.53025817871094, "loss": 0.6522, "losses/dpo": 0.6604568362236023, "losses/sft": 0.7809833288192749, "losses/total": 0.6604568362236023, "ref_logps/chosen": -21.809900283813477, "ref_logps/rejected": -42.49721145629883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017148815095424652, "rewards/margins": 0.08615591377019882, "rewards/rejected": -0.10330472886562347, "step": 268 }, { "epoch": 0.25, "grad_norm": 23.07269603317784, "learning_rate": 4.961278788679705e-07, "logps/chosen": -31.556476593017578, "logps/rejected": -42.44225311279297, "loss": 0.6384, "losses/dpo": 0.6426547169685364, "losses/sft": 0.9898515343666077, "losses/total": 0.6426547169685364, "ref_logps/chosen": -31.317996978759766, "ref_logps/rejected": -41.024169921875, "rewards/accuracies": 0.75, "rewards/chosen": -0.023848142474889755, "rewards/margins": 0.11796043813228607, "rewards/rejected": -0.14180858433246613, "step": 269 }, { "epoch": 0.25, "grad_norm": 18.674208901502418, "learning_rate": 4.960831026813579e-07, "logps/chosen": -22.2509708404541, "logps/rejected": -23.246849060058594, "loss": 0.7024, "losses/dpo": 0.7311710715293884, "losses/sft": 0.9258562326431274, "losses/total": 0.7311710715293884, "ref_logps/chosen": -21.666000366210938, "ref_logps/rejected": -22.751495361328125, "rewards/accuracies": 0.5, "rewards/chosen": -0.05849699676036835, "rewards/margins": -0.0089618731290102, "rewards/rejected": -0.04953513294458389, "step": 270 }, { "epoch": 0.26, "grad_norm": 18.387604680827867, "learning_rate": 4.96038071134808e-07, "logps/chosen": -21.39550018310547, "logps/rejected": -31.720111846923828, "loss": 0.6199, "losses/dpo": 0.6716375350952148, "losses/sft": 1.075993537902832, "losses/total": 0.6716375350952148, "ref_logps/chosen": -21.690937042236328, "ref_logps/rejected": -30.43390464782715, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02954355999827385, "rewards/margins": 0.158164381980896, "rewards/rejected": -0.12862080335617065, "step": 271 }, { "epoch": 0.26, "grad_norm": 18.167173095846127, "learning_rate": 4.959927842750501e-07, "logps/chosen": -17.4378719329834, "logps/rejected": -26.884313583374023, "loss": 0.6442, "losses/dpo": 0.7507590651512146, "losses/sft": 0.7733747363090515, "losses/total": 0.7507590651512146, "ref_logps/chosen": -17.2620849609375, "ref_logps/rejected": -25.615154266357422, "rewards/accuracies": 0.625, "rewards/chosen": -0.017578870058059692, "rewards/margins": 0.10933716595172882, "rewards/rejected": -0.1269160360097885, "step": 272 }, { "epoch": 0.26, "grad_norm": 20.696253455689288, "learning_rate": 4.959472421490782e-07, "logps/chosen": -27.944698333740234, "logps/rejected": -23.627422332763672, "loss": 0.6805, "losses/dpo": 0.7342029809951782, "losses/sft": 1.627620816230774, "losses/total": 0.7342029809951782, "ref_logps/chosen": -27.920677185058594, "ref_logps/rejected": -23.290048599243164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0024019302800297737, "rewards/margins": 0.031335435807704926, "rewards/rejected": -0.033737365156412125, "step": 273 }, { "epoch": 0.26, "grad_norm": 21.269524722500766, "learning_rate": 4.959014448041511e-07, "logps/chosen": -30.50692367553711, "logps/rejected": -27.502071380615234, "loss": 0.6716, "losses/dpo": 0.7337452173233032, "losses/sft": 1.3375006914138794, "losses/total": 0.7337452173233032, "ref_logps/chosen": -29.961145401000977, "ref_logps/rejected": -26.427749633789062, "rewards/accuracies": 0.625, "rewards/chosen": -0.05457780510187149, "rewards/margins": 0.05285404250025749, "rewards/rejected": -0.10743185132741928, "step": 274 }, { "epoch": 0.26, "grad_norm": 19.28535609798192, "learning_rate": 4.95855392287793e-07, "logps/chosen": -18.255144119262695, "logps/rejected": -37.457008361816406, "loss": 0.6593, "losses/dpo": 0.7086925506591797, "losses/sft": 0.4909714460372925, "losses/total": 0.7086925506591797, "ref_logps/chosen": -18.098533630371094, "ref_logps/rejected": -36.54096221923828, "rewards/accuracies": 0.5625, "rewards/chosen": -0.015660995617508888, "rewards/margins": 0.07594318687915802, "rewards/rejected": -0.09160418063402176, "step": 275 }, { "epoch": 0.26, "grad_norm": 23.333143920350537, "learning_rate": 4.958090846477921e-07, "logps/chosen": -42.41791534423828, "logps/rejected": -35.40343475341797, "loss": 0.6648, "losses/dpo": 0.604709804058075, "losses/sft": 0.9491846561431885, "losses/total": 0.604709804058075, "ref_logps/chosen": -42.21893310546875, "ref_logps/rejected": -34.55111312866211, "rewards/accuracies": 0.75, "rewards/chosen": -0.019897721707820892, "rewards/margins": 0.0653344988822937, "rewards/rejected": -0.08523222804069519, "step": 276 }, { "epoch": 0.26, "grad_norm": 23.15208058028297, "learning_rate": 4.957625219322019e-07, "logps/chosen": -34.84403991699219, "logps/rejected": -33.80641555786133, "loss": 0.7074, "losses/dpo": 0.719702959060669, "losses/sft": 1.3457612991333008, "losses/total": 0.719702959060669, "ref_logps/chosen": -33.93678283691406, "ref_logps/rejected": -33.07502365112305, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09072577953338623, "rewards/margins": -0.01758645661175251, "rewards/rejected": -0.07313931733369827, "step": 277 }, { "epoch": 0.26, "grad_norm": 19.331003907416846, "learning_rate": 4.957157041893406e-07, "logps/chosen": -22.173728942871094, "logps/rejected": -29.800308227539062, "loss": 0.6456, "losses/dpo": 0.5787314772605896, "losses/sft": 0.1820506602525711, "losses/total": 0.5787314772605896, "ref_logps/chosen": -22.242107391357422, "ref_logps/rejected": -28.83858299255371, "rewards/accuracies": 0.75, "rewards/chosen": 0.006837689317762852, "rewards/margins": 0.10301044583320618, "rewards/rejected": -0.0961727499961853, "step": 278 }, { "epoch": 0.26, "grad_norm": 19.67705346278149, "learning_rate": 4.956686314677907e-07, "logps/chosen": -22.057605743408203, "logps/rejected": -25.56330680847168, "loss": 0.6832, "losses/dpo": 0.7696553468704224, "losses/sft": 0.9750624299049377, "losses/total": 0.7696553468704224, "ref_logps/chosen": -21.751232147216797, "ref_logps/rejected": -25.005096435546875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.030637361109256744, "rewards/margins": 0.025183560326695442, "rewards/rejected": -0.055820927023887634, "step": 279 }, { "epoch": 0.26, "grad_norm": 21.14612759951071, "learning_rate": 4.956213038163994e-07, "logps/chosen": -34.11610794067383, "logps/rejected": -42.06311798095703, "loss": 0.6563, "losses/dpo": 0.6710242629051208, "losses/sft": 1.4147988557815552, "losses/total": 0.6710242629051208, "ref_logps/chosen": -33.48563003540039, "ref_logps/rejected": -40.39112091064453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06304755806922913, "rewards/margins": 0.10415245592594147, "rewards/rejected": -0.1671999990940094, "step": 280 }, { "epoch": 0.27, "grad_norm": 21.308271714248136, "learning_rate": 4.955737212842787e-07, "logps/chosen": -29.254058837890625, "logps/rejected": -30.72490119934082, "loss": 0.6749, "losses/dpo": 0.790844738483429, "losses/sft": 1.4809430837631226, "losses/total": 0.790844738483429, "ref_logps/chosen": -28.426836013793945, "ref_logps/rejected": -29.42933464050293, "rewards/accuracies": 0.625, "rewards/chosen": -0.08272221684455872, "rewards/margins": 0.04683442413806915, "rewards/rejected": -0.12955664098262787, "step": 281 }, { "epoch": 0.27, "grad_norm": 20.507367274988496, "learning_rate": 4.955258839208048e-07, "logps/chosen": -24.659202575683594, "logps/rejected": -35.00513458251953, "loss": 0.6499, "losses/dpo": 0.6476426124572754, "losses/sft": 0.16601797938346863, "losses/total": 0.6476426124572754, "ref_logps/chosen": -24.283626556396484, "ref_logps/rejected": -33.563323974609375, "rewards/accuracies": 0.75, "rewards/chosen": -0.037557657808065414, "rewards/margins": 0.10662347823381424, "rewards/rejected": -0.14418113231658936, "step": 282 }, { "epoch": 0.27, "grad_norm": 21.44802689675558, "learning_rate": 4.954777917756183e-07, "logps/chosen": -33.28300476074219, "logps/rejected": -41.33049774169922, "loss": 0.646, "losses/dpo": 0.7137933373451233, "losses/sft": 0.7786162495613098, "losses/total": 0.7137933373451233, "ref_logps/chosen": -32.88454818725586, "ref_logps/rejected": -39.84197998046875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0398455485701561, "rewards/margins": 0.10900593549013138, "rewards/rejected": -0.14885148406028748, "step": 283 }, { "epoch": 0.27, "grad_norm": 21.02168536529281, "learning_rate": 4.954294448986247e-07, "logps/chosen": -31.23189353942871, "logps/rejected": -29.606904983520508, "loss": 0.6836, "losses/dpo": 0.812567412853241, "losses/sft": 0.8734865784645081, "losses/total": 0.812567412853241, "ref_logps/chosen": -30.847349166870117, "ref_logps/rejected": -28.97168731689453, "rewards/accuracies": 0.625, "rewards/chosen": -0.03845435380935669, "rewards/margins": 0.02506737969815731, "rewards/rejected": -0.06352172791957855, "step": 284 }, { "epoch": 0.27, "grad_norm": 18.915008737689565, "learning_rate": 4.953808433399931e-07, "logps/chosen": -23.06317901611328, "logps/rejected": -33.64259719848633, "loss": 0.6029, "losses/dpo": 0.6858687400817871, "losses/sft": 0.9105321764945984, "losses/total": 0.6858687400817871, "ref_logps/chosen": -23.165281295776367, "ref_logps/rejected": -31.736146926879883, "rewards/accuracies": 0.875, "rewards/chosen": 0.010210230946540833, "rewards/margins": 0.20085522532463074, "rewards/rejected": -0.1906449794769287, "step": 285 }, { "epoch": 0.27, "grad_norm": 20.875505369367687, "learning_rate": 4.953319871501574e-07, "logps/chosen": -24.747629165649414, "logps/rejected": -34.57719039916992, "loss": 0.6453, "losses/dpo": 0.6184383034706116, "losses/sft": 0.5264939665794373, "losses/total": 0.6184383034706116, "ref_logps/chosen": -24.612308502197266, "ref_logps/rejected": -33.343353271484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.013532137498259544, "rewards/margins": 0.10985182970762253, "rewards/rejected": -0.12338396906852722, "step": 286 }, { "epoch": 0.27, "grad_norm": 21.719933499728263, "learning_rate": 4.952828763798154e-07, "logps/chosen": -29.252988815307617, "logps/rejected": -34.05558395385742, "loss": 0.6574, "losses/dpo": 0.7114152312278748, "losses/sft": 0.8973916172981262, "losses/total": 0.7114152312278748, "ref_logps/chosen": -28.832521438598633, "ref_logps/rejected": -32.83118438720703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04204676300287247, "rewards/margins": 0.08039311319589615, "rewards/rejected": -0.12243987619876862, "step": 287 }, { "epoch": 0.27, "grad_norm": 21.93055218574413, "learning_rate": 4.952335110799295e-07, "logps/chosen": -27.963411331176758, "logps/rejected": -33.08734130859375, "loss": 0.6119, "losses/dpo": 0.6941902041435242, "losses/sft": 0.493768572807312, "losses/total": 0.6941902041435242, "ref_logps/chosen": -27.839954376220703, "ref_logps/rejected": -31.114704132080078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.012345668859779835, "rewards/margins": 0.1849180907011032, "rewards/rejected": -0.19726374745368958, "step": 288 }, { "epoch": 0.27, "grad_norm": 24.151244930838377, "learning_rate": 4.951838913017258e-07, "logps/chosen": -25.974416732788086, "logps/rejected": -44.624176025390625, "loss": 0.6555, "losses/dpo": 0.553982138633728, "losses/sft": 1.4612994194030762, "losses/total": 0.553982138633728, "ref_logps/chosen": -25.386472702026367, "ref_logps/rejected": -43.08336639404297, "rewards/accuracies": 0.625, "rewards/chosen": -0.05879439413547516, "rewards/margins": 0.09528692066669464, "rewards/rejected": -0.1540813148021698, "step": 289 }, { "epoch": 0.27, "grad_norm": 19.213125112992508, "learning_rate": 4.951340170966947e-07, "logps/chosen": -22.899932861328125, "logps/rejected": -28.47020721435547, "loss": 0.638, "losses/dpo": 0.5868801474571228, "losses/sft": 0.6373641490936279, "losses/total": 0.5868801474571228, "ref_logps/chosen": -22.802284240722656, "ref_logps/rejected": -27.195945739746094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.009764824993908405, "rewards/margins": 0.11766132712364197, "rewards/rejected": -0.1274261623620987, "step": 290 }, { "epoch": 0.27, "grad_norm": 23.778550299686447, "learning_rate": 4.950838885165904e-07, "logps/chosen": -31.802501678466797, "logps/rejected": -38.90342712402344, "loss": 0.7226, "losses/dpo": 0.5495800971984863, "losses/sft": 0.9091281890869141, "losses/total": 0.5495800971984863, "ref_logps/chosen": -30.569950103759766, "ref_logps/rejected": -38.17729187011719, "rewards/accuracies": 0.25, "rewards/chosen": -0.1232551783323288, "rewards/margins": -0.05064186453819275, "rewards/rejected": -0.07261331379413605, "step": 291 }, { "epoch": 0.28, "grad_norm": 21.929861577533316, "learning_rate": 4.950335056134317e-07, "logps/chosen": -24.841520309448242, "logps/rejected": -37.68931198120117, "loss": 0.6412, "losses/dpo": 0.6615662574768066, "losses/sft": 0.978054404258728, "losses/total": 0.6615662574768066, "ref_logps/chosen": -24.592178344726562, "ref_logps/rejected": -36.30732345581055, "rewards/accuracies": 0.75, "rewards/chosen": -0.0249341931194067, "rewards/margins": 0.11326460540294647, "rewards/rejected": -0.13819879293441772, "step": 292 }, { "epoch": 0.28, "grad_norm": 20.560513714022058, "learning_rate": 4.949828684395002e-07, "logps/chosen": -28.49094581604004, "logps/rejected": -25.582704544067383, "loss": 0.6622, "losses/dpo": 0.7185788750648499, "losses/sft": 0.92091304063797, "losses/total": 0.7185788750648499, "ref_logps/chosen": -28.188945770263672, "ref_logps/rejected": -24.59274673461914, "rewards/accuracies": 0.5625, "rewards/chosen": -0.030199915170669556, "rewards/margins": 0.06879602372646332, "rewards/rejected": -0.09899593889713287, "step": 293 }, { "epoch": 0.28, "grad_norm": 20.196317972132523, "learning_rate": 4.949319770473424e-07, "logps/chosen": -22.29047203063965, "logps/rejected": -41.51643371582031, "loss": 0.6364, "losses/dpo": 0.7287880182266235, "losses/sft": 0.39411675930023193, "losses/total": 0.7287880182266235, "ref_logps/chosen": -21.89513397216797, "ref_logps/rejected": -39.90663146972656, "rewards/accuracies": 0.75, "rewards/chosen": -0.03953400254249573, "rewards/margins": 0.12144605815410614, "rewards/rejected": -0.16098007559776306, "step": 294 }, { "epoch": 0.28, "grad_norm": 24.58019230176231, "learning_rate": 4.948808314897683e-07, "logps/chosen": -35.65583419799805, "logps/rejected": -41.72895812988281, "loss": 0.6652, "losses/dpo": 0.6934332251548767, "losses/sft": 0.7231680750846863, "losses/total": 0.6934332251548767, "ref_logps/chosen": -34.97152328491211, "ref_logps/rejected": -40.36332702636719, "rewards/accuracies": 0.625, "rewards/chosen": -0.06843093782663345, "rewards/margins": 0.06813202053308487, "rewards/rejected": -0.13656297326087952, "step": 295 }, { "epoch": 0.28, "grad_norm": 23.020889634005986, "learning_rate": 4.948294318198511e-07, "logps/chosen": -29.22870445251465, "logps/rejected": -38.59664535522461, "loss": 0.6762, "losses/dpo": 0.5277621746063232, "losses/sft": 1.0057483911514282, "losses/total": 0.5277621746063232, "ref_logps/chosen": -28.57340431213379, "ref_logps/rejected": -37.46343994140625, "rewards/accuracies": 0.625, "rewards/chosen": -0.06552985310554504, "rewards/margins": 0.0477907620370388, "rewards/rejected": -0.11332061141729355, "step": 296 }, { "epoch": 0.28, "grad_norm": 27.134899872426885, "learning_rate": 4.947777780909284e-07, "logps/chosen": -32.64911651611328, "logps/rejected": -36.02116012573242, "loss": 0.7022, "losses/dpo": 1.5654640197753906, "losses/sft": 1.9745452404022217, "losses/total": 1.5654640197753906, "ref_logps/chosen": -31.670711517333984, "ref_logps/rejected": -34.868412017822266, "rewards/accuracies": 0.75, "rewards/chosen": -0.09784067422151566, "rewards/margins": 0.01743388921022415, "rewards/rejected": -0.11527456343173981, "step": 297 }, { "epoch": 0.28, "grad_norm": 20.44014422010979, "learning_rate": 4.947258703566012e-07, "logps/chosen": -23.188413619995117, "logps/rejected": -44.811153411865234, "loss": 0.6535, "losses/dpo": 0.7039923071861267, "losses/sft": 0.6895202994346619, "losses/total": 0.7039923071861267, "ref_logps/chosen": -22.54567527770996, "ref_logps/rejected": -43.21263122558594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06427382677793503, "rewards/margins": 0.09557852149009705, "rewards/rejected": -0.15985234081745148, "step": 298 }, { "epoch": 0.28, "grad_norm": 26.353691719150493, "learning_rate": 4.94673708670734e-07, "logps/chosen": -36.778892517089844, "logps/rejected": -45.56700897216797, "loss": 0.6771, "losses/dpo": 0.5864163637161255, "losses/sft": 0.592175304889679, "losses/total": 0.5864163637161255, "ref_logps/chosen": -35.71586227416992, "ref_logps/rejected": -44.07836151123047, "rewards/accuracies": 0.625, "rewards/chosen": -0.10630284249782562, "rewards/margins": 0.042561475187540054, "rewards/rejected": -0.14886432886123657, "step": 299 }, { "epoch": 0.28, "grad_norm": 18.299106461516487, "learning_rate": 4.946212930874549e-07, "logps/chosen": -21.59851837158203, "logps/rejected": -30.120098114013672, "loss": 0.6591, "losses/dpo": 0.48632824420928955, "losses/sft": 0.8194484114646912, "losses/total": 0.48632824420928955, "ref_logps/chosen": -21.139842987060547, "ref_logps/rejected": -28.881322860717773, "rewards/accuracies": 0.625, "rewards/chosen": -0.045867715030908585, "rewards/margins": 0.07800982147455215, "rewards/rejected": -0.12387753278017044, "step": 300 }, { "epoch": 0.28, "grad_norm": 22.518995724228606, "learning_rate": 4.945686236611554e-07, "logps/chosen": -28.203144073486328, "logps/rejected": -35.11069107055664, "loss": 0.6567, "losses/dpo": 0.5163403153419495, "losses/sft": 0.9409673810005188, "losses/total": 0.5163403153419495, "ref_logps/chosen": -27.42251968383789, "ref_logps/rejected": -33.39204025268555, "rewards/accuracies": 0.625, "rewards/chosen": -0.07806266844272614, "rewards/margins": 0.0938023179769516, "rewards/rejected": -0.17186498641967773, "step": 301 }, { "epoch": 0.28, "grad_norm": 22.31376708107806, "learning_rate": 4.945157004464904e-07, "logps/chosen": -35.12454605102539, "logps/rejected": -37.700584411621094, "loss": 0.644, "losses/dpo": 0.6639184355735779, "losses/sft": 0.8836156129837036, "losses/total": 0.6639184355735779, "ref_logps/chosen": -34.40534210205078, "ref_logps/rejected": -35.803985595703125, "rewards/accuracies": 0.75, "rewards/chosen": -0.07192045450210571, "rewards/margins": 0.11773946136236191, "rewards/rejected": -0.1896599382162094, "step": 302 }, { "epoch": 0.29, "grad_norm": 23.0120669389034, "learning_rate": 4.944625234983784e-07, "logps/chosen": -34.586666107177734, "logps/rejected": -37.47782897949219, "loss": 0.6776, "losses/dpo": 0.7501705884933472, "losses/sft": 1.3654640913009644, "losses/total": 0.7501705884933472, "ref_logps/chosen": -33.764427185058594, "ref_logps/rejected": -36.16032409667969, "rewards/accuracies": 0.625, "rewards/chosen": -0.08222386986017227, "rewards/margins": 0.0495268851518631, "rewards/rejected": -0.13175076246261597, "step": 303 }, { "epoch": 0.29, "grad_norm": 22.375591021865738, "learning_rate": 4.944090928720009e-07, "logps/chosen": -39.53582763671875, "logps/rejected": -36.38225555419922, "loss": 0.6503, "losses/dpo": 0.6250157952308655, "losses/sft": 0.38218215107917786, "losses/total": 0.6250157952308655, "ref_logps/chosen": -38.929996490478516, "ref_logps/rejected": -34.771392822265625, "rewards/accuracies": 0.5, "rewards/chosen": -0.060583263635635376, "rewards/margins": 0.10050256550312042, "rewards/rejected": -0.1610858142375946, "step": 304 }, { "epoch": 0.29, "grad_norm": 19.861315777827567, "learning_rate": 4.943554086228026e-07, "logps/chosen": -23.777257919311523, "logps/rejected": -47.453369140625, "loss": 0.598, "losses/dpo": 0.574725329875946, "losses/sft": 1.1324666738510132, "losses/total": 0.574725329875946, "ref_logps/chosen": -23.488222122192383, "ref_logps/rejected": -45.053016662597656, "rewards/accuracies": 0.875, "rewards/chosen": -0.028903471305966377, "rewards/margins": 0.211131751537323, "rewards/rejected": -0.24003523588180542, "step": 305 }, { "epoch": 0.29, "grad_norm": 22.160480718355725, "learning_rate": 4.943014708064917e-07, "logps/chosen": -25.936843872070312, "logps/rejected": -40.47211456298828, "loss": 0.645, "losses/dpo": 0.6859869956970215, "losses/sft": 0.5956947207450867, "losses/total": 0.6859869956970215, "ref_logps/chosen": -25.223644256591797, "ref_logps/rejected": -38.71336364746094, "rewards/accuracies": 0.75, "rewards/chosen": -0.07131989300251007, "rewards/margins": 0.10455510020256042, "rewards/rejected": -0.1758749783039093, "step": 306 }, { "epoch": 0.29, "grad_norm": 20.617889017475537, "learning_rate": 4.942472794790395e-07, "logps/chosen": -34.36223602294922, "logps/rejected": -36.1309700012207, "loss": 0.6218, "losses/dpo": 0.5656472444534302, "losses/sft": 0.6162394285202026, "losses/total": 0.5656472444534302, "ref_logps/chosen": -33.75431442260742, "ref_logps/rejected": -33.792694091796875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.060792386531829834, "rewards/margins": 0.17303550243377686, "rewards/rejected": -0.2338278889656067, "step": 307 }, { "epoch": 0.29, "grad_norm": 19.814315565736177, "learning_rate": 4.941928346966801e-07, "logps/chosen": -26.704784393310547, "logps/rejected": -26.282907485961914, "loss": 0.642, "losses/dpo": 0.5523932576179504, "losses/sft": 0.9972934126853943, "losses/total": 0.5523932576179504, "ref_logps/chosen": -26.221298217773438, "ref_logps/rejected": -24.57244110107422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.048348572105169296, "rewards/margins": 0.1226980984210968, "rewards/rejected": -0.1710466593503952, "step": 308 }, { "epoch": 0.29, "grad_norm": 22.031069561280123, "learning_rate": 4.941381365159105e-07, "logps/chosen": -31.649456024169922, "logps/rejected": -33.956703186035156, "loss": 0.6463, "losses/dpo": 0.6583370566368103, "losses/sft": 1.2220991849899292, "losses/total": 0.6583370566368103, "ref_logps/chosen": -31.585472106933594, "ref_logps/rejected": -32.85108947753906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.006398579105734825, "rewards/margins": 0.10416257381439209, "rewards/rejected": -0.11056114733219147, "step": 309 }, { "epoch": 0.29, "grad_norm": 21.37314664102493, "learning_rate": 4.940831849934912e-07, "logps/chosen": -31.13311767578125, "logps/rejected": -31.933443069458008, "loss": 0.6293, "losses/dpo": 0.6399639248847961, "losses/sft": 1.1167281866073608, "losses/total": 0.6399639248847961, "ref_logps/chosen": -30.77181053161621, "ref_logps/rejected": -30.142623901367188, "rewards/accuracies": 0.75, "rewards/chosen": -0.03613080084323883, "rewards/margins": 0.1429510861635208, "rewards/rejected": -0.17908188700675964, "step": 310 }, { "epoch": 0.29, "grad_norm": 22.206264771175878, "learning_rate": 4.940279801864453e-07, "logps/chosen": -29.482906341552734, "logps/rejected": -38.708885192871094, "loss": 0.6518, "losses/dpo": 0.7155402898788452, "losses/sft": 0.24191422760486603, "losses/total": 0.7155402898788452, "ref_logps/chosen": -29.34610366821289, "ref_logps/rejected": -37.61125564575195, "rewards/accuracies": 0.75, "rewards/chosen": -0.013680499978363514, "rewards/margins": 0.09608308225870132, "rewards/rejected": -0.10976358503103256, "step": 311 }, { "epoch": 0.29, "grad_norm": 22.483609200502432, "learning_rate": 4.939725221520585e-07, "logps/chosen": -30.208377838134766, "logps/rejected": -43.22770690917969, "loss": 0.6756, "losses/dpo": 0.6389274597167969, "losses/sft": 1.6329330205917358, "losses/total": 0.6389274597167969, "ref_logps/chosen": -29.39089584350586, "ref_logps/rejected": -42.01932144165039, "rewards/accuracies": 0.625, "rewards/chosen": -0.0817483440041542, "rewards/margins": 0.03909039497375488, "rewards/rejected": -0.12083873897790909, "step": 312 }, { "epoch": 0.3, "grad_norm": 24.344026893552485, "learning_rate": 4.939168109478798e-07, "logps/chosen": -31.779268264770508, "logps/rejected": -43.88080596923828, "loss": 0.6729, "losses/dpo": 0.578009307384491, "losses/sft": 0.7000105977058411, "losses/total": 0.578009307384491, "ref_logps/chosen": -30.842700958251953, "ref_logps/rejected": -42.42404556274414, "rewards/accuracies": 0.625, "rewards/chosen": -0.0936567634344101, "rewards/margins": 0.05201949551701546, "rewards/rejected": -0.14567625522613525, "step": 313 }, { "epoch": 0.3, "grad_norm": 19.434135610580682, "learning_rate": 4.938608466317201e-07, "logps/chosen": -28.015228271484375, "logps/rejected": -33.576271057128906, "loss": 0.6353, "losses/dpo": 0.6067282557487488, "losses/sft": 0.5886077284812927, "losses/total": 0.6067282557487488, "ref_logps/chosen": -27.428129196166992, "ref_logps/rejected": -31.645912170410156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05871008336544037, "rewards/margins": 0.13432598114013672, "rewards/rejected": -0.1930360496044159, "step": 314 }, { "epoch": 0.3, "grad_norm": 25.941308419957345, "learning_rate": 4.93804629261654e-07, "logps/chosen": -43.294071197509766, "logps/rejected": -29.23244857788086, "loss": 0.7167, "losses/dpo": 0.6054716110229492, "losses/sft": 1.0329703092575073, "losses/total": 0.6054716110229492, "ref_logps/chosen": -42.245086669921875, "ref_logps/rejected": -28.59295654296875, "rewards/accuracies": 0.5, "rewards/chosen": -0.10489820688962936, "rewards/margins": -0.04094903543591499, "rewards/rejected": -0.06394916772842407, "step": 315 }, { "epoch": 0.3, "grad_norm": 21.12485617072488, "learning_rate": 4.937481588960178e-07, "logps/chosen": -27.796337127685547, "logps/rejected": -31.675106048583984, "loss": 0.6726, "losses/dpo": 0.46666890382766724, "losses/sft": 0.8825459480285645, "losses/total": 0.46666890382766724, "ref_logps/chosen": -26.959197998046875, "ref_logps/rejected": -30.302654266357422, "rewards/accuracies": 0.625, "rewards/chosen": -0.08371385931968689, "rewards/margins": 0.0535312183201313, "rewards/rejected": -0.1372450888156891, "step": 316 }, { "epoch": 0.3, "grad_norm": 21.60368038208307, "learning_rate": 4.936914355934108e-07, "logps/chosen": -27.863588333129883, "logps/rejected": -39.111812591552734, "loss": 0.6399, "losses/dpo": 0.6330317258834839, "losses/sft": 0.5755366683006287, "losses/total": 0.6330317258834839, "ref_logps/chosen": -27.292510986328125, "ref_logps/rejected": -37.27519226074219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0571078397333622, "rewards/margins": 0.12655451893806458, "rewards/rejected": -0.18366235494613647, "step": 317 }, { "epoch": 0.3, "grad_norm": 23.652734271588947, "learning_rate": 4.936344594126948e-07, "logps/chosen": -28.97354507446289, "logps/rejected": -26.54882049560547, "loss": 0.6719, "losses/dpo": 0.6724902987480164, "losses/sft": 0.20840932428836823, "losses/total": 0.6724902987480164, "ref_logps/chosen": -28.34224510192871, "ref_logps/rejected": -25.36941909790039, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0631301999092102, "rewards/margins": 0.05481010675430298, "rewards/rejected": -0.11794030666351318, "step": 318 }, { "epoch": 0.3, "grad_norm": 19.226647729497884, "learning_rate": 4.935772304129936e-07, "logps/chosen": -32.154205322265625, "logps/rejected": -28.818195343017578, "loss": 0.6614, "losses/dpo": 0.611913800239563, "losses/sft": 0.7640294432640076, "losses/total": 0.611913800239563, "ref_logps/chosen": -31.538658142089844, "ref_logps/rejected": -27.368732452392578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06155489385128021, "rewards/margins": 0.0833914577960968, "rewards/rejected": -0.14494633674621582, "step": 319 }, { "epoch": 0.3, "grad_norm": 21.526140466863062, "learning_rate": 4.935197486536936e-07, "logps/chosen": -24.21915054321289, "logps/rejected": -37.06775665283203, "loss": 0.642, "losses/dpo": 0.5636464357376099, "losses/sft": 0.2847607433795929, "losses/total": 0.5636464357376099, "ref_logps/chosen": -24.08651351928711, "ref_logps/rejected": -35.81602478027344, "rewards/accuracies": 0.625, "rewards/chosen": -0.013263700529932976, "rewards/margins": 0.11190967261791229, "rewards/rejected": -0.12517337501049042, "step": 320 }, { "epoch": 0.3, "grad_norm": 20.371976627561217, "learning_rate": 4.934620141944439e-07, "logps/chosen": -23.84784698486328, "logps/rejected": -40.762420654296875, "loss": 0.6434, "losses/dpo": 0.6883155107498169, "losses/sft": 1.085560917854309, "losses/total": 0.6883155107498169, "ref_logps/chosen": -23.035884857177734, "ref_logps/rejected": -38.88675308227539, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08119606226682663, "rewards/margins": 0.10637056827545166, "rewards/rejected": -0.1875666379928589, "step": 321 }, { "epoch": 0.3, "grad_norm": 20.952937196932723, "learning_rate": 4.934040270951551e-07, "logps/chosen": -28.229188919067383, "logps/rejected": -38.7379150390625, "loss": 0.6001, "losses/dpo": 0.7090763449668884, "losses/sft": 1.7998558282852173, "losses/total": 0.7090763449668884, "ref_logps/chosen": -28.017086029052734, "ref_logps/rejected": -36.483211517333984, "rewards/accuracies": 0.9375, "rewards/chosen": -0.021210264414548874, "rewards/margins": 0.20426003634929657, "rewards/rejected": -0.22547030448913574, "step": 322 }, { "epoch": 0.3, "grad_norm": 22.888453046657215, "learning_rate": 4.933457874160003e-07, "logps/chosen": -38.59470748901367, "logps/rejected": -39.38153076171875, "loss": 0.6302, "losses/dpo": 0.612696647644043, "losses/sft": 0.9638804197311401, "losses/total": 0.612696647644043, "ref_logps/chosen": -38.12613296508789, "ref_logps/rejected": -37.460777282714844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.046857550740242004, "rewards/margins": 0.14521756768226624, "rewards/rejected": -0.19207511842250824, "step": 323 }, { "epoch": 0.31, "grad_norm": 20.15189988153512, "learning_rate": 4.932872952174147e-07, "logps/chosen": -29.113964080810547, "logps/rejected": -29.47580337524414, "loss": 0.5898, "losses/dpo": 0.5897672772407532, "losses/sft": 0.43303316831588745, "losses/total": 0.5897672772407532, "ref_logps/chosen": -28.613603591918945, "ref_logps/rejected": -26.561065673828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.050036147236824036, "rewards/margins": 0.2414378970861435, "rewards/rejected": -0.29147404432296753, "step": 324 }, { "epoch": 0.31, "grad_norm": 22.005258728098354, "learning_rate": 4.932285505600958e-07, "logps/chosen": -29.978862762451172, "logps/rejected": -30.663246154785156, "loss": 0.6614, "losses/dpo": 0.45733416080474854, "losses/sft": 1.0104585886001587, "losses/total": 0.45733416080474854, "ref_logps/chosen": -29.111835479736328, "ref_logps/rejected": -28.93870735168457, "rewards/accuracies": 0.625, "rewards/chosen": -0.08670292794704437, "rewards/margins": 0.08575104176998138, "rewards/rejected": -0.17245396971702576, "step": 325 }, { "epoch": 0.31, "grad_norm": 23.87567822246727, "learning_rate": 4.931695535050025e-07, "logps/chosen": -29.891597747802734, "logps/rejected": -37.65968322753906, "loss": 0.6753, "losses/dpo": 0.8406550884246826, "losses/sft": 1.450574517250061, "losses/total": 0.8406550884246826, "ref_logps/chosen": -28.881771087646484, "ref_logps/rejected": -36.14826583862305, "rewards/accuracies": 0.625, "rewards/chosen": -0.10098274052143097, "rewards/margins": 0.05015882849693298, "rewards/rejected": -0.15114158391952515, "step": 326 }, { "epoch": 0.31, "grad_norm": 21.08325591205674, "learning_rate": 4.931103041133561e-07, "logps/chosen": -27.86251449584961, "logps/rejected": -35.936126708984375, "loss": 0.6553, "losses/dpo": 0.6810934543609619, "losses/sft": 0.33890751004219055, "losses/total": 0.6810934543609619, "ref_logps/chosen": -27.215938568115234, "ref_logps/rejected": -34.38379669189453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06465773284435272, "rewards/margins": 0.09057535231113434, "rewards/rejected": -0.15523308515548706, "step": 327 }, { "epoch": 0.31, "grad_norm": 21.574481613868496, "learning_rate": 4.930508024466394e-07, "logps/chosen": -26.817676544189453, "logps/rejected": -30.71657371520996, "loss": 0.6385, "losses/dpo": 0.46618688106536865, "losses/sft": 1.0140101909637451, "losses/total": 0.46618688106536865, "ref_logps/chosen": -25.90378189086914, "ref_logps/rejected": -28.47583770751953, "rewards/accuracies": 0.75, "rewards/chosen": -0.09138955175876617, "rewards/margins": 0.13268378376960754, "rewards/rejected": -0.22407332062721252, "step": 328 }, { "epoch": 0.31, "grad_norm": 20.05548857008018, "learning_rate": 4.929910485665974e-07, "logps/chosen": -28.689884185791016, "logps/rejected": -33.73432922363281, "loss": 0.643, "losses/dpo": 0.5967938303947449, "losses/sft": 0.9277857542037964, "losses/total": 0.5967938303947449, "ref_logps/chosen": -28.531299591064453, "ref_logps/rejected": -32.32073974609375, "rewards/accuracies": 0.75, "rewards/chosen": -0.015858784317970276, "rewards/margins": 0.12550042569637299, "rewards/rejected": -0.14135921001434326, "step": 329 }, { "epoch": 0.31, "grad_norm": 22.892443121790055, "learning_rate": 4.929310425352364e-07, "logps/chosen": -30.159645080566406, "logps/rejected": -39.03651428222656, "loss": 0.6366, "losses/dpo": 0.7379928827285767, "losses/sft": 0.747988760471344, "losses/total": 0.7379928827285767, "ref_logps/chosen": -29.618112564086914, "ref_logps/rejected": -37.10527038574219, "rewards/accuracies": 0.5, "rewards/chosen": -0.0541532039642334, "rewards/margins": 0.13897082209587097, "rewards/rejected": -0.19312402606010437, "step": 330 }, { "epoch": 0.31, "grad_norm": 16.709135898896626, "learning_rate": 4.928707844148246e-07, "logps/chosen": -20.443862915039062, "logps/rejected": -29.970348358154297, "loss": 0.5978, "losses/dpo": 0.3900461196899414, "losses/sft": 0.8531772494316101, "losses/total": 0.3900461196899414, "ref_logps/chosen": -20.389812469482422, "ref_logps/rejected": -27.734498977661133, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005405198782682419, "rewards/margins": 0.21817973256111145, "rewards/rejected": -0.22358492016792297, "step": 331 }, { "epoch": 0.31, "grad_norm": 25.312820932668977, "learning_rate": 4.928102742678917e-07, "logps/chosen": -36.106727600097656, "logps/rejected": -37.48309326171875, "loss": 0.6396, "losses/dpo": 0.6307786107063293, "losses/sft": 0.8544961810112, "losses/total": 0.6307786107063293, "ref_logps/chosen": -35.81026840209961, "ref_logps/rejected": -35.69643783569336, "rewards/accuracies": 0.75, "rewards/chosen": -0.029645886272192, "rewards/margins": 0.14902010560035706, "rewards/rejected": -0.17866599559783936, "step": 332 }, { "epoch": 0.31, "grad_norm": 26.93606784705199, "learning_rate": 4.92749512157229e-07, "logps/chosen": -48.02074432373047, "logps/rejected": -44.20256042480469, "loss": 0.6542, "losses/dpo": 0.5529283881187439, "losses/sft": 1.1212422847747803, "losses/total": 0.5529283881187439, "ref_logps/chosen": -46.361236572265625, "ref_logps/rejected": -41.513214111328125, "rewards/accuracies": 0.75, "rewards/chosen": -0.16595053672790527, "rewards/margins": 0.10298405587673187, "rewards/rejected": -0.26893457770347595, "step": 333 }, { "epoch": 0.32, "grad_norm": 20.571018704723286, "learning_rate": 4.926884981458892e-07, "logps/chosen": -28.564186096191406, "logps/rejected": -33.831459045410156, "loss": 0.6358, "losses/dpo": 0.778003454208374, "losses/sft": 1.62531316280365, "losses/total": 0.778003454208374, "ref_logps/chosen": -28.233654022216797, "ref_logps/rejected": -32.102394104003906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03305342048406601, "rewards/margins": 0.1398533284664154, "rewards/rejected": -0.17290674149990082, "step": 334 }, { "epoch": 0.32, "grad_norm": 20.006909125800608, "learning_rate": 4.926272322971864e-07, "logps/chosen": -22.725887298583984, "logps/rejected": -25.691255569458008, "loss": 0.6708, "losses/dpo": 0.6063663959503174, "losses/sft": 0.3209933042526245, "losses/total": 0.6063663959503174, "ref_logps/chosen": -22.111019134521484, "ref_logps/rejected": -24.50396728515625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.061486780643463135, "rewards/margins": 0.0572420135140419, "rewards/rejected": -0.11872879415750504, "step": 335 }, { "epoch": 0.32, "grad_norm": 22.818984445770273, "learning_rate": 4.925657146746961e-07, "logps/chosen": -34.036319732666016, "logps/rejected": -29.678232192993164, "loss": 0.6293, "losses/dpo": 0.603499174118042, "losses/sft": 0.4761703610420227, "losses/total": 0.603499174118042, "ref_logps/chosen": -33.928443908691406, "ref_logps/rejected": -28.152462005615234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.010787703096866608, "rewards/margins": 0.14178921282291412, "rewards/rejected": -0.15257692337036133, "step": 336 }, { "epoch": 0.32, "grad_norm": 21.90574133677782, "learning_rate": 4.92503945342255e-07, "logps/chosen": -32.885650634765625, "logps/rejected": -37.48805618286133, "loss": 0.6223, "losses/dpo": 0.7742347717285156, "losses/sft": 0.7779593467712402, "losses/total": 0.7742347717285156, "ref_logps/chosen": -32.380706787109375, "ref_logps/rejected": -35.313682556152344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0504942461848259, "rewards/margins": 0.16694283485412598, "rewards/rejected": -0.21743707358837128, "step": 337 }, { "epoch": 0.32, "grad_norm": 20.50808868717749, "learning_rate": 4.924419243639608e-07, "logps/chosen": -29.422962188720703, "logps/rejected": -30.132827758789062, "loss": 0.6488, "losses/dpo": 0.6402976512908936, "losses/sft": 1.3116710186004639, "losses/total": 0.6402976512908936, "ref_logps/chosen": -29.11081886291504, "ref_logps/rejected": -28.82525634765625, "rewards/accuracies": 0.75, "rewards/chosen": -0.031214524060487747, "rewards/margins": 0.09954274445772171, "rewards/rejected": -0.13075727224349976, "step": 338 }, { "epoch": 0.32, "grad_norm": 24.315424097289164, "learning_rate": 4.923796518041729e-07, "logps/chosen": -36.905792236328125, "logps/rejected": -51.360023498535156, "loss": 0.6068, "losses/dpo": 0.7009338140487671, "losses/sft": 0.7586531639099121, "losses/total": 0.7009338140487671, "ref_logps/chosen": -35.44159698486328, "ref_logps/rejected": -47.79114532470703, "rewards/accuracies": 0.875, "rewards/chosen": -0.14641912281513214, "rewards/margins": 0.21046891808509827, "rewards/rejected": -0.3568880558013916, "step": 339 }, { "epoch": 0.32, "grad_norm": 18.954901986651677, "learning_rate": 4.923171277275112e-07, "logps/chosen": -23.948055267333984, "logps/rejected": -32.614933013916016, "loss": 0.6224, "losses/dpo": 0.6807696223258972, "losses/sft": 0.5366243720054626, "losses/total": 0.6807696223258972, "ref_logps/chosen": -23.62971305847168, "ref_logps/rejected": -30.60293960571289, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03183414414525032, "rewards/margins": 0.16936522722244263, "rewards/rejected": -0.20119938254356384, "step": 340 }, { "epoch": 0.32, "grad_norm": 18.756688075170885, "learning_rate": 4.922543521988568e-07, "logps/chosen": -21.259384155273438, "logps/rejected": -30.51002311706543, "loss": 0.5867, "losses/dpo": 0.5704569220542908, "losses/sft": 0.6182830333709717, "losses/total": 0.5704569220542908, "ref_logps/chosen": -20.93194580078125, "ref_logps/rejected": -27.712596893310547, "rewards/accuracies": 0.875, "rewards/chosen": -0.032743826508522034, "rewards/margins": 0.24699872732162476, "rewards/rejected": -0.2797425389289856, "step": 341 }, { "epoch": 0.32, "grad_norm": 22.36115212065729, "learning_rate": 4.921913252833518e-07, "logps/chosen": -29.793842315673828, "logps/rejected": -41.93769836425781, "loss": 0.6151, "losses/dpo": 0.7418943047523499, "losses/sft": 1.5774043798446655, "losses/total": 0.7418943047523499, "ref_logps/chosen": -28.940719604492188, "ref_logps/rejected": -39.275169372558594, "rewards/accuracies": 0.75, "rewards/chosen": -0.08531221002340317, "rewards/margins": 0.1809406280517578, "rewards/rejected": -0.2662528455257416, "step": 342 }, { "epoch": 0.32, "grad_norm": 20.69535930248303, "learning_rate": 4.921280470463991e-07, "logps/chosen": -25.85882568359375, "logps/rejected": -27.02472686767578, "loss": 0.7014, "losses/dpo": 0.9860588908195496, "losses/sft": 1.1752763986587524, "losses/total": 0.9860588908195496, "ref_logps/chosen": -24.824512481689453, "ref_logps/rejected": -26.057353973388672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10343119502067566, "rewards/margins": -0.006693903356790543, "rewards/rejected": -0.09673729538917542, "step": 343 }, { "epoch": 0.32, "grad_norm": 21.070402776682286, "learning_rate": 4.920645175536624e-07, "logps/chosen": -22.93651008605957, "logps/rejected": -39.084228515625, "loss": 0.6259, "losses/dpo": 0.6189451217651367, "losses/sft": 0.8426660895347595, "losses/total": 0.6189451217651367, "ref_logps/chosen": -22.367271423339844, "ref_logps/rejected": -36.81373977661133, "rewards/accuracies": 0.75, "rewards/chosen": -0.05692403018474579, "rewards/margins": 0.17012466490268707, "rewards/rejected": -0.22704866528511047, "step": 344 }, { "epoch": 0.33, "grad_norm": 19.527290729236462, "learning_rate": 4.920007368710661e-07, "logps/chosen": -22.875534057617188, "logps/rejected": -31.199995040893555, "loss": 0.6687, "losses/dpo": 0.7625980973243713, "losses/sft": 0.793837308883667, "losses/total": 0.7625980973243713, "ref_logps/chosen": -22.083959579467773, "ref_logps/rejected": -29.65519142150879, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07915763556957245, "rewards/margins": 0.07532265782356262, "rewards/rejected": -0.15448030829429626, "step": 345 }, { "epoch": 0.33, "grad_norm": 19.540246972682006, "learning_rate": 4.919367050647952e-07, "logps/chosen": -25.21356201171875, "logps/rejected": -35.31367492675781, "loss": 0.5971, "losses/dpo": 0.6033971905708313, "losses/sft": 0.2077522575855255, "losses/total": 0.6033971905708313, "ref_logps/chosen": -24.7724609375, "ref_logps/rejected": -32.76677322387695, "rewards/accuracies": 0.875, "rewards/chosen": -0.044110022485256195, "rewards/margins": 0.2105797976255417, "rewards/rejected": -0.2546898126602173, "step": 346 }, { "epoch": 0.33, "grad_norm": 22.382189953956374, "learning_rate": 4.918724222012955e-07, "logps/chosen": -43.42041778564453, "logps/rejected": -46.331817626953125, "loss": 0.6287, "losses/dpo": 0.6770682334899902, "losses/sft": 0.4867769777774811, "losses/total": 0.6770682334899902, "ref_logps/chosen": -42.11416244506836, "ref_logps/rejected": -43.410911560058594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13062545657157898, "rewards/margins": 0.16146522760391235, "rewards/rejected": -0.2920907139778137, "step": 347 }, { "epoch": 0.33, "grad_norm": 22.242199050651724, "learning_rate": 4.918078883472733e-07, "logps/chosen": -35.13628387451172, "logps/rejected": -32.658042907714844, "loss": 0.6621, "losses/dpo": 0.591223955154419, "losses/sft": 1.0511010885238647, "losses/total": 0.591223955154419, "ref_logps/chosen": -34.32464599609375, "ref_logps/rejected": -31.109832763671875, "rewards/accuracies": 0.625, "rewards/chosen": -0.08116337656974792, "rewards/margins": 0.07365753501653671, "rewards/rejected": -0.15482088923454285, "step": 348 }, { "epoch": 0.33, "grad_norm": 20.6054107623057, "learning_rate": 4.91743103569695e-07, "logps/chosen": -28.865970611572266, "logps/rejected": -31.758888244628906, "loss": 0.661, "losses/dpo": 0.8095197081565857, "losses/sft": 1.384610652923584, "losses/total": 0.8095197081565857, "ref_logps/chosen": -27.955198287963867, "ref_logps/rejected": -29.96941375732422, "rewards/accuracies": 0.625, "rewards/chosen": -0.09107738733291626, "rewards/margins": 0.08787043392658234, "rewards/rejected": -0.1789478063583374, "step": 349 }, { "epoch": 0.33, "grad_norm": 18.930333914550076, "learning_rate": 4.916780679357879e-07, "logps/chosen": -21.265872955322266, "logps/rejected": -29.59015655517578, "loss": 0.6196, "losses/dpo": 0.6998823285102844, "losses/sft": 0.3582296073436737, "losses/total": 0.6998823285102844, "ref_logps/chosen": -21.168773651123047, "ref_logps/rejected": -27.795654296875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009709915146231651, "rewards/margins": 0.16974017024040222, "rewards/rejected": -0.17945007979869843, "step": 350 }, { "epoch": 0.33, "grad_norm": 21.87298611919371, "learning_rate": 4.916127815130389e-07, "logps/chosen": -33.5753059387207, "logps/rejected": -42.552001953125, "loss": 0.6034, "losses/dpo": 0.48670005798339844, "losses/sft": 0.9643361568450928, "losses/total": 0.48670005798339844, "ref_logps/chosen": -32.70741271972656, "ref_logps/rejected": -39.59663009643555, "rewards/accuracies": 0.875, "rewards/chosen": -0.08678925782442093, "rewards/margins": 0.20874816179275513, "rewards/rejected": -0.29553741216659546, "step": 351 }, { "epoch": 0.33, "grad_norm": 21.011467860377113, "learning_rate": 4.915472443691962e-07, "logps/chosen": -26.822532653808594, "logps/rejected": -38.5020866394043, "loss": 0.6152, "losses/dpo": 0.6887233853340149, "losses/sft": 0.8604673743247986, "losses/total": 0.6887233853340149, "ref_logps/chosen": -26.62289047241211, "ref_logps/rejected": -36.5163459777832, "rewards/accuracies": 0.8125, "rewards/chosen": -0.019963964819908142, "rewards/margins": 0.17861002683639526, "rewards/rejected": -0.1985739916563034, "step": 352 }, { "epoch": 0.33, "grad_norm": 21.01108404473789, "learning_rate": 4.91481456572267e-07, "logps/chosen": -28.389314651489258, "logps/rejected": -33.44404983520508, "loss": 0.6119, "losses/dpo": 0.5023601651191711, "losses/sft": 0.819004476070404, "losses/total": 0.5023601651191711, "ref_logps/chosen": -28.338178634643555, "ref_logps/rejected": -31.602689743041992, "rewards/accuracies": 0.875, "rewards/chosen": -0.005113763734698296, "rewards/margins": 0.17902211844921112, "rewards/rejected": -0.18413589894771576, "step": 353 }, { "epoch": 0.33, "grad_norm": 19.59913128015621, "learning_rate": 4.914154181905196e-07, "logps/chosen": -21.237192153930664, "logps/rejected": -44.233314514160156, "loss": 0.5399, "losses/dpo": 0.5103285908699036, "losses/sft": 0.7843578457832336, "losses/total": 0.5103285908699036, "ref_logps/chosen": -21.15270233154297, "ref_logps/rejected": -40.606903076171875, "rewards/accuracies": 0.875, "rewards/chosen": -0.00844912976026535, "rewards/margins": 0.35419172048568726, "rewards/rejected": -0.3626408576965332, "step": 354 }, { "epoch": 0.33, "grad_norm": 20.341603342766376, "learning_rate": 4.913491292924816e-07, "logps/chosen": -29.651721954345703, "logps/rejected": -36.022789001464844, "loss": 0.5934, "losses/dpo": 0.5936194062232971, "losses/sft": 0.3208937346935272, "losses/total": 0.5936194062232971, "ref_logps/chosen": -28.877164840698242, "ref_logps/rejected": -32.828731536865234, "rewards/accuracies": 0.75, "rewards/chosen": -0.07745575159788132, "rewards/margins": 0.24195009469985962, "rewards/rejected": -0.31940585374832153, "step": 355 }, { "epoch": 0.34, "grad_norm": 22.64689588250176, "learning_rate": 4.912825899469409e-07, "logps/chosen": -28.617076873779297, "logps/rejected": -34.801597595214844, "loss": 0.6337, "losses/dpo": 0.7792518734931946, "losses/sft": 0.6841630339622498, "losses/total": 0.7792518734931946, "ref_logps/chosen": -27.3543701171875, "ref_logps/rejected": -32.17980194091797, "rewards/accuracies": 0.75, "rewards/chosen": -0.12627069652080536, "rewards/margins": 0.13590875267982483, "rewards/rejected": -0.262179434299469, "step": 356 }, { "epoch": 0.34, "grad_norm": 22.356444347933074, "learning_rate": 4.912158002229454e-07, "logps/chosen": -24.01513671875, "logps/rejected": -36.379356384277344, "loss": 0.6258, "losses/dpo": 0.5545159578323364, "losses/sft": 0.4065888524055481, "losses/total": 0.5545159578323364, "ref_logps/chosen": -23.286083221435547, "ref_logps/rejected": -34.051612854003906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07290515303611755, "rewards/margins": 0.1598687767982483, "rewards/rejected": -0.23277392983436584, "step": 357 }, { "epoch": 0.34, "grad_norm": 19.84673765520466, "learning_rate": 4.911487601898025e-07, "logps/chosen": -28.224044799804688, "logps/rejected": -43.66613006591797, "loss": 0.5902, "losses/dpo": 0.6823126673698425, "losses/sft": 1.3586193323135376, "losses/total": 0.6823126673698425, "ref_logps/chosen": -27.105056762695312, "ref_logps/rejected": -39.839385986328125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11189892888069153, "rewards/margins": 0.2707752585411072, "rewards/rejected": -0.3826741874217987, "step": 358 }, { "epoch": 0.34, "grad_norm": 23.411174135238443, "learning_rate": 4.910814699170797e-07, "logps/chosen": -28.63547134399414, "logps/rejected": -33.34358596801758, "loss": 0.6534, "losses/dpo": 0.5545798540115356, "losses/sft": 1.4980494976043701, "losses/total": 0.5545798540115356, "ref_logps/chosen": -28.078506469726562, "ref_logps/rejected": -31.829872131347656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.055696625262498856, "rewards/margins": 0.09567467868328094, "rewards/rejected": -0.1513713002204895, "step": 359 }, { "epoch": 0.34, "grad_norm": 20.74492686530194, "learning_rate": 4.910139294746037e-07, "logps/chosen": -28.581064224243164, "logps/rejected": -29.10474395751953, "loss": 0.6674, "losses/dpo": 0.6472219824790955, "losses/sft": 0.9012381434440613, "losses/total": 0.6472219824790955, "ref_logps/chosen": -27.4820499420166, "ref_logps/rejected": -27.389663696289062, "rewards/accuracies": 0.75, "rewards/chosen": -0.10990135371685028, "rewards/margins": 0.06160668283700943, "rewards/rejected": -0.1715080440044403, "step": 360 }, { "epoch": 0.34, "grad_norm": 21.90310701326177, "learning_rate": 4.909461389324614e-07, "logps/chosen": -36.66258239746094, "logps/rejected": -42.73207092285156, "loss": 0.6038, "losses/dpo": 0.8390791416168213, "losses/sft": 2.0136446952819824, "losses/total": 0.8390791416168213, "ref_logps/chosen": -35.49272155761719, "ref_logps/rejected": -39.329200744628906, "rewards/accuracies": 0.75, "rewards/chosen": -0.11698609590530396, "rewards/margins": 0.22330135107040405, "rewards/rejected": -0.340287446975708, "step": 361 }, { "epoch": 0.34, "grad_norm": 23.369356882888376, "learning_rate": 4.908780983609986e-07, "logps/chosen": -25.968961715698242, "logps/rejected": -39.226287841796875, "loss": 0.6502, "losses/dpo": 0.5260043144226074, "losses/sft": 0.906235933303833, "losses/total": 0.5260043144226074, "ref_logps/chosen": -24.765657424926758, "ref_logps/rejected": -36.80180358886719, "rewards/accuracies": 0.75, "rewards/chosen": -0.12033066898584366, "rewards/margins": 0.1221172958612442, "rewards/rejected": -0.24244795739650726, "step": 362 }, { "epoch": 0.34, "grad_norm": 21.569209975504368, "learning_rate": 4.908098078308211e-07, "logps/chosen": -23.701446533203125, "logps/rejected": -42.00910186767578, "loss": 0.6363, "losses/dpo": 0.672517716884613, "losses/sft": 0.38802841305732727, "losses/total": 0.672517716884613, "ref_logps/chosen": -22.558212280273438, "ref_logps/rejected": -39.50931167602539, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11432330310344696, "rewards/margins": 0.13565577566623688, "rewards/rejected": -0.24997907876968384, "step": 363 }, { "epoch": 0.34, "grad_norm": 19.282975524972933, "learning_rate": 4.907412674127937e-07, "logps/chosen": -21.713748931884766, "logps/rejected": -37.21073532104492, "loss": 0.6005, "losses/dpo": 0.5261354446411133, "losses/sft": 0.34772831201553345, "losses/total": 0.5261354446411133, "ref_logps/chosen": -21.729537963867188, "ref_logps/rejected": -35.15924072265625, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0015786569565534592, "rewards/margins": 0.20672793686389923, "rewards/rejected": -0.20514927804470062, "step": 364 }, { "epoch": 0.34, "grad_norm": 27.196155681928904, "learning_rate": 4.906724771780406e-07, "logps/chosen": -42.05462646484375, "logps/rejected": -42.02058792114258, "loss": 0.6588, "losses/dpo": 0.6260238289833069, "losses/sft": 0.5783752799034119, "losses/total": 0.6260238289833069, "ref_logps/chosen": -40.382137298583984, "ref_logps/rejected": -39.531158447265625, "rewards/accuracies": 0.625, "rewards/chosen": -0.1672491431236267, "rewards/margins": 0.08169403672218323, "rewards/rejected": -0.24894317984580994, "step": 365 }, { "epoch": 0.35, "grad_norm": 25.561367741588978, "learning_rate": 4.906034371979456e-07, "logps/chosen": -27.384708404541016, "logps/rejected": -54.75737380981445, "loss": 0.567, "losses/dpo": 0.7138622999191284, "losses/sft": 1.0324561595916748, "losses/total": 0.7138622999191284, "ref_logps/chosen": -26.283405303955078, "ref_logps/rejected": -50.76370620727539, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11013046652078629, "rewards/margins": 0.2892364263534546, "rewards/rejected": -0.3993668854236603, "step": 366 }, { "epoch": 0.35, "grad_norm": 19.30371927605576, "learning_rate": 4.905341475441509e-07, "logps/chosen": -28.31380271911621, "logps/rejected": -30.205793380737305, "loss": 0.5951, "losses/dpo": 0.7220141291618347, "losses/sft": 0.39221036434173584, "losses/total": 0.7220141291618347, "ref_logps/chosen": -27.780757904052734, "ref_logps/rejected": -27.158832550048828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05330440029501915, "rewards/margins": 0.2513916492462158, "rewards/rejected": -0.30469608306884766, "step": 367 }, { "epoch": 0.35, "grad_norm": 20.516554760156364, "learning_rate": 4.904646082885586e-07, "logps/chosen": -28.83851432800293, "logps/rejected": -48.141502380371094, "loss": 0.5724, "losses/dpo": 0.5703908205032349, "losses/sft": 1.1562190055847168, "losses/total": 0.5703908205032349, "ref_logps/chosen": -28.091678619384766, "ref_logps/rejected": -44.18052673339844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07468368113040924, "rewards/margins": 0.32141417264938354, "rewards/rejected": -0.3960978090763092, "step": 368 }, { "epoch": 0.35, "grad_norm": 27.539403846796322, "learning_rate": 4.903948195033293e-07, "logps/chosen": -41.79988098144531, "logps/rejected": -44.056968688964844, "loss": 0.7003, "losses/dpo": 1.1219971179962158, "losses/sft": 1.288548469543457, "losses/total": 1.1219971179962158, "ref_logps/chosen": -40.077579498291016, "ref_logps/rejected": -42.28183364868164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17223018407821655, "rewards/margins": 0.005283500999212265, "rewards/rejected": -0.17751367390155792, "step": 369 }, { "epoch": 0.35, "grad_norm": 19.616042605892524, "learning_rate": 4.903247812608826e-07, "logps/chosen": -29.452102661132812, "logps/rejected": -43.17719268798828, "loss": 0.5688, "losses/dpo": 0.499711275100708, "losses/sft": 1.5470408201217651, "losses/total": 0.499711275100708, "ref_logps/chosen": -28.737245559692383, "ref_logps/rejected": -39.550048828125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07148593664169312, "rewards/margins": 0.29122835397720337, "rewards/rejected": -0.3627142906188965, "step": 370 }, { "epoch": 0.35, "grad_norm": 25.251270077893974, "learning_rate": 4.902544936338973e-07, "logps/chosen": -31.543380737304688, "logps/rejected": -38.576377868652344, "loss": 0.6688, "losses/dpo": 0.7249348759651184, "losses/sft": 0.8469500541687012, "losses/total": 0.7249348759651184, "ref_logps/chosen": -29.963207244873047, "ref_logps/rejected": -36.29280090332031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15801742672920227, "rewards/margins": 0.07034019380807877, "rewards/rejected": -0.22835761308670044, "step": 371 }, { "epoch": 0.35, "grad_norm": 18.389489626110386, "learning_rate": 4.901839566953104e-07, "logps/chosen": -30.76398468017578, "logps/rejected": -32.75887680053711, "loss": 0.5465, "losses/dpo": 0.45750004053115845, "losses/sft": 1.0385549068450928, "losses/total": 0.45750004053115845, "ref_logps/chosen": -30.30224609375, "ref_logps/rejected": -28.93923568725586, "rewards/accuracies": 0.875, "rewards/chosen": -0.046173546463251114, "rewards/margins": 0.33579057455062866, "rewards/rejected": -0.3819641172885895, "step": 372 }, { "epoch": 0.35, "grad_norm": 21.259722804409623, "learning_rate": 4.901131705183182e-07, "logps/chosen": -32.806034088134766, "logps/rejected": -39.48468780517578, "loss": 0.6381, "losses/dpo": 0.526104211807251, "losses/sft": 1.3929250240325928, "losses/total": 0.526104211807251, "ref_logps/chosen": -31.515987396240234, "ref_logps/rejected": -36.76390838623047, "rewards/accuracies": 0.625, "rewards/chosen": -0.12900453805923462, "rewards/margins": 0.14307329058647156, "rewards/rejected": -0.2720778286457062, "step": 373 }, { "epoch": 0.35, "grad_norm": 22.459797319781558, "learning_rate": 4.900421351763754e-07, "logps/chosen": -32.5936164855957, "logps/rejected": -38.35536193847656, "loss": 0.6381, "losses/dpo": 0.8319301605224609, "losses/sft": 1.477744460105896, "losses/total": 0.8319301605224609, "ref_logps/chosen": -30.592288970947266, "ref_logps/rejected": -34.935577392578125, "rewards/accuracies": 0.5, "rewards/chosen": -0.20013296604156494, "rewards/margins": 0.14184492826461792, "rewards/rejected": -0.34197789430618286, "step": 374 }, { "epoch": 0.35, "grad_norm": 20.708217110527002, "learning_rate": 4.899708507431949e-07, "logps/chosen": -23.373992919921875, "logps/rejected": -41.4346923828125, "loss": 0.611, "losses/dpo": 0.5040609836578369, "losses/sft": 0.6924494504928589, "losses/total": 0.5040609836578369, "ref_logps/chosen": -22.32748794555664, "ref_logps/rejected": -38.145957946777344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10465070605278015, "rewards/margins": 0.22422274947166443, "rewards/rejected": -0.3288734555244446, "step": 375 }, { "epoch": 0.35, "grad_norm": 19.723898429428072, "learning_rate": 4.898993172927487e-07, "logps/chosen": -32.310916900634766, "logps/rejected": -37.59693908691406, "loss": 0.6034, "losses/dpo": 0.6113296151161194, "losses/sft": 0.35360899567604065, "losses/total": 0.6113296151161194, "ref_logps/chosen": -31.012012481689453, "ref_logps/rejected": -33.99290466308594, "rewards/accuracies": 0.75, "rewards/chosen": -0.12989062070846558, "rewards/margins": 0.23051272332668304, "rewards/rejected": -0.3604033291339874, "step": 376 }, { "epoch": 0.36, "grad_norm": 21.157924687319255, "learning_rate": 4.898275348992668e-07, "logps/chosen": -31.026844024658203, "logps/rejected": -39.84944152832031, "loss": 0.6089, "losses/dpo": 0.5259171724319458, "losses/sft": 1.6123207807540894, "losses/total": 0.5259171724319458, "ref_logps/chosen": -28.94589614868164, "ref_logps/rejected": -35.73371124267578, "rewards/accuracies": 0.625, "rewards/chosen": -0.2080947756767273, "rewards/margins": 0.20347826182842255, "rewards/rejected": -0.41157305240631104, "step": 377 }, { "epoch": 0.36, "grad_norm": 19.606217502534523, "learning_rate": 4.897555036372377e-07, "logps/chosen": -25.448837280273438, "logps/rejected": -24.554874420166016, "loss": 0.6321, "losses/dpo": 0.6184150576591492, "losses/sft": 0.48345568776130676, "losses/total": 0.6184150576591492, "ref_logps/chosen": -24.030780792236328, "ref_logps/rejected": -21.768009185791016, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14180555939674377, "rewards/margins": 0.13688096404075623, "rewards/rejected": -0.2786865234375, "step": 378 }, { "epoch": 0.36, "grad_norm": 20.146247717433845, "learning_rate": 4.896832235814081e-07, "logps/chosen": -30.17314338684082, "logps/rejected": -29.44635772705078, "loss": 0.6634, "losses/dpo": 0.6128146052360535, "losses/sft": 0.18377666175365448, "losses/total": 0.6128146052360535, "ref_logps/chosen": -29.125961303710938, "ref_logps/rejected": -27.566429138183594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10471824556589127, "rewards/margins": 0.08327483385801315, "rewards/rejected": -0.18799307942390442, "step": 379 }, { "epoch": 0.36, "grad_norm": 21.867836827194218, "learning_rate": 4.896106948067829e-07, "logps/chosen": -30.86905288696289, "logps/rejected": -28.35112762451172, "loss": 0.6572, "losses/dpo": 0.3719083070755005, "losses/sft": 1.5852382183074951, "losses/total": 0.3719083070755005, "ref_logps/chosen": -29.160322189331055, "ref_logps/rejected": -25.547664642333984, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17087286710739136, "rewards/margins": 0.10947339981794357, "rewards/rejected": -0.28034624457359314, "step": 380 }, { "epoch": 0.36, "grad_norm": 20.619033975328648, "learning_rate": 4.895379173886248e-07, "logps/chosen": -31.770299911499023, "logps/rejected": -32.53035354614258, "loss": 0.6429, "losses/dpo": 0.5595015287399292, "losses/sft": 0.7834732532501221, "losses/total": 0.5595015287399292, "ref_logps/chosen": -30.57036018371582, "ref_logps/rejected": -30.113784790039062, "rewards/accuracies": 0.625, "rewards/chosen": -0.11999401450157166, "rewards/margins": 0.1216631755232811, "rewards/rejected": -0.24165719747543335, "step": 381 }, { "epoch": 0.36, "grad_norm": 26.31611990135046, "learning_rate": 4.894648914024552e-07, "logps/chosen": -30.071426391601562, "logps/rejected": -31.989065170288086, "loss": 0.7042, "losses/dpo": 0.47972095012664795, "losses/sft": 0.23540224134922028, "losses/total": 0.47972095012664795, "ref_logps/chosen": -27.479122161865234, "ref_logps/rejected": -28.891719818115234, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2592305541038513, "rewards/margins": 0.05050404742360115, "rewards/rejected": -0.309734582901001, "step": 382 }, { "epoch": 0.36, "grad_norm": 22.18948145872108, "learning_rate": 4.893916169240526e-07, "logps/chosen": -30.423538208007812, "logps/rejected": -40.669063568115234, "loss": 0.6316, "losses/dpo": 0.7115437984466553, "losses/sft": 1.9202417135238647, "losses/total": 0.7115437984466553, "ref_logps/chosen": -29.228477478027344, "ref_logps/rejected": -38.00771713256836, "rewards/accuracies": 0.625, "rewards/chosen": -0.11950612813234329, "rewards/margins": 0.14662817120552063, "rewards/rejected": -0.2661343216896057, "step": 383 }, { "epoch": 0.36, "grad_norm": 20.71170202881059, "learning_rate": 4.893180940294541e-07, "logps/chosen": -27.962139129638672, "logps/rejected": -42.92897033691406, "loss": 0.5909, "losses/dpo": 0.8200230598449707, "losses/sft": 1.2890231609344482, "losses/total": 0.8200230598449707, "ref_logps/chosen": -26.298049926757812, "ref_logps/rejected": -38.309730529785156, "rewards/accuracies": 0.625, "rewards/chosen": -0.1664087027311325, "rewards/margins": 0.2955157160758972, "rewards/rejected": -0.4619244337081909, "step": 384 }, { "epoch": 0.36, "grad_norm": 23.00995881297325, "learning_rate": 4.892443227949543e-07, "logps/chosen": -36.78803253173828, "logps/rejected": -37.460968017578125, "loss": 0.6789, "losses/dpo": 0.6543883085250854, "losses/sft": 1.2758736610412598, "losses/total": 0.6543883085250854, "ref_logps/chosen": -34.90734100341797, "ref_logps/rejected": -35.177978515625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18806922435760498, "rewards/margins": 0.04022980481386185, "rewards/rejected": -0.22829902172088623, "step": 385 }, { "epoch": 0.36, "grad_norm": 22.83399337316326, "learning_rate": 4.891703032971049e-07, "logps/chosen": -28.672210693359375, "logps/rejected": -42.1572380065918, "loss": 0.6308, "losses/dpo": 0.7403767108917236, "losses/sft": 1.7078191041946411, "losses/total": 0.7403767108917236, "ref_logps/chosen": -26.65664291381836, "ref_logps/rejected": -38.365379333496094, "rewards/accuracies": 0.75, "rewards/chosen": -0.20155677199363708, "rewards/margins": 0.17762956023216248, "rewards/rejected": -0.37918633222579956, "step": 386 }, { "epoch": 0.37, "grad_norm": 24.333825420787974, "learning_rate": 4.890960356127166e-07, "logps/chosen": -34.86188507080078, "logps/rejected": -35.90169906616211, "loss": 0.6409, "losses/dpo": 0.7884860634803772, "losses/sft": 1.6980806589126587, "losses/total": 0.7884860634803772, "ref_logps/chosen": -33.45568084716797, "ref_logps/rejected": -33.07782745361328, "rewards/accuracies": 0.75, "rewards/chosen": -0.1406201720237732, "rewards/margins": 0.14176708459854126, "rewards/rejected": -0.28238725662231445, "step": 387 }, { "epoch": 0.37, "grad_norm": 21.418017207924976, "learning_rate": 4.890215198188561e-07, "logps/chosen": -20.959854125976562, "logps/rejected": -34.5628662109375, "loss": 0.6419, "losses/dpo": 0.47178706526756287, "losses/sft": 0.7672502398490906, "losses/total": 0.47178706526756287, "ref_logps/chosen": -19.754520416259766, "ref_logps/rejected": -32.049232482910156, "rewards/accuracies": 0.75, "rewards/chosen": -0.12053336203098297, "rewards/margins": 0.13083012402057648, "rewards/rejected": -0.25136351585388184, "step": 388 }, { "epoch": 0.37, "grad_norm": 24.175861712236284, "learning_rate": 4.889467559928487e-07, "logps/chosen": -37.62225341796875, "logps/rejected": -46.091346740722656, "loss": 0.6428, "losses/dpo": 0.45363378524780273, "losses/sft": 0.4854414165019989, "losses/total": 0.45363378524780273, "ref_logps/chosen": -35.49768829345703, "ref_logps/rejected": -42.520198822021484, "rewards/accuracies": 0.5, "rewards/chosen": -0.21245652437210083, "rewards/margins": 0.14465820789337158, "rewards/rejected": -0.3571147322654724, "step": 389 }, { "epoch": 0.37, "grad_norm": 21.52181651629466, "learning_rate": 4.888717442122766e-07, "logps/chosen": -29.646495819091797, "logps/rejected": -31.48245620727539, "loss": 0.6408, "losses/dpo": 0.6850010752677917, "losses/sft": 0.14549210667610168, "losses/total": 0.6850010752677917, "ref_logps/chosen": -27.984954833984375, "ref_logps/rejected": -28.421974182128906, "rewards/accuracies": 0.5, "rewards/chosen": -0.16615404188632965, "rewards/margins": 0.13989412784576416, "rewards/rejected": -0.306048184633255, "step": 390 }, { "epoch": 0.37, "grad_norm": 16.717000748203937, "learning_rate": 4.887964845549793e-07, "logps/chosen": -23.98613739013672, "logps/rejected": -25.686290740966797, "loss": 0.6003, "losses/dpo": 0.41768231987953186, "losses/sft": 1.1737374067306519, "losses/total": 0.41768231987953186, "ref_logps/chosen": -23.234464645385742, "ref_logps/rejected": -22.75193214416504, "rewards/accuracies": 0.875, "rewards/chosen": -0.07516716420650482, "rewards/margins": 0.21826864778995514, "rewards/rejected": -0.29343581199645996, "step": 391 }, { "epoch": 0.37, "grad_norm": 22.01526746734936, "learning_rate": 4.887209770990537e-07, "logps/chosen": -26.41743278503418, "logps/rejected": -40.98262023925781, "loss": 0.6106, "losses/dpo": 0.45887959003448486, "losses/sft": 0.8386738300323486, "losses/total": 0.45887959003448486, "ref_logps/chosen": -25.037044525146484, "ref_logps/rejected": -37.61011505126953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1380389928817749, "rewards/margins": 0.19921167194843292, "rewards/rejected": -0.337250679731369, "step": 392 }, { "epoch": 0.37, "grad_norm": 19.899292107450556, "learning_rate": 4.886452219228535e-07, "logps/chosen": -30.798587799072266, "logps/rejected": -27.202404022216797, "loss": 0.6383, "losses/dpo": 0.8797686100006104, "losses/sft": 0.6192131042480469, "losses/total": 0.8797686100006104, "ref_logps/chosen": -30.04422378540039, "ref_logps/rejected": -25.04416847229004, "rewards/accuracies": 0.625, "rewards/chosen": -0.07543622702360153, "rewards/margins": 0.14038720726966858, "rewards/rejected": -0.2158234417438507, "step": 393 }, { "epoch": 0.37, "grad_norm": 22.111104434383485, "learning_rate": 4.885692191049901e-07, "logps/chosen": -26.523475646972656, "logps/rejected": -34.50318908691406, "loss": 0.6701, "losses/dpo": 0.609727680683136, "losses/sft": 0.09800824522972107, "losses/total": 0.609727680683136, "ref_logps/chosen": -24.833927154541016, "ref_logps/rejected": -32.168487548828125, "rewards/accuracies": 0.625, "rewards/chosen": -0.16895492374897003, "rewards/margins": 0.06451530754566193, "rewards/rejected": -0.23347023129463196, "step": 394 }, { "epoch": 0.37, "grad_norm": 19.588048905711133, "learning_rate": 4.88492968724331e-07, "logps/chosen": -24.12374496459961, "logps/rejected": -32.39152526855469, "loss": 0.6087, "losses/dpo": 0.36112213134765625, "losses/sft": 0.4585651755332947, "losses/total": 0.36112213134765625, "ref_logps/chosen": -23.32042694091797, "ref_logps/rejected": -29.352216720581055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08033184707164764, "rewards/margins": 0.22359901666641235, "rewards/rejected": -0.3039308786392212, "step": 395 }, { "epoch": 0.37, "grad_norm": 20.256418111837426, "learning_rate": 4.884164708600013e-07, "logps/chosen": -30.383647918701172, "logps/rejected": -44.68799591064453, "loss": 0.5903, "losses/dpo": 0.780085563659668, "losses/sft": 0.5970336198806763, "losses/total": 0.780085563659668, "ref_logps/chosen": -28.277254104614258, "ref_logps/rejected": -39.89811706542969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2106393426656723, "rewards/margins": 0.2683483958244324, "rewards/rejected": -0.47898775339126587, "step": 396 }, { "epoch": 0.37, "grad_norm": 23.255233429161276, "learning_rate": 4.883397255913829e-07, "logps/chosen": -32.43157958984375, "logps/rejected": -42.06312561035156, "loss": 0.5697, "losses/dpo": 0.5645579099655151, "losses/sft": 1.9499595165252686, "losses/total": 0.5645579099655151, "ref_logps/chosen": -31.94427490234375, "ref_logps/rejected": -38.55473327636719, "rewards/accuracies": 0.875, "rewards/chosen": -0.048730138689279556, "rewards/margins": 0.3021089434623718, "rewards/rejected": -0.3508390784263611, "step": 397 }, { "epoch": 0.38, "grad_norm": 20.830239060618336, "learning_rate": 4.882627329981138e-07, "logps/chosen": -24.858821868896484, "logps/rejected": -32.773223876953125, "loss": 0.5808, "losses/dpo": 0.6898692846298218, "losses/sft": 0.0850726068019867, "losses/total": 0.6898692846298218, "ref_logps/chosen": -23.84576416015625, "ref_logps/rejected": -28.94524383544922, "rewards/accuracies": 0.75, "rewards/chosen": -0.10130578279495239, "rewards/margins": 0.28149235248565674, "rewards/rejected": -0.38279813528060913, "step": 398 }, { "epoch": 0.38, "grad_norm": 20.5879677006847, "learning_rate": 4.881854931600894e-07, "logps/chosen": -28.27510643005371, "logps/rejected": -42.519004821777344, "loss": 0.5927, "losses/dpo": 0.6908790469169617, "losses/sft": 1.3290212154388428, "losses/total": 0.6908790469169617, "ref_logps/chosen": -26.978090286254883, "ref_logps/rejected": -38.766048431396484, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12970158457756042, "rewards/margins": 0.24559378623962402, "rewards/rejected": -0.37529534101486206, "step": 399 }, { "epoch": 0.38, "grad_norm": 22.332286401266536, "learning_rate": 4.881080061574611e-07, "logps/chosen": -30.46358871459961, "logps/rejected": -36.086822509765625, "loss": 0.6047, "losses/dpo": 0.47944748401641846, "losses/sft": 0.7164310216903687, "losses/total": 0.47944748401641846, "ref_logps/chosen": -29.015804290771484, "ref_logps/rejected": -32.499202728271484, "rewards/accuracies": 0.75, "rewards/chosen": -0.14477813243865967, "rewards/margins": 0.21398362517356873, "rewards/rejected": -0.3587617576122284, "step": 400 }, { "epoch": 0.38, "grad_norm": 21.212311889110442, "learning_rate": 4.88030272070637e-07, "logps/chosen": -29.45766830444336, "logps/rejected": -34.8433837890625, "loss": 0.6696, "losses/dpo": 0.44970810413360596, "losses/sft": 1.351164698600769, "losses/total": 0.44970810413360596, "ref_logps/chosen": -28.327186584472656, "ref_logps/rejected": -33.010772705078125, "rewards/accuracies": 0.625, "rewards/chosen": -0.11304812133312225, "rewards/margins": 0.07021306455135345, "rewards/rejected": -0.1832611858844757, "step": 401 }, { "epoch": 0.38, "grad_norm": 22.616015308086993, "learning_rate": 4.879522909802817e-07, "logps/chosen": -31.214378356933594, "logps/rejected": -30.128870010375977, "loss": 0.698, "losses/dpo": 0.6186286211013794, "losses/sft": 1.0146325826644897, "losses/total": 0.6186286211013794, "ref_logps/chosen": -28.958770751953125, "ref_logps/rejected": -27.730072021484375, "rewards/accuracies": 0.5, "rewards/chosen": -0.22556093335151672, "rewards/margins": 0.014318942092359066, "rewards/rejected": -0.23987987637519836, "step": 402 }, { "epoch": 0.38, "grad_norm": 22.03767596214967, "learning_rate": 4.878740629673159e-07, "logps/chosen": -30.582326889038086, "logps/rejected": -38.715797424316406, "loss": 0.6353, "losses/dpo": 0.5317294001579285, "losses/sft": 1.1765542030334473, "losses/total": 0.5317294001579285, "ref_logps/chosen": -28.600662231445312, "ref_logps/rejected": -35.06280517578125, "rewards/accuracies": 0.625, "rewards/chosen": -0.19816671311855316, "rewards/margins": 0.1671323925256729, "rewards/rejected": -0.3652991056442261, "step": 403 }, { "epoch": 0.38, "grad_norm": 18.544310322541232, "learning_rate": 4.877955881129169e-07, "logps/chosen": -28.50820541381836, "logps/rejected": -41.887325286865234, "loss": 0.5385, "losses/dpo": 0.689644992351532, "losses/sft": 2.022465944290161, "losses/total": 0.689644992351532, "ref_logps/chosen": -27.438880920410156, "ref_logps/rejected": -36.9975700378418, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10693223029375076, "rewards/margins": 0.3820432126522064, "rewards/rejected": -0.488975465297699, "step": 404 }, { "epoch": 0.38, "grad_norm": 20.79989467238408, "learning_rate": 4.877168664985177e-07, "logps/chosen": -29.30644989013672, "logps/rejected": -39.692893981933594, "loss": 0.6167, "losses/dpo": 0.5012751817703247, "losses/sft": 0.7165572643280029, "losses/total": 0.5012751817703247, "ref_logps/chosen": -27.927154541015625, "ref_logps/rejected": -36.347923278808594, "rewards/accuracies": 0.875, "rewards/chosen": -0.13792932033538818, "rewards/margins": 0.19656822085380554, "rewards/rejected": -0.3344975411891937, "step": 405 }, { "epoch": 0.38, "grad_norm": 21.105200537512975, "learning_rate": 4.876378982058076e-07, "logps/chosen": -30.353334426879883, "logps/rejected": -40.16853713989258, "loss": 0.6514, "losses/dpo": 0.37395086884498596, "losses/sft": 1.111774206161499, "losses/total": 0.37395086884498596, "ref_logps/chosen": -27.89218521118164, "ref_logps/rejected": -36.37512969970703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24611493945121765, "rewards/margins": 0.13322579860687256, "rewards/rejected": -0.3793407678604126, "step": 406 }, { "epoch": 0.38, "grad_norm": 19.661107272349252, "learning_rate": 4.875586833167318e-07, "logps/chosen": -33.17323303222656, "logps/rejected": -41.245948791503906, "loss": 0.5457, "losses/dpo": 0.7516378164291382, "losses/sft": 0.8122458457946777, "losses/total": 0.7516378164291382, "ref_logps/chosen": -31.21379280090332, "ref_logps/rejected": -35.504859924316406, "rewards/accuracies": 0.75, "rewards/chosen": -0.19594399631023407, "rewards/margins": 0.37816500663757324, "rewards/rejected": -0.5741089582443237, "step": 407 }, { "epoch": 0.38, "grad_norm": 20.90064754582848, "learning_rate": 4.874792219134916e-07, "logps/chosen": -29.509952545166016, "logps/rejected": -36.412200927734375, "loss": 0.6163, "losses/dpo": 0.7123246788978577, "losses/sft": 0.44740813970565796, "losses/total": 0.7123246788978577, "ref_logps/chosen": -28.323795318603516, "ref_logps/rejected": -33.16991424560547, "rewards/accuracies": 0.625, "rewards/chosen": -0.1186157837510109, "rewards/margins": 0.2056126594543457, "rewards/rejected": -0.324228435754776, "step": 408 }, { "epoch": 0.39, "grad_norm": 19.370587812045294, "learning_rate": 4.873995140785437e-07, "logps/chosen": -22.82128143310547, "logps/rejected": -30.815677642822266, "loss": 0.6467, "losses/dpo": 0.6791825294494629, "losses/sft": 1.2522047758102417, "losses/total": 0.6791825294494629, "ref_logps/chosen": -22.21221923828125, "ref_logps/rejected": -29.107898712158203, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06090622395277023, "rewards/margins": 0.10987183451652527, "rewards/rejected": -0.1707780510187149, "step": 409 }, { "epoch": 0.39, "grad_norm": 24.41689835083989, "learning_rate": 4.873195598946008e-07, "logps/chosen": -36.13754653930664, "logps/rejected": -54.15019226074219, "loss": 0.5979, "losses/dpo": 0.5844522714614868, "losses/sft": 0.6875357627868652, "losses/total": 0.5844522714614868, "ref_logps/chosen": -33.724693298339844, "ref_logps/rejected": -48.77684020996094, "rewards/accuracies": 0.75, "rewards/chosen": -0.2412852942943573, "rewards/margins": 0.296049565076828, "rewards/rejected": -0.5373348593711853, "step": 410 }, { "epoch": 0.39, "grad_norm": 19.33169685156916, "learning_rate": 4.872393594446314e-07, "logps/chosen": -26.871862411499023, "logps/rejected": -29.79359245300293, "loss": 0.6058, "losses/dpo": 0.5739963054656982, "losses/sft": 0.35412710905075073, "losses/total": 0.5739963054656982, "ref_logps/chosen": -26.100814819335938, "ref_logps/rejected": -26.780603408813477, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07710491120815277, "rewards/margins": 0.22419413924217224, "rewards/rejected": -0.3012990653514862, "step": 411 }, { "epoch": 0.39, "grad_norm": 22.800954768110646, "learning_rate": 4.871589128118592e-07, "logps/chosen": -31.6968994140625, "logps/rejected": -34.34685516357422, "loss": 0.6517, "losses/dpo": 0.8152088522911072, "losses/sft": 1.9204142093658447, "losses/total": 0.8152088522911072, "ref_logps/chosen": -30.23918914794922, "ref_logps/rejected": -31.783119201660156, "rewards/accuracies": 0.625, "rewards/chosen": -0.14577095210552216, "rewards/margins": 0.11060255020856857, "rewards/rejected": -0.25637349486351013, "step": 412 }, { "epoch": 0.39, "grad_norm": 21.655226090436248, "learning_rate": 4.870782200797634e-07, "logps/chosen": -35.8330078125, "logps/rejected": -34.89292907714844, "loss": 0.6395, "losses/dpo": 0.719232976436615, "losses/sft": 0.40368086099624634, "losses/total": 0.719232976436615, "ref_logps/chosen": -33.372825622558594, "ref_logps/rejected": -30.601058959960938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.246018186211586, "rewards/margins": 0.18316854536533356, "rewards/rejected": -0.42918676137924194, "step": 413 }, { "epoch": 0.39, "grad_norm": 19.636712590691143, "learning_rate": 4.869972813320789e-07, "logps/chosen": -24.699905395507812, "logps/rejected": -26.167667388916016, "loss": 0.6685, "losses/dpo": 0.782320499420166, "losses/sft": 0.3485712707042694, "losses/total": 0.782320499420166, "ref_logps/chosen": -23.06426239013672, "ref_logps/rejected": -23.762306213378906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1635643094778061, "rewards/margins": 0.07697182148694992, "rewards/rejected": -0.24053612351417542, "step": 414 }, { "epoch": 0.39, "grad_norm": 22.632668108501523, "learning_rate": 4.869160966527956e-07, "logps/chosen": -33.02280044555664, "logps/rejected": -36.917625427246094, "loss": 0.6433, "losses/dpo": 0.6478912234306335, "losses/sft": 0.8644539713859558, "losses/total": 0.6478912234306335, "ref_logps/chosen": -31.366558074951172, "ref_logps/rejected": -33.855743408203125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16562438011169434, "rewards/margins": 0.14056359231472015, "rewards/rejected": -0.3061879873275757, "step": 415 }, { "epoch": 0.39, "grad_norm": 23.409303348505468, "learning_rate": 4.868346661261586e-07, "logps/chosen": -33.89098358154297, "logps/rejected": -49.6177978515625, "loss": 0.6404, "losses/dpo": 0.8882695436477661, "losses/sft": 0.9314327239990234, "losses/total": 0.8882695436477661, "ref_logps/chosen": -31.587736129760742, "ref_logps/rejected": -45.632049560546875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23032474517822266, "rewards/margins": 0.1682499647140503, "rewards/rejected": -0.39857468008995056, "step": 416 }, { "epoch": 0.39, "grad_norm": 22.433574994689895, "learning_rate": 4.867529898366681e-07, "logps/chosen": -26.372848510742188, "logps/rejected": -28.725284576416016, "loss": 0.6029, "losses/dpo": 0.7522779703140259, "losses/sft": 0.3266182243824005, "losses/total": 0.7522779703140259, "ref_logps/chosen": -25.877941131591797, "ref_logps/rejected": -26.085723876953125, "rewards/accuracies": 0.75, "rewards/chosen": -0.049490880221128464, "rewards/margins": 0.2144652158021927, "rewards/rejected": -0.26395609974861145, "step": 417 }, { "epoch": 0.39, "grad_norm": 21.226606952638665, "learning_rate": 4.866710678690797e-07, "logps/chosen": -27.859214782714844, "logps/rejected": -46.09143829345703, "loss": 0.5062, "losses/dpo": 0.38865283131599426, "losses/sft": 0.2561016082763672, "losses/total": 0.38865283131599426, "ref_logps/chosen": -27.1861572265625, "ref_logps/rejected": -40.5323486328125, "rewards/accuracies": 0.875, "rewards/chosen": -0.06730574369430542, "rewards/margins": 0.48860350251197815, "rewards/rejected": -0.555909276008606, "step": 418 }, { "epoch": 0.4, "grad_norm": 20.917640847561813, "learning_rate": 4.865889003084036e-07, "logps/chosen": -29.5806827545166, "logps/rejected": -32.348541259765625, "loss": 0.5977, "losses/dpo": 0.40858709812164307, "losses/sft": 0.8463036417961121, "losses/total": 0.40858709812164307, "ref_logps/chosen": -27.273540496826172, "ref_logps/rejected": -27.600723266601562, "rewards/accuracies": 0.75, "rewards/chosen": -0.230714350938797, "rewards/margins": 0.24406792223453522, "rewards/rejected": -0.474782258272171, "step": 419 }, { "epoch": 0.4, "grad_norm": 26.46201313764768, "learning_rate": 4.865064872399048e-07, "logps/chosen": -40.5654296875, "logps/rejected": -46.27885437011719, "loss": 0.6214, "losses/dpo": 0.5736859440803528, "losses/sft": 1.1939642429351807, "losses/total": 0.5736859440803528, "ref_logps/chosen": -38.423919677734375, "ref_logps/rejected": -42.41477966308594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2141510248184204, "rewards/margins": 0.17225618660449982, "rewards/rejected": -0.38640719652175903, "step": 420 }, { "epoch": 0.4, "grad_norm": 21.209595612975495, "learning_rate": 4.864238287491031e-07, "logps/chosen": -30.73710823059082, "logps/rejected": -42.893585205078125, "loss": 0.5692, "losses/dpo": 0.6684864163398743, "losses/sft": 1.1497727632522583, "losses/total": 0.6684864163398743, "ref_logps/chosen": -29.030933380126953, "ref_logps/rejected": -37.93590545654297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17061766982078552, "rewards/margins": 0.32515019178390503, "rewards/rejected": -0.49576789140701294, "step": 421 }, { "epoch": 0.4, "grad_norm": 18.262061103794, "learning_rate": 4.863409249217732e-07, "logps/chosen": -27.84783172607422, "logps/rejected": -42.641334533691406, "loss": 0.545, "losses/dpo": 0.3939412534236908, "losses/sft": 1.3471300601959229, "losses/total": 0.3939412534236908, "ref_logps/chosen": -26.787628173828125, "ref_logps/rejected": -37.697628021240234, "rewards/accuracies": 0.75, "rewards/chosen": -0.1060205027461052, "rewards/margins": 0.3883499801158905, "rewards/rejected": -0.4943704903125763, "step": 422 }, { "epoch": 0.4, "grad_norm": 22.66871088697491, "learning_rate": 4.862577758439442e-07, "logps/chosen": -36.84308624267578, "logps/rejected": -36.53874588012695, "loss": 0.5927, "losses/dpo": 0.327787309885025, "losses/sft": 0.8535103797912598, "losses/total": 0.327787309885025, "ref_logps/chosen": -36.04808044433594, "ref_logps/rejected": -33.18212890625, "rewards/accuracies": 0.75, "rewards/chosen": -0.07950038462877274, "rewards/margins": 0.2561611831188202, "rewards/rejected": -0.33566156029701233, "step": 423 }, { "epoch": 0.4, "grad_norm": 22.39922437609097, "learning_rate": 4.861743816018997e-07, "logps/chosen": -31.77701759338379, "logps/rejected": -39.71210479736328, "loss": 0.6104, "losses/dpo": 0.7881492376327515, "losses/sft": 1.4704976081848145, "losses/total": 0.7881492376327515, "ref_logps/chosen": -30.122230529785156, "ref_logps/rejected": -35.95326232910156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1654786765575409, "rewards/margins": 0.2104060798883438, "rewards/rejected": -0.3758847415447235, "step": 424 }, { "epoch": 0.4, "grad_norm": 23.366880970021345, "learning_rate": 4.860907422821779e-07, "logps/chosen": -34.0506706237793, "logps/rejected": -43.08148956298828, "loss": 0.6734, "losses/dpo": 0.6192563772201538, "losses/sft": 0.7832827568054199, "losses/total": 0.6192563772201538, "ref_logps/chosen": -30.189722061157227, "ref_logps/rejected": -38.130245208740234, "rewards/accuracies": 0.625, "rewards/chosen": -0.38609492778778076, "rewards/margins": 0.10902971029281616, "rewards/rejected": -0.4951246380805969, "step": 425 }, { "epoch": 0.4, "grad_norm": 22.737070705251682, "learning_rate": 4.860068579715708e-07, "logps/chosen": -30.242517471313477, "logps/rejected": -38.24612808227539, "loss": 0.6178, "losses/dpo": 0.6836260557174683, "losses/sft": 1.679368019104004, "losses/total": 0.6836260557174683, "ref_logps/chosen": -27.992040634155273, "ref_logps/rejected": -34.240997314453125, "rewards/accuracies": 0.875, "rewards/chosen": -0.22504755854606628, "rewards/margins": 0.17546549439430237, "rewards/rejected": -0.40051305294036865, "step": 426 }, { "epoch": 0.4, "grad_norm": 19.581312706310054, "learning_rate": 4.859227287571253e-07, "logps/chosen": -27.251956939697266, "logps/rejected": -37.89329147338867, "loss": 0.6112, "losses/dpo": 0.5243713855743408, "losses/sft": 1.210069179534912, "losses/total": 0.5243713855743408, "ref_logps/chosen": -26.114824295043945, "ref_logps/rejected": -34.74456787109375, "rewards/accuracies": 0.75, "rewards/chosen": -0.11371321976184845, "rewards/margins": 0.20115916430950165, "rewards/rejected": -0.3148723840713501, "step": 427 }, { "epoch": 0.4, "grad_norm": 23.585094045218778, "learning_rate": 4.858383547261419e-07, "logps/chosen": -31.183303833007812, "logps/rejected": -43.42414093017578, "loss": 0.6421, "losses/dpo": 0.8049939870834351, "losses/sft": 1.192050814628601, "losses/total": 0.8049939870834351, "ref_logps/chosen": -28.149152755737305, "ref_logps/rejected": -39.10224151611328, "rewards/accuracies": 0.5, "rewards/chosen": -0.3034152388572693, "rewards/margins": 0.12877485156059265, "rewards/rejected": -0.43219009041786194, "step": 428 }, { "epoch": 0.4, "grad_norm": 16.300764444073643, "learning_rate": 4.857537359661757e-07, "logps/chosen": -22.7632999420166, "logps/rejected": -33.69353485107422, "loss": 0.5357, "losses/dpo": 0.5390862822532654, "losses/sft": 1.029152512550354, "losses/total": 0.5390862822532654, "ref_logps/chosen": -22.098480224609375, "ref_logps/rejected": -29.32358169555664, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06648223102092743, "rewards/margins": 0.37051326036453247, "rewards/rejected": -0.4369955062866211, "step": 429 }, { "epoch": 0.41, "grad_norm": 25.628825345503845, "learning_rate": 4.85668872565035e-07, "logps/chosen": -34.414424896240234, "logps/rejected": -47.85415267944336, "loss": 0.6169, "losses/dpo": 0.5234639644622803, "losses/sft": 0.6667687892913818, "losses/total": 0.5234639644622803, "ref_logps/chosen": -32.13954162597656, "ref_logps/rejected": -42.82835388183594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22748810052871704, "rewards/margins": 0.27509233355522156, "rewards/rejected": -0.5025804042816162, "step": 430 }, { "epoch": 0.41, "grad_norm": 26.499477955258435, "learning_rate": 4.855837646107827e-07, "logps/chosen": -30.968687057495117, "logps/rejected": -38.649803161621094, "loss": 0.7609, "losses/dpo": 0.751736044883728, "losses/sft": 0.7613048553466797, "losses/total": 0.751736044883728, "ref_logps/chosen": -25.983657836914062, "ref_logps/rejected": -33.97136688232422, "rewards/accuracies": 0.375, "rewards/chosen": -0.4985029101371765, "rewards/margins": -0.03065914660692215, "rewards/rejected": -0.46784377098083496, "step": 431 }, { "epoch": 0.41, "grad_norm": 20.15396399753046, "learning_rate": 4.854984121917348e-07, "logps/chosen": -32.02383041381836, "logps/rejected": -37.68282699584961, "loss": 0.5723, "losses/dpo": 0.6440339684486389, "losses/sft": 0.7763099074363708, "losses/total": 0.6440339684486389, "ref_logps/chosen": -30.810880661010742, "ref_logps/rejected": -33.53742218017578, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12129507958889008, "rewards/margins": 0.2932456433773041, "rewards/rejected": -0.41454067826271057, "step": 432 }, { "epoch": 0.41, "grad_norm": 20.237253548374845, "learning_rate": 4.854128153964617e-07, "logps/chosen": -27.690156936645508, "logps/rejected": -33.47569274902344, "loss": 0.6458, "losses/dpo": 0.6138634085655212, "losses/sft": 0.13169291615486145, "losses/total": 0.6138634085655212, "ref_logps/chosen": -25.001386642456055, "ref_logps/rejected": -29.285913467407227, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26887694001197815, "rewards/margins": 0.1501009166240692, "rewards/rejected": -0.418977826833725, "step": 433 }, { "epoch": 0.41, "grad_norm": 20.277327309125223, "learning_rate": 4.853269743137868e-07, "logps/chosen": -27.666793823242188, "logps/rejected": -36.83775329589844, "loss": 0.5938, "losses/dpo": 0.7589706182479858, "losses/sft": 0.647279679775238, "losses/total": 0.7589706182479858, "ref_logps/chosen": -26.4132080078125, "ref_logps/rejected": -33.22043991088867, "rewards/accuracies": 0.75, "rewards/chosen": -0.1253584623336792, "rewards/margins": 0.2363731563091278, "rewards/rejected": -0.3617315888404846, "step": 434 }, { "epoch": 0.41, "grad_norm": 22.90683740347055, "learning_rate": 4.852408890327873e-07, "logps/chosen": -38.626102447509766, "logps/rejected": -35.198001861572266, "loss": 0.604, "losses/dpo": 0.581518292427063, "losses/sft": 0.47253167629241943, "losses/total": 0.581518292427063, "ref_logps/chosen": -36.473655700683594, "ref_logps/rejected": -31.033946990966797, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2152443826198578, "rewards/margins": 0.20116086304187775, "rewards/rejected": -0.41640520095825195, "step": 435 }, { "epoch": 0.41, "grad_norm": 18.654146659988278, "learning_rate": 4.851545596427938e-07, "logps/chosen": -26.606281280517578, "logps/rejected": -42.36216735839844, "loss": 0.5309, "losses/dpo": 0.4004138112068176, "losses/sft": 1.1164494752883911, "losses/total": 0.4004138112068176, "ref_logps/chosen": -24.916379928588867, "ref_logps/rejected": -36.772098541259766, "rewards/accuracies": 0.875, "rewards/chosen": -0.16899025440216064, "rewards/margins": 0.39001700282096863, "rewards/rejected": -0.5590072870254517, "step": 436 }, { "epoch": 0.41, "grad_norm": 19.179939835409172, "learning_rate": 4.850679862333898e-07, "logps/chosen": -29.103933334350586, "logps/rejected": -40.36922073364258, "loss": 0.5615, "losses/dpo": 0.7609297633171082, "losses/sft": 1.2703981399536133, "losses/total": 0.7609297633171082, "ref_logps/chosen": -27.903289794921875, "ref_logps/rejected": -35.178009033203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12006430327892303, "rewards/margins": 0.39905691146850586, "rewards/rejected": -0.5191211700439453, "step": 437 }, { "epoch": 0.41, "grad_norm": 22.227503969022397, "learning_rate": 4.849811688944128e-07, "logps/chosen": -34.66577911376953, "logps/rejected": -37.14069747924805, "loss": 0.6067, "losses/dpo": 0.23820379376411438, "losses/sft": 1.1128441095352173, "losses/total": 0.23820379376411438, "ref_logps/chosen": -31.28900146484375, "ref_logps/rejected": -31.522403717041016, "rewards/accuracies": 0.75, "rewards/chosen": -0.3376777172088623, "rewards/margins": 0.2241516411304474, "rewards/rejected": -0.5618293285369873, "step": 438 }, { "epoch": 0.41, "grad_norm": 20.742313772821635, "learning_rate": 4.848941077159527e-07, "logps/chosen": -25.721111297607422, "logps/rejected": -38.0343017578125, "loss": 0.6373, "losses/dpo": 0.6900727152824402, "losses/sft": 0.8522011637687683, "losses/total": 0.6900727152824402, "ref_logps/chosen": -23.325658798217773, "ref_logps/rejected": -34.204490661621094, "rewards/accuracies": 0.625, "rewards/chosen": -0.2395453304052353, "rewards/margins": 0.14343582093715668, "rewards/rejected": -0.38298115134239197, "step": 439 }, { "epoch": 0.42, "grad_norm": 23.08767812484257, "learning_rate": 4.848068027883527e-07, "logps/chosen": -30.789142608642578, "logps/rejected": -38.10093688964844, "loss": 0.6362, "losses/dpo": 0.6560032367706299, "losses/sft": 0.9831251502037048, "losses/total": 0.6560032367706299, "ref_logps/chosen": -28.182161331176758, "ref_logps/rejected": -34.06071472167969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2606978416442871, "rewards/margins": 0.14332431554794312, "rewards/rejected": -0.4040221571922302, "step": 440 }, { "epoch": 0.42, "grad_norm": 27.851913425137734, "learning_rate": 4.847192542022092e-07, "logps/chosen": -39.99571228027344, "logps/rejected": -45.295021057128906, "loss": 0.6915, "losses/dpo": 0.432949423789978, "losses/sft": 1.62623131275177, "losses/total": 0.432949423789978, "ref_logps/chosen": -35.38771057128906, "ref_logps/rejected": -39.84103012084961, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46080029010772705, "rewards/margins": 0.08459851145744324, "rewards/rejected": -0.5453988313674927, "step": 441 }, { "epoch": 0.42, "grad_norm": 23.229019840345355, "learning_rate": 4.846314620483709e-07, "logps/chosen": -36.60045623779297, "logps/rejected": -44.21564483642578, "loss": 0.6345, "losses/dpo": 0.6494840979576111, "losses/sft": 1.0190538167953491, "losses/total": 0.6494840979576111, "ref_logps/chosen": -32.92999267578125, "ref_logps/rejected": -39.01611328125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3670462369918823, "rewards/margins": 0.15290744602680206, "rewards/rejected": -0.5199536681175232, "step": 442 }, { "epoch": 0.42, "grad_norm": 18.655238833754687, "learning_rate": 4.845434264179397e-07, "logps/chosen": -22.36052131652832, "logps/rejected": -31.01690673828125, "loss": 0.5968, "losses/dpo": 0.8079219460487366, "losses/sft": 1.1909449100494385, "losses/total": 0.8079219460487366, "ref_logps/chosen": -20.475078582763672, "ref_logps/rejected": -26.668344497680664, "rewards/accuracies": 0.75, "rewards/chosen": -0.18854427337646484, "rewards/margins": 0.24631206691265106, "rewards/rejected": -0.4348563551902771, "step": 443 }, { "epoch": 0.42, "grad_norm": 19.956839161871287, "learning_rate": 4.844551474022698e-07, "logps/chosen": -20.418256759643555, "logps/rejected": -32.4892692565918, "loss": 0.5896, "losses/dpo": 0.22024092078208923, "losses/sft": 0.33539894223213196, "losses/total": 0.22024092078208923, "ref_logps/chosen": -19.58808135986328, "ref_logps/rejected": -29.007001876831055, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08301743865013123, "rewards/margins": 0.2652091681957245, "rewards/rejected": -0.3482266068458557, "step": 444 }, { "epoch": 0.42, "grad_norm": 26.482347041913364, "learning_rate": 4.843666250929685e-07, "logps/chosen": -44.090721130371094, "logps/rejected": -40.01682662963867, "loss": 0.64, "losses/dpo": 0.800696611404419, "losses/sft": 1.6781063079833984, "losses/total": 0.800696611404419, "ref_logps/chosen": -40.575164794921875, "ref_logps/rejected": -34.89933776855469, "rewards/accuracies": 0.625, "rewards/chosen": -0.3515562415122986, "rewards/margins": 0.16019272804260254, "rewards/rejected": -0.5117490291595459, "step": 445 }, { "epoch": 0.42, "grad_norm": 23.038481256356892, "learning_rate": 4.842778595818949e-07, "logps/chosen": -34.075355529785156, "logps/rejected": -33.240440368652344, "loss": 0.6113, "losses/dpo": 0.746896505355835, "losses/sft": 1.2821646928787231, "losses/total": 0.746896505355835, "ref_logps/chosen": -31.556495666503906, "ref_logps/rejected": -28.315698623657227, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25188586115837097, "rewards/margins": 0.2405884861946106, "rewards/rejected": -0.4924743175506592, "step": 446 }, { "epoch": 0.42, "grad_norm": 22.817701005423192, "learning_rate": 4.84188850961161e-07, "logps/chosen": -38.441246032714844, "logps/rejected": -36.566619873046875, "loss": 0.6232, "losses/dpo": 0.4926941990852356, "losses/sft": 0.8366514444351196, "losses/total": 0.4926941990852356, "ref_logps/chosen": -35.23017501831055, "ref_logps/rejected": -31.60162353515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.32110682129859924, "rewards/margins": 0.17539265751838684, "rewards/rejected": -0.49649950861930847, "step": 447 }, { "epoch": 0.42, "grad_norm": 21.957635979673604, "learning_rate": 4.840995993231308e-07, "logps/chosen": -28.640514373779297, "logps/rejected": -32.347862243652344, "loss": 0.6401, "losses/dpo": 0.5159199237823486, "losses/sft": 1.4370700120925903, "losses/total": 0.5159199237823486, "ref_logps/chosen": -27.14333724975586, "ref_logps/rejected": -28.797311782836914, "rewards/accuracies": 0.75, "rewards/chosen": -0.14971762895584106, "rewards/margins": 0.20533761382102966, "rewards/rejected": -0.3550552427768707, "step": 448 }, { "epoch": 0.42, "grad_norm": 28.17687493414049, "learning_rate": 4.840101047604204e-07, "logps/chosen": -38.27685546875, "logps/rejected": -34.174346923828125, "loss": 0.766, "losses/dpo": 0.5788336992263794, "losses/sft": 0.531764566898346, "losses/total": 0.5788336992263794, "ref_logps/chosen": -33.3394889831543, "ref_logps/rejected": -29.4987850189209, "rewards/accuracies": 0.5, "rewards/chosen": -0.49373647570610046, "rewards/margins": -0.026180289685726166, "rewards/rejected": -0.4675561785697937, "step": 449 }, { "epoch": 0.42, "grad_norm": 19.24551065840981, "learning_rate": 4.839203673658982e-07, "logps/chosen": -27.557273864746094, "logps/rejected": -39.942020416259766, "loss": 0.5152, "losses/dpo": 0.5182595252990723, "losses/sft": 0.8677736520767212, "losses/total": 0.5182595252990723, "ref_logps/chosen": -26.043155670166016, "ref_logps/rejected": -34.166282653808594, "rewards/accuracies": 0.875, "rewards/chosen": -0.1514119654893875, "rewards/margins": 0.42616206407546997, "rewards/rejected": -0.5775740146636963, "step": 450 }, { "epoch": 0.43, "grad_norm": 21.326694797666057, "learning_rate": 4.838303872326848e-07, "logps/chosen": -36.34627151489258, "logps/rejected": -35.101806640625, "loss": 0.6173, "losses/dpo": 0.893215000629425, "losses/sft": 0.6463819146156311, "losses/total": 0.893215000629425, "ref_logps/chosen": -34.786827087402344, "ref_logps/rejected": -31.612003326416016, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1559445559978485, "rewards/margins": 0.19303591549396515, "rewards/rejected": -0.34898048639297485, "step": 451 }, { "epoch": 0.43, "grad_norm": 18.65353415702731, "learning_rate": 4.83740164454152e-07, "logps/chosen": -23.336137771606445, "logps/rejected": -40.943199157714844, "loss": 0.5281, "losses/dpo": 0.6268637776374817, "losses/sft": 0.3701310455799103, "losses/total": 0.6268637776374817, "ref_logps/chosen": -21.81549835205078, "ref_logps/rejected": -34.942474365234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.15206393599510193, "rewards/margins": 0.4480084776878357, "rewards/rejected": -0.6000723838806152, "step": 452 }, { "epoch": 0.43, "grad_norm": 23.949064915331682, "learning_rate": 4.83649699123924e-07, "logps/chosen": -36.53241729736328, "logps/rejected": -48.96662139892578, "loss": 0.5604, "losses/dpo": 0.3804440498352051, "losses/sft": 0.7589377164840698, "losses/total": 0.3804440498352051, "ref_logps/chosen": -34.495643615722656, "ref_logps/rejected": -43.47510528564453, "rewards/accuracies": 0.75, "rewards/chosen": -0.20367702841758728, "rewards/margins": 0.3454746603965759, "rewards/rejected": -0.5491517186164856, "step": 453 }, { "epoch": 0.43, "grad_norm": 25.92202296410594, "learning_rate": 4.835589913358764e-07, "logps/chosen": -34.65194320678711, "logps/rejected": -46.491729736328125, "loss": 0.6591, "losses/dpo": 1.2085367441177368, "losses/sft": 1.164869785308838, "losses/total": 1.2085367441177368, "ref_logps/chosen": -31.18935203552246, "ref_logps/rejected": -41.63543701171875, "rewards/accuracies": 0.75, "rewards/chosen": -0.34625929594039917, "rewards/margins": 0.1393701136112213, "rewards/rejected": -0.48562946915626526, "step": 454 }, { "epoch": 0.43, "grad_norm": 19.048102158846945, "learning_rate": 4.834680411841366e-07, "logps/chosen": -36.835506439208984, "logps/rejected": -40.33917236328125, "loss": 0.5737, "losses/dpo": 0.15573246777057648, "losses/sft": 1.400941252708435, "losses/total": 0.15573246777057648, "ref_logps/chosen": -35.36465835571289, "ref_logps/rejected": -35.502376556396484, "rewards/accuracies": 0.75, "rewards/chosen": -0.14708518981933594, "rewards/margins": 0.3365944027900696, "rewards/rejected": -0.48367956280708313, "step": 455 }, { "epoch": 0.43, "grad_norm": 21.96332551418456, "learning_rate": 4.833768487630833e-07, "logps/chosen": -27.67875862121582, "logps/rejected": -34.79352951049805, "loss": 0.6373, "losses/dpo": 0.33320969343185425, "losses/sft": 0.24470771849155426, "losses/total": 0.33320969343185425, "ref_logps/chosen": -25.361328125, "ref_logps/rejected": -31.07276153564453, "rewards/accuracies": 0.625, "rewards/chosen": -0.23174308240413666, "rewards/margins": 0.14033383131027222, "rewards/rejected": -0.37207692861557007, "step": 456 }, { "epoch": 0.43, "grad_norm": 21.990297610309167, "learning_rate": 4.832854141673467e-07, "logps/chosen": -34.17192077636719, "logps/rejected": -41.03984069824219, "loss": 0.6247, "losses/dpo": 0.5049231052398682, "losses/sft": 0.6905831098556519, "losses/total": 0.5049231052398682, "ref_logps/chosen": -31.570650100708008, "ref_logps/rejected": -35.51313781738281, "rewards/accuracies": 0.75, "rewards/chosen": -0.26012688875198364, "rewards/margins": 0.29254308342933655, "rewards/rejected": -0.5526700019836426, "step": 457 }, { "epoch": 0.43, "grad_norm": 20.049239947723684, "learning_rate": 4.831937374918083e-07, "logps/chosen": -22.5571346282959, "logps/rejected": -29.972900390625, "loss": 0.6467, "losses/dpo": 0.8807029724121094, "losses/sft": 0.8109902739524841, "losses/total": 0.8807029724121094, "ref_logps/chosen": -19.81479835510254, "ref_logps/rejected": -26.00480842590332, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27423352003097534, "rewards/margins": 0.12257583439350128, "rewards/rejected": -0.39680933952331543, "step": 458 }, { "epoch": 0.43, "grad_norm": 22.30952775746806, "learning_rate": 4.83101818831601e-07, "logps/chosen": -30.730594635009766, "logps/rejected": -46.00578689575195, "loss": 0.5615, "losses/dpo": 0.3812459409236908, "losses/sft": 0.973388135433197, "losses/total": 0.3812459409236908, "ref_logps/chosen": -28.506275177001953, "ref_logps/rejected": -39.92744445800781, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22243212163448334, "rewards/margins": 0.3854019045829773, "rewards/rejected": -0.6078340411186218, "step": 459 }, { "epoch": 0.43, "grad_norm": 20.776009731784107, "learning_rate": 4.830096582821083e-07, "logps/chosen": -31.131425857543945, "logps/rejected": -36.031455993652344, "loss": 0.5854, "losses/dpo": 0.5086708068847656, "losses/sft": 1.1525204181671143, "losses/total": 0.5086708068847656, "ref_logps/chosen": -27.964733123779297, "ref_logps/rejected": -30.00271987915039, "rewards/accuracies": 0.6875, "rewards/chosen": -0.316669225692749, "rewards/margins": 0.28620412945747375, "rewards/rejected": -0.6028733849525452, "step": 460 }, { "epoch": 0.43, "grad_norm": 22.118557762259485, "learning_rate": 4.82917255938965e-07, "logps/chosen": -29.39324378967285, "logps/rejected": -45.37220001220703, "loss": 0.618, "losses/dpo": 0.9607857465744019, "losses/sft": 1.0486971139907837, "losses/total": 0.9607857465744019, "ref_logps/chosen": -26.32530975341797, "ref_logps/rejected": -40.040794372558594, "rewards/accuracies": 0.625, "rewards/chosen": -0.3067933917045593, "rewards/margins": 0.2263467013835907, "rewards/rejected": -0.5331401228904724, "step": 461 }, { "epoch": 0.44, "grad_norm": 21.734702148880668, "learning_rate": 4.828246118980571e-07, "logps/chosen": -32.119407653808594, "logps/rejected": -36.80213165283203, "loss": 0.5942, "losses/dpo": 0.6136451363563538, "losses/sft": 1.8078876733779907, "losses/total": 0.6136451363563538, "ref_logps/chosen": -29.60222625732422, "ref_logps/rejected": -31.39507293701172, "rewards/accuracies": 0.75, "rewards/chosen": -0.2517182230949402, "rewards/margins": 0.2889874279499054, "rewards/rejected": -0.540705680847168, "step": 462 }, { "epoch": 0.44, "grad_norm": 17.419009789091636, "learning_rate": 4.82731726255521e-07, "logps/chosen": -23.641727447509766, "logps/rejected": -35.954776763916016, "loss": 0.5447, "losses/dpo": 0.6672338843345642, "losses/sft": 1.0417619943618774, "losses/total": 0.6672338843345642, "ref_logps/chosen": -21.795013427734375, "ref_logps/rejected": -30.165302276611328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18467119336128235, "rewards/margins": 0.39427635073661804, "rewards/rejected": -0.5789475440979004, "step": 463 }, { "epoch": 0.44, "grad_norm": 20.558893716119247, "learning_rate": 4.826385991077438e-07, "logps/chosen": -25.923625946044922, "logps/rejected": -33.847320556640625, "loss": 0.589, "losses/dpo": 0.3828663229942322, "losses/sft": 0.497933954000473, "losses/total": 0.3828663229942322, "ref_logps/chosen": -24.524076461791992, "ref_logps/rejected": -29.65721893310547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13995490968227386, "rewards/margins": 0.27905523777008057, "rewards/rejected": -0.41901013255119324, "step": 464 }, { "epoch": 0.44, "grad_norm": 22.621051490727158, "learning_rate": 4.825452305513636e-07, "logps/chosen": -30.455610275268555, "logps/rejected": -36.673423767089844, "loss": 0.6237, "losses/dpo": 0.4364307224750519, "losses/sft": 0.5293335318565369, "losses/total": 0.4364307224750519, "ref_logps/chosen": -26.933141708374023, "ref_logps/rejected": -31.148807525634766, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3522467613220215, "rewards/margins": 0.20021480321884155, "rewards/rejected": -0.5524616241455078, "step": 465 }, { "epoch": 0.44, "grad_norm": 20.369168481469202, "learning_rate": 4.824516206832687e-07, "logps/chosen": -25.953964233398438, "logps/rejected": -33.661285400390625, "loss": 0.6122, "losses/dpo": 0.720572829246521, "losses/sft": 1.021852731704712, "losses/total": 0.720572829246521, "ref_logps/chosen": -23.76584243774414, "ref_logps/rejected": -29.035354614257812, "rewards/accuracies": 0.625, "rewards/chosen": -0.21881236135959625, "rewards/margins": 0.2437806874513626, "rewards/rejected": -0.46259304881095886, "step": 466 }, { "epoch": 0.44, "grad_norm": 25.184305087272786, "learning_rate": 4.823577696005979e-07, "logps/chosen": -33.3287467956543, "logps/rejected": -44.754798889160156, "loss": 0.6185, "losses/dpo": 0.5393975973129272, "losses/sft": 0.6021174788475037, "losses/total": 0.5393975973129272, "ref_logps/chosen": -30.971261978149414, "ref_logps/rejected": -40.091468811035156, "rewards/accuracies": 0.75, "rewards/chosen": -0.2357485294342041, "rewards/margins": 0.23058468103408813, "rewards/rejected": -0.46633321046829224, "step": 467 }, { "epoch": 0.44, "grad_norm": 16.61153143176096, "learning_rate": 4.822636774007399e-07, "logps/chosen": -26.14720916748047, "logps/rejected": -49.85420608520508, "loss": 0.4953, "losses/dpo": 0.5646136999130249, "losses/sft": 0.2426612824201584, "losses/total": 0.5646136999130249, "ref_logps/chosen": -24.576026916503906, "ref_logps/rejected": -41.82843780517578, "rewards/accuracies": 0.75, "rewards/chosen": -0.1571182906627655, "rewards/margins": 0.6454586982727051, "rewards/rejected": -0.8025769591331482, "step": 468 }, { "epoch": 0.44, "grad_norm": 22.137793168003476, "learning_rate": 4.821693441813345e-07, "logps/chosen": -29.99382781982422, "logps/rejected": -36.683475494384766, "loss": 0.5872, "losses/dpo": 0.4714118540287018, "losses/sft": 0.7136156558990479, "losses/total": 0.4714118540287018, "ref_logps/chosen": -26.27260971069336, "ref_logps/rejected": -29.965280532836914, "rewards/accuracies": 0.625, "rewards/chosen": -0.3721217215061188, "rewards/margins": 0.29969778656959534, "rewards/rejected": -0.6718195080757141, "step": 469 }, { "epoch": 0.44, "grad_norm": 17.996130039178198, "learning_rate": 4.820747700402709e-07, "logps/chosen": -28.60187530517578, "logps/rejected": -34.78827667236328, "loss": 0.5204, "losses/dpo": 0.334128201007843, "losses/sft": 1.0335664749145508, "losses/total": 0.334128201007843, "ref_logps/chosen": -26.914846420288086, "ref_logps/rejected": -28.330413818359375, "rewards/accuracies": 0.75, "rewards/chosen": -0.1687030792236328, "rewards/margins": 0.4770832657814026, "rewards/rejected": -0.6457863450050354, "step": 470 }, { "epoch": 0.44, "grad_norm": 24.29590350924425, "learning_rate": 4.819799550756884e-07, "logps/chosen": -32.35297775268555, "logps/rejected": -37.939476013183594, "loss": 0.677, "losses/dpo": 1.4558851718902588, "losses/sft": 1.3116999864578247, "losses/total": 1.4558851718902588, "ref_logps/chosen": -29.334449768066406, "ref_logps/rejected": -33.90156555175781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30185258388519287, "rewards/margins": 0.101938396692276, "rewards/rejected": -0.40379098057746887, "step": 471 }, { "epoch": 0.45, "grad_norm": 17.819419599715637, "learning_rate": 4.818848993859764e-07, "logps/chosen": -22.76861000061035, "logps/rejected": -28.216163635253906, "loss": 0.6252, "losses/dpo": 0.36636897921562195, "losses/sft": 0.6636195182800293, "losses/total": 0.36636897921562195, "ref_logps/chosen": -20.885114669799805, "ref_logps/rejected": -24.271217346191406, "rewards/accuracies": 0.5, "rewards/chosen": -0.18834969401359558, "rewards/margins": 0.20614516735076904, "rewards/rejected": -0.3944948613643646, "step": 472 }, { "epoch": 0.45, "grad_norm": 21.94740955910436, "learning_rate": 4.817896030697739e-07, "logps/chosen": -34.08820343017578, "logps/rejected": -39.33507537841797, "loss": 0.5671, "losses/dpo": 0.5589925050735474, "losses/sft": 1.2257827520370483, "losses/total": 0.5589925050735474, "ref_logps/chosen": -31.153423309326172, "ref_logps/rejected": -32.43218231201172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29347795248031616, "rewards/margins": 0.3968115746974945, "rewards/rejected": -0.6902894973754883, "step": 473 }, { "epoch": 0.45, "grad_norm": 19.851988721111635, "learning_rate": 4.816940662259697e-07, "logps/chosen": -31.1102352142334, "logps/rejected": -43.18267822265625, "loss": 0.5116, "losses/dpo": 0.4476759135723114, "losses/sft": 0.6574519872665405, "losses/total": 0.4476759135723114, "ref_logps/chosen": -29.328529357910156, "ref_logps/rejected": -36.93198013305664, "rewards/accuracies": 0.875, "rewards/chosen": -0.17817038297653198, "rewards/margins": 0.4468995928764343, "rewards/rejected": -0.6250699758529663, "step": 474 }, { "epoch": 0.45, "grad_norm": 19.10271978074273, "learning_rate": 4.815982889537024e-07, "logps/chosen": -25.716449737548828, "logps/rejected": -30.66345977783203, "loss": 0.5694, "losses/dpo": 0.5850218534469604, "losses/sft": 1.0269662141799927, "losses/total": 0.5850218534469604, "ref_logps/chosen": -24.17403221130371, "ref_logps/rejected": -26.045364379882812, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15424171090126038, "rewards/margins": 0.3075675368309021, "rewards/rejected": -0.46180927753448486, "step": 475 }, { "epoch": 0.45, "grad_norm": 20.2381836650768, "learning_rate": 4.815022713523596e-07, "logps/chosen": -31.978347778320312, "logps/rejected": -54.035011291503906, "loss": 0.4944, "losses/dpo": 0.13049449026584625, "losses/sft": 0.9774156212806702, "losses/total": 0.13049449026584625, "ref_logps/chosen": -29.216815948486328, "ref_logps/rejected": -45.9149284362793, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2761533856391907, "rewards/margins": 0.5358549952507019, "rewards/rejected": -0.8120083808898926, "step": 476 }, { "epoch": 0.45, "grad_norm": 25.427153888522604, "learning_rate": 4.814060135215788e-07, "logps/chosen": -43.19718933105469, "logps/rejected": -32.45085906982422, "loss": 0.6664, "losses/dpo": 0.730678915977478, "losses/sft": 1.556956171989441, "losses/total": 0.730678915977478, "ref_logps/chosen": -39.33873748779297, "ref_logps/rejected": -27.570568084716797, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3858451843261719, "rewards/margins": 0.10218357294797897, "rewards/rejected": -0.48802876472473145, "step": 477 }, { "epoch": 0.45, "grad_norm": 19.3795115340229, "learning_rate": 4.813095155612464e-07, "logps/chosen": -32.64534378051758, "logps/rejected": -35.97815704345703, "loss": 0.5134, "losses/dpo": 0.2905964255332947, "losses/sft": 0.29150792956352234, "losses/total": 0.2905964255332947, "ref_logps/chosen": -30.663280487060547, "ref_logps/rejected": -29.58361053466797, "rewards/accuracies": 0.875, "rewards/chosen": -0.19820629060268402, "rewards/margins": 0.44124844670295715, "rewards/rejected": -0.63945472240448, "step": 478 }, { "epoch": 0.45, "grad_norm": 16.997521704283283, "learning_rate": 4.812127775714982e-07, "logps/chosen": -20.965530395507812, "logps/rejected": -30.074790954589844, "loss": 0.578, "losses/dpo": 0.7187721133232117, "losses/sft": 0.16461709141731262, "losses/total": 0.7187721133232117, "ref_logps/chosen": -19.389339447021484, "ref_logps/rejected": -25.12456512451172, "rewards/accuracies": 0.625, "rewards/chosen": -0.15761896967887878, "rewards/margins": 0.33740371465682983, "rewards/rejected": -0.495022714138031, "step": 479 }, { "epoch": 0.45, "grad_norm": 19.798387905328273, "learning_rate": 4.811157996527191e-07, "logps/chosen": -36.02048873901367, "logps/rejected": -35.85890197753906, "loss": 0.5653, "losses/dpo": 0.1819681078195572, "losses/sft": 1.0323545932769775, "losses/total": 0.1819681078195572, "ref_logps/chosen": -31.936655044555664, "ref_logps/rejected": -27.9410343170166, "rewards/accuracies": 0.6875, "rewards/chosen": -0.408383309841156, "rewards/margins": 0.3834034204483032, "rewards/rejected": -0.7917868494987488, "step": 480 }, { "epoch": 0.45, "grad_norm": 20.70847366563875, "learning_rate": 4.810185819055427e-07, "logps/chosen": -26.587875366210938, "logps/rejected": -47.18013000488281, "loss": 0.5668, "losses/dpo": 0.6423084735870361, "losses/sft": 1.2156766653060913, "losses/total": 0.6423084735870361, "ref_logps/chosen": -24.395238876342773, "ref_logps/rejected": -41.65867233276367, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21926358342170715, "rewards/margins": 0.3328820466995239, "rewards/rejected": -0.5521456003189087, "step": 481 }, { "epoch": 0.45, "grad_norm": 23.012053434414174, "learning_rate": 4.809211244308518e-07, "logps/chosen": -36.458251953125, "logps/rejected": -39.479957580566406, "loss": 0.6039, "losses/dpo": 0.7587834000587463, "losses/sft": 0.5426580905914307, "losses/total": 0.7587834000587463, "ref_logps/chosen": -33.00156784057617, "ref_logps/rejected": -33.103939056396484, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34566906094551086, "rewards/margins": 0.2919326424598694, "rewards/rejected": -0.6376016736030579, "step": 482 }, { "epoch": 0.46, "grad_norm": 21.7736116353144, "learning_rate": 4.808234273297778e-07, "logps/chosen": -36.26436996459961, "logps/rejected": -48.39561080932617, "loss": 0.526, "losses/dpo": 0.5681625604629517, "losses/sft": 1.3260248899459839, "losses/total": 0.5681625604629517, "ref_logps/chosen": -33.64049530029297, "ref_logps/rejected": -40.862037658691406, "rewards/accuracies": 0.625, "rewards/chosen": -0.26238739490509033, "rewards/margins": 0.4909699857234955, "rewards/rejected": -0.7533572912216187, "step": 483 }, { "epoch": 0.46, "grad_norm": 22.082638591607644, "learning_rate": 4.807254907037007e-07, "logps/chosen": -27.39023208618164, "logps/rejected": -38.569698333740234, "loss": 0.662, "losses/dpo": 0.6594560742378235, "losses/sft": 0.4606875479221344, "losses/total": 0.6594560742378235, "ref_logps/chosen": -24.202131271362305, "ref_logps/rejected": -34.368263244628906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31881025433540344, "rewards/margins": 0.1013331413269043, "rewards/rejected": -0.4201434254646301, "step": 484 }, { "epoch": 0.46, "grad_norm": 19.116379446925222, "learning_rate": 4.806273146542492e-07, "logps/chosen": -25.205760955810547, "logps/rejected": -37.51335906982422, "loss": 0.5523, "losses/dpo": 0.5540833473205566, "losses/sft": 1.1509922742843628, "losses/total": 0.5540833473205566, "ref_logps/chosen": -23.062864303588867, "ref_logps/rejected": -31.771114349365234, "rewards/accuracies": 0.875, "rewards/chosen": -0.21428969502449036, "rewards/margins": 0.3599349856376648, "rewards/rejected": -0.5742246508598328, "step": 485 }, { "epoch": 0.46, "grad_norm": 20.466260983589965, "learning_rate": 4.805288992833004e-07, "logps/chosen": -37.530277252197266, "logps/rejected": -47.14744186401367, "loss": 0.5029, "losses/dpo": 0.2410556823015213, "losses/sft": 1.0101735591888428, "losses/total": 0.2410556823015213, "ref_logps/chosen": -33.12394714355469, "ref_logps/rejected": -37.3004150390625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4406331479549408, "rewards/margins": 0.5440695881843567, "rewards/rejected": -0.9847027063369751, "step": 486 }, { "epoch": 0.46, "grad_norm": 22.359351400551443, "learning_rate": 4.804302446929798e-07, "logps/chosen": -33.52158737182617, "logps/rejected": -39.432533264160156, "loss": 0.6225, "losses/dpo": 1.0031092166900635, "losses/sft": 1.3977686166763306, "losses/total": 1.0031092166900635, "ref_logps/chosen": -29.761838912963867, "ref_logps/rejected": -32.671409606933594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37597453594207764, "rewards/margins": 0.30013763904571533, "rewards/rejected": -0.6761122941970825, "step": 487 }, { "epoch": 0.46, "grad_norm": 22.74557947394786, "learning_rate": 4.80331350985661e-07, "logps/chosen": -31.04314422607422, "logps/rejected": -41.38746643066406, "loss": 0.6208, "losses/dpo": 0.5111294984817505, "losses/sft": 1.2946304082870483, "losses/total": 0.5111294984817505, "ref_logps/chosen": -25.973636627197266, "ref_logps/rejected": -33.791221618652344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5069506168365479, "rewards/margins": 0.25267407298088074, "rewards/rejected": -0.759624719619751, "step": 488 }, { "epoch": 0.46, "grad_norm": 19.980839747981626, "learning_rate": 4.802322182639657e-07, "logps/chosen": -26.50829315185547, "logps/rejected": -44.20970916748047, "loss": 0.5206, "losses/dpo": 0.6182718276977539, "losses/sft": 0.5141971707344055, "losses/total": 0.6182718276977539, "ref_logps/chosen": -23.866548538208008, "ref_logps/rejected": -36.01105880737305, "rewards/accuracies": 0.8125, "rewards/chosen": -0.26417452096939087, "rewards/margins": 0.5556905269622803, "rewards/rejected": -0.8198649883270264, "step": 489 }, { "epoch": 0.46, "grad_norm": 23.60522664255658, "learning_rate": 4.801328466307638e-07, "logps/chosen": -23.37128448486328, "logps/rejected": -35.45518493652344, "loss": 0.6618, "losses/dpo": 0.722160816192627, "losses/sft": 1.0314329862594604, "losses/total": 0.722160816192627, "ref_logps/chosen": -20.744808197021484, "ref_logps/rejected": -31.303958892822266, "rewards/accuracies": 0.5, "rewards/chosen": -0.26264774799346924, "rewards/margins": 0.15247464179992676, "rewards/rejected": -0.415122389793396, "step": 490 }, { "epoch": 0.46, "grad_norm": 21.250412188804447, "learning_rate": 4.800332361891732e-07, "logps/chosen": -30.405906677246094, "logps/rejected": -49.63224792480469, "loss": 0.5666, "losses/dpo": 0.2999174892902374, "losses/sft": 1.4613745212554932, "losses/total": 0.2999174892902374, "ref_logps/chosen": -26.69738006591797, "ref_logps/rejected": -42.75941848754883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37085282802581787, "rewards/margins": 0.3164300322532654, "rewards/rejected": -0.6872828602790833, "step": 491 }, { "epoch": 0.46, "grad_norm": 21.007169977243258, "learning_rate": 4.799333870425592e-07, "logps/chosen": -27.50531578063965, "logps/rejected": -40.62626647949219, "loss": 0.5757, "losses/dpo": 0.37598326802253723, "losses/sft": 1.2516216039657593, "losses/total": 0.37598326802253723, "ref_logps/chosen": -23.818523406982422, "ref_logps/rejected": -33.940242767333984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3686794638633728, "rewards/margins": 0.2999231517314911, "rewards/rejected": -0.6686025857925415, "step": 492 }, { "epoch": 0.47, "grad_norm": 18.912417009966834, "learning_rate": 4.798332992945352e-07, "logps/chosen": -26.737110137939453, "logps/rejected": -40.45068359375, "loss": 0.5074, "losses/dpo": 0.2886115610599518, "losses/sft": 0.9665390849113464, "losses/total": 0.2886115610599518, "ref_logps/chosen": -25.319332122802734, "ref_logps/rejected": -34.243709564208984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14177784323692322, "rewards/margins": 0.47891929745674133, "rewards/rejected": -0.6206971406936646, "step": 493 }, { "epoch": 0.47, "grad_norm": 24.375277193614302, "learning_rate": 4.79732973048962e-07, "logps/chosen": -25.077733993530273, "logps/rejected": -31.502710342407227, "loss": 0.7251, "losses/dpo": 0.6987410187721252, "losses/sft": 0.6126065850257874, "losses/total": 0.6987410187721252, "ref_logps/chosen": -21.261688232421875, "ref_logps/rejected": -27.346576690673828, "rewards/accuracies": 0.4375, "rewards/chosen": -0.38160446286201477, "rewards/margins": 0.03400897979736328, "rewards/rejected": -0.41561347246170044, "step": 494 }, { "epoch": 0.47, "grad_norm": 22.03281217562161, "learning_rate": 4.79632408409948e-07, "logps/chosen": -28.803396224975586, "logps/rejected": -36.924827575683594, "loss": 0.619, "losses/dpo": 0.695277750492096, "losses/sft": 0.6059163808822632, "losses/total": 0.695277750492096, "ref_logps/chosen": -25.15566635131836, "ref_logps/rejected": -30.723304748535156, "rewards/accuracies": 0.625, "rewards/chosen": -0.3647730350494385, "rewards/margins": 0.2553790509700775, "rewards/rejected": -0.6201520562171936, "step": 495 }, { "epoch": 0.47, "grad_norm": 23.522595724560716, "learning_rate": 4.795316054818489e-07, "logps/chosen": -36.71510314941406, "logps/rejected": -37.88941955566406, "loss": 0.6063, "losses/dpo": 1.3068509101867676, "losses/sft": 1.9345382452011108, "losses/total": 1.3068509101867676, "ref_logps/chosen": -32.31220245361328, "ref_logps/rejected": -30.62557029724121, "rewards/accuracies": 0.75, "rewards/chosen": -0.4402899146080017, "rewards/margins": 0.28609487414360046, "rewards/rejected": -0.7263847589492798, "step": 496 }, { "epoch": 0.47, "grad_norm": 23.00512874757826, "learning_rate": 4.794305643692676e-07, "logps/chosen": -28.658899307250977, "logps/rejected": -37.63408660888672, "loss": 0.611, "losses/dpo": 0.743156373500824, "losses/sft": 1.4323649406433105, "losses/total": 0.743156373500824, "ref_logps/chosen": -24.759206771850586, "ref_logps/rejected": -31.23900604248047, "rewards/accuracies": 0.5, "rewards/chosen": -0.3899691700935364, "rewards/margins": 0.24953874945640564, "rewards/rejected": -0.6395078897476196, "step": 497 }, { "epoch": 0.47, "grad_norm": 26.178374930782383, "learning_rate": 4.793292851770545e-07, "logps/chosen": -36.98802947998047, "logps/rejected": -45.042484283447266, "loss": 0.6331, "losses/dpo": 0.27946239709854126, "losses/sft": 1.2566426992416382, "losses/total": 0.27946239709854126, "ref_logps/chosen": -33.05995178222656, "ref_logps/rejected": -38.93373107910156, "rewards/accuracies": 0.625, "rewards/chosen": -0.39280784130096436, "rewards/margins": 0.21806704998016357, "rewards/rejected": -0.6108748912811279, "step": 498 }, { "epoch": 0.47, "grad_norm": 18.72119888127307, "learning_rate": 4.792277680103065e-07, "logps/chosen": -27.378597259521484, "logps/rejected": -38.424072265625, "loss": 0.5729, "losses/dpo": 0.3033331334590912, "losses/sft": 0.30590224266052246, "losses/total": 0.3033331334590912, "ref_logps/chosen": -25.028728485107422, "ref_logps/rejected": -32.61304473876953, "rewards/accuracies": 0.75, "rewards/chosen": -0.2349868267774582, "rewards/margins": 0.34611597657203674, "rewards/rejected": -0.5811027884483337, "step": 499 }, { "epoch": 0.47, "grad_norm": 20.093938810007597, "learning_rate": 4.79126012974368e-07, "logps/chosen": -28.19216537475586, "logps/rejected": -49.57545471191406, "loss": 0.554, "losses/dpo": 0.7035272717475891, "losses/sft": 1.2444604635238647, "losses/total": 0.7035272717475891, "ref_logps/chosen": -25.10454559326172, "ref_logps/rejected": -41.956504821777344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30876192450523376, "rewards/margins": 0.4531335234642029, "rewards/rejected": -0.7618954181671143, "step": 500 }, { "epoch": 0.47, "grad_norm": 21.988314263609805, "learning_rate": 4.790240201748299e-07, "logps/chosen": -30.440486907958984, "logps/rejected": -36.01658630371094, "loss": 0.6774, "losses/dpo": 0.7791270613670349, "losses/sft": 1.439986228942871, "losses/total": 0.7791270613670349, "ref_logps/chosen": -27.358091354370117, "ref_logps/rejected": -31.810543060302734, "rewards/accuracies": 0.625, "rewards/chosen": -0.30823975801467896, "rewards/margins": 0.11236443370580673, "rewards/rejected": -0.4206041693687439, "step": 501 }, { "epoch": 0.47, "grad_norm": 23.31461114429232, "learning_rate": 4.789217897175299e-07, "logps/chosen": -25.91265106201172, "logps/rejected": -40.95402145385742, "loss": 0.6342, "losses/dpo": 0.5292296409606934, "losses/sft": 0.10884812474250793, "losses/total": 0.5292296409606934, "ref_logps/chosen": -21.608219146728516, "ref_logps/rejected": -34.688621520996094, "rewards/accuracies": 0.5, "rewards/chosen": -0.43044328689575195, "rewards/margins": 0.1960965394973755, "rewards/rejected": -0.6265398263931274, "step": 502 }, { "epoch": 0.47, "grad_norm": 18.212370085607606, "learning_rate": 4.788193217085523e-07, "logps/chosen": -28.37285614013672, "logps/rejected": -34.89739227294922, "loss": 0.5494, "losses/dpo": 0.585126519203186, "losses/sft": 1.1880251169204712, "losses/total": 0.585126519203186, "ref_logps/chosen": -27.217758178710938, "ref_logps/rejected": -30.17632293701172, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11550988256931305, "rewards/margins": 0.35659703612327576, "rewards/rejected": -0.4721069633960724, "step": 503 }, { "epoch": 0.48, "grad_norm": 20.88208586168674, "learning_rate": 4.787166162542281e-07, "logps/chosen": -36.64181900024414, "logps/rejected": -46.829429626464844, "loss": 0.5637, "losses/dpo": 0.8199678659439087, "losses/sft": 1.2780903577804565, "losses/total": 0.8199678659439087, "ref_logps/chosen": -32.10833740234375, "ref_logps/rejected": -38.11437225341797, "rewards/accuracies": 0.625, "rewards/chosen": -0.45334792137145996, "rewards/margins": 0.4181579351425171, "rewards/rejected": -0.871505856513977, "step": 504 }, { "epoch": 0.48, "grad_norm": 19.522565561435066, "learning_rate": 4.786136734611345e-07, "logps/chosen": -25.913551330566406, "logps/rejected": -37.72145462036133, "loss": 0.5801, "losses/dpo": 0.5296852588653564, "losses/sft": 0.06710104644298553, "losses/total": 0.5296852588653564, "ref_logps/chosen": -22.520673751831055, "ref_logps/rejected": -30.69173812866211, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33928772807121277, "rewards/margins": 0.36368387937545776, "rewards/rejected": -0.7029715776443481, "step": 505 }, { "epoch": 0.48, "grad_norm": 19.81019868057095, "learning_rate": 4.785104934360948e-07, "logps/chosen": -25.22217559814453, "logps/rejected": -45.522483825683594, "loss": 0.5796, "losses/dpo": 0.5914543867111206, "losses/sft": 0.12926346063613892, "losses/total": 0.5914543867111206, "ref_logps/chosen": -20.65833282470703, "ref_logps/rejected": -37.296958923339844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45638442039489746, "rewards/margins": 0.3661682605743408, "rewards/rejected": -0.8225526809692383, "step": 506 }, { "epoch": 0.48, "grad_norm": 24.713069958290916, "learning_rate": 4.784070762861791e-07, "logps/chosen": -35.44572830200195, "logps/rejected": -40.795204162597656, "loss": 0.6624, "losses/dpo": 0.674668550491333, "losses/sft": 0.6105311512947083, "losses/total": 0.674668550491333, "ref_logps/chosen": -29.845197677612305, "ref_logps/rejected": -33.57854461669922, "rewards/accuracies": 0.625, "rewards/chosen": -0.5600532293319702, "rewards/margins": 0.16161221265792847, "rewards/rejected": -0.7216654419898987, "step": 507 }, { "epoch": 0.48, "grad_norm": 16.61115837266059, "learning_rate": 4.783034221187027e-07, "logps/chosen": -26.411640167236328, "logps/rejected": -41.01521301269531, "loss": 0.4782, "losses/dpo": 0.6890736222267151, "losses/sft": 1.0539213418960571, "losses/total": 0.6890736222267151, "ref_logps/chosen": -22.447181701660156, "ref_logps/rejected": -31.17924690246582, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3964458703994751, "rewards/margins": 0.5871506333351135, "rewards/rejected": -0.9835965633392334, "step": 508 }, { "epoch": 0.48, "grad_norm": 20.00843776838207, "learning_rate": 4.781995310412278e-07, "logps/chosen": -23.69526481628418, "logps/rejected": -40.566062927246094, "loss": 0.622, "losses/dpo": 0.6856530904769897, "losses/sft": 1.3462116718292236, "losses/total": 0.6856530904769897, "ref_logps/chosen": -20.0004825592041, "ref_logps/rejected": -34.80134582519531, "rewards/accuracies": 0.625, "rewards/chosen": -0.3694782853126526, "rewards/margins": 0.20699310302734375, "rewards/rejected": -0.5764713883399963, "step": 509 }, { "epoch": 0.48, "grad_norm": 20.579725499848365, "learning_rate": 4.780954031615617e-07, "logps/chosen": -27.910598754882812, "logps/rejected": -42.354042053222656, "loss": 0.5262, "losses/dpo": 0.6522693634033203, "losses/sft": 0.9252116680145264, "losses/total": 0.6522693634033203, "ref_logps/chosen": -24.14780616760254, "ref_logps/rejected": -34.26959228515625, "rewards/accuracies": 0.875, "rewards/chosen": -0.3762793242931366, "rewards/margins": 0.43216589093208313, "rewards/rejected": -0.8084452152252197, "step": 510 }, { "epoch": 0.48, "grad_norm": 24.36190292440097, "learning_rate": 4.779910385877577e-07, "logps/chosen": -28.16088104248047, "logps/rejected": -39.21577835083008, "loss": 0.6085, "losses/dpo": 0.728076159954071, "losses/sft": 0.8462598919868469, "losses/total": 0.728076159954071, "ref_logps/chosen": -23.9445743560791, "ref_logps/rejected": -32.16504669189453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4216305613517761, "rewards/margins": 0.28344249725341797, "rewards/rejected": -0.7050730586051941, "step": 511 }, { "epoch": 0.48, "grad_norm": 21.524832000705405, "learning_rate": 4.778864374281146e-07, "logps/chosen": -31.833377838134766, "logps/rejected": -42.37372970581055, "loss": 0.5825, "losses/dpo": 0.46099185943603516, "losses/sft": 1.4541256427764893, "losses/total": 0.46099185943603516, "ref_logps/chosen": -26.948040008544922, "ref_logps/rejected": -34.271881103515625, "rewards/accuracies": 0.75, "rewards/chosen": -0.4885336756706238, "rewards/margins": 0.3216516375541687, "rewards/rejected": -0.8101853132247925, "step": 512 }, { "epoch": 0.48, "grad_norm": 26.999658274290358, "learning_rate": 4.77781599791177e-07, "logps/chosen": -34.7181510925293, "logps/rejected": -40.201690673828125, "loss": 0.6907, "losses/dpo": 0.5131183862686157, "losses/sft": 0.42322930693626404, "losses/total": 0.5131183862686157, "ref_logps/chosen": -28.68212127685547, "ref_logps/rejected": -33.030513763427734, "rewards/accuracies": 0.5, "rewards/chosen": -0.6036028861999512, "rewards/margins": 0.11351491510868073, "rewards/rejected": -0.7171178460121155, "step": 513 }, { "epoch": 0.48, "grad_norm": 28.054017509660003, "learning_rate": 4.776765257857347e-07, "logps/chosen": -42.433292388916016, "logps/rejected": -43.05769729614258, "loss": 0.6544, "losses/dpo": 1.2111997604370117, "losses/sft": 1.4654971361160278, "losses/total": 1.2111997604370117, "ref_logps/chosen": -38.76290512084961, "ref_logps/rejected": -37.68411636352539, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3670389652252197, "rewards/margins": 0.17031894624233246, "rewards/rejected": -0.5373579263687134, "step": 514 }, { "epoch": 0.49, "grad_norm": 18.04367140228946, "learning_rate": 4.775712155208227e-07, "logps/chosen": -24.52401351928711, "logps/rejected": -34.571163177490234, "loss": 0.5463, "losses/dpo": 0.6503860950469971, "losses/sft": 0.8658599257469177, "losses/total": 0.6503860950469971, "ref_logps/chosen": -21.327999114990234, "ref_logps/rejected": -26.858497619628906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3196014165878296, "rewards/margins": 0.45166510343551636, "rewards/rejected": -0.7712665796279907, "step": 515 }, { "epoch": 0.49, "grad_norm": 22.671744053485842, "learning_rate": 4.774656691057213e-07, "logps/chosen": -41.34471130371094, "logps/rejected": -38.50638961791992, "loss": 0.5912, "losses/dpo": 0.6623877286911011, "losses/sft": 0.9984578490257263, "losses/total": 0.6623877286911011, "ref_logps/chosen": -38.115394592285156, "ref_logps/rejected": -32.400108337402344, "rewards/accuracies": 0.75, "rewards/chosen": -0.3229317367076874, "rewards/margins": 0.28769612312316895, "rewards/rejected": -0.6106278896331787, "step": 516 }, { "epoch": 0.49, "grad_norm": 22.978042715340187, "learning_rate": 4.773598866499557e-07, "logps/chosen": -36.68193054199219, "logps/rejected": -62.82994842529297, "loss": 0.5336, "losses/dpo": 0.7640513181686401, "losses/sft": 2.3969545364379883, "losses/total": 0.7640513181686401, "ref_logps/chosen": -31.5223388671875, "ref_logps/rejected": -52.56169891357422, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5159590244293213, "rewards/margins": 0.5108664631843567, "rewards/rejected": -1.0268255472183228, "step": 517 }, { "epoch": 0.49, "grad_norm": 24.66954890403919, "learning_rate": 4.772538682632963e-07, "logps/chosen": -30.725370407104492, "logps/rejected": -46.2544059753418, "loss": 0.5586, "losses/dpo": 0.5972235202789307, "losses/sft": 0.9299275279045105, "losses/total": 0.5972235202789307, "ref_logps/chosen": -26.82027816772461, "ref_logps/rejected": -37.909461975097656, "rewards/accuracies": 0.75, "rewards/chosen": -0.39050906896591187, "rewards/margins": 0.44398537278175354, "rewards/rejected": -0.834494411945343, "step": 518 }, { "epoch": 0.49, "grad_norm": 22.21031831348113, "learning_rate": 4.771476140557581e-07, "logps/chosen": -34.41826629638672, "logps/rejected": -42.40753173828125, "loss": 0.5433, "losses/dpo": 0.8160485625267029, "losses/sft": 1.272996187210083, "losses/total": 0.8160485625267029, "ref_logps/chosen": -30.340150833129883, "ref_logps/rejected": -33.629615783691406, "rewards/accuracies": 0.625, "rewards/chosen": -0.4078115224838257, "rewards/margins": 0.46997934579849243, "rewards/rejected": -0.8777908086776733, "step": 519 }, { "epoch": 0.49, "grad_norm": 24.38943775987301, "learning_rate": 4.770411241376008e-07, "logps/chosen": -30.05590057373047, "logps/rejected": -39.70996856689453, "loss": 0.5824, "losses/dpo": 0.33513396978378296, "losses/sft": 0.8107466101646423, "losses/total": 0.33513396978378296, "ref_logps/chosen": -25.658926010131836, "ref_logps/rejected": -31.499971389770508, "rewards/accuracies": 0.625, "rewards/chosen": -0.4396973252296448, "rewards/margins": 0.38130244612693787, "rewards/rejected": -0.8209997415542603, "step": 520 }, { "epoch": 0.49, "grad_norm": 19.619631045714137, "learning_rate": 4.769343986193288e-07, "logps/chosen": -29.655649185180664, "logps/rejected": -44.76261520385742, "loss": 0.4914, "losses/dpo": 0.7710341215133667, "losses/sft": 1.5414313077926636, "losses/total": 0.7710341215133667, "ref_logps/chosen": -26.800487518310547, "ref_logps/rejected": -35.78729248046875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28551599383354187, "rewards/margins": 0.6120162010192871, "rewards/rejected": -0.8975322246551514, "step": 521 }, { "epoch": 0.49, "grad_norm": 22.764757876354786, "learning_rate": 4.76827437611691e-07, "logps/chosen": -31.566993713378906, "logps/rejected": -41.050567626953125, "loss": 0.6406, "losses/dpo": 0.6504263877868652, "losses/sft": 0.9759252071380615, "losses/total": 0.6504263877868652, "ref_logps/chosen": -26.495059967041016, "ref_logps/rejected": -34.039451599121094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5071935653686523, "rewards/margins": 0.1939181685447693, "rewards/rejected": -0.7011117339134216, "step": 522 }, { "epoch": 0.49, "grad_norm": 22.630270515663764, "learning_rate": 4.767202412256807e-07, "logps/chosen": -31.124998092651367, "logps/rejected": -31.956954956054688, "loss": 0.6485, "losses/dpo": 0.6964255571365356, "losses/sft": 0.9617919921875, "losses/total": 0.6964255571365356, "ref_logps/chosen": -27.60205078125, "ref_logps/rejected": -26.72555160522461, "rewards/accuracies": 0.5, "rewards/chosen": -0.35229483246803284, "rewards/margins": 0.17084522545337677, "rewards/rejected": -0.5231400728225708, "step": 523 }, { "epoch": 0.49, "grad_norm": 18.526643539100355, "learning_rate": 4.7661280957253514e-07, "logps/chosen": -31.340707778930664, "logps/rejected": -44.75270080566406, "loss": 0.472, "losses/dpo": 0.3367931544780731, "losses/sft": 0.7648292779922485, "losses/total": 0.3367931544780731, "ref_logps/chosen": -27.66455078125, "ref_logps/rejected": -34.25316619873047, "rewards/accuracies": 0.75, "rewards/chosen": -0.3676157593727112, "rewards/margins": 0.682337760925293, "rewards/rejected": -1.049953579902649, "step": 524 }, { "epoch": 0.5, "grad_norm": 20.712863326280445, "learning_rate": 4.7650514276373613e-07, "logps/chosen": -21.29874038696289, "logps/rejected": -32.86607360839844, "loss": 0.6378, "losses/dpo": 0.855224609375, "losses/sft": 0.9733845591545105, "losses/total": 0.855224609375, "ref_logps/chosen": -17.132991790771484, "ref_logps/rejected": -26.204504013061523, "rewards/accuracies": 0.625, "rewards/chosen": -0.41657477617263794, "rewards/margins": 0.24958235025405884, "rewards/rejected": -0.6661571264266968, "step": 525 }, { "epoch": 0.5, "grad_norm": 26.723948737286307, "learning_rate": 4.7639724091100924e-07, "logps/chosen": -34.48298645019531, "logps/rejected": -40.110809326171875, "loss": 0.7489, "losses/dpo": 1.2118955850601196, "losses/sft": 1.1409279108047485, "losses/total": 1.2118955850601196, "ref_logps/chosen": -28.40555191040039, "ref_logps/rejected": -33.156044006347656, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6077436804771423, "rewards/margins": 0.08773314952850342, "rewards/rejected": -0.6954768300056458, "step": 526 }, { "epoch": 0.5, "grad_norm": 19.459074130686506, "learning_rate": 4.7628910412632397e-07, "logps/chosen": -27.225833892822266, "logps/rejected": -35.33570861816406, "loss": 0.5739, "losses/dpo": 0.7180626392364502, "losses/sft": 0.7959280610084534, "losses/total": 0.7180626392364502, "ref_logps/chosen": -24.107845306396484, "ref_logps/rejected": -28.762706756591797, "rewards/accuracies": 0.625, "rewards/chosen": -0.3117988705635071, "rewards/margins": 0.34550154209136963, "rewards/rejected": -0.6573003530502319, "step": 527 }, { "epoch": 0.5, "grad_norm": 22.28795798400298, "learning_rate": 4.761807325218937e-07, "logps/chosen": -40.69900894165039, "logps/rejected": -45.28719711303711, "loss": 0.5736, "losses/dpo": 0.7603490352630615, "losses/sft": 1.1800177097320557, "losses/total": 0.7603490352630615, "ref_logps/chosen": -35.940181732177734, "ref_logps/rejected": -35.53713607788086, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4758826494216919, "rewards/margins": 0.4991234540939331, "rewards/rejected": -0.975006103515625, "step": 528 }, { "epoch": 0.5, "grad_norm": 21.494074305400137, "learning_rate": 4.760721262101754e-07, "logps/chosen": -28.90043830871582, "logps/rejected": -41.01263427734375, "loss": 0.5382, "losses/dpo": 0.5473275184631348, "losses/sft": 1.4694417715072632, "losses/total": 0.5473275184631348, "ref_logps/chosen": -24.554744720458984, "ref_logps/rejected": -32.09693145751953, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4345695674419403, "rewards/margins": 0.4570005536079407, "rewards/rejected": -0.8915700912475586, "step": 529 }, { "epoch": 0.5, "grad_norm": 20.96533203034334, "learning_rate": 4.7596328530386956e-07, "logps/chosen": -32.8366584777832, "logps/rejected": -42.272640228271484, "loss": 0.5669, "losses/dpo": 0.455208957195282, "losses/sft": 0.479226291179657, "losses/total": 0.455208957195282, "ref_logps/chosen": -29.11563491821289, "ref_logps/rejected": -35.198265075683594, "rewards/accuracies": 0.75, "rewards/chosen": -0.3721022307872772, "rewards/margins": 0.3353354334831238, "rewards/rejected": -0.7074376344680786, "step": 530 }, { "epoch": 0.5, "grad_norm": 21.498460041106057, "learning_rate": 4.7585420991592027e-07, "logps/chosen": -31.290058135986328, "logps/rejected": -37.923057556152344, "loss": 0.6338, "losses/dpo": 0.1958507001399994, "losses/sft": 1.3146361112594604, "losses/total": 0.1958507001399994, "ref_logps/chosen": -28.501684188842773, "ref_logps/rejected": -32.13544845581055, "rewards/accuracies": 0.75, "rewards/chosen": -0.27883732318878174, "rewards/margins": 0.29992377758026123, "rewards/rejected": -0.578761100769043, "step": 531 }, { "epoch": 0.5, "grad_norm": 20.821039967006392, "learning_rate": 4.7574490015951487e-07, "logps/chosen": -29.390140533447266, "logps/rejected": -42.0405387878418, "loss": 0.539, "losses/dpo": 0.43966686725616455, "losses/sft": 0.5378720760345459, "losses/total": 0.43966686725616455, "ref_logps/chosen": -25.358312606811523, "ref_logps/rejected": -33.804325103759766, "rewards/accuracies": 0.75, "rewards/chosen": -0.4031828045845032, "rewards/margins": 0.42043834924697876, "rewards/rejected": -0.8236211538314819, "step": 532 }, { "epoch": 0.5, "grad_norm": 22.857183010818897, "learning_rate": 4.756353561480836e-07, "logps/chosen": -28.688356399536133, "logps/rejected": -38.64572525024414, "loss": 0.6187, "losses/dpo": 0.508783221244812, "losses/sft": 0.7600058317184448, "losses/total": 0.508783221244812, "ref_logps/chosen": -24.833911895751953, "ref_logps/rejected": -30.926719665527344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38544416427612305, "rewards/margins": 0.3864567279815674, "rewards/rejected": -0.7719008922576904, "step": 533 }, { "epoch": 0.5, "grad_norm": 19.31836228915835, "learning_rate": 4.755255779953004e-07, "logps/chosen": -28.993600845336914, "logps/rejected": -52.934181213378906, "loss": 0.4988, "losses/dpo": 0.6410211324691772, "losses/sft": 1.2727450132369995, "losses/total": 0.6410211324691772, "ref_logps/chosen": -25.354705810546875, "ref_logps/rejected": -43.576175689697266, "rewards/accuracies": 0.875, "rewards/chosen": -0.36388924717903137, "rewards/margins": 0.5719113945960999, "rewards/rejected": -0.9358006715774536, "step": 534 }, { "epoch": 0.5, "grad_norm": 24.212893513001855, "learning_rate": 4.754155658150817e-07, "logps/chosen": -40.2817268371582, "logps/rejected": -45.28466033935547, "loss": 0.635, "losses/dpo": 1.005113959312439, "losses/sft": 1.3125749826431274, "losses/total": 1.005113959312439, "ref_logps/chosen": -34.083030700683594, "ref_logps/rejected": -37.02979278564453, "rewards/accuracies": 0.625, "rewards/chosen": -0.6198693513870239, "rewards/margins": 0.2056168019771576, "rewards/rejected": -0.8254861831665039, "step": 535 }, { "epoch": 0.51, "grad_norm": 19.936163929577756, "learning_rate": 4.7530531972158684e-07, "logps/chosen": -36.46882629394531, "logps/rejected": -45.693756103515625, "loss": 0.4991, "losses/dpo": 0.20779818296432495, "losses/sft": 0.46748948097229004, "losses/total": 0.20779818296432495, "ref_logps/chosen": -31.5657958984375, "ref_logps/rejected": -34.966224670410156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4903031885623932, "rewards/margins": 0.5824503302574158, "rewards/rejected": -1.0727534294128418, "step": 536 }, { "epoch": 0.51, "grad_norm": 19.620427272516018, "learning_rate": 4.7519483982921803e-07, "logps/chosen": -25.947914123535156, "logps/rejected": -53.05496597290039, "loss": 0.4664, "losses/dpo": 0.4266441762447357, "losses/sft": 1.2050907611846924, "losses/total": 0.4266441762447357, "ref_logps/chosen": -22.614429473876953, "ref_logps/rejected": -42.024131774902344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33334869146347046, "rewards/margins": 0.7697348594665527, "rewards/rejected": -1.103083610534668, "step": 537 }, { "epoch": 0.51, "grad_norm": 24.69862821558076, "learning_rate": 4.750841262526201e-07, "logps/chosen": -32.45796203613281, "logps/rejected": -39.26926803588867, "loss": 0.6821, "losses/dpo": 0.2854974567890167, "losses/sft": 0.9219029545783997, "losses/total": 0.2854974567890167, "ref_logps/chosen": -26.640663146972656, "ref_logps/rejected": -32.30586242675781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5817296504974365, "rewards/margins": 0.11461064219474792, "rewards/rejected": -0.6963403224945068, "step": 538 }, { "epoch": 0.51, "grad_norm": 19.0144848005665, "learning_rate": 4.749731791066802e-07, "logps/chosen": -32.12488555908203, "logps/rejected": -44.345298767089844, "loss": 0.4573, "losses/dpo": 0.7195256948471069, "losses/sft": 1.238998293876648, "losses/total": 0.7195256948471069, "ref_logps/chosen": -28.572463989257812, "ref_logps/rejected": -34.68731689453125, "rewards/accuracies": 0.875, "rewards/chosen": -0.3552422821521759, "rewards/margins": 0.6105555295944214, "rewards/rejected": -0.9657978415489197, "step": 539 }, { "epoch": 0.51, "grad_norm": 23.755913613798466, "learning_rate": 4.7486199850652803e-07, "logps/chosen": -35.811866760253906, "logps/rejected": -34.11158752441406, "loss": 0.6236, "losses/dpo": 0.45043206214904785, "losses/sft": 0.17612296342849731, "losses/total": 0.45043206214904785, "ref_logps/chosen": -31.11347770690918, "ref_logps/rejected": -26.89638328552246, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46983879804611206, "rewards/margins": 0.25168153643608093, "rewards/rejected": -0.7215204238891602, "step": 540 }, { "epoch": 0.51, "grad_norm": 26.255694871156756, "learning_rate": 4.747505845675354e-07, "logps/chosen": -40.00474548339844, "logps/rejected": -41.97130584716797, "loss": 0.6382, "losses/dpo": 0.7631707191467285, "losses/sft": 1.0861709117889404, "losses/total": 0.7631707191467285, "ref_logps/chosen": -35.28369903564453, "ref_logps/rejected": -35.13319778442383, "rewards/accuracies": 0.5625, "rewards/chosen": -0.472104549407959, "rewards/margins": 0.21170610189437866, "rewards/rejected": -0.6838107109069824, "step": 541 }, { "epoch": 0.51, "grad_norm": 19.014745135190573, "learning_rate": 4.7463893740531636e-07, "logps/chosen": -21.059612274169922, "logps/rejected": -40.38677978515625, "loss": 0.5336, "losses/dpo": 0.6872146129608154, "losses/sft": 0.8823416829109192, "losses/total": 0.6872146129608154, "ref_logps/chosen": -18.78635025024414, "ref_logps/rejected": -33.75849151611328, "rewards/accuracies": 0.875, "rewards/chosen": -0.2273261696100235, "rewards/margins": 0.4355029761791229, "rewards/rejected": -0.6628291606903076, "step": 542 }, { "epoch": 0.51, "grad_norm": 19.517561270229862, "learning_rate": 4.7452705713572704e-07, "logps/chosen": -26.681987762451172, "logps/rejected": -39.22837829589844, "loss": 0.5764, "losses/dpo": 0.5984467267990112, "losses/sft": 0.8920701146125793, "losses/total": 0.5984467267990112, "ref_logps/chosen": -23.83768653869629, "ref_logps/rejected": -32.676719665527344, "rewards/accuracies": 0.75, "rewards/chosen": -0.2844300866127014, "rewards/margins": 0.3707355260848999, "rewards/rejected": -0.6551656126976013, "step": 543 }, { "epoch": 0.51, "grad_norm": 27.638110392161238, "learning_rate": 4.7441494387486524e-07, "logps/chosen": -47.917213439941406, "logps/rejected": -49.54640579223633, "loss": 0.6435, "losses/dpo": 0.8982082009315491, "losses/sft": 0.8639118075370789, "losses/total": 0.8982082009315491, "ref_logps/chosen": -41.40434265136719, "ref_logps/rejected": -40.658424377441406, "rewards/accuracies": 0.625, "rewards/chosen": -0.6512876749038696, "rewards/margins": 0.2375107705593109, "rewards/rejected": -0.8887983560562134, "step": 544 }, { "epoch": 0.51, "grad_norm": 19.32380208349953, "learning_rate": 4.7430259773907066e-07, "logps/chosen": -31.66454315185547, "logps/rejected": -43.11380386352539, "loss": 0.5611, "losses/dpo": 0.42056259512901306, "losses/sft": 0.8885305523872375, "losses/total": 0.42056259512901306, "ref_logps/chosen": -28.409225463867188, "ref_logps/rejected": -36.02820587158203, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3255317211151123, "rewards/margins": 0.38302791118621826, "rewards/rejected": -0.7085596323013306, "step": 545 }, { "epoch": 0.52, "grad_norm": 17.806421401031606, "learning_rate": 4.741900188449248e-07, "logps/chosen": -25.732181549072266, "logps/rejected": -39.864349365234375, "loss": 0.4723, "losses/dpo": 0.45400556921958923, "losses/sft": 0.9985635876655579, "losses/total": 0.45400556921958923, "ref_logps/chosen": -23.518762588500977, "ref_logps/rejected": -31.69968605041504, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22134190797805786, "rewards/margins": 0.5951242446899414, "rewards/rejected": -0.8164661526679993, "step": 546 }, { "epoch": 0.52, "grad_norm": 20.403117423290166, "learning_rate": 4.740772073092504e-07, "logps/chosen": -29.098146438598633, "logps/rejected": -36.18224334716797, "loss": 0.6086, "losses/dpo": 0.48851001262664795, "losses/sft": 1.4222182035446167, "losses/total": 0.48851001262664795, "ref_logps/chosen": -24.46573257446289, "ref_logps/rejected": -28.58106231689453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46324121952056885, "rewards/margins": 0.2968768775463104, "rewards/rejected": -0.7601180672645569, "step": 547 }, { "epoch": 0.52, "grad_norm": 19.84207849399558, "learning_rate": 4.739641632491118e-07, "logps/chosen": -25.274818420410156, "logps/rejected": -39.299530029296875, "loss": 0.5366, "losses/dpo": 0.6277439594268799, "losses/sft": 1.142492651939392, "losses/total": 0.6277439594268799, "ref_logps/chosen": -21.517349243164062, "ref_logps/rejected": -30.980457305908203, "rewards/accuracies": 0.75, "rewards/chosen": -0.37574684619903564, "rewards/margins": 0.456160306930542, "rewards/rejected": -0.8319071531295776, "step": 548 }, { "epoch": 0.52, "grad_norm": 24.001432171884318, "learning_rate": 4.738508867818146e-07, "logps/chosen": -32.708702087402344, "logps/rejected": -47.43525314331055, "loss": 0.6594, "losses/dpo": 0.6528933644294739, "losses/sft": 0.8586260080337524, "losses/total": 0.6528933644294739, "ref_logps/chosen": -27.051223754882812, "ref_logps/rejected": -39.05437469482422, "rewards/accuracies": 0.75, "rewards/chosen": -0.5657479166984558, "rewards/margins": 0.2723396420478821, "rewards/rejected": -0.8380875587463379, "step": 549 }, { "epoch": 0.52, "grad_norm": 22.17846459358022, "learning_rate": 4.7373737802490565e-07, "logps/chosen": -36.434715270996094, "logps/rejected": -42.857460021972656, "loss": 0.5443, "losses/dpo": 0.4392516613006592, "losses/sft": 1.659854531288147, "losses/total": 0.4392516613006592, "ref_logps/chosen": -31.703369140625, "ref_logps/rejected": -34.03941345214844, "rewards/accuracies": 0.875, "rewards/chosen": -0.4731343984603882, "rewards/margins": 0.40867018699645996, "rewards/rejected": -0.8818045854568481, "step": 550 }, { "epoch": 0.52, "grad_norm": 24.43533316610455, "learning_rate": 4.736236370961726e-07, "logps/chosen": -43.67321014404297, "logps/rejected": -55.47050857543945, "loss": 0.5458, "losses/dpo": 0.7144737839698792, "losses/sft": 0.5858408808708191, "losses/total": 0.7144737839698792, "ref_logps/chosen": -37.81050109863281, "ref_logps/rejected": -44.20393371582031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5862715244293213, "rewards/margins": 0.5403856039047241, "rewards/rejected": -1.1266571283340454, "step": 551 }, { "epoch": 0.52, "grad_norm": 21.360674522282302, "learning_rate": 4.735096641136443e-07, "logps/chosen": -28.644485473632812, "logps/rejected": -46.68540573120117, "loss": 0.5512, "losses/dpo": 0.5778580904006958, "losses/sft": 0.7089519500732422, "losses/total": 0.5778580904006958, "ref_logps/chosen": -24.201061248779297, "ref_logps/rejected": -38.06681442260742, "rewards/accuracies": 0.75, "rewards/chosen": -0.4443424344062805, "rewards/margins": 0.41751644015312195, "rewards/rejected": -0.8618588447570801, "step": 552 }, { "epoch": 0.52, "grad_norm": 18.064414885164833, "learning_rate": 4.733954591955902e-07, "logps/chosen": -28.50263214111328, "logps/rejected": -52.212608337402344, "loss": 0.4074, "losses/dpo": 0.15416166186332703, "losses/sft": 0.8320966362953186, "losses/total": 0.15416166186332703, "ref_logps/chosen": -25.355655670166016, "ref_logps/rejected": -40.16828918457031, "rewards/accuracies": 0.875, "rewards/chosen": -0.3146979808807373, "rewards/margins": 0.8897335529327393, "rewards/rejected": -1.2044315338134766, "step": 553 }, { "epoch": 0.52, "grad_norm": 26.194075344659385, "learning_rate": 4.732810224605206e-07, "logps/chosen": -43.67914962768555, "logps/rejected": -40.8546142578125, "loss": 0.6526, "losses/dpo": 0.7656344771385193, "losses/sft": 1.1082179546356201, "losses/total": 0.7656344771385193, "ref_logps/chosen": -36.265628814697266, "ref_logps/rejected": -31.0140380859375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7413519620895386, "rewards/margins": 0.24270544946193695, "rewards/rejected": -0.9840574264526367, "step": 554 }, { "epoch": 0.52, "grad_norm": 25.558893779423478, "learning_rate": 4.7316635402718627e-07, "logps/chosen": -37.4716796875, "logps/rejected": -48.79682540893555, "loss": 0.5644, "losses/dpo": 0.40009069442749023, "losses/sft": 1.0308367013931274, "losses/total": 0.40009069442749023, "ref_logps/chosen": -32.79035949707031, "ref_logps/rejected": -40.28694534301758, "rewards/accuracies": 0.75, "rewards/chosen": -0.46813270449638367, "rewards/margins": 0.3828549385070801, "rewards/rejected": -0.8509876132011414, "step": 555 }, { "epoch": 0.52, "grad_norm": 22.18961580026382, "learning_rate": 4.7305145401457823e-07, "logps/chosen": -35.15283966064453, "logps/rejected": -37.55563735961914, "loss": 0.6106, "losses/dpo": 0.5182918310165405, "losses/sft": 0.40982866287231445, "losses/total": 0.5182918310165405, "ref_logps/chosen": -29.690397262573242, "ref_logps/rejected": -28.98211669921875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5462443828582764, "rewards/margins": 0.3111078143119812, "rewards/rejected": -0.8573521971702576, "step": 556 }, { "epoch": 0.53, "grad_norm": 25.743982170603065, "learning_rate": 4.729363225419283e-07, "logps/chosen": -26.600231170654297, "logps/rejected": -27.948528289794922, "loss": 0.7562, "losses/dpo": 1.258376955986023, "losses/sft": 1.398624062538147, "losses/total": 1.258376955986023, "ref_logps/chosen": -20.551395416259766, "ref_logps/rejected": -21.892274856567383, "rewards/accuracies": 0.5, "rewards/chosen": -0.6048834323883057, "rewards/margins": 0.000741809606552124, "rewards/rejected": -0.6056252121925354, "step": 557 }, { "epoch": 0.53, "grad_norm": 18.97159318577055, "learning_rate": 4.7282095972870785e-07, "logps/chosen": -34.5098876953125, "logps/rejected": -34.662933349609375, "loss": 0.5799, "losses/dpo": 0.8469346165657043, "losses/sft": 1.0381118059158325, "losses/total": 0.8469346165657043, "ref_logps/chosen": -29.925331115722656, "ref_logps/rejected": -26.742860794067383, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45845556259155273, "rewards/margins": 0.33355164527893066, "rewards/rejected": -0.7920071482658386, "step": 558 }, { "epoch": 0.53, "grad_norm": 25.555159031500683, "learning_rate": 4.727053656946288e-07, "logps/chosen": -40.26702880859375, "logps/rejected": -51.89540100097656, "loss": 0.609, "losses/dpo": 0.7676990628242493, "losses/sft": 1.8109636306762695, "losses/total": 0.7676990628242493, "ref_logps/chosen": -32.79016876220703, "ref_logps/rejected": -41.38371276855469, "rewards/accuracies": 0.625, "rewards/chosen": -0.7476861476898193, "rewards/margins": 0.3034825026988983, "rewards/rejected": -1.05116868019104, "step": 559 }, { "epoch": 0.53, "grad_norm": 29.58648338607051, "learning_rate": 4.72589540559643e-07, "logps/chosen": -51.93077850341797, "logps/rejected": -50.76955795288086, "loss": 0.6571, "losses/dpo": 0.02617046982049942, "losses/sft": 1.0301814079284668, "losses/total": 0.02617046982049942, "ref_logps/chosen": -42.355873107910156, "ref_logps/rejected": -37.12807846069336, "rewards/accuracies": 0.625, "rewards/chosen": -0.9574905633926392, "rewards/margins": 0.4066574275493622, "rewards/rejected": -1.3641481399536133, "step": 560 }, { "epoch": 0.53, "grad_norm": 17.616758909377026, "learning_rate": 4.724734844439416e-07, "logps/chosen": -39.01957702636719, "logps/rejected": -46.55259704589844, "loss": 0.434, "losses/dpo": 0.4331347346305847, "losses/sft": 0.5942819714546204, "losses/total": 0.4331347346305847, "ref_logps/chosen": -33.457740783691406, "ref_logps/rejected": -34.12803268432617, "rewards/accuracies": 1.0, "rewards/chosen": -0.5561834573745728, "rewards/margins": 0.686272919178009, "rewards/rejected": -1.2424564361572266, "step": 561 }, { "epoch": 0.53, "grad_norm": 20.63035309926791, "learning_rate": 4.7235719746795603e-07, "logps/chosen": -33.15313720703125, "logps/rejected": -58.76091766357422, "loss": 0.4818, "losses/dpo": 0.6469289064407349, "losses/sft": 0.9854974150657654, "losses/total": 0.6469289064407349, "ref_logps/chosen": -27.869558334350586, "ref_logps/rejected": -46.776954650878906, "rewards/accuracies": 0.875, "rewards/chosen": -0.528357744216919, "rewards/margins": 0.6700390577316284, "rewards/rejected": -1.1983966827392578, "step": 562 }, { "epoch": 0.53, "grad_norm": 29.490297004193078, "learning_rate": 4.7224067975235703e-07, "logps/chosen": -44.28321838378906, "logps/rejected": -44.411895751953125, "loss": 0.6818, "losses/dpo": 0.4534907937049866, "losses/sft": 1.0463597774505615, "losses/total": 0.4534907937049866, "ref_logps/chosen": -36.580894470214844, "ref_logps/rejected": -34.65812683105469, "rewards/accuracies": 0.625, "rewards/chosen": -0.7702330350875854, "rewards/margins": 0.20514407753944397, "rewards/rejected": -0.975377082824707, "step": 563 }, { "epoch": 0.53, "grad_norm": 22.777506843075212, "learning_rate": 4.7212393141805485e-07, "logps/chosen": -31.598106384277344, "logps/rejected": -45.365867614746094, "loss": 0.5519, "losses/dpo": 0.3730446994304657, "losses/sft": 1.1271289587020874, "losses/total": 0.3730446994304657, "ref_logps/chosen": -25.74553680419922, "ref_logps/rejected": -34.597618103027344, "rewards/accuracies": 0.75, "rewards/chosen": -0.5852572321891785, "rewards/margins": 0.49156758189201355, "rewards/rejected": -1.0768247842788696, "step": 564 }, { "epoch": 0.53, "grad_norm": 26.788375030774247, "learning_rate": 4.7200695258619885e-07, "logps/chosen": -30.753223419189453, "logps/rejected": -45.79199981689453, "loss": 0.6565, "losses/dpo": 0.2893655300140381, "losses/sft": 1.2595425844192505, "losses/total": 0.2893655300140381, "ref_logps/chosen": -23.483325958251953, "ref_logps/rejected": -35.619571685791016, "rewards/accuracies": 0.5, "rewards/chosen": -0.7269898056983948, "rewards/margins": 0.29025325179100037, "rewards/rejected": -1.0172431468963623, "step": 565 }, { "epoch": 0.53, "grad_norm": 22.029877630668768, "learning_rate": 4.7188974337817776e-07, "logps/chosen": -29.794219970703125, "logps/rejected": -43.268638610839844, "loss": 0.569, "losses/dpo": 0.21668745577335358, "losses/sft": 0.3915039598941803, "losses/total": 0.21668745577335358, "ref_logps/chosen": -23.718460083007812, "ref_logps/rejected": -31.921222686767578, "rewards/accuracies": 0.875, "rewards/chosen": -0.6075762510299683, "rewards/margins": 0.5271652936935425, "rewards/rejected": -1.1347415447235107, "step": 566 }, { "epoch": 0.53, "grad_norm": 25.21999554680501, "learning_rate": 4.717723039156194e-07, "logps/chosen": -33.06120681762695, "logps/rejected": -48.365501403808594, "loss": 0.547, "losses/dpo": 0.1757078319787979, "losses/sft": 1.1363515853881836, "losses/total": 0.1757078319787979, "ref_logps/chosen": -27.23636245727539, "ref_logps/rejected": -38.07865905761719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5824846029281616, "rewards/margins": 0.4461996555328369, "rewards/rejected": -1.028684139251709, "step": 567 }, { "epoch": 0.54, "grad_norm": 23.085367619288558, "learning_rate": 4.7165463432039047e-07, "logps/chosen": -31.759929656982422, "logps/rejected": -52.363826751708984, "loss": 0.599, "losses/dpo": 0.5362101793289185, "losses/sft": 1.0298957824707031, "losses/total": 0.5362101793289185, "ref_logps/chosen": -25.352155685424805, "ref_logps/rejected": -42.9951057434082, "rewards/accuracies": 0.75, "rewards/chosen": -0.640777587890625, "rewards/margins": 0.29609405994415283, "rewards/rejected": -0.9368717670440674, "step": 568 }, { "epoch": 0.54, "grad_norm": 18.48483219164379, "learning_rate": 4.7153673471459644e-07, "logps/chosen": -22.13750648498535, "logps/rejected": -41.81434631347656, "loss": 0.48, "losses/dpo": 0.504783034324646, "losses/sft": 1.1277087926864624, "losses/total": 0.504783034324646, "ref_logps/chosen": -17.0794677734375, "ref_logps/rejected": -30.415050506591797, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5058038234710693, "rewards/margins": 0.6341257095336914, "rewards/rejected": -1.1399295330047607, "step": 569 }, { "epoch": 0.54, "grad_norm": 27.809704352327593, "learning_rate": 4.714186052205815e-07, "logps/chosen": -43.68354415893555, "logps/rejected": -48.29063415527344, "loss": 0.6575, "losses/dpo": 0.6334193348884583, "losses/sft": 1.3714150190353394, "losses/total": 0.6334193348884583, "ref_logps/chosen": -36.160255432128906, "ref_logps/rejected": -38.09669494628906, "rewards/accuracies": 0.625, "rewards/chosen": -0.7523289918899536, "rewards/margins": 0.26706463098526, "rewards/rejected": -1.0193936824798584, "step": 570 }, { "epoch": 0.54, "grad_norm": 16.313636985839484, "learning_rate": 4.7130024596092837e-07, "logps/chosen": -29.036954879760742, "logps/rejected": -52.381622314453125, "loss": 0.4566, "losses/dpo": 0.7628687620162964, "losses/sft": 0.15001346170902252, "losses/total": 0.7628687620162964, "ref_logps/chosen": -23.443113327026367, "ref_logps/rejected": -39.699806213378906, "rewards/accuracies": 0.75, "rewards/chosen": -0.5593841075897217, "rewards/margins": 0.7087972164154053, "rewards/rejected": -1.268181324005127, "step": 571 }, { "epoch": 0.54, "grad_norm": 20.552481710734654, "learning_rate": 4.7118165705845823e-07, "logps/chosen": -29.779216766357422, "logps/rejected": -47.340293884277344, "loss": 0.47, "losses/dpo": 0.34655410051345825, "losses/sft": 1.0014570951461792, "losses/total": 0.34655410051345825, "ref_logps/chosen": -24.992294311523438, "ref_logps/rejected": -36.33269500732422, "rewards/accuracies": 0.875, "rewards/chosen": -0.4786921441555023, "rewards/margins": 0.6220673322677612, "rewards/rejected": -1.100759506225586, "step": 572 }, { "epoch": 0.54, "grad_norm": 24.9661058854814, "learning_rate": 4.7106283863623054e-07, "logps/chosen": -33.242916107177734, "logps/rejected": -39.73655319213867, "loss": 0.6821, "losses/dpo": 0.791109025478363, "losses/sft": 0.6915169954299927, "losses/total": 0.791109025478363, "ref_logps/chosen": -26.861543655395508, "ref_logps/rejected": -31.96761131286621, "rewards/accuracies": 0.5625, "rewards/chosen": -0.63813716173172, "rewards/margins": 0.13875716924667358, "rewards/rejected": -0.7768943309783936, "step": 573 }, { "epoch": 0.54, "grad_norm": 25.804195238566116, "learning_rate": 4.709437908175429e-07, "logps/chosen": -38.6502571105957, "logps/rejected": -48.42251205444336, "loss": 0.6175, "losses/dpo": 0.21912655234336853, "losses/sft": 0.8507135510444641, "losses/total": 0.21912655234336853, "ref_logps/chosen": -31.190631866455078, "ref_logps/rejected": -38.33055877685547, "rewards/accuracies": 0.625, "rewards/chosen": -0.745962381362915, "rewards/margins": 0.26323288679122925, "rewards/rejected": -1.009195327758789, "step": 574 }, { "epoch": 0.54, "grad_norm": 19.103672828521578, "learning_rate": 4.708245137259311e-07, "logps/chosen": -27.754709243774414, "logps/rejected": -52.17005920410156, "loss": 0.4313, "losses/dpo": 0.09946773201227188, "losses/sft": 0.7843036651611328, "losses/total": 0.09946773201227188, "ref_logps/chosen": -22.06188201904297, "ref_logps/rejected": -39.20614242553711, "rewards/accuracies": 0.875, "rewards/chosen": -0.5692826509475708, "rewards/margins": 0.7271088361740112, "rewards/rejected": -1.2963916063308716, "step": 575 }, { "epoch": 0.54, "grad_norm": 25.972683221523788, "learning_rate": 4.707050074851686e-07, "logps/chosen": -35.8011589050293, "logps/rejected": -51.24067687988281, "loss": 0.5972, "losses/dpo": 2.282541036605835, "losses/sft": 1.6199406385421753, "losses/total": 2.282541036605835, "ref_logps/chosen": -27.534360885620117, "ref_logps/rejected": -37.890228271484375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8266799449920654, "rewards/margins": 0.508364737033844, "rewards/rejected": -1.3350447416305542, "step": 576 }, { "epoch": 0.54, "grad_norm": 33.39703791791137, "learning_rate": 4.705852722192669e-07, "logps/chosen": -46.81102752685547, "logps/rejected": -35.571170806884766, "loss": 0.8271, "losses/dpo": 0.6630995273590088, "losses/sft": 1.0458322763442993, "losses/total": 0.6630995273590088, "ref_logps/chosen": -40.05646896362305, "ref_logps/rejected": -30.29071807861328, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6754561066627502, "rewards/margins": -0.1474110186100006, "rewards/rejected": -0.5280450582504272, "step": 577 }, { "epoch": 0.55, "grad_norm": 25.325218513340314, "learning_rate": 4.70465308052475e-07, "logps/chosen": -31.38156509399414, "logps/rejected": -34.052146911621094, "loss": 0.7254, "losses/dpo": 0.8127146363258362, "losses/sft": 0.689734935760498, "losses/total": 0.8127146363258362, "ref_logps/chosen": -25.963520050048828, "ref_logps/rejected": -27.505022048950195, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5418046712875366, "rewards/margins": 0.11290813982486725, "rewards/rejected": -0.6547127962112427, "step": 578 }, { "epoch": 0.55, "grad_norm": 21.335049516680254, "learning_rate": 4.703451151092793e-07, "logps/chosen": -31.94198226928711, "logps/rejected": -46.72318649291992, "loss": 0.5573, "losses/dpo": 0.7167991995811462, "losses/sft": 1.178222894668579, "losses/total": 0.7167991995811462, "ref_logps/chosen": -27.205501556396484, "ref_logps/rejected": -37.47165298461914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4736480414867401, "rewards/margins": 0.45150524377822876, "rewards/rejected": -0.9251533150672913, "step": 579 }, { "epoch": 0.55, "grad_norm": 22.97091424932587, "learning_rate": 4.702246935144041e-07, "logps/chosen": -37.725711822509766, "logps/rejected": -52.61585998535156, "loss": 0.4453, "losses/dpo": 0.4600352346897125, "losses/sft": 0.8874503374099731, "losses/total": 0.4600352346897125, "ref_logps/chosen": -33.90260696411133, "ref_logps/rejected": -40.472782135009766, "rewards/accuracies": 0.75, "rewards/chosen": -0.38231027126312256, "rewards/margins": 0.8319973349571228, "rewards/rejected": -1.2143075466156006, "step": 580 }, { "epoch": 0.55, "grad_norm": 19.188277045775294, "learning_rate": 4.701040433928105e-07, "logps/chosen": -27.1820068359375, "logps/rejected": -46.2115478515625, "loss": 0.4827, "losses/dpo": 0.3878229558467865, "losses/sft": 0.5608516931533813, "losses/total": 0.3878229558467865, "ref_logps/chosen": -23.80061149597168, "ref_logps/rejected": -36.975154876708984, "rewards/accuracies": 0.875, "rewards/chosen": -0.3381396234035492, "rewards/margins": 0.5854997038841248, "rewards/rejected": -0.9236392974853516, "step": 581 }, { "epoch": 0.55, "grad_norm": 22.730757693173647, "learning_rate": 4.6998316486969676e-07, "logps/chosen": -33.41335678100586, "logps/rejected": -38.742530822753906, "loss": 0.5443, "losses/dpo": 0.348469614982605, "losses/sft": 0.8435642719268799, "losses/total": 0.348469614982605, "ref_logps/chosen": -28.817617416381836, "ref_logps/rejected": -28.064167022705078, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45957374572753906, "rewards/margins": 0.6082627773284912, "rewards/rejected": -1.0678365230560303, "step": 582 }, { "epoch": 0.55, "grad_norm": 21.362568498172056, "learning_rate": 4.698620580704984e-07, "logps/chosen": -31.365894317626953, "logps/rejected": -43.017005920410156, "loss": 0.533, "losses/dpo": 0.5548902750015259, "losses/sft": 0.7529851794242859, "losses/total": 0.5548902750015259, "ref_logps/chosen": -27.327373504638672, "ref_logps/rejected": -33.749454498291016, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40385228395462036, "rewards/margins": 0.5229029655456543, "rewards/rejected": -0.9267551898956299, "step": 583 }, { "epoch": 0.55, "grad_norm": 20.712781752757085, "learning_rate": 4.6974072312088775e-07, "logps/chosen": -32.139461517333984, "logps/rejected": -38.428504943847656, "loss": 0.581, "losses/dpo": 0.29830318689346313, "losses/sft": 0.5864368677139282, "losses/total": 0.29830318689346313, "ref_logps/chosen": -27.36843490600586, "ref_logps/rejected": -30.202486038208008, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4771028757095337, "rewards/margins": 0.3454991579055786, "rewards/rejected": -0.8226020336151123, "step": 584 }, { "epoch": 0.55, "grad_norm": 17.04460051302615, "learning_rate": 4.696191601467737e-07, "logps/chosen": -26.504016876220703, "logps/rejected": -38.49720001220703, "loss": 0.4827, "losses/dpo": 0.13656137883663177, "losses/sft": 1.093996524810791, "losses/total": 0.13656137883663177, "ref_logps/chosen": -23.350017547607422, "ref_logps/rejected": -28.602643966674805, "rewards/accuracies": 0.75, "rewards/chosen": -0.3153999447822571, "rewards/margins": 0.6740556359291077, "rewards/rejected": -0.98945552110672, "step": 585 }, { "epoch": 0.55, "grad_norm": 25.084721240015625, "learning_rate": 4.6949736927430204e-07, "logps/chosen": -34.883209228515625, "logps/rejected": -45.59157943725586, "loss": 0.7187, "losses/dpo": 2.018975019454956, "losses/sft": 1.6614842414855957, "losses/total": 2.018975019454956, "ref_logps/chosen": -26.808069229125977, "ref_logps/rejected": -34.66655349731445, "rewards/accuracies": 0.625, "rewards/chosen": -0.807513952255249, "rewards/margins": 0.28498873114585876, "rewards/rejected": -1.0925025939941406, "step": 586 }, { "epoch": 0.55, "grad_norm": 25.335359894752894, "learning_rate": 4.693753506298548e-07, "logps/chosen": -36.404720306396484, "logps/rejected": -39.93024444580078, "loss": 0.6416, "losses/dpo": 0.4563181400299072, "losses/sft": 0.3944336175918579, "losses/total": 0.4563181400299072, "ref_logps/chosen": -29.889217376708984, "ref_logps/rejected": -31.665119171142578, "rewards/accuracies": 0.625, "rewards/chosen": -0.6515501737594604, "rewards/margins": 0.17496268451213837, "rewards/rejected": -0.8265129327774048, "step": 587 }, { "epoch": 0.55, "grad_norm": 23.873962138659802, "learning_rate": 4.692531043400506e-07, "logps/chosen": -37.91200256347656, "logps/rejected": -46.74785614013672, "loss": 0.553, "losses/dpo": 0.6461275815963745, "losses/sft": 0.5408849716186523, "losses/total": 0.6461275815963745, "ref_logps/chosen": -31.845205307006836, "ref_logps/rejected": -34.516456604003906, "rewards/accuracies": 0.75, "rewards/chosen": -0.6066795587539673, "rewards/margins": 0.6164602637290955, "rewards/rejected": -1.223139762878418, "step": 588 }, { "epoch": 0.56, "grad_norm": 19.816072790792685, "learning_rate": 4.6913063053174407e-07, "logps/chosen": -32.38018035888672, "logps/rejected": -47.817222595214844, "loss": 0.5248, "losses/dpo": 0.1777152270078659, "losses/sft": 0.483010470867157, "losses/total": 0.1777152270078659, "ref_logps/chosen": -26.455446243286133, "ref_logps/rejected": -36.815425872802734, "rewards/accuracies": 0.75, "rewards/chosen": -0.5924732089042664, "rewards/margins": 0.5077061057090759, "rewards/rejected": -1.1001793146133423, "step": 589 }, { "epoch": 0.56, "grad_norm": 23.423869909585427, "learning_rate": 4.6900792933202606e-07, "logps/chosen": -37.518218994140625, "logps/rejected": -50.518882751464844, "loss": 0.5019, "losses/dpo": 0.5128961801528931, "losses/sft": 1.5512409210205078, "losses/total": 0.5128961801528931, "ref_logps/chosen": -31.292449951171875, "ref_logps/rejected": -39.07789611816406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6225770711898804, "rewards/margins": 0.5215213298797607, "rewards/rejected": -1.1440984010696411, "step": 590 }, { "epoch": 0.56, "grad_norm": 23.861315361722355, "learning_rate": 4.6888500086822336e-07, "logps/chosen": -47.38700866699219, "logps/rejected": -47.669403076171875, "loss": 0.5471, "losses/dpo": 0.645943820476532, "losses/sft": 0.8027803301811218, "losses/total": 0.645943820476532, "ref_logps/chosen": -39.68552780151367, "ref_logps/rejected": -34.9899787902832, "rewards/accuracies": 0.75, "rewards/chosen": -0.7701483964920044, "rewards/margins": 0.4977937936782837, "rewards/rejected": -1.2679420709609985, "step": 591 }, { "epoch": 0.56, "grad_norm": 21.9551032504596, "learning_rate": 4.687618452678985e-07, "logps/chosen": -39.772377014160156, "logps/rejected": -45.549400329589844, "loss": 0.5957, "losses/dpo": 0.10585377365350723, "losses/sft": 0.6975957751274109, "losses/total": 0.10585377365350723, "ref_logps/chosen": -31.989707946777344, "ref_logps/rejected": -32.38483428955078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.778266966342926, "rewards/margins": 0.5381898880004883, "rewards/rejected": -1.3164567947387695, "step": 592 }, { "epoch": 0.56, "grad_norm": 27.339268842547163, "learning_rate": 4.6863846265885e-07, "logps/chosen": -40.04603576660156, "logps/rejected": -41.13397216796875, "loss": 0.5584, "losses/dpo": 0.6659671664237976, "losses/sft": 0.7946620583534241, "losses/total": 0.6659671664237976, "ref_logps/chosen": -34.16004943847656, "ref_logps/rejected": -30.640335083007812, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5885986685752869, "rewards/margins": 0.46076470613479614, "rewards/rejected": -1.049363374710083, "step": 593 }, { "epoch": 0.56, "grad_norm": 21.29181133600788, "learning_rate": 4.6851485316911164e-07, "logps/chosen": -28.284412384033203, "logps/rejected": -46.71199035644531, "loss": 0.5124, "losses/dpo": 0.05180954188108444, "losses/sft": 0.7350914478302002, "losses/total": 0.05180954188108444, "ref_logps/chosen": -22.368579864501953, "ref_logps/rejected": -34.479129791259766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.591583251953125, "rewards/margins": 0.6317030191421509, "rewards/rejected": -1.2232862710952759, "step": 594 }, { "epoch": 0.56, "grad_norm": 21.63820431723281, "learning_rate": 4.6839101692695263e-07, "logps/chosen": -33.72478485107422, "logps/rejected": -43.5667839050293, "loss": 0.5552, "losses/dpo": 0.4324324131011963, "losses/sft": 0.8324698209762573, "losses/total": 0.4324324131011963, "ref_logps/chosen": -27.767677307128906, "ref_logps/rejected": -32.9699592590332, "rewards/accuracies": 0.75, "rewards/chosen": -0.5957105159759521, "rewards/margins": 0.46397197246551514, "rewards/rejected": -1.0596824884414673, "step": 595 }, { "epoch": 0.56, "grad_norm": 15.876463655221562, "learning_rate": 4.6826695406087775e-07, "logps/chosen": -22.1149959564209, "logps/rejected": -42.46928405761719, "loss": 0.4557, "losses/dpo": 0.07756131887435913, "losses/sft": 0.31581467390060425, "losses/total": 0.07756131887435913, "ref_logps/chosen": -19.53905487060547, "ref_logps/rejected": -32.233863830566406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2575939893722534, "rewards/margins": 0.7659485936164856, "rewards/rejected": -1.0235426425933838, "step": 596 }, { "epoch": 0.56, "grad_norm": 23.53673943895533, "learning_rate": 4.681426646996267e-07, "logps/chosen": -38.8846435546875, "logps/rejected": -48.398719787597656, "loss": 0.5414, "losses/dpo": 0.6461108922958374, "losses/sft": 0.9861966371536255, "losses/total": 0.6461108922958374, "ref_logps/chosen": -32.361385345458984, "ref_logps/rejected": -37.041141510009766, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6523258686065674, "rewards/margins": 0.4834321439266205, "rewards/rejected": -1.1357579231262207, "step": 597 }, { "epoch": 0.56, "grad_norm": 30.950277689356128, "learning_rate": 4.6801814897217427e-07, "logps/chosen": -52.887306213378906, "logps/rejected": -51.24040222167969, "loss": 0.6914, "losses/dpo": 1.8203119039535522, "losses/sft": 1.9396673440933228, "losses/total": 1.8203119039535522, "ref_logps/chosen": -43.928466796875, "ref_logps/rejected": -39.63343048095703, "rewards/accuracies": 0.625, "rewards/chosen": -0.8958839178085327, "rewards/margins": 0.26481375098228455, "rewards/rejected": -1.1606976985931396, "step": 598 }, { "epoch": 0.57, "grad_norm": 19.39926676379286, "learning_rate": 4.678934070077303e-07, "logps/chosen": -31.72031593322754, "logps/rejected": -52.4933967590332, "loss": 0.4764, "losses/dpo": 0.6359044909477234, "losses/sft": 0.8590939044952393, "losses/total": 0.6359044909477234, "ref_logps/chosen": -26.193706512451172, "ref_logps/rejected": -39.518192291259766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5526609420776367, "rewards/margins": 0.7448594570159912, "rewards/rejected": -1.297520399093628, "step": 599 }, { "epoch": 0.57, "grad_norm": 17.527579333891943, "learning_rate": 4.677684389357391e-07, "logps/chosen": -34.964088439941406, "logps/rejected": -46.78453826904297, "loss": 0.5019, "losses/dpo": 0.25319546461105347, "losses/sft": 0.6913333535194397, "losses/total": 0.25319546461105347, "ref_logps/chosen": -31.562774658203125, "ref_logps/rejected": -37.6266975402832, "rewards/accuracies": 0.75, "rewards/chosen": -0.34013134241104126, "rewards/margins": 0.5756528377532959, "rewards/rejected": -0.9157841801643372, "step": 600 }, { "epoch": 0.57, "grad_norm": 23.794524458776824, "learning_rate": 4.6764324488588e-07, "logps/chosen": -35.40614318847656, "logps/rejected": -38.31875991821289, "loss": 0.6738, "losses/dpo": 0.695321798324585, "losses/sft": 0.48255568742752075, "losses/total": 0.695321798324585, "ref_logps/chosen": -29.182710647583008, "ref_logps/rejected": -30.585338592529297, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6223431825637817, "rewards/margins": 0.15099895000457764, "rewards/rejected": -0.7733421325683594, "step": 601 }, { "epoch": 0.57, "grad_norm": 14.585083766265681, "learning_rate": 4.675178249880664e-07, "logps/chosen": -30.663942337036133, "logps/rejected": -55.972328186035156, "loss": 0.3362, "losses/dpo": 0.2918541431427002, "losses/sft": 2.1336348056793213, "losses/total": 0.2918541431427002, "ref_logps/chosen": -26.443281173706055, "ref_logps/rejected": -41.725669860839844, "rewards/accuracies": 0.875, "rewards/chosen": -0.4220660924911499, "rewards/margins": 1.002599835395813, "rewards/rejected": -1.424665927886963, "step": 602 }, { "epoch": 0.57, "grad_norm": 20.77240573838031, "learning_rate": 4.6739217937244644e-07, "logps/chosen": -32.41156005859375, "logps/rejected": -38.6456184387207, "loss": 0.5672, "losses/dpo": 0.5742881298065186, "losses/sft": 1.5287419557571411, "losses/total": 0.5742881298065186, "ref_logps/chosen": -26.812389373779297, "ref_logps/rejected": -28.686336517333984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5599167943000793, "rewards/margins": 0.43601125478744507, "rewards/rejected": -0.9959280490875244, "step": 603 }, { "epoch": 0.57, "grad_norm": 18.979555806259125, "learning_rate": 4.6726630816940217e-07, "logps/chosen": -29.911788940429688, "logps/rejected": -47.23920440673828, "loss": 0.4789, "losses/dpo": 0.2950848340988159, "losses/sft": 2.154348373413086, "losses/total": 0.2950848340988159, "ref_logps/chosen": -24.706966400146484, "ref_logps/rejected": -35.58237075805664, "rewards/accuracies": 0.75, "rewards/chosen": -0.5204820036888123, "rewards/margins": 0.645201563835144, "rewards/rejected": -1.1656835079193115, "step": 604 }, { "epoch": 0.57, "grad_norm": 19.66330388087272, "learning_rate": 4.6714021150955004e-07, "logps/chosen": -34.29392623901367, "logps/rejected": -50.238868713378906, "loss": 0.4401, "losses/dpo": 0.2269349992275238, "losses/sft": 1.0105233192443848, "losses/total": 0.2269349992275238, "ref_logps/chosen": -27.63511848449707, "ref_logps/rejected": -36.54885482788086, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6658809185028076, "rewards/margins": 0.7031206488609314, "rewards/rejected": -1.3690016269683838, "step": 605 }, { "epoch": 0.57, "grad_norm": 24.79467931436912, "learning_rate": 4.6701388952374013e-07, "logps/chosen": -48.10850524902344, "logps/rejected": -63.78190231323242, "loss": 0.5208, "losses/dpo": 0.28270864486694336, "losses/sft": 1.5895428657531738, "losses/total": 0.28270864486694336, "ref_logps/chosen": -40.67289352416992, "ref_logps/rejected": -47.72718811035156, "rewards/accuracies": 0.75, "rewards/chosen": -0.7435615062713623, "rewards/margins": 0.8619096279144287, "rewards/rejected": -1.605471134185791, "step": 606 }, { "epoch": 0.57, "grad_norm": 23.981301317066187, "learning_rate": 4.668873423430567e-07, "logps/chosen": -42.65095901489258, "logps/rejected": -59.621910095214844, "loss": 0.5403, "losses/dpo": 0.8153530359268188, "losses/sft": 1.7010388374328613, "losses/total": 0.8153530359268188, "ref_logps/chosen": -32.98031997680664, "ref_logps/rejected": -44.725616455078125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9670640230178833, "rewards/margins": 0.5225658416748047, "rewards/rejected": -1.4896297454833984, "step": 607 }, { "epoch": 0.57, "grad_norm": 24.88719262166803, "learning_rate": 4.6676057009881733e-07, "logps/chosen": -43.809967041015625, "logps/rejected": -46.025699615478516, "loss": 0.5575, "losses/dpo": 0.24096116423606873, "losses/sft": 1.849166989326477, "losses/total": 0.24096116423606873, "ref_logps/chosen": -37.029457092285156, "ref_logps/rejected": -33.63835525512695, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6780507564544678, "rewards/margins": 0.56068354845047, "rewards/rejected": -1.2387343645095825, "step": 608 }, { "epoch": 0.57, "grad_norm": 22.821932635216406, "learning_rate": 4.6663357292257344e-07, "logps/chosen": -37.427947998046875, "logps/rejected": -39.60730743408203, "loss": 0.5612, "losses/dpo": 0.7370954751968384, "losses/sft": 1.3768842220306396, "losses/total": 0.7370954751968384, "ref_logps/chosen": -32.175315856933594, "ref_logps/rejected": -30.195545196533203, "rewards/accuracies": 0.75, "rewards/chosen": -0.525263249874115, "rewards/margins": 0.41591283679008484, "rewards/rejected": -0.9411760568618774, "step": 609 }, { "epoch": 0.58, "grad_norm": 22.060641818589815, "learning_rate": 4.6650635094610966e-07, "logps/chosen": -31.485471725463867, "logps/rejected": -39.83027648925781, "loss": 0.5891, "losses/dpo": 0.6842118501663208, "losses/sft": 1.5637197494506836, "losses/total": 0.6842118501663208, "ref_logps/chosen": -25.982315063476562, "ref_logps/rejected": -30.74991798400879, "rewards/accuracies": 0.75, "rewards/chosen": -0.5503154397010803, "rewards/margins": 0.3577204644680023, "rewards/rejected": -0.908035933971405, "step": 610 }, { "epoch": 0.58, "grad_norm": 22.16177398498296, "learning_rate": 4.66378904301444e-07, "logps/chosen": -36.685302734375, "logps/rejected": -40.66183090209961, "loss": 0.529, "losses/dpo": 1.0408539772033691, "losses/sft": 2.4851295948028564, "losses/total": 1.0408539772033691, "ref_logps/chosen": -31.378982543945312, "ref_logps/rejected": -30.54393768310547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5306322574615479, "rewards/margins": 0.4811570644378662, "rewards/rejected": -1.011789321899414, "step": 611 }, { "epoch": 0.58, "grad_norm": 20.45212273839421, "learning_rate": 4.662512331208276e-07, "logps/chosen": -41.113067626953125, "logps/rejected": -53.82804489135742, "loss": 0.3744, "losses/dpo": 0.5808403491973877, "losses/sft": 1.557922601699829, "losses/total": 0.5808403491973877, "ref_logps/chosen": -33.95191955566406, "ref_logps/rejected": -36.624473571777344, "rewards/accuracies": 0.875, "rewards/chosen": -0.7161147594451904, "rewards/margins": 1.0042425394058228, "rewards/rejected": -1.7203572988510132, "step": 612 }, { "epoch": 0.58, "grad_norm": 23.96363485318395, "learning_rate": 4.661233375367446e-07, "logps/chosen": -39.62434387207031, "logps/rejected": -64.38750457763672, "loss": 0.4773, "losses/dpo": 0.5720837116241455, "losses/sft": 2.122967481613159, "losses/total": 0.5720837116241455, "ref_logps/chosen": -31.596784591674805, "ref_logps/rejected": -50.066402435302734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8027559518814087, "rewards/margins": 0.6293543577194214, "rewards/rejected": -1.4321101903915405, "step": 613 }, { "epoch": 0.58, "grad_norm": 23.52337071736161, "learning_rate": 4.6599521768191195e-07, "logps/chosen": -32.68368911743164, "logps/rejected": -37.72242736816406, "loss": 0.5666, "losses/dpo": 0.8125581741333008, "losses/sft": 1.3638566732406616, "losses/total": 0.8125581741333008, "ref_logps/chosen": -26.549983978271484, "ref_logps/rejected": -27.998682022094727, "rewards/accuracies": 0.75, "rewards/chosen": -0.6133705973625183, "rewards/margins": 0.3590039014816284, "rewards/rejected": -0.972374439239502, "step": 614 }, { "epoch": 0.58, "grad_norm": 23.968173483000268, "learning_rate": 4.658668736892795e-07, "logps/chosen": -42.20924377441406, "logps/rejected": -55.53137969970703, "loss": 0.6298, "losses/dpo": 1.203316330909729, "losses/sft": 1.2178432941436768, "losses/total": 1.203316330909729, "ref_logps/chosen": -33.58540344238281, "ref_logps/rejected": -41.64684295654297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8623842000961304, "rewards/margins": 0.5260697603225708, "rewards/rejected": -1.3884539604187012, "step": 615 }, { "epoch": 0.58, "grad_norm": 27.769045220137414, "learning_rate": 4.6573830569202934e-07, "logps/chosen": -49.16379928588867, "logps/rejected": -52.543941497802734, "loss": 0.6117, "losses/dpo": 0.7592117190361023, "losses/sft": 0.651050329208374, "losses/total": 0.7592117190361023, "ref_logps/chosen": -40.852745056152344, "ref_logps/rejected": -41.03955078125, "rewards/accuracies": 0.625, "rewards/chosen": -0.8311052918434143, "rewards/margins": 0.3193340003490448, "rewards/rejected": -1.1504392623901367, "step": 616 }, { "epoch": 0.58, "grad_norm": 23.892730279897577, "learning_rate": 4.6560951382357625e-07, "logps/chosen": -34.264163970947266, "logps/rejected": -35.901206970214844, "loss": 0.6209, "losses/dpo": 0.9199696779251099, "losses/sft": 1.6595240831375122, "losses/total": 0.9199696779251099, "ref_logps/chosen": -29.490917205810547, "ref_logps/rejected": -29.2386474609375, "rewards/accuracies": 0.625, "rewards/chosen": -0.4773246645927429, "rewards/margins": 0.18893127143383026, "rewards/rejected": -0.6662559509277344, "step": 617 }, { "epoch": 0.58, "grad_norm": 20.673342094672098, "learning_rate": 4.6548049821756747e-07, "logps/chosen": -35.53408432006836, "logps/rejected": -46.88211441040039, "loss": 0.521, "losses/dpo": 0.16088257730007172, "losses/sft": 1.2200039625167847, "losses/total": 0.16088257730007172, "ref_logps/chosen": -30.479114532470703, "ref_logps/rejected": -35.60508728027344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5054967403411865, "rewards/margins": 0.6222060918807983, "rewards/rejected": -1.1277027130126953, "step": 618 }, { "epoch": 0.58, "grad_norm": 31.475436370714355, "learning_rate": 4.653512590078821e-07, "logps/chosen": -48.169158935546875, "logps/rejected": -42.80339431762695, "loss": 0.7342, "losses/dpo": 1.3381986618041992, "losses/sft": 1.0880893468856812, "losses/total": 1.3381986618041992, "ref_logps/chosen": -37.35893630981445, "ref_logps/rejected": -31.376632690429688, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0810219049453735, "rewards/margins": 0.06165421009063721, "rewards/rejected": -1.1426761150360107, "step": 619 }, { "epoch": 0.58, "grad_norm": 17.631472908140424, "learning_rate": 4.652217963286314e-07, "logps/chosen": -25.29452133178711, "logps/rejected": -49.89160919189453, "loss": 0.4375, "losses/dpo": 0.3674740791320801, "losses/sft": 0.8061797618865967, "losses/total": 0.3674740791320801, "ref_logps/chosen": -20.606304168701172, "ref_logps/rejected": -36.47036361694336, "rewards/accuracies": 0.9375, "rewards/chosen": -0.46882158517837524, "rewards/margins": 0.8733025789260864, "rewards/rejected": -1.3421242237091064, "step": 620 }, { "epoch": 0.59, "grad_norm": 18.97881318011441, "learning_rate": 4.6509211031415856e-07, "logps/chosen": -37.91680145263672, "logps/rejected": -48.864749908447266, "loss": 0.5028, "losses/dpo": 0.7684571743011475, "losses/sft": 1.4244568347930908, "losses/total": 0.7684571743011475, "ref_logps/chosen": -31.04085922241211, "ref_logps/rejected": -35.51007080078125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6875939965248108, "rewards/margins": 0.6478739380836487, "rewards/rejected": -1.3354679346084595, "step": 621 }, { "epoch": 0.59, "grad_norm": 19.273094620678812, "learning_rate": 4.6496220109903847e-07, "logps/chosen": -33.745819091796875, "logps/rejected": -46.927547454833984, "loss": 0.4585, "losses/dpo": 0.37398049235343933, "losses/sft": 1.7120610475540161, "losses/total": 0.37398049235343933, "ref_logps/chosen": -28.208049774169922, "ref_logps/rejected": -34.55374526977539, "rewards/accuracies": 0.75, "rewards/chosen": -0.5537770986557007, "rewards/margins": 0.6836031675338745, "rewards/rejected": -1.2373802661895752, "step": 622 }, { "epoch": 0.59, "grad_norm": 35.56257603255109, "learning_rate": 4.648320688180778e-07, "logps/chosen": -54.49860763549805, "logps/rejected": -47.41084289550781, "loss": 0.7643, "losses/dpo": 0.32909321784973145, "losses/sft": 0.7180881500244141, "losses/total": 0.32909321784973145, "ref_logps/chosen": -43.05466842651367, "ref_logps/rejected": -35.88060760498047, "rewards/accuracies": 0.4375, "rewards/chosen": -1.144394040107727, "rewards/margins": 0.008629336953163147, "rewards/rejected": -1.153023362159729, "step": 623 }, { "epoch": 0.59, "grad_norm": 25.122632048236667, "learning_rate": 4.647017136063143e-07, "logps/chosen": -34.25472640991211, "logps/rejected": -34.19500732421875, "loss": 0.7257, "losses/dpo": 0.6531023979187012, "losses/sft": 0.050412170588970184, "losses/total": 0.6531023979187012, "ref_logps/chosen": -25.99571990966797, "ref_logps/rejected": -24.788116455078125, "rewards/accuracies": 0.5, "rewards/chosen": -0.8259005546569824, "rewards/margins": 0.11478828638792038, "rewards/rejected": -0.9406888484954834, "step": 624 }, { "epoch": 0.59, "grad_norm": 17.421690944324833, "learning_rate": 4.645711355990175e-07, "logps/chosen": -25.02206802368164, "logps/rejected": -34.58924102783203, "loss": 0.4911, "losses/dpo": 0.39220261573791504, "losses/sft": 1.461242914199829, "losses/total": 0.39220261573791504, "ref_logps/chosen": -20.492618560791016, "ref_logps/rejected": -24.700130462646484, "rewards/accuracies": 0.875, "rewards/chosen": -0.4529449939727783, "rewards/margins": 0.5359658598899841, "rewards/rejected": -0.9889107942581177, "step": 625 }, { "epoch": 0.59, "grad_norm": 19.775090107851042, "learning_rate": 4.644403349316879e-07, "logps/chosen": -28.817867279052734, "logps/rejected": -34.52857971191406, "loss": 0.5104, "losses/dpo": 0.41135331988334656, "losses/sft": 1.368776798248291, "losses/total": 0.41135331988334656, "ref_logps/chosen": -23.316478729248047, "ref_logps/rejected": -24.240234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.550138533115387, "rewards/margins": 0.4786961078643799, "rewards/rejected": -1.028834581375122, "step": 626 }, { "epoch": 0.59, "grad_norm": 17.389100349868105, "learning_rate": 4.643093117400571e-07, "logps/chosen": -25.839157104492188, "logps/rejected": -40.062339782714844, "loss": 0.4659, "losses/dpo": 0.33898019790649414, "losses/sft": 1.1247072219848633, "losses/total": 0.33898019790649414, "ref_logps/chosen": -22.86089324951172, "ref_logps/rejected": -30.416826248168945, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29782634973526, "rewards/margins": 0.6667252779006958, "rewards/rejected": -0.964551568031311, "step": 627 }, { "epoch": 0.59, "grad_norm": 19.880921029083275, "learning_rate": 4.641780661600874e-07, "logps/chosen": -35.73851776123047, "logps/rejected": -43.23097229003906, "loss": 0.504, "losses/dpo": 0.20673610270023346, "losses/sft": 2.17807936668396, "losses/total": 0.20673610270023346, "ref_logps/chosen": -31.068315505981445, "ref_logps/rejected": -32.57802963256836, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4670202136039734, "rewards/margins": 0.5982741713523865, "rewards/rejected": -1.0652943849563599, "step": 628 }, { "epoch": 0.59, "grad_norm": 23.450212002405483, "learning_rate": 4.640465983279724e-07, "logps/chosen": -33.53006362915039, "logps/rejected": -38.750640869140625, "loss": 0.5864, "losses/dpo": 0.16570882499217987, "losses/sft": 0.5928378105163574, "losses/total": 0.16570882499217987, "ref_logps/chosen": -24.754228591918945, "ref_logps/rejected": -25.109800338745117, "rewards/accuracies": 0.625, "rewards/chosen": -0.8775835037231445, "rewards/margins": 0.4865005612373352, "rewards/rejected": -1.3640841245651245, "step": 629 }, { "epoch": 0.59, "grad_norm": 23.291616936695533, "learning_rate": 4.6391490838013573e-07, "logps/chosen": -34.0415153503418, "logps/rejected": -42.2414665222168, "loss": 0.552, "losses/dpo": 0.73856520652771, "losses/sft": 1.584169864654541, "losses/total": 0.73856520652771, "ref_logps/chosen": -30.140647888183594, "ref_logps/rejected": -33.97486114501953, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3900867700576782, "rewards/margins": 0.43657374382019043, "rewards/rejected": -0.8266605734825134, "step": 630 }, { "epoch": 0.6, "grad_norm": 22.058294717770014, "learning_rate": 4.6378299645323193e-07, "logps/chosen": -33.35873794555664, "logps/rejected": -58.91335678100586, "loss": 0.4836, "losses/dpo": 0.49759209156036377, "losses/sft": 1.1706128120422363, "losses/total": 0.49759209156036377, "ref_logps/chosen": -27.683073043823242, "ref_logps/rejected": -45.55229568481445, "rewards/accuracies": 0.75, "rewards/chosen": -0.5675666332244873, "rewards/margins": 0.7685397267341614, "rewards/rejected": -1.3361064195632935, "step": 631 }, { "epoch": 0.6, "grad_norm": 26.239494048812244, "learning_rate": 4.6365086268414576e-07, "logps/chosen": -30.65104103088379, "logps/rejected": -45.34071350097656, "loss": 0.641, "losses/dpo": 0.8863164186477661, "losses/sft": 1.4360121488571167, "losses/total": 0.8863164186477661, "ref_logps/chosen": -23.08618927001953, "ref_logps/rejected": -34.670013427734375, "rewards/accuracies": 0.75, "rewards/chosen": -0.7564851641654968, "rewards/margins": 0.310585081577301, "rewards/rejected": -1.0670702457427979, "step": 632 }, { "epoch": 0.6, "grad_norm": 34.021664193704105, "learning_rate": 4.6351850720999206e-07, "logps/chosen": -30.647188186645508, "logps/rejected": -41.78785705566406, "loss": 0.6394, "losses/dpo": 0.44184452295303345, "losses/sft": 0.7362833023071289, "losses/total": 0.44184452295303345, "ref_logps/chosen": -25.249923706054688, "ref_logps/rejected": -32.368465423583984, "rewards/accuracies": 0.625, "rewards/chosen": -0.5397261381149292, "rewards/margins": 0.4022131562232971, "rewards/rejected": -0.9419392347335815, "step": 633 }, { "epoch": 0.6, "grad_norm": 25.57056374366263, "learning_rate": 4.6338593016811595e-07, "logps/chosen": -36.243011474609375, "logps/rejected": -41.066219329833984, "loss": 0.6773, "losses/dpo": 0.30450981855392456, "losses/sft": 1.2758631706237793, "losses/total": 0.30450981855392456, "ref_logps/chosen": -28.268802642822266, "ref_logps/rejected": -31.27225112915039, "rewards/accuracies": 0.5, "rewards/chosen": -0.7974207997322083, "rewards/margins": 0.181975856423378, "rewards/rejected": -0.9793967008590698, "step": 634 }, { "epoch": 0.6, "grad_norm": 21.67802840174242, "learning_rate": 4.6325313169609226e-07, "logps/chosen": -38.94700622558594, "logps/rejected": -50.35860061645508, "loss": 0.5852, "losses/dpo": 0.3802967965602875, "losses/sft": 1.4586256742477417, "losses/total": 0.3802967965602875, "ref_logps/chosen": -31.317119598388672, "ref_logps/rejected": -38.593894958496094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7629886865615845, "rewards/margins": 0.4134817123413086, "rewards/rejected": -1.1764705181121826, "step": 635 }, { "epoch": 0.6, "grad_norm": 21.390395383193187, "learning_rate": 4.6312011193172567e-07, "logps/chosen": -39.482757568359375, "logps/rejected": -47.164981842041016, "loss": 0.5443, "losses/dpo": 1.2792094945907593, "losses/sft": 1.5407575368881226, "losses/total": 1.2792094945907593, "ref_logps/chosen": -32.25577926635742, "ref_logps/rejected": -35.32502746582031, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7226977348327637, "rewards/margins": 0.46129798889160156, "rewards/rejected": -1.1839957237243652, "step": 636 }, { "epoch": 0.6, "grad_norm": 30.61052501875098, "learning_rate": 4.629868710130507e-07, "logps/chosen": -46.504886627197266, "logps/rejected": -41.666744232177734, "loss": 0.711, "losses/dpo": 0.5685024261474609, "losses/sft": 0.18232198059558868, "losses/total": 0.5685024261474609, "ref_logps/chosen": -37.64099884033203, "ref_logps/rejected": -30.399089813232422, "rewards/accuracies": 0.625, "rewards/chosen": -0.8863887786865234, "rewards/margins": 0.24037665128707886, "rewards/rejected": -1.126765489578247, "step": 637 }, { "epoch": 0.6, "grad_norm": 23.297457196865953, "learning_rate": 4.6285340907833093e-07, "logps/chosen": -41.85067367553711, "logps/rejected": -44.21183776855469, "loss": 0.5687, "losses/dpo": 0.8732883930206299, "losses/sft": 1.028587818145752, "losses/total": 0.8732883930206299, "ref_logps/chosen": -35.1544303894043, "ref_logps/rejected": -31.92450714111328, "rewards/accuracies": 0.625, "rewards/chosen": -0.6696243286132812, "rewards/margins": 0.5591084957122803, "rewards/rejected": -1.2287328243255615, "step": 638 }, { "epoch": 0.6, "grad_norm": 18.673126965243725, "learning_rate": 4.627197262660597e-07, "logps/chosen": -29.189401626586914, "logps/rejected": -45.99738311767578, "loss": 0.4465, "losses/dpo": 0.03463166952133179, "losses/sft": 0.23882247507572174, "losses/total": 0.03463166952133179, "ref_logps/chosen": -23.828828811645508, "ref_logps/rejected": -32.126251220703125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.53605717420578, "rewards/margins": 0.8510556817054749, "rewards/rejected": -1.3871128559112549, "step": 639 }, { "epoch": 0.6, "grad_norm": 29.735586810031165, "learning_rate": 4.625858227149594e-07, "logps/chosen": -34.96662139892578, "logps/rejected": -36.88236999511719, "loss": 0.8782, "losses/dpo": 0.15424950420856476, "losses/sft": 0.7729324102401733, "losses/total": 0.15424950420856476, "ref_logps/chosen": -26.231365203857422, "ref_logps/rejected": -29.119739532470703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8735252618789673, "rewards/margins": -0.09726211428642273, "rewards/rejected": -0.7762631177902222, "step": 640 }, { "epoch": 0.6, "grad_norm": 20.329504642146517, "learning_rate": 4.624516985639814e-07, "logps/chosen": -25.548473358154297, "logps/rejected": -42.162654876708984, "loss": 0.5815, "losses/dpo": 0.7025285959243774, "losses/sft": 0.891549289226532, "losses/total": 0.7025285959243774, "ref_logps/chosen": -19.184640884399414, "ref_logps/rejected": -30.47906494140625, "rewards/accuracies": 0.625, "rewards/chosen": -0.6363832950592041, "rewards/margins": 0.5319756269454956, "rewards/rejected": -1.1683589220046997, "step": 641 }, { "epoch": 0.61, "grad_norm": 24.029813492087023, "learning_rate": 4.623173539523061e-07, "logps/chosen": -39.08476257324219, "logps/rejected": -48.41978454589844, "loss": 0.5504, "losses/dpo": 0.18632851541042328, "losses/sft": 1.4873589277267456, "losses/total": 0.18632851541042328, "ref_logps/chosen": -31.40216827392578, "ref_logps/rejected": -35.39824676513672, "rewards/accuracies": 0.75, "rewards/chosen": -0.7682597637176514, "rewards/margins": 0.5338944792747498, "rewards/rejected": -1.3021541833877563, "step": 642 }, { "epoch": 0.61, "grad_norm": 15.273320448702446, "learning_rate": 4.621827890193427e-07, "logps/chosen": -23.972314834594727, "logps/rejected": -40.824134826660156, "loss": 0.4632, "losses/dpo": 0.358480840921402, "losses/sft": 1.1795464754104614, "losses/total": 0.358480840921402, "ref_logps/chosen": -19.636611938476562, "ref_logps/rejected": -28.77640724182129, "rewards/accuracies": 0.75, "rewards/chosen": -0.4335702657699585, "rewards/margins": 0.7712026834487915, "rewards/rejected": -1.20477294921875, "step": 643 }, { "epoch": 0.61, "grad_norm": 35.79871898947879, "learning_rate": 4.6204800390472885e-07, "logps/chosen": -44.61802673339844, "logps/rejected": -52.128910064697266, "loss": 0.7247, "losses/dpo": 0.019710179418325424, "losses/sft": 0.4274463355541229, "losses/total": 0.019710179418325424, "ref_logps/chosen": -36.797550201416016, "ref_logps/rejected": -40.897186279296875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7820476293563843, "rewards/margins": 0.34112483263015747, "rewards/rejected": -1.1231725215911865, "step": 644 }, { "epoch": 0.61, "grad_norm": 17.78748257480198, "learning_rate": 4.619129987483308e-07, "logps/chosen": -31.207473754882812, "logps/rejected": -50.23456573486328, "loss": 0.4617, "losses/dpo": 0.2611769139766693, "losses/sft": 1.2258033752441406, "losses/total": 0.2611769139766693, "ref_logps/chosen": -26.032188415527344, "ref_logps/rejected": -37.08369445800781, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5175287127494812, "rewards/margins": 0.797558605670929, "rewards/rejected": -1.3150873184204102, "step": 645 }, { "epoch": 0.61, "grad_norm": 16.534140103684408, "learning_rate": 4.617777736902432e-07, "logps/chosen": -27.663543701171875, "logps/rejected": -47.688255310058594, "loss": 0.4338, "losses/dpo": 0.3754425346851349, "losses/sft": 0.4922862648963928, "losses/total": 0.3754425346851349, "ref_logps/chosen": -23.296323776245117, "ref_logps/rejected": -34.9271240234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.4367222487926483, "rewards/margins": 0.8393905162811279, "rewards/rejected": -1.2761127948760986, "step": 646 }, { "epoch": 0.61, "grad_norm": 21.13896635296367, "learning_rate": 4.6164232887078875e-07, "logps/chosen": -37.36035919189453, "logps/rejected": -42.021244049072266, "loss": 0.5006, "losses/dpo": 0.16717474162578583, "losses/sft": 0.6927555203437805, "losses/total": 0.16717474162578583, "ref_logps/chosen": -31.62629508972168, "ref_logps/rejected": -29.939422607421875, "rewards/accuracies": 0.625, "rewards/chosen": -0.5734068155288696, "rewards/margins": 0.6347751617431641, "rewards/rejected": -1.2081820964813232, "step": 647 }, { "epoch": 0.61, "grad_norm": 25.709092654657102, "learning_rate": 4.615066644305183e-07, "logps/chosen": -31.437528610229492, "logps/rejected": -43.46318054199219, "loss": 0.7094, "losses/dpo": 0.46931934356689453, "losses/sft": 1.4208474159240723, "losses/total": 0.46931934356689453, "ref_logps/chosen": -23.988082885742188, "ref_logps/rejected": -35.094146728515625, "rewards/accuracies": 0.5, "rewards/chosen": -0.7449445724487305, "rewards/margins": 0.09195896238088608, "rewards/rejected": -0.8369035124778748, "step": 648 }, { "epoch": 0.61, "grad_norm": 23.123696854530472, "learning_rate": 4.613707805102105e-07, "logps/chosen": -38.48326873779297, "logps/rejected": -41.09943771362305, "loss": 0.629, "losses/dpo": 0.8105164170265198, "losses/sft": 1.6897635459899902, "losses/total": 0.8105164170265198, "ref_logps/chosen": -27.105119705200195, "ref_logps/rejected": -26.49690818786621, "rewards/accuracies": 0.75, "rewards/chosen": -1.1378153562545776, "rewards/margins": 0.3224376440048218, "rewards/rejected": -1.4602530002593994, "step": 649 }, { "epoch": 0.61, "grad_norm": 21.31828911705751, "learning_rate": 4.61234677250872e-07, "logps/chosen": -37.70623779296875, "logps/rejected": -46.435340881347656, "loss": 0.5302, "losses/dpo": 0.318496435880661, "losses/sft": 1.1439001560211182, "losses/total": 0.318496435880661, "ref_logps/chosen": -31.141956329345703, "ref_logps/rejected": -33.89915466308594, "rewards/accuracies": 0.75, "rewards/chosen": -0.656428337097168, "rewards/margins": 0.5971906781196594, "rewards/rejected": -1.2536189556121826, "step": 650 }, { "epoch": 0.61, "grad_norm": 22.966528187672544, "learning_rate": 4.610983547937366e-07, "logps/chosen": -37.471160888671875, "logps/rejected": -47.14955520629883, "loss": 0.5415, "losses/dpo": 0.7830667495727539, "losses/sft": 1.0724693536758423, "losses/total": 0.7830667495727539, "ref_logps/chosen": -30.12961196899414, "ref_logps/rejected": -35.50242614746094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7341550588607788, "rewards/margins": 0.4305577874183655, "rewards/rejected": -1.164712905883789, "step": 651 }, { "epoch": 0.62, "grad_norm": 17.96838072018895, "learning_rate": 4.609618132802661e-07, "logps/chosen": -31.057676315307617, "logps/rejected": -53.43559265136719, "loss": 0.421, "losses/dpo": 0.251058429479599, "losses/sft": 0.9489599466323853, "losses/total": 0.251058429479599, "ref_logps/chosen": -26.077110290527344, "ref_logps/rejected": -40.952537536621094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49805647134780884, "rewards/margins": 0.7502493858337402, "rewards/rejected": -1.2483057975769043, "step": 652 }, { "epoch": 0.62, "grad_norm": 21.680874622294645, "learning_rate": 4.608250528521491e-07, "logps/chosen": -28.846532821655273, "logps/rejected": -39.01908874511719, "loss": 0.5931, "losses/dpo": 0.34047481417655945, "losses/sft": 0.36448934674263, "losses/total": 0.34047481417655945, "ref_logps/chosen": -23.528907775878906, "ref_logps/rejected": -29.941177368164062, "rewards/accuracies": 0.625, "rewards/chosen": -0.5317626595497131, "rewards/margins": 0.37602874636650085, "rewards/rejected": -0.9077913761138916, "step": 653 }, { "epoch": 0.62, "grad_norm": 23.024965203703577, "learning_rate": 4.6068807365130187e-07, "logps/chosen": -37.83990478515625, "logps/rejected": -44.84632873535156, "loss": 0.5705, "losses/dpo": 0.2196911871433258, "losses/sft": 1.3661961555480957, "losses/total": 0.2196911871433258, "ref_logps/chosen": -30.344175338745117, "ref_logps/rejected": -32.78589630126953, "rewards/accuracies": 0.875, "rewards/chosen": -0.7495729923248291, "rewards/margins": 0.45647042989730835, "rewards/rejected": -1.2060434818267822, "step": 654 }, { "epoch": 0.62, "grad_norm": 23.13133098946377, "learning_rate": 4.605508758198673e-07, "logps/chosen": -39.57470703125, "logps/rejected": -59.07281494140625, "loss": 0.4662, "losses/dpo": 1.0505043268203735, "losses/sft": 1.134100317955017, "losses/total": 1.0505043268203735, "ref_logps/chosen": -31.81458282470703, "ref_logps/rejected": -43.16560363769531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7760123014450073, "rewards/margins": 0.8147090077400208, "rewards/rejected": -1.5907213687896729, "step": 655 }, { "epoch": 0.62, "grad_norm": 24.034659872703305, "learning_rate": 4.604134595002154e-07, "logps/chosen": -37.15745544433594, "logps/rejected": -39.89818572998047, "loss": 0.6154, "losses/dpo": 0.1559189260005951, "losses/sft": 0.8651077151298523, "losses/total": 0.1559189260005951, "ref_logps/chosen": -29.728649139404297, "ref_logps/rejected": -27.778579711914062, "rewards/accuracies": 0.6875, "rewards/chosen": -0.742880642414093, "rewards/margins": 0.46908000111579895, "rewards/rejected": -1.2119606733322144, "step": 656 }, { "epoch": 0.62, "grad_norm": 21.587618405247202, "learning_rate": 4.6027582483494265e-07, "logps/chosen": -34.60535430908203, "logps/rejected": -54.39714050292969, "loss": 0.5836, "losses/dpo": 0.28300365805625916, "losses/sft": 1.029249906539917, "losses/total": 0.28300365805625916, "ref_logps/chosen": -24.982383728027344, "ref_logps/rejected": -38.586463928222656, "rewards/accuracies": 0.625, "rewards/chosen": -0.9622969627380371, "rewards/margins": 0.6187713146209717, "rewards/rejected": -1.5810683965682983, "step": 657 }, { "epoch": 0.62, "grad_norm": 25.803337902758567, "learning_rate": 4.6013797196687243e-07, "logps/chosen": -36.65309143066406, "logps/rejected": -45.32371520996094, "loss": 0.5431, "losses/dpo": 0.5297002792358398, "losses/sft": 0.08460690081119537, "losses/total": 0.5297002792358398, "ref_logps/chosen": -29.288745880126953, "ref_logps/rejected": -33.099029541015625, "rewards/accuracies": 0.75, "rewards/chosen": -0.7364347577095032, "rewards/margins": 0.4860335886478424, "rewards/rejected": -1.222468376159668, "step": 658 }, { "epoch": 0.62, "grad_norm": 18.870490668437284, "learning_rate": 4.599999010390543e-07, "logps/chosen": -32.70846176147461, "logps/rejected": -41.165462493896484, "loss": 0.4939, "losses/dpo": 0.28260067105293274, "losses/sft": 0.5844208002090454, "losses/total": 0.28260067105293274, "ref_logps/chosen": -27.137252807617188, "ref_logps/rejected": -29.89151382446289, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5571209192276001, "rewards/margins": 0.5702742338180542, "rewards/rejected": -1.1273950338363647, "step": 659 }, { "epoch": 0.62, "grad_norm": 22.24116171304526, "learning_rate": 4.598616121947642e-07, "logps/chosen": -34.17109680175781, "logps/rejected": -45.435264587402344, "loss": 0.5615, "losses/dpo": 0.6335158944129944, "losses/sft": 0.5961390137672424, "losses/total": 0.6335158944129944, "ref_logps/chosen": -27.397300720214844, "ref_logps/rejected": -33.96364212036133, "rewards/accuracies": 0.75, "rewards/chosen": -0.6773796081542969, "rewards/margins": 0.4697827398777008, "rewards/rejected": -1.1471623182296753, "step": 660 }, { "epoch": 0.62, "grad_norm": 17.438169740493013, "learning_rate": 4.597231055775041e-07, "logps/chosen": -33.43608474731445, "logps/rejected": -50.024757385253906, "loss": 0.4114, "losses/dpo": 0.2168746143579483, "losses/sft": 0.9500698447227478, "losses/total": 0.2168746143579483, "ref_logps/chosen": -25.805261611938477, "ref_logps/rejected": -32.76909255981445, "rewards/accuracies": 0.875, "rewards/chosen": -0.7630825042724609, "rewards/margins": 0.9624840021133423, "rewards/rejected": -1.7255663871765137, "step": 661 }, { "epoch": 0.62, "grad_norm": 26.269731943597872, "learning_rate": 4.595843813310022e-07, "logps/chosen": -38.41364288330078, "logps/rejected": -42.88172149658203, "loss": 0.618, "losses/dpo": 0.1381654143333435, "losses/sft": 0.9243693351745605, "losses/total": 0.1381654143333435, "ref_logps/chosen": -30.239341735839844, "ref_logps/rejected": -30.720844268798828, "rewards/accuracies": 0.625, "rewards/chosen": -0.8174301385879517, "rewards/margins": 0.3986576497554779, "rewards/rejected": -1.216087818145752, "step": 662 }, { "epoch": 0.63, "grad_norm": 21.320328677337343, "learning_rate": 4.594454395992122e-07, "logps/chosen": -32.272064208984375, "logps/rejected": -44.05171203613281, "loss": 0.5905, "losses/dpo": 0.7469954490661621, "losses/sft": 1.271263599395752, "losses/total": 0.7469954490661621, "ref_logps/chosen": -23.918167114257812, "ref_logps/rejected": -32.264556884765625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8353894948959351, "rewards/margins": 0.34332627058029175, "rewards/rejected": -1.178715705871582, "step": 663 }, { "epoch": 0.63, "grad_norm": 22.60249799414985, "learning_rate": 4.5930628052631383e-07, "logps/chosen": -34.254173278808594, "logps/rejected": -51.84258270263672, "loss": 0.5574, "losses/dpo": 0.3175211548805237, "losses/sft": 1.8833116292953491, "losses/total": 0.3175211548805237, "ref_logps/chosen": -27.748220443725586, "ref_logps/rejected": -40.30290222167969, "rewards/accuracies": 0.75, "rewards/chosen": -0.6505955457687378, "rewards/margins": 0.5033724308013916, "rewards/rejected": -1.153968095779419, "step": 664 }, { "epoch": 0.63, "grad_norm": 19.401934137784117, "learning_rate": 4.59166904256712e-07, "logps/chosen": -30.67557144165039, "logps/rejected": -44.66902160644531, "loss": 0.4761, "losses/dpo": 0.9349763989448547, "losses/sft": 1.1101778745651245, "losses/total": 0.9349763989448547, "ref_logps/chosen": -25.320152282714844, "ref_logps/rejected": -32.088096618652344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5355420708656311, "rewards/margins": 0.7225500345230103, "rewards/rejected": -1.2580920457839966, "step": 665 }, { "epoch": 0.63, "grad_norm": 20.637360698966265, "learning_rate": 4.590273109350373e-07, "logps/chosen": -32.112037658691406, "logps/rejected": -43.586708068847656, "loss": 0.5812, "losses/dpo": 0.8486281633377075, "losses/sft": 1.4523260593414307, "losses/total": 0.8486281633377075, "ref_logps/chosen": -25.250368118286133, "ref_logps/rejected": -32.93247604370117, "rewards/accuracies": 0.625, "rewards/chosen": -0.6861666440963745, "rewards/margins": 0.3792562484741211, "rewards/rejected": -1.065422773361206, "step": 666 }, { "epoch": 0.63, "grad_norm": 22.18349252025121, "learning_rate": 4.5888750070614547e-07, "logps/chosen": -38.42280197143555, "logps/rejected": -49.803794860839844, "loss": 0.5599, "losses/dpo": 1.0317864418029785, "losses/sft": 1.3979573249816895, "losses/total": 1.0317864418029785, "ref_logps/chosen": -30.432329177856445, "ref_logps/rejected": -37.432289123535156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7990473508834839, "rewards/margins": 0.4381027817726135, "rewards/rejected": -1.2371501922607422, "step": 667 }, { "epoch": 0.63, "grad_norm": 27.119619275136294, "learning_rate": 4.5874747371511714e-07, "logps/chosen": -40.10429000854492, "logps/rejected": -43.74522399902344, "loss": 0.671, "losses/dpo": 0.2170935869216919, "losses/sft": 0.6776456236839294, "losses/total": 0.2170935869216919, "ref_logps/chosen": -31.57549285888672, "ref_logps/rejected": -32.55165481567383, "rewards/accuracies": 0.5, "rewards/chosen": -0.8528798222541809, "rewards/margins": 0.26647692918777466, "rewards/rejected": -1.1193568706512451, "step": 668 }, { "epoch": 0.63, "grad_norm": 22.190989375747616, "learning_rate": 4.586072301072582e-07, "logps/chosen": -32.89898681640625, "logps/rejected": -44.962242126464844, "loss": 0.507, "losses/dpo": 0.18903322517871857, "losses/sft": 1.514735460281372, "losses/total": 0.18903322517871857, "ref_logps/chosen": -26.459993362426758, "ref_logps/rejected": -32.892784118652344, "rewards/accuracies": 0.875, "rewards/chosen": -0.6438995599746704, "rewards/margins": 0.5630461573600769, "rewards/rejected": -1.2069456577301025, "step": 669 }, { "epoch": 0.63, "grad_norm": 20.257010872276364, "learning_rate": 4.5846677002809903e-07, "logps/chosen": -30.500038146972656, "logps/rejected": -40.23486328125, "loss": 0.5447, "losses/dpo": 0.7869978547096252, "losses/sft": 1.115344524383545, "losses/total": 0.7869978547096252, "ref_logps/chosen": -23.7115478515625, "ref_logps/rejected": -28.370376586914062, "rewards/accuracies": 0.75, "rewards/chosen": -0.6788489818572998, "rewards/margins": 0.5075997114181519, "rewards/rejected": -1.1864486932754517, "step": 670 }, { "epoch": 0.63, "grad_norm": 21.148120466024647, "learning_rate": 4.583260936233949e-07, "logps/chosen": -31.876201629638672, "logps/rejected": -51.29749298095703, "loss": 0.4507, "losses/dpo": 0.863013505935669, "losses/sft": 1.3745990991592407, "losses/total": 0.863013505935669, "ref_logps/chosen": -25.599708557128906, "ref_logps/rejected": -35.642425537109375, "rewards/accuracies": 0.75, "rewards/chosen": -0.627649188041687, "rewards/margins": 0.937857449054718, "rewards/rejected": -1.5655066967010498, "step": 671 }, { "epoch": 0.63, "grad_norm": 21.624120207311126, "learning_rate": 4.5818520103912526e-07, "logps/chosen": -39.315528869628906, "logps/rejected": -61.198936462402344, "loss": 0.4061, "losses/dpo": 0.5152442455291748, "losses/sft": 0.6920422315597534, "losses/total": 0.5152442455291748, "ref_logps/chosen": -31.1480655670166, "ref_logps/rejected": -44.18769073486328, "rewards/accuracies": 0.875, "rewards/chosen": -0.8167464733123779, "rewards/margins": 0.8843779563903809, "rewards/rejected": -1.7011244297027588, "step": 672 }, { "epoch": 0.63, "grad_norm": 21.1001758972848, "learning_rate": 4.5804409242149425e-07, "logps/chosen": -32.91680908203125, "logps/rejected": -60.4622688293457, "loss": 0.4674, "losses/dpo": 0.042661964893341064, "losses/sft": 0.45299598574638367, "losses/total": 0.042661964893341064, "ref_logps/chosen": -24.64590835571289, "ref_logps/rejected": -43.56279754638672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8270902633666992, "rewards/margins": 0.8628571033477783, "rewards/rejected": -1.6899473667144775, "step": 673 }, { "epoch": 0.64, "grad_norm": 18.854425406239336, "learning_rate": 4.579027679169298e-07, "logps/chosen": -28.871871948242188, "logps/rejected": -37.98689651489258, "loss": 0.5091, "losses/dpo": 0.5709837079048157, "losses/sft": 1.6384685039520264, "losses/total": 0.5709837079048157, "ref_logps/chosen": -23.26822280883789, "ref_logps/rejected": -27.608154296875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5603649020195007, "rewards/margins": 0.47750937938690186, "rewards/rejected": -1.0378742218017578, "step": 674 }, { "epoch": 0.64, "grad_norm": 24.750039669922774, "learning_rate": 4.5776122767208427e-07, "logps/chosen": -29.183185577392578, "logps/rejected": -42.37678527832031, "loss": 0.7144, "losses/dpo": 1.0311615467071533, "losses/sft": 1.212329387664795, "losses/total": 1.0311615467071533, "ref_logps/chosen": -20.780838012695312, "ref_logps/rejected": -31.796184539794922, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8402348756790161, "rewards/margins": 0.21782508492469788, "rewards/rejected": -1.0580599308013916, "step": 675 }, { "epoch": 0.64, "grad_norm": 16.395477265997688, "learning_rate": 4.5761947183383356e-07, "logps/chosen": -28.94422721862793, "logps/rejected": -32.9804801940918, "loss": 0.5177, "losses/dpo": 0.2998391091823578, "losses/sft": 1.2929158210754395, "losses/total": 0.2998391091823578, "ref_logps/chosen": -23.753856658935547, "ref_logps/rejected": -22.95147705078125, "rewards/accuracies": 0.75, "rewards/chosen": -0.5190368890762329, "rewards/margins": 0.48386335372924805, "rewards/rejected": -1.002900242805481, "step": 676 }, { "epoch": 0.64, "grad_norm": 18.631036824681082, "learning_rate": 4.5747750054927744e-07, "logps/chosen": -38.26249694824219, "logps/rejected": -53.238487243652344, "loss": 0.5847, "losses/dpo": 0.0705828070640564, "losses/sft": 1.6856552362442017, "losses/total": 0.0705828070640564, "ref_logps/chosen": -29.87721061706543, "ref_logps/rejected": -37.20977783203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8385285139083862, "rewards/margins": 0.7643423080444336, "rewards/rejected": -1.6028707027435303, "step": 677 }, { "epoch": 0.64, "grad_norm": 15.941859860414615, "learning_rate": 4.573353139657392e-07, "logps/chosen": -29.076147079467773, "logps/rejected": -48.93705749511719, "loss": 0.4007, "losses/dpo": 0.38076967000961304, "losses/sft": 0.10715598613023758, "losses/total": 0.38076967000961304, "ref_logps/chosen": -25.295879364013672, "ref_logps/rejected": -35.112056732177734, "rewards/accuracies": 0.875, "rewards/chosen": -0.3780268728733063, "rewards/margins": 1.004473090171814, "rewards/rejected": -1.3824999332427979, "step": 678 }, { "epoch": 0.64, "grad_norm": 24.41658414403987, "learning_rate": 4.5719291223076556e-07, "logps/chosen": -29.425901412963867, "logps/rejected": -52.62269592285156, "loss": 0.5992, "losses/dpo": 0.6852480173110962, "losses/sft": 0.3314376771450043, "losses/total": 0.6852480173110962, "ref_logps/chosen": -20.715105056762695, "ref_logps/rejected": -40.97241973876953, "rewards/accuracies": 0.75, "rewards/chosen": -0.8710796236991882, "rewards/margins": 0.29394808411598206, "rewards/rejected": -1.1650277376174927, "step": 679 }, { "epoch": 0.64, "grad_norm": 23.771183369135805, "learning_rate": 4.570502954921266e-07, "logps/chosen": -40.36071014404297, "logps/rejected": -50.63197326660156, "loss": 0.624, "losses/dpo": 1.2234665155410767, "losses/sft": 2.103851318359375, "losses/total": 1.2234665155410767, "ref_logps/chosen": -29.87540626525879, "ref_logps/rejected": -36.66088104248047, "rewards/accuracies": 0.5625, "rewards/chosen": -1.048530101776123, "rewards/margins": 0.3485797643661499, "rewards/rejected": -1.3971097469329834, "step": 680 }, { "epoch": 0.64, "grad_norm": 17.81323719556886, "learning_rate": 4.569074638978153e-07, "logps/chosen": -29.867050170898438, "logps/rejected": -42.170570373535156, "loss": 0.3899, "losses/dpo": 0.08487270027399063, "losses/sft": 0.8708683848381042, "losses/total": 0.08487270027399063, "ref_logps/chosen": -25.47060775756836, "ref_logps/rejected": -26.769893646240234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43964439630508423, "rewards/margins": 1.1004228591918945, "rewards/rejected": -1.540067195892334, "step": 681 }, { "epoch": 0.64, "grad_norm": 23.590472473604837, "learning_rate": 4.567644175960479e-07, "logps/chosen": -41.13995361328125, "logps/rejected": -56.45172882080078, "loss": 0.4964, "losses/dpo": 0.25475844740867615, "losses/sft": 1.0040078163146973, "losses/total": 0.25475844740867615, "ref_logps/chosen": -32.4641227722168, "ref_logps/rejected": -41.60316467285156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.867583155632019, "rewards/margins": 0.6172729134559631, "rewards/rejected": -1.484856128692627, "step": 682 }, { "epoch": 0.64, "grad_norm": 23.100088261138467, "learning_rate": 4.56621156735263e-07, "logps/chosen": -38.94739532470703, "logps/rejected": -43.87704849243164, "loss": 0.5836, "losses/dpo": 0.21341775357723236, "losses/sft": 0.6885048151016235, "losses/total": 0.21341775357723236, "ref_logps/chosen": -29.639806747436523, "ref_logps/rejected": -30.54530143737793, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9307588338851929, "rewards/margins": 0.4024158716201782, "rewards/rejected": -1.333174705505371, "step": 683 }, { "epoch": 0.65, "grad_norm": 26.793286329789762, "learning_rate": 4.5647768146412236e-07, "logps/chosen": -39.19150924682617, "logps/rejected": -46.973480224609375, "loss": 0.6801, "losses/dpo": 1.0767107009887695, "losses/sft": 2.0346744060516357, "losses/total": 1.0767107009887695, "ref_logps/chosen": -27.93883514404297, "ref_logps/rejected": -30.944236755371094, "rewards/accuracies": 0.75, "rewards/chosen": -1.1252678632736206, "rewards/margins": 0.47765636444091797, "rewards/rejected": -1.6029242277145386, "step": 684 }, { "epoch": 0.65, "grad_norm": 29.840446891787234, "learning_rate": 4.563339919315098e-07, "logps/chosen": -44.572296142578125, "logps/rejected": -51.172760009765625, "loss": 0.6183, "losses/dpo": 0.2724364995956421, "losses/sft": 0.682341992855072, "losses/total": 0.2724364995956421, "ref_logps/chosen": -33.94873809814453, "ref_logps/rejected": -37.69361877441406, "rewards/accuracies": 0.625, "rewards/chosen": -1.0623555183410645, "rewards/margins": 0.28555828332901, "rewards/rejected": -1.3479139804840088, "step": 685 }, { "epoch": 0.65, "grad_norm": 26.25235804162874, "learning_rate": 4.561900882865317e-07, "logps/chosen": -31.406354904174805, "logps/rejected": -51.719242095947266, "loss": 0.5659, "losses/dpo": 0.3012601435184479, "losses/sft": 1.7957420349121094, "losses/total": 0.3012601435184479, "ref_logps/chosen": -23.76095962524414, "ref_logps/rejected": -39.17911148071289, "rewards/accuracies": 0.75, "rewards/chosen": -0.7645397782325745, "rewards/margins": 0.48947304487228394, "rewards/rejected": -1.2540128231048584, "step": 686 }, { "epoch": 0.65, "grad_norm": 23.72748805487029, "learning_rate": 4.560459706785167e-07, "logps/chosen": -28.444488525390625, "logps/rejected": -36.682334899902344, "loss": 0.6556, "losses/dpo": 0.35205620527267456, "losses/sft": 0.7227797508239746, "losses/total": 0.35205620527267456, "ref_logps/chosen": -20.871185302734375, "ref_logps/rejected": -25.545751571655273, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7573302984237671, "rewards/margins": 0.3563278913497925, "rewards/rejected": -1.1136581897735596, "step": 687 }, { "epoch": 0.65, "grad_norm": 19.058636988355005, "learning_rate": 4.559016392570151e-07, "logps/chosen": -30.449050903320312, "logps/rejected": -50.204166412353516, "loss": 0.4536, "losses/dpo": 0.1104518324136734, "losses/sft": 0.6518766283988953, "losses/total": 0.1104518324136734, "ref_logps/chosen": -23.813325881958008, "ref_logps/rejected": -35.488861083984375, "rewards/accuracies": 0.75, "rewards/chosen": -0.6635724902153015, "rewards/margins": 0.8079580664634705, "rewards/rejected": -1.471530556678772, "step": 688 }, { "epoch": 0.65, "grad_norm": 23.39858905901387, "learning_rate": 4.557570941717996e-07, "logps/chosen": -32.887779235839844, "logps/rejected": -42.28535079956055, "loss": 0.6122, "losses/dpo": 0.6438087224960327, "losses/sft": 0.3263789713382721, "losses/total": 0.6438087224960327, "ref_logps/chosen": -24.755664825439453, "ref_logps/rejected": -30.527873992919922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8132115602493286, "rewards/margins": 0.3625361919403076, "rewards/rejected": -1.1757476329803467, "step": 689 }, { "epoch": 0.65, "grad_norm": 24.99813795204149, "learning_rate": 4.5561233557286415e-07, "logps/chosen": -38.1380615234375, "logps/rejected": -43.84553909301758, "loss": 0.4975, "losses/dpo": 0.5712835192680359, "losses/sft": 1.0790847539901733, "losses/total": 0.5712835192680359, "ref_logps/chosen": -30.84027862548828, "ref_logps/rejected": -29.446197509765625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.729778528213501, "rewards/margins": 0.710155725479126, "rewards/rejected": -1.439934253692627, "step": 690 }, { "epoch": 0.65, "grad_norm": 26.46027900414484, "learning_rate": 4.5546736361042457e-07, "logps/chosen": -43.571075439453125, "logps/rejected": -50.74225616455078, "loss": 0.6234, "losses/dpo": 0.44270655512809753, "losses/sft": 1.6004340648651123, "losses/total": 0.44270655512809753, "ref_logps/chosen": -34.32265090942383, "ref_logps/rejected": -37.550926208496094, "rewards/accuracies": 0.625, "rewards/chosen": -0.9248427152633667, "rewards/margins": 0.39429017901420593, "rewards/rejected": -1.319132924079895, "step": 691 }, { "epoch": 0.65, "grad_norm": 22.06548672671967, "learning_rate": 4.5532217843491795e-07, "logps/chosen": -27.960529327392578, "logps/rejected": -45.689476013183594, "loss": 0.5259, "losses/dpo": 0.30199187994003296, "losses/sft": 1.3436036109924316, "losses/total": 0.30199187994003296, "ref_logps/chosen": -21.15851593017578, "ref_logps/rejected": -32.037593841552734, "rewards/accuracies": 0.875, "rewards/chosen": -0.6802015900611877, "rewards/margins": 0.6849867701530457, "rewards/rejected": -1.3651883602142334, "step": 692 }, { "epoch": 0.65, "grad_norm": 30.0634329394328, "learning_rate": 4.551767801970025e-07, "logps/chosen": -52.16084671020508, "logps/rejected": -59.136688232421875, "loss": 0.6581, "losses/dpo": 0.8153606653213501, "losses/sft": 0.9420509338378906, "losses/total": 0.8153606653213501, "ref_logps/chosen": -40.549869537353516, "ref_logps/rejected": -43.87236022949219, "rewards/accuracies": 0.625, "rewards/chosen": -1.1610976457595825, "rewards/margins": 0.3653346300125122, "rewards/rejected": -1.5264323949813843, "step": 693 }, { "epoch": 0.65, "grad_norm": 18.756431730473206, "learning_rate": 4.550311690475579e-07, "logps/chosen": -33.51912307739258, "logps/rejected": -36.26432800292969, "loss": 0.5032, "losses/dpo": 0.6389309763908386, "losses/sft": 0.7770150899887085, "losses/total": 0.6389309763908386, "ref_logps/chosen": -27.83419418334961, "ref_logps/rejected": -25.235010147094727, "rewards/accuracies": 0.875, "rewards/chosen": -0.5684926509857178, "rewards/margins": 0.5344387888908386, "rewards/rejected": -1.1029313802719116, "step": 694 }, { "epoch": 0.66, "grad_norm": 31.637013882072434, "learning_rate": 4.548853451376844e-07, "logps/chosen": -38.6067008972168, "logps/rejected": -58.0496940612793, "loss": 0.5907, "losses/dpo": 0.39298009872436523, "losses/sft": 0.07310919463634491, "losses/total": 0.39298009872436523, "ref_logps/chosen": -31.252952575683594, "ref_logps/rejected": -45.846744537353516, "rewards/accuracies": 0.625, "rewards/chosen": -0.735375165939331, "rewards/margins": 0.48491978645324707, "rewards/rejected": -1.2202949523925781, "step": 695 }, { "epoch": 0.66, "grad_norm": 16.306470416920476, "learning_rate": 4.5473930861870324e-07, "logps/chosen": -24.507862091064453, "logps/rejected": -40.679351806640625, "loss": 0.4056, "losses/dpo": 0.7688062787055969, "losses/sft": 0.3262580931186676, "losses/total": 0.7688062787055969, "ref_logps/chosen": -20.67083740234375, "ref_logps/rejected": -28.198843002319336, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3837025463581085, "rewards/margins": 0.8643486499786377, "rewards/rejected": -1.2480511665344238, "step": 696 }, { "epoch": 0.66, "grad_norm": 23.231521506392696, "learning_rate": 4.545930596421562e-07, "logps/chosen": -32.33563995361328, "logps/rejected": -36.51070785522461, "loss": 0.6218, "losses/dpo": 1.303257942199707, "losses/sft": 0.7641470432281494, "losses/total": 1.303257942199707, "ref_logps/chosen": -24.74269676208496, "ref_logps/rejected": -25.970592498779297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7592943906784058, "rewards/margins": 0.294717013835907, "rewards/rejected": -1.054011344909668, "step": 697 }, { "epoch": 0.66, "grad_norm": 15.411363927123839, "learning_rate": 4.544465983598056e-07, "logps/chosen": -39.996070861816406, "logps/rejected": -60.4014892578125, "loss": 0.3821, "losses/dpo": 1.393168568611145, "losses/sft": 1.356032133102417, "losses/total": 1.393168568611145, "ref_logps/chosen": -31.453659057617188, "ref_logps/rejected": -40.811553955078125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8542409539222717, "rewards/margins": 1.1047519445419312, "rewards/rejected": -1.9589929580688477, "step": 698 }, { "epoch": 0.66, "grad_norm": 19.549828315085303, "learning_rate": 4.5429992492363387e-07, "logps/chosen": -30.658708572387695, "logps/rejected": -54.38196563720703, "loss": 0.4522, "losses/dpo": 0.4743472933769226, "losses/sft": 0.6787049770355225, "losses/total": 0.4743472933769226, "ref_logps/chosen": -24.14516830444336, "ref_logps/rejected": -39.642337799072266, "rewards/accuracies": 0.875, "rewards/chosen": -0.651354193687439, "rewards/margins": 0.822608232498169, "rewards/rejected": -1.4739625453948975, "step": 699 }, { "epoch": 0.66, "grad_norm": 33.200361623911945, "learning_rate": 4.5415303948584395e-07, "logps/chosen": -43.18952941894531, "logps/rejected": -39.40531921386719, "loss": 0.8218, "losses/dpo": 1.5312392711639404, "losses/sft": 1.4325966835021973, "losses/total": 1.5312392711639404, "ref_logps/chosen": -33.612205505371094, "ref_logps/rejected": -29.579666137695312, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9577322006225586, "rewards/margins": 0.024833299219608307, "rewards/rejected": -0.9825655221939087, "step": 700 }, { "epoch": 0.66, "grad_norm": 23.799919807797657, "learning_rate": 4.5400594219885837e-07, "logps/chosen": -44.53129577636719, "logps/rejected": -47.40294647216797, "loss": 0.548, "losses/dpo": 1.0439014434814453, "losses/sft": 1.3517040014266968, "losses/total": 1.0439014434814453, "ref_logps/chosen": -34.76280975341797, "ref_logps/rejected": -31.74169158935547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9768490195274353, "rewards/margins": 0.5892765522003174, "rewards/rejected": -1.5661256313323975, "step": 701 }, { "epoch": 0.66, "grad_norm": 27.97865034204928, "learning_rate": 4.538586332153198e-07, "logps/chosen": -46.54060363769531, "logps/rejected": -47.17954635620117, "loss": 0.629, "losses/dpo": 0.6898346543312073, "losses/sft": 1.0533716678619385, "losses/total": 0.6898346543312073, "ref_logps/chosen": -36.52458190917969, "ref_logps/rejected": -33.389739990234375, "rewards/accuracies": 0.75, "rewards/chosen": -1.001602292060852, "rewards/margins": 0.3773784041404724, "rewards/rejected": -1.3789806365966797, "step": 702 }, { "epoch": 0.66, "grad_norm": 21.19792028895112, "learning_rate": 4.5371111268809035e-07, "logps/chosen": -34.2764892578125, "logps/rejected": -51.764915466308594, "loss": 0.4311, "losses/dpo": 0.23411624133586884, "losses/sft": 0.9186955690383911, "losses/total": 0.23411624133586884, "ref_logps/chosen": -27.707693099975586, "ref_logps/rejected": -35.88692855834961, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6568797826766968, "rewards/margins": 0.9309190511703491, "rewards/rejected": -1.587798833847046, "step": 703 }, { "epoch": 0.66, "grad_norm": 30.500404527240367, "learning_rate": 4.535633807702519e-07, "logps/chosen": -38.468387603759766, "logps/rejected": -40.54704284667969, "loss": 0.7087, "losses/dpo": 0.6931471824645996, "losses/sft": 0.3318515717983246, "losses/total": 0.6931471824645996, "ref_logps/chosen": -29.41427230834961, "ref_logps/rejected": -30.153968811035156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.905411422252655, "rewards/margins": 0.1338963806629181, "rewards/rejected": -1.0393078327178955, "step": 704 }, { "epoch": 0.67, "grad_norm": 21.322453555121523, "learning_rate": 4.534154376151056e-07, "logps/chosen": -30.86666488647461, "logps/rejected": -48.839698791503906, "loss": 0.4894, "losses/dpo": 0.4169134795665741, "losses/sft": 0.7061957716941833, "losses/total": 0.4169134795665741, "ref_logps/chosen": -24.4953670501709, "ref_logps/rejected": -35.96376419067383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6371296644210815, "rewards/margins": 0.6504635810852051, "rewards/rejected": -1.2875933647155762, "step": 705 }, { "epoch": 0.67, "grad_norm": 16.11513524018605, "learning_rate": 4.532672833761716e-07, "logps/chosen": -25.46825408935547, "logps/rejected": -34.4082145690918, "loss": 0.5307, "losses/dpo": 0.4193875789642334, "losses/sft": 0.9455954432487488, "losses/total": 0.4193875789642334, "ref_logps/chosen": -20.422740936279297, "ref_logps/rejected": -24.2021484375, "rewards/accuracies": 0.75, "rewards/chosen": -0.5045512318611145, "rewards/margins": 0.5160553455352783, "rewards/rejected": -1.020606517791748, "step": 706 }, { "epoch": 0.67, "grad_norm": 30.644337202004376, "learning_rate": 4.531189182071893e-07, "logps/chosen": -44.46723175048828, "logps/rejected": -53.621734619140625, "loss": 0.6603, "losses/dpo": 1.284367322921753, "losses/sft": 0.9783662557601929, "losses/total": 1.284367322921753, "ref_logps/chosen": -35.025184631347656, "ref_logps/rejected": -39.658409118652344, "rewards/accuracies": 0.75, "rewards/chosen": -0.9442046284675598, "rewards/margins": 0.4521276652812958, "rewards/rejected": -1.3963322639465332, "step": 707 }, { "epoch": 0.67, "grad_norm": 27.840447025508922, "learning_rate": 4.529703422621171e-07, "logps/chosen": -45.862831115722656, "logps/rejected": -45.69800567626953, "loss": 0.7489, "losses/dpo": 1.7383277416229248, "losses/sft": 1.1671968698501587, "losses/total": 1.7383277416229248, "ref_logps/chosen": -34.610591888427734, "ref_logps/rejected": -33.139984130859375, "rewards/accuracies": 0.5, "rewards/chosen": -1.1252241134643555, "rewards/margins": 0.13057786226272583, "rewards/rejected": -1.2558019161224365, "step": 708 }, { "epoch": 0.67, "grad_norm": 20.8997777184491, "learning_rate": 4.528215556951317e-07, "logps/chosen": -28.112709045410156, "logps/rejected": -45.7291259765625, "loss": 0.5396, "losses/dpo": 0.5762552618980408, "losses/sft": 0.2404518723487854, "losses/total": 0.5762552618980408, "ref_logps/chosen": -22.413528442382812, "ref_logps/rejected": -33.95429229736328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5699183940887451, "rewards/margins": 0.6075651049613953, "rewards/rejected": -1.1774835586547852, "step": 709 }, { "epoch": 0.67, "grad_norm": 17.33690689063853, "learning_rate": 4.526725586606288e-07, "logps/chosen": -24.95415496826172, "logps/rejected": -42.135250091552734, "loss": 0.5289, "losses/dpo": 0.2902252972126007, "losses/sft": 0.8214869499206543, "losses/total": 0.2902252972126007, "ref_logps/chosen": -19.107738494873047, "ref_logps/rejected": -30.701969146728516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5846416354179382, "rewards/margins": 0.5586864352226257, "rewards/rejected": -1.1433281898498535, "step": 710 }, { "epoch": 0.67, "grad_norm": 22.07404550463283, "learning_rate": 4.5252335131322226e-07, "logps/chosen": -37.217445373535156, "logps/rejected": -44.518714904785156, "loss": 0.5145, "losses/dpo": 0.6979540586471558, "losses/sft": 1.7021360397338867, "losses/total": 0.6979540586471558, "ref_logps/chosen": -30.09398078918457, "ref_logps/rejected": -30.443058013916016, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7123462557792664, "rewards/margins": 0.6952196359634399, "rewards/rejected": -1.407565951347351, "step": 711 }, { "epoch": 0.67, "grad_norm": 25.249238729141194, "learning_rate": 4.5237393380774427e-07, "logps/chosen": -40.69586944580078, "logps/rejected": -47.13969421386719, "loss": 0.5443, "losses/dpo": 0.7391864061355591, "losses/sft": 0.9201348423957825, "losses/total": 0.7391864061355591, "ref_logps/chosen": -33.56379318237305, "ref_logps/rejected": -34.56399154663086, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7132078409194946, "rewards/margins": 0.5443626046180725, "rewards/rejected": -1.257570505142212, "step": 712 }, { "epoch": 0.67, "grad_norm": 21.982619366259822, "learning_rate": 4.52224306299245e-07, "logps/chosen": -29.528980255126953, "logps/rejected": -39.814151763916016, "loss": 0.635, "losses/dpo": 0.8328795433044434, "losses/sft": 0.7114535570144653, "losses/total": 0.8328795433044434, "ref_logps/chosen": -21.87472152709961, "ref_logps/rejected": -28.65117073059082, "rewards/accuracies": 0.625, "rewards/chosen": -0.765425980091095, "rewards/margins": 0.3508721590042114, "rewards/rejected": -1.1162981986999512, "step": 713 }, { "epoch": 0.67, "grad_norm": 28.056777268544714, "learning_rate": 4.5207446894299273e-07, "logps/chosen": -38.01008605957031, "logps/rejected": -47.942893981933594, "loss": 0.649, "losses/dpo": 1.1741633415222168, "losses/sft": 1.9230245351791382, "losses/total": 1.1741633415222168, "ref_logps/chosen": -27.887706756591797, "ref_logps/rejected": -35.683563232421875, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0122381448745728, "rewards/margins": 0.21369482576847076, "rewards/rejected": -1.2259329557418823, "step": 714 }, { "epoch": 0.67, "grad_norm": 20.36411392286873, "learning_rate": 4.5192442189447334e-07, "logps/chosen": -30.457794189453125, "logps/rejected": -33.841156005859375, "loss": 0.508, "losses/dpo": 0.5476698875427246, "losses/sft": 0.939591109752655, "losses/total": 0.5476698875427246, "ref_logps/chosen": -23.975496292114258, "ref_logps/rejected": -21.520843505859375, "rewards/accuracies": 0.75, "rewards/chosen": -0.6482297778129578, "rewards/margins": 0.5838016271591187, "rewards/rejected": -1.2320313453674316, "step": 715 }, { "epoch": 0.68, "grad_norm": 22.573894952255436, "learning_rate": 4.5177416530939027e-07, "logps/chosen": -32.86807632446289, "logps/rejected": -47.08106231689453, "loss": 0.4663, "losses/dpo": 0.5233097076416016, "losses/sft": 0.5943374037742615, "losses/total": 0.5233097076416016, "ref_logps/chosen": -25.912254333496094, "ref_logps/rejected": -32.37773513793945, "rewards/accuracies": 0.75, "rewards/chosen": -0.6955820322036743, "rewards/margins": 0.7747507095336914, "rewards/rejected": -1.4703328609466553, "step": 716 }, { "epoch": 0.68, "grad_norm": 24.027545699757894, "learning_rate": 4.516236993436645e-07, "logps/chosen": -31.577335357666016, "logps/rejected": -43.85548400878906, "loss": 0.6295, "losses/dpo": 0.2507880926132202, "losses/sft": 1.3239359855651855, "losses/total": 0.2507880926132202, "ref_logps/chosen": -23.404964447021484, "ref_logps/rejected": -32.08576583862305, "rewards/accuracies": 0.625, "rewards/chosen": -0.8172370195388794, "rewards/margins": 0.35973483324050903, "rewards/rejected": -1.1769717931747437, "step": 717 }, { "epoch": 0.68, "grad_norm": 21.96974617830752, "learning_rate": 4.5147302415343437e-07, "logps/chosen": -26.541934967041016, "logps/rejected": -31.932861328125, "loss": 0.6551, "losses/dpo": 0.9574466943740845, "losses/sft": 1.4927828311920166, "losses/total": 0.9574466943740845, "ref_logps/chosen": -19.738330841064453, "ref_logps/rejected": -22.816003799438477, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6803605556488037, "rewards/margins": 0.23132525384426117, "rewards/rejected": -0.9116858243942261, "step": 718 }, { "epoch": 0.68, "grad_norm": 26.06408288753011, "learning_rate": 4.513221398950551e-07, "logps/chosen": -38.546234130859375, "logps/rejected": -47.15898895263672, "loss": 0.5269, "losses/dpo": 0.24055661261081696, "losses/sft": 0.5394267439842224, "losses/total": 0.24055661261081696, "ref_logps/chosen": -30.086565017700195, "ref_logps/rejected": -31.872222900390625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.845967173576355, "rewards/margins": 0.6827090382575989, "rewards/rejected": -1.5286762714385986, "step": 719 }, { "epoch": 0.68, "grad_norm": 23.276859566612618, "learning_rate": 4.5117104672509897e-07, "logps/chosen": -32.35856246948242, "logps/rejected": -45.153587341308594, "loss": 0.6105, "losses/dpo": 0.32772964239120483, "losses/sft": 0.980364978313446, "losses/total": 0.32772964239120483, "ref_logps/chosen": -24.457948684692383, "ref_logps/rejected": -33.75373840332031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7900612354278564, "rewards/margins": 0.34992390871047974, "rewards/rejected": -1.139985203742981, "step": 720 }, { "epoch": 0.68, "grad_norm": 18.892684756162204, "learning_rate": 4.5101974480035513e-07, "logps/chosen": -31.64666748046875, "logps/rejected": -48.360870361328125, "loss": 0.4686, "losses/dpo": 0.2495608925819397, "losses/sft": 1.9971565008163452, "losses/total": 0.2495608925819397, "ref_logps/chosen": -24.102935791015625, "ref_logps/rejected": -33.889678955078125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7543730735778809, "rewards/margins": 0.6927462816238403, "rewards/rejected": -1.4471193552017212, "step": 721 }, { "epoch": 0.68, "grad_norm": 24.06626107334786, "learning_rate": 4.5086823427782925e-07, "logps/chosen": -32.05658721923828, "logps/rejected": -52.376625061035156, "loss": 0.4955, "losses/dpo": 0.26718083024024963, "losses/sft": 0.7038200497627258, "losses/total": 0.26718083024024963, "ref_logps/chosen": -25.531944274902344, "ref_logps/rejected": -38.00464630126953, "rewards/accuracies": 0.75, "rewards/chosen": -0.6524641513824463, "rewards/margins": 0.7847338318824768, "rewards/rejected": -1.4371979236602783, "step": 722 }, { "epoch": 0.68, "grad_norm": 20.333403545148276, "learning_rate": 4.5071651531474353e-07, "logps/chosen": -27.73233985900879, "logps/rejected": -57.066986083984375, "loss": 0.4847, "losses/dpo": 0.3839556872844696, "losses/sft": 0.7916404604911804, "losses/total": 0.3839556872844696, "ref_logps/chosen": -20.7528076171875, "ref_logps/rejected": -43.236328125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6979530453681946, "rewards/margins": 0.685112714767456, "rewards/rejected": -1.3830657005310059, "step": 723 }, { "epoch": 0.68, "grad_norm": 23.70464063585009, "learning_rate": 4.5056458806853635e-07, "logps/chosen": -53.825233459472656, "logps/rejected": -57.67127990722656, "loss": 0.5426, "losses/dpo": 0.32945388555526733, "losses/sft": 1.655349612236023, "losses/total": 0.32945388555526733, "ref_logps/chosen": -43.42021942138672, "ref_logps/rejected": -42.298667907714844, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0405011177062988, "rewards/margins": 0.49676045775413513, "rewards/rejected": -1.5372614860534668, "step": 724 }, { "epoch": 0.68, "grad_norm": 23.886848942466724, "learning_rate": 4.5041245269686244e-07, "logps/chosen": -43.97332763671875, "logps/rejected": -44.40788269042969, "loss": 0.5541, "losses/dpo": 0.8595328330993652, "losses/sft": 1.9126083850860596, "losses/total": 0.8595328330993652, "ref_logps/chosen": -34.84081268310547, "ref_logps/rejected": -30.713600158691406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9132518172264099, "rewards/margins": 0.45617616176605225, "rewards/rejected": -1.3694279193878174, "step": 725 }, { "epoch": 0.68, "grad_norm": 24.978384799293806, "learning_rate": 4.5026010935759225e-07, "logps/chosen": -32.12104034423828, "logps/rejected": -42.26741027832031, "loss": 0.6637, "losses/dpo": 0.9262085556983948, "losses/sft": 1.3203282356262207, "losses/total": 0.9262085556983948, "ref_logps/chosen": -21.940128326416016, "ref_logps/rejected": -29.72545623779297, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0180913209915161, "rewards/margins": 0.23610396683216095, "rewards/rejected": -1.254195213317871, "step": 726 }, { "epoch": 0.69, "grad_norm": 19.899279323839547, "learning_rate": 4.5010755820881213e-07, "logps/chosen": -33.58455276489258, "logps/rejected": -45.71602249145508, "loss": 0.4684, "losses/dpo": 0.37642258405685425, "losses/sft": 0.831511378288269, "losses/total": 0.37642258405685425, "ref_logps/chosen": -27.974599838256836, "ref_logps/rejected": -32.984561920166016, "rewards/accuracies": 0.875, "rewards/chosen": -0.5609954595565796, "rewards/margins": 0.7121507525444031, "rewards/rejected": -1.2731462717056274, "step": 727 }, { "epoch": 0.69, "grad_norm": 21.657570300593434, "learning_rate": 4.499547994088242e-07, "logps/chosen": -40.328304290771484, "logps/rejected": -48.98722839355469, "loss": 0.4809, "losses/dpo": 0.22476086020469666, "losses/sft": 0.9079604148864746, "losses/total": 0.22476086020469666, "ref_logps/chosen": -32.07945251464844, "ref_logps/rejected": -33.680992126464844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8248854279518127, "rewards/margins": 0.7057379484176636, "rewards/rejected": -1.530623435974121, "step": 728 }, { "epoch": 0.69, "grad_norm": 27.463259234712304, "learning_rate": 4.4980183311614584e-07, "logps/chosen": -38.66835403442383, "logps/rejected": -44.43119812011719, "loss": 0.6048, "losses/dpo": 0.7560780644416809, "losses/sft": 0.6624940037727356, "losses/total": 0.7560780644416809, "ref_logps/chosen": -28.763324737548828, "ref_logps/rejected": -29.95652198791504, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9905030727386475, "rewards/margins": 0.45696479082107544, "rewards/rejected": -1.4474680423736572, "step": 729 }, { "epoch": 0.69, "grad_norm": 20.017270237234083, "learning_rate": 4.4964865948951004e-07, "logps/chosen": -27.555770874023438, "logps/rejected": -41.12815856933594, "loss": 0.5997, "losses/dpo": 0.15334464609622955, "losses/sft": 0.4839994013309479, "losses/total": 0.15334464609622955, "ref_logps/chosen": -19.195749282836914, "ref_logps/rejected": -27.818775177001953, "rewards/accuracies": 0.75, "rewards/chosen": -0.8360022306442261, "rewards/margins": 0.4949357807636261, "rewards/rejected": -1.3309379816055298, "step": 730 }, { "epoch": 0.69, "grad_norm": 21.010912976942492, "learning_rate": 4.4949527868786457e-07, "logps/chosen": -29.342227935791016, "logps/rejected": -46.88103485107422, "loss": 0.5675, "losses/dpo": 0.2869590222835541, "losses/sft": 1.253127932548523, "losses/total": 0.2869590222835541, "ref_logps/chosen": -22.19078826904297, "ref_logps/rejected": -33.045108795166016, "rewards/accuracies": 0.625, "rewards/chosen": -0.7151440382003784, "rewards/margins": 0.6684483289718628, "rewards/rejected": -1.3835923671722412, "step": 731 }, { "epoch": 0.69, "grad_norm": 23.85837756188027, "learning_rate": 4.493416908703724e-07, "logps/chosen": -38.563499450683594, "logps/rejected": -45.39124298095703, "loss": 0.6797, "losses/dpo": 0.7321650385856628, "losses/sft": 1.8692052364349365, "losses/total": 0.7321650385856628, "ref_logps/chosen": -27.51198387145996, "ref_logps/rejected": -32.542083740234375, "rewards/accuracies": 0.5625, "rewards/chosen": -1.105151891708374, "rewards/margins": 0.17976412177085876, "rewards/rejected": -1.2849159240722656, "step": 732 }, { "epoch": 0.69, "grad_norm": 32.723557717183134, "learning_rate": 4.491878961964115e-07, "logps/chosen": -42.88732147216797, "logps/rejected": -44.758399963378906, "loss": 0.8378, "losses/dpo": 0.4112456440925598, "losses/sft": 0.3314531445503235, "losses/total": 0.4112456440925598, "ref_logps/chosen": -32.59047317504883, "ref_logps/rejected": -34.059303283691406, "rewards/accuracies": 0.625, "rewards/chosen": -1.0296850204467773, "rewards/margins": 0.04022429138422012, "rewards/rejected": -1.0699093341827393, "step": 733 }, { "epoch": 0.69, "grad_norm": 23.221748450642757, "learning_rate": 4.4903389482557414e-07, "logps/chosen": -31.786951065063477, "logps/rejected": -35.56210708618164, "loss": 0.6797, "losses/dpo": 0.4866293668746948, "losses/sft": 0.3974372148513794, "losses/total": 0.4866293668746948, "ref_logps/chosen": -24.579883575439453, "ref_logps/rejected": -27.06547737121582, "rewards/accuracies": 0.625, "rewards/chosen": -0.7207068204879761, "rewards/margins": 0.1289558708667755, "rewards/rejected": -0.8496626615524292, "step": 734 }, { "epoch": 0.69, "grad_norm": 32.249836364135284, "learning_rate": 4.488796869176672e-07, "logps/chosen": -46.613548278808594, "logps/rejected": -60.135196685791016, "loss": 0.909, "losses/dpo": 0.5588258504867554, "losses/sft": 0.34238579869270325, "losses/total": 0.5588258504867554, "ref_logps/chosen": -32.325313568115234, "ref_logps/rejected": -42.523277282714844, "rewards/accuracies": 0.625, "rewards/chosen": -1.428823471069336, "rewards/margins": 0.3323689103126526, "rewards/rejected": -1.7611923217773438, "step": 735 }, { "epoch": 0.69, "grad_norm": 22.246174971366592, "learning_rate": 4.4872527263271193e-07, "logps/chosen": -26.918468475341797, "logps/rejected": -35.74522399902344, "loss": 0.6003, "losses/dpo": 0.7845463156700134, "losses/sft": 1.3946210145950317, "losses/total": 0.7845463156700134, "ref_logps/chosen": -20.85462760925293, "ref_logps/rejected": -25.733612060546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.6063840985298157, "rewards/margins": 0.3947767913341522, "rewards/rejected": -1.0011608600616455, "step": 736 }, { "epoch": 0.7, "grad_norm": 34.39392216235178, "learning_rate": 4.485706521309437e-07, "logps/chosen": -42.562198638916016, "logps/rejected": -50.908287048339844, "loss": 0.8425, "losses/dpo": 0.9778814911842346, "losses/sft": 2.3338799476623535, "losses/total": 0.9778814911842346, "ref_logps/chosen": -31.59320640563965, "ref_logps/rejected": -40.29045486450195, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0968990325927734, "rewards/margins": -0.035115569829940796, "rewards/rejected": -1.0617834329605103, "step": 737 }, { "epoch": 0.7, "grad_norm": 20.005741958699847, "learning_rate": 4.4841582557281205e-07, "logps/chosen": -26.327632904052734, "logps/rejected": -31.987152099609375, "loss": 0.5878, "losses/dpo": 0.28847867250442505, "losses/sft": 0.4837670624256134, "losses/total": 0.28847867250442505, "ref_logps/chosen": -20.246902465820312, "ref_logps/rejected": -22.434791564941406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6080728769302368, "rewards/margins": 0.34716323018074036, "rewards/rejected": -0.9552360773086548, "step": 738 }, { "epoch": 0.7, "grad_norm": 29.54608143415062, "learning_rate": 4.4826079311898e-07, "logps/chosen": -43.99919891357422, "logps/rejected": -52.904396057128906, "loss": 0.5802, "losses/dpo": 0.824455976486206, "losses/sft": 1.3810769319534302, "losses/total": 0.824455976486206, "ref_logps/chosen": -34.97930145263672, "ref_logps/rejected": -37.5146369934082, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9019895792007446, "rewards/margins": 0.636986255645752, "rewards/rejected": -1.5389758348464966, "step": 739 }, { "epoch": 0.7, "grad_norm": 22.256377806333127, "learning_rate": 4.4810555493032453e-07, "logps/chosen": -34.92280960083008, "logps/rejected": -57.45033264160156, "loss": 0.5027, "losses/dpo": 0.2637943625450134, "losses/sft": 1.1914517879486084, "losses/total": 0.2637943625450134, "ref_logps/chosen": -26.127151489257812, "ref_logps/rejected": -42.08104705810547, "rewards/accuracies": 0.75, "rewards/chosen": -0.8795657753944397, "rewards/margins": 0.657362699508667, "rewards/rejected": -1.536928415298462, "step": 740 }, { "epoch": 0.7, "grad_norm": 23.901907786631053, "learning_rate": 4.479501111679359e-07, "logps/chosen": -28.57849884033203, "logps/rejected": -42.568199157714844, "loss": 0.5853, "losses/dpo": 0.7217780947685242, "losses/sft": 0.810937762260437, "losses/total": 0.7217780947685242, "ref_logps/chosen": -22.235092163085938, "ref_logps/rejected": -31.069368362426758, "rewards/accuracies": 0.625, "rewards/chosen": -0.6343404054641724, "rewards/margins": 0.5155429244041443, "rewards/rejected": -1.1498832702636719, "step": 741 }, { "epoch": 0.7, "grad_norm": 26.368315008599335, "learning_rate": 4.4779446199311786e-07, "logps/chosen": -39.28992462158203, "logps/rejected": -42.9508056640625, "loss": 0.7231, "losses/dpo": 0.5220909118652344, "losses/sft": 1.094911813735962, "losses/total": 0.5220909118652344, "ref_logps/chosen": -30.968833923339844, "ref_logps/rejected": -32.5638427734375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8321091532707214, "rewards/margins": 0.20658737421035767, "rewards/rejected": -1.038696527481079, "step": 742 }, { "epoch": 0.7, "grad_norm": 22.53021344866796, "learning_rate": 4.476386075673873e-07, "logps/chosen": -31.912944793701172, "logps/rejected": -49.7503662109375, "loss": 0.5509, "losses/dpo": 0.44273945689201355, "losses/sft": 0.3387399911880493, "losses/total": 0.44273945689201355, "ref_logps/chosen": -25.372827529907227, "ref_logps/rejected": -38.39241027832031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6540118455886841, "rewards/margins": 0.4817838668823242, "rewards/rejected": -1.1357957124710083, "step": 743 }, { "epoch": 0.7, "grad_norm": 27.002665955699793, "learning_rate": 4.4748254805247386e-07, "logps/chosen": -34.54560470581055, "logps/rejected": -48.87393569946289, "loss": 0.5708, "losses/dpo": 0.694816529750824, "losses/sft": 1.9376435279846191, "losses/total": 0.694816529750824, "ref_logps/chosen": -25.5460205078125, "ref_logps/rejected": -34.45051574707031, "rewards/accuracies": 0.75, "rewards/chosen": -0.8999583721160889, "rewards/margins": 0.5423834323883057, "rewards/rejected": -1.4423418045043945, "step": 744 }, { "epoch": 0.7, "grad_norm": 18.951639422050402, "learning_rate": 4.473262836103203e-07, "logps/chosen": -22.315444946289062, "logps/rejected": -42.0109977722168, "loss": 0.492, "losses/dpo": 0.3386684060096741, "losses/sft": 0.17181357741355896, "losses/total": 0.3386684060096741, "ref_logps/chosen": -16.87876319885254, "ref_logps/rejected": -30.33389663696289, "rewards/accuracies": 0.75, "rewards/chosen": -0.543668270111084, "rewards/margins": 0.6240416765213013, "rewards/rejected": -1.1677100658416748, "step": 745 }, { "epoch": 0.7, "grad_norm": 21.751154871779352, "learning_rate": 4.4716981440308187e-07, "logps/chosen": -33.44097137451172, "logps/rejected": -34.02304458618164, "loss": 0.6224, "losses/dpo": 0.5474454164505005, "losses/sft": 1.5842310190200806, "losses/total": 0.5474454164505005, "ref_logps/chosen": -27.494487762451172, "ref_logps/rejected": -24.841358184814453, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5946485996246338, "rewards/margins": 0.32352012395858765, "rewards/rejected": -0.9181686639785767, "step": 746 }, { "epoch": 0.7, "grad_norm": 25.469837704313573, "learning_rate": 4.4701314059312644e-07, "logps/chosen": -46.62129211425781, "logps/rejected": -62.049617767333984, "loss": 0.4649, "losses/dpo": 0.5219026803970337, "losses/sft": 1.2183892726898193, "losses/total": 0.5219026803970337, "ref_logps/chosen": -36.96009063720703, "ref_logps/rejected": -45.46171569824219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9661200046539307, "rewards/margins": 0.6926704049110413, "rewards/rejected": -1.6587903499603271, "step": 747 }, { "epoch": 0.71, "grad_norm": 24.31149810945654, "learning_rate": 4.46856262343034e-07, "logps/chosen": -31.74755096435547, "logps/rejected": -36.64699172973633, "loss": 0.6215, "losses/dpo": 0.5231785774230957, "losses/sft": 1.0829013586044312, "losses/total": 0.5231785774230957, "ref_logps/chosen": -24.726699829101562, "ref_logps/rejected": -25.796592712402344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.702085018157959, "rewards/margins": 0.38295528292655945, "rewards/rejected": -1.0850403308868408, "step": 748 }, { "epoch": 0.71, "grad_norm": 15.565784204213845, "learning_rate": 4.466991798155969e-07, "logps/chosen": -28.327404022216797, "logps/rejected": -63.36408996582031, "loss": 0.3217, "losses/dpo": 0.31101471185684204, "losses/sft": 2.144118309020996, "losses/total": 0.31101471185684204, "ref_logps/chosen": -20.604839324951172, "ref_logps/rejected": -42.29961395263672, "rewards/accuracies": 0.875, "rewards/chosen": -0.7722562551498413, "rewards/margins": 1.3341915607452393, "rewards/rejected": -2.106447696685791, "step": 749 }, { "epoch": 0.71, "grad_norm": 23.932601884330648, "learning_rate": 4.465418931738192e-07, "logps/chosen": -41.356101989746094, "logps/rejected": -52.230628967285156, "loss": 0.5107, "losses/dpo": 0.4346042573451996, "losses/sft": 1.114706039428711, "losses/total": 0.4346042573451996, "ref_logps/chosen": -32.94683074951172, "ref_logps/rejected": -37.41435623168945, "rewards/accuracies": 0.75, "rewards/chosen": -0.8409270644187927, "rewards/margins": 0.6407003402709961, "rewards/rejected": -1.4816274642944336, "step": 750 }, { "epoch": 0.71, "grad_norm": 17.865230274603274, "learning_rate": 4.4638440258091715e-07, "logps/chosen": -41.50404739379883, "logps/rejected": -56.95954513549805, "loss": 0.3845, "losses/dpo": 0.3835565149784088, "losses/sft": 1.2222304344177246, "losses/total": 0.3835565149784088, "ref_logps/chosen": -35.39565658569336, "ref_logps/rejected": -41.134803771972656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6108390092849731, "rewards/margins": 0.9716354608535767, "rewards/rejected": -1.5824744701385498, "step": 751 }, { "epoch": 0.71, "grad_norm": 40.74616313224949, "learning_rate": 4.462267082003183e-07, "logps/chosen": -46.226409912109375, "logps/rejected": -56.09168243408203, "loss": 0.882, "losses/dpo": 1.4855986833572388, "losses/sft": 1.9368057250976562, "losses/total": 1.4855986833572388, "ref_logps/chosen": -32.157066345214844, "ref_logps/rejected": -41.52873229980469, "rewards/accuracies": 0.5625, "rewards/chosen": -1.406934380531311, "rewards/margins": 0.049361079931259155, "rewards/rejected": -1.4562954902648926, "step": 752 }, { "epoch": 0.71, "grad_norm": 22.825633558484636, "learning_rate": 4.460688101956617e-07, "logps/chosen": -33.622161865234375, "logps/rejected": -41.863807678222656, "loss": 0.4925, "losses/dpo": 0.3373573422431946, "losses/sft": 1.0098227262496948, "losses/total": 0.3373573422431946, "ref_logps/chosen": -27.610979080200195, "ref_logps/rejected": -28.58438491821289, "rewards/accuracies": 0.75, "rewards/chosen": -0.6011184453964233, "rewards/margins": 0.7268238663673401, "rewards/rejected": -1.3279423713684082, "step": 753 }, { "epoch": 0.71, "grad_norm": 25.538370945055647, "learning_rate": 4.459107087307978e-07, "logps/chosen": -36.65918731689453, "logps/rejected": -44.48274230957031, "loss": 0.6369, "losses/dpo": 0.5279362797737122, "losses/sft": 1.050249457359314, "losses/total": 0.5279362797737122, "ref_logps/chosen": -26.293296813964844, "ref_logps/rejected": -31.16249656677246, "rewards/accuracies": 0.625, "rewards/chosen": -1.0365886688232422, "rewards/margins": 0.29543614387512207, "rewards/rejected": -1.3320248126983643, "step": 754 }, { "epoch": 0.71, "grad_norm": 20.8829723823018, "learning_rate": 4.457524039697884e-07, "logps/chosen": -28.598487854003906, "logps/rejected": -34.80458068847656, "loss": 0.5585, "losses/dpo": 0.6450380682945251, "losses/sft": 1.2834248542785645, "losses/total": 0.6450380682945251, "ref_logps/chosen": -21.050006866455078, "ref_logps/rejected": -23.234886169433594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7548481822013855, "rewards/margins": 0.40212148427963257, "rewards/rejected": -1.1569697856903076, "step": 755 }, { "epoch": 0.71, "grad_norm": 24.7055770011336, "learning_rate": 4.455938960769058e-07, "logps/chosen": -39.507041931152344, "logps/rejected": -49.42637252807617, "loss": 0.6014, "losses/dpo": 0.32586219906806946, "losses/sft": 1.8074880838394165, "losses/total": 0.32586219906806946, "ref_logps/chosen": -29.388355255126953, "ref_logps/rejected": -35.19071578979492, "rewards/accuracies": 0.75, "rewards/chosen": -1.0118688344955444, "rewards/margins": 0.4116969108581543, "rewards/rejected": -1.4235658645629883, "step": 756 }, { "epoch": 0.71, "grad_norm": 23.95685399433394, "learning_rate": 4.4543518521663344e-07, "logps/chosen": -36.56089401245117, "logps/rejected": -34.15040969848633, "loss": 0.6884, "losses/dpo": 0.8586512804031372, "losses/sft": 0.6849903464317322, "losses/total": 0.8586512804031372, "ref_logps/chosen": -29.161705017089844, "ref_logps/rejected": -25.781728744506836, "rewards/accuracies": 0.625, "rewards/chosen": -0.7399190068244934, "rewards/margins": 0.0969490110874176, "rewards/rejected": -0.8368679881095886, "step": 757 }, { "epoch": 0.72, "grad_norm": 22.873739196526188, "learning_rate": 4.4527627155366515e-07, "logps/chosen": -35.58399963378906, "logps/rejected": -68.24763488769531, "loss": 0.5, "losses/dpo": 0.5668648481369019, "losses/sft": 0.941761314868927, "losses/total": 0.5668648481369019, "ref_logps/chosen": -27.573129653930664, "ref_logps/rejected": -53.25929260253906, "rewards/accuracies": 0.625, "rewards/chosen": -0.8010866045951843, "rewards/margins": 0.6977477073669434, "rewards/rejected": -1.498834252357483, "step": 758 }, { "epoch": 0.72, "grad_norm": 28.7189627403496, "learning_rate": 4.451171552529054e-07, "logps/chosen": -46.035972595214844, "logps/rejected": -49.49989700317383, "loss": 0.5941, "losses/dpo": 1.3327817916870117, "losses/sft": 1.3404958248138428, "losses/total": 1.3327817916870117, "ref_logps/chosen": -36.60057830810547, "ref_logps/rejected": -34.89925765991211, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9435396194458008, "rewards/margins": 0.5165241956710815, "rewards/rejected": -1.4600638151168823, "step": 759 }, { "epoch": 0.72, "grad_norm": 21.19123761301654, "learning_rate": 4.4495783647946884e-07, "logps/chosen": -34.757835388183594, "logps/rejected": -44.74314498901367, "loss": 0.534, "losses/dpo": 0.4020719826221466, "losses/sft": 1.3995904922485352, "losses/total": 0.4020719826221466, "ref_logps/chosen": -28.274784088134766, "ref_logps/rejected": -31.619413375854492, "rewards/accuracies": 0.625, "rewards/chosen": -0.6483052968978882, "rewards/margins": 0.664067804813385, "rewards/rejected": -1.312373161315918, "step": 760 }, { "epoch": 0.72, "grad_norm": 20.851590772281295, "learning_rate": 4.4479831539868024e-07, "logps/chosen": -27.940101623535156, "logps/rejected": -47.21737289428711, "loss": 0.5602, "losses/dpo": 0.803519070148468, "losses/sft": 1.5599620342254639, "losses/total": 0.803519070148468, "ref_logps/chosen": -21.186176300048828, "ref_logps/rejected": -36.001834869384766, "rewards/accuracies": 0.75, "rewards/chosen": -0.6753925681114197, "rewards/margins": 0.4461612105369568, "rewards/rejected": -1.121553659439087, "step": 761 }, { "epoch": 0.72, "grad_norm": 23.266563787312094, "learning_rate": 4.446385921760743e-07, "logps/chosen": -36.177223205566406, "logps/rejected": -55.24497985839844, "loss": 0.4571, "losses/dpo": 0.6495215892791748, "losses/sft": 1.3079349994659424, "losses/total": 0.6495215892791748, "ref_logps/chosen": -27.100685119628906, "ref_logps/rejected": -39.59154510498047, "rewards/accuracies": 0.875, "rewards/chosen": -0.9076536893844604, "rewards/margins": 0.6576902866363525, "rewards/rejected": -1.565343976020813, "step": 762 }, { "epoch": 0.72, "grad_norm": 15.280524246627557, "learning_rate": 4.4447866697739545e-07, "logps/chosen": -27.171855926513672, "logps/rejected": -58.54140853881836, "loss": 0.3792, "losses/dpo": 0.5600853562355042, "losses/sft": 0.9636139273643494, "losses/total": 0.5600853562355042, "ref_logps/chosen": -20.958547592163086, "ref_logps/rejected": -40.496402740478516, "rewards/accuracies": 0.875, "rewards/chosen": -0.6213309168815613, "rewards/margins": 1.1831698417663574, "rewards/rejected": -1.8045008182525635, "step": 763 }, { "epoch": 0.72, "grad_norm": 21.310002837430616, "learning_rate": 4.443185399685978e-07, "logps/chosen": -29.469144821166992, "logps/rejected": -37.53219223022461, "loss": 0.5499, "losses/dpo": 0.45027899742126465, "losses/sft": 0.7892292141914368, "losses/total": 0.45027899742126465, "ref_logps/chosen": -23.88103485107422, "ref_logps/rejected": -27.207420349121094, "rewards/accuracies": 0.75, "rewards/chosen": -0.5588111281394958, "rewards/margins": 0.47366607189178467, "rewards/rejected": -1.0324771404266357, "step": 764 }, { "epoch": 0.72, "grad_norm": 18.79257415011216, "learning_rate": 4.4415821131584477e-07, "logps/chosen": -34.7292594909668, "logps/rejected": -48.325584411621094, "loss": 0.418, "losses/dpo": 1.2080755233764648, "losses/sft": 1.9372928142547607, "losses/total": 1.2080755233764648, "ref_logps/chosen": -28.297534942626953, "ref_logps/rejected": -32.810508728027344, "rewards/accuracies": 0.875, "rewards/chosen": -0.6431726217269897, "rewards/margins": 0.9083352088928223, "rewards/rejected": -1.5515079498291016, "step": 765 }, { "epoch": 0.72, "grad_norm": 19.69224718956215, "learning_rate": 4.439976811855091e-07, "logps/chosen": -33.53835678100586, "logps/rejected": -44.64678192138672, "loss": 0.5528, "losses/dpo": 0.3808399438858032, "losses/sft": 0.9396642446517944, "losses/total": 0.3808399438858032, "ref_logps/chosen": -26.764564514160156, "ref_logps/rejected": -31.84939956665039, "rewards/accuracies": 0.75, "rewards/chosen": -0.6773794293403625, "rewards/margins": 0.6023589372634888, "rewards/rejected": -1.279738426208496, "step": 766 }, { "epoch": 0.72, "grad_norm": 19.925200267803913, "learning_rate": 4.4383694974417263e-07, "logps/chosen": -41.439117431640625, "logps/rejected": -53.23297119140625, "loss": 0.4166, "losses/dpo": 0.6613832116127014, "losses/sft": 1.8116214275360107, "losses/total": 0.6613832116127014, "ref_logps/chosen": -33.56214904785156, "ref_logps/rejected": -36.904212951660156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7876970767974854, "rewards/margins": 0.8451790809631348, "rewards/rejected": -1.6328761577606201, "step": 767 }, { "epoch": 0.72, "grad_norm": 14.411175194136048, "learning_rate": 4.4367601715862594e-07, "logps/chosen": -33.64580535888672, "logps/rejected": -57.62085723876953, "loss": 0.3623, "losses/dpo": 0.3255855143070221, "losses/sft": 0.6885007619857788, "losses/total": 0.3255855143070221, "ref_logps/chosen": -27.225322723388672, "ref_logps/rejected": -39.017295837402344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6420484781265259, "rewards/margins": 1.2183077335357666, "rewards/rejected": -1.8603562116622925, "step": 768 }, { "epoch": 0.73, "grad_norm": 21.32733584806775, "learning_rate": 4.435148835958686e-07, "logps/chosen": -37.46600341796875, "logps/rejected": -47.59906768798828, "loss": 0.5588, "losses/dpo": 0.3947111666202545, "losses/sft": 1.7235965728759766, "losses/total": 0.3947111666202545, "ref_logps/chosen": -30.106657028198242, "ref_logps/rejected": -34.4416389465332, "rewards/accuracies": 0.75, "rewards/chosen": -0.7359346151351929, "rewards/margins": 0.579808235168457, "rewards/rejected": -1.31574285030365, "step": 769 }, { "epoch": 0.73, "grad_norm": 18.621457606577685, "learning_rate": 4.433535492231084e-07, "logps/chosen": -34.419464111328125, "logps/rejected": -51.654937744140625, "loss": 0.4207, "losses/dpo": 0.3651345372200012, "losses/sft": 0.9034489989280701, "losses/total": 0.3651345372200012, "ref_logps/chosen": -27.67188262939453, "ref_logps/rejected": -36.120182037353516, "rewards/accuracies": 0.75, "rewards/chosen": -0.6747581958770752, "rewards/margins": 0.8787178993225098, "rewards/rejected": -1.553476095199585, "step": 770 }, { "epoch": 0.73, "grad_norm": 24.005616910581004, "learning_rate": 4.4319201420776187e-07, "logps/chosen": -43.654178619384766, "logps/rejected": -49.02555847167969, "loss": 0.5363, "losses/dpo": 0.45935454964637756, "losses/sft": 1.5680694580078125, "losses/total": 0.45935454964637756, "ref_logps/chosen": -34.4882926940918, "ref_logps/rejected": -33.08019256591797, "rewards/accuracies": 0.75, "rewards/chosen": -0.9165884852409363, "rewards/margins": 0.6779478788375854, "rewards/rejected": -1.5945364236831665, "step": 771 }, { "epoch": 0.73, "grad_norm": 19.489618184890954, "learning_rate": 4.4303027871745344e-07, "logps/chosen": -36.66615295410156, "logps/rejected": -53.01757049560547, "loss": 0.4867, "losses/dpo": 0.1202298179268837, "losses/sft": 0.19592222571372986, "losses/total": 0.1202298179268837, "ref_logps/chosen": -28.141090393066406, "ref_logps/rejected": -37.50215148925781, "rewards/accuracies": 0.75, "rewards/chosen": -0.8525062799453735, "rewards/margins": 0.6990357637405396, "rewards/rejected": -1.551542043685913, "step": 772 }, { "epoch": 0.73, "grad_norm": 20.275327402073064, "learning_rate": 4.4286834292001576e-07, "logps/chosen": -33.869384765625, "logps/rejected": -40.737510681152344, "loss": 0.5223, "losses/dpo": 0.26375824213027954, "losses/sft": 1.861124038696289, "losses/total": 0.26375824213027954, "ref_logps/chosen": -26.77070426940918, "ref_logps/rejected": -27.969196319580078, "rewards/accuracies": 0.75, "rewards/chosen": -0.7098681926727295, "rewards/margins": 0.566963255405426, "rewards/rejected": -1.2768315076828003, "step": 773 }, { "epoch": 0.73, "grad_norm": 20.38879580655584, "learning_rate": 4.4270620698348924e-07, "logps/chosen": -30.570419311523438, "logps/rejected": -54.3520622253418, "loss": 0.4312, "losses/dpo": 1.029860496520996, "losses/sft": 0.8617779612541199, "losses/total": 1.029860496520996, "ref_logps/chosen": -24.3668270111084, "ref_logps/rejected": -38.423946380615234, "rewards/accuracies": 0.75, "rewards/chosen": -0.6203593015670776, "rewards/margins": 0.9724524021148682, "rewards/rejected": -1.5928117036819458, "step": 774 }, { "epoch": 0.73, "grad_norm": 25.487895828179738, "learning_rate": 4.4254387107612206e-07, "logps/chosen": -39.228904724121094, "logps/rejected": -49.49446105957031, "loss": 0.6077, "losses/dpo": 0.7324658632278442, "losses/sft": 1.3987228870391846, "losses/total": 0.7324658632278442, "ref_logps/chosen": -30.628734588623047, "ref_logps/rejected": -37.63291549682617, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8600170016288757, "rewards/margins": 0.3261373043060303, "rewards/rejected": -1.1861543655395508, "step": 775 }, { "epoch": 0.73, "grad_norm": 26.653542743457677, "learning_rate": 4.4238133536636985e-07, "logps/chosen": -40.67345428466797, "logps/rejected": -55.79542541503906, "loss": 0.5905, "losses/dpo": 0.6381421685218811, "losses/sft": 1.0707215070724487, "losses/total": 0.6381421685218811, "ref_logps/chosen": -31.191604614257812, "ref_logps/rejected": -40.16755294799805, "rewards/accuracies": 0.75, "rewards/chosen": -0.9481852054595947, "rewards/margins": 0.6146017909049988, "rewards/rejected": -1.5627870559692383, "step": 776 }, { "epoch": 0.73, "grad_norm": 21.219581282341924, "learning_rate": 4.4221860002289557e-07, "logps/chosen": -35.80842208862305, "logps/rejected": -46.609100341796875, "loss": 0.3975, "losses/dpo": 0.7607000470161438, "losses/sft": 1.6822428703308105, "losses/total": 0.7607000470161438, "ref_logps/chosen": -28.127010345458984, "ref_logps/rejected": -29.279237747192383, "rewards/accuracies": 0.875, "rewards/chosen": -0.7681411504745483, "rewards/margins": 0.9648449420928955, "rewards/rejected": -1.7329860925674438, "step": 777 }, { "epoch": 0.73, "grad_norm": 35.34593920332609, "learning_rate": 4.420556652145694e-07, "logps/chosen": -49.332496643066406, "logps/rejected": -54.578853607177734, "loss": 0.6106, "losses/dpo": 1.5398012399673462, "losses/sft": 2.5209014415740967, "losses/total": 1.5398012399673462, "ref_logps/chosen": -38.655113220214844, "ref_logps/rejected": -38.41394805908203, "rewards/accuracies": 0.625, "rewards/chosen": -1.0677385330200195, "rewards/margins": 0.548751950263977, "rewards/rejected": -1.6164904832839966, "step": 778 }, { "epoch": 0.73, "grad_norm": 15.389507003653637, "learning_rate": 4.418925311104683e-07, "logps/chosen": -28.48149299621582, "logps/rejected": -47.05885314941406, "loss": 0.3936, "losses/dpo": 0.1618538200855255, "losses/sft": 1.9322786331176758, "losses/total": 0.1618538200855255, "ref_logps/chosen": -23.4765625, "ref_logps/rejected": -32.347206115722656, "rewards/accuracies": 0.875, "rewards/chosen": -0.5004932880401611, "rewards/margins": 0.9706719517707825, "rewards/rejected": -1.4711651802062988, "step": 779 }, { "epoch": 0.74, "grad_norm": 19.27642503405622, "learning_rate": 4.4172919787987646e-07, "logps/chosen": -28.778667449951172, "logps/rejected": -47.676055908203125, "loss": 0.5244, "losses/dpo": 0.02799329161643982, "losses/sft": 1.5308761596679688, "losses/total": 0.02799329161643982, "ref_logps/chosen": -20.796648025512695, "ref_logps/rejected": -33.750274658203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7982021570205688, "rewards/margins": 0.5943759679794312, "rewards/rejected": -1.392578125, "step": 780 }, { "epoch": 0.74, "grad_norm": 23.08866411710316, "learning_rate": 4.4156566569228426e-07, "logps/chosen": -43.250579833984375, "logps/rejected": -54.67108154296875, "loss": 0.4988, "losses/dpo": 0.8351228833198547, "losses/sft": 0.7311747074127197, "losses/total": 0.8351228833198547, "ref_logps/chosen": -34.104759216308594, "ref_logps/rejected": -38.582191467285156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9145818948745728, "rewards/margins": 0.6943067312240601, "rewards/rejected": -1.6088886260986328, "step": 781 }, { "epoch": 0.74, "grad_norm": 31.904720603289427, "learning_rate": 4.4140193471738873e-07, "logps/chosen": -49.78604507446289, "logps/rejected": -51.867645263671875, "loss": 0.7032, "losses/dpo": 1.061184287071228, "losses/sft": 1.1942272186279297, "losses/total": 1.061184287071228, "ref_logps/chosen": -37.17436599731445, "ref_logps/rejected": -38.20751953125, "rewards/accuracies": 0.625, "rewards/chosen": -1.2611675262451172, "rewards/margins": 0.10484505444765091, "rewards/rejected": -1.3660125732421875, "step": 782 }, { "epoch": 0.74, "grad_norm": 15.411419475712362, "learning_rate": 4.4123800512509323e-07, "logps/chosen": -37.29464340209961, "logps/rejected": -55.014068603515625, "loss": 0.4378, "losses/dpo": 0.8244131803512573, "losses/sft": 1.2818955183029175, "losses/total": 0.8244131803512573, "ref_logps/chosen": -29.41126251220703, "ref_logps/rejected": -38.13100814819336, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7883378863334656, "rewards/margins": 0.8999679088592529, "rewards/rejected": -1.6883058547973633, "step": 783 }, { "epoch": 0.74, "grad_norm": 20.36848082927671, "learning_rate": 4.4107387708550713e-07, "logps/chosen": -39.260009765625, "logps/rejected": -51.587554931640625, "loss": 0.4719, "losses/dpo": 0.6253898739814758, "losses/sft": 1.0595015287399292, "losses/total": 0.6253898739814758, "ref_logps/chosen": -30.431055068969727, "ref_logps/rejected": -35.6278076171875, "rewards/accuracies": 0.75, "rewards/chosen": -0.8828953504562378, "rewards/margins": 0.7130794525146484, "rewards/rejected": -1.5959746837615967, "step": 784 }, { "epoch": 0.74, "grad_norm": 23.017620614991465, "learning_rate": 4.4090955076894583e-07, "logps/chosen": -43.82636642456055, "logps/rejected": -52.46995544433594, "loss": 0.4643, "losses/dpo": 0.24967744946479797, "losses/sft": 0.9550115466117859, "losses/total": 0.24967744946479797, "ref_logps/chosen": -35.12809371948242, "ref_logps/rejected": -34.69322204589844, "rewards/accuracies": 0.875, "rewards/chosen": -0.869827151298523, "rewards/margins": 0.9078459739685059, "rewards/rejected": -1.7776731252670288, "step": 785 }, { "epoch": 0.74, "grad_norm": 22.923239936333346, "learning_rate": 4.407450263459303e-07, "logps/chosen": -24.988426208496094, "logps/rejected": -50.2301025390625, "loss": 0.5366, "losses/dpo": 0.4219341576099396, "losses/sft": 0.8149469494819641, "losses/total": 0.4219341576099396, "ref_logps/chosen": -17.877365112304688, "ref_logps/rejected": -37.33316421508789, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7111060619354248, "rewards/margins": 0.5785875916481018, "rewards/rejected": -1.2896937131881714, "step": 786 }, { "epoch": 0.74, "grad_norm": 28.722975393305894, "learning_rate": 4.405803039871873e-07, "logps/chosen": -51.218448638916016, "logps/rejected": -58.28340530395508, "loss": 0.4655, "losses/dpo": 0.9268859624862671, "losses/sft": 1.2363007068634033, "losses/total": 0.9268859624862671, "ref_logps/chosen": -42.34202575683594, "ref_logps/rejected": -39.534908294677734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8876423835754395, "rewards/margins": 0.9872070550918579, "rewards/rejected": -1.8748493194580078, "step": 787 }, { "epoch": 0.74, "grad_norm": 23.216854468147982, "learning_rate": 4.404153838636488e-07, "logps/chosen": -39.63623809814453, "logps/rejected": -53.14503479003906, "loss": 0.467, "losses/dpo": 1.4648592472076416, "losses/sft": 1.352638602256775, "losses/total": 1.4648592472076416, "ref_logps/chosen": -31.924592971801758, "ref_logps/rejected": -36.06179428100586, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7711645364761353, "rewards/margins": 0.9371595978736877, "rewards/rejected": -1.7083241939544678, "step": 788 }, { "epoch": 0.74, "grad_norm": 25.202047168420968, "learning_rate": 4.402502661464522e-07, "logps/chosen": -38.86884307861328, "logps/rejected": -55.89228057861328, "loss": 0.591, "losses/dpo": 0.10737673938274384, "losses/sft": 1.5068371295928955, "losses/total": 0.10737673938274384, "ref_logps/chosen": -29.189510345458984, "ref_logps/rejected": -37.685890197753906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9679332375526428, "rewards/margins": 0.852706253528595, "rewards/rejected": -1.8206393718719482, "step": 789 }, { "epoch": 0.75, "grad_norm": 18.222127843175507, "learning_rate": 4.4008495100693976e-07, "logps/chosen": -28.7501220703125, "logps/rejected": -42.97188186645508, "loss": 0.4793, "losses/dpo": 0.8989771604537964, "losses/sft": 1.334427833557129, "losses/total": 0.8989771604537964, "ref_logps/chosen": -22.390708923339844, "ref_logps/rejected": -28.716121673583984, "rewards/accuracies": 0.75, "rewards/chosen": -0.6359413862228394, "rewards/margins": 0.7896350026130676, "rewards/rejected": -1.4255763292312622, "step": 790 }, { "epoch": 0.75, "grad_norm": 30.464587046168244, "learning_rate": 4.399194386166586e-07, "logps/chosen": -39.79511260986328, "logps/rejected": -44.9297981262207, "loss": 0.7492, "losses/dpo": 0.9408731460571289, "losses/sft": 1.6686334609985352, "losses/total": 0.9408731460571289, "ref_logps/chosen": -28.29004669189453, "ref_logps/rejected": -31.347299575805664, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1505067348480225, "rewards/margins": 0.20774318277835846, "rewards/rejected": -1.3582497835159302, "step": 791 }, { "epoch": 0.75, "grad_norm": 28.207282778333944, "learning_rate": 4.3975372914736063e-07, "logps/chosen": -32.225181579589844, "logps/rejected": -55.4962272644043, "loss": 0.6162, "losses/dpo": 0.8710224628448486, "losses/sft": 0.7880999445915222, "losses/total": 0.8710224628448486, "ref_logps/chosen": -22.715282440185547, "ref_logps/rejected": -42.58079147338867, "rewards/accuracies": 0.625, "rewards/chosen": -0.950989842414856, "rewards/margins": 0.34055376052856445, "rewards/rejected": -1.2915436029434204, "step": 792 }, { "epoch": 0.75, "grad_norm": 21.752589076216346, "learning_rate": 4.395878227710024e-07, "logps/chosen": -34.757537841796875, "logps/rejected": -46.054107666015625, "loss": 0.4842, "losses/dpo": 0.4601227045059204, "losses/sft": 1.1423925161361694, "losses/total": 0.4601227045059204, "ref_logps/chosen": -27.673540115356445, "ref_logps/rejected": -32.140281677246094, "rewards/accuracies": 0.75, "rewards/chosen": -0.7083996534347534, "rewards/margins": 0.6829829216003418, "rewards/rejected": -1.3913825750350952, "step": 793 }, { "epoch": 0.75, "grad_norm": 27.657425615674622, "learning_rate": 4.3942171965974426e-07, "logps/chosen": -32.140594482421875, "logps/rejected": -46.53765106201172, "loss": 0.687, "losses/dpo": 0.2910304367542267, "losses/sft": 0.6878974437713623, "losses/total": 0.2910304367542267, "ref_logps/chosen": -22.556011199951172, "ref_logps/rejected": -33.96935272216797, "rewards/accuracies": 0.375, "rewards/chosen": -0.9584583044052124, "rewards/margins": 0.298371285200119, "rewards/rejected": -1.2568296194076538, "step": 794 }, { "epoch": 0.75, "grad_norm": 14.355109522068014, "learning_rate": 4.392554199859514e-07, "logps/chosen": -21.223255157470703, "logps/rejected": -50.706077575683594, "loss": 0.3712, "losses/dpo": 0.25541672110557556, "losses/sft": 0.2599344551563263, "losses/total": 0.25541672110557556, "ref_logps/chosen": -17.970731735229492, "ref_logps/rejected": -35.63373565673828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32525235414505005, "rewards/margins": 1.1819818019866943, "rewards/rejected": -1.5072342157363892, "step": 795 }, { "epoch": 0.75, "grad_norm": 21.00724832943907, "learning_rate": 4.3908892392219257e-07, "logps/chosen": -34.930335998535156, "logps/rejected": -51.04865264892578, "loss": 0.4321, "losses/dpo": 1.037211537361145, "losses/sft": 0.7553372979164124, "losses/total": 1.037211537361145, "ref_logps/chosen": -26.89031219482422, "ref_logps/rejected": -32.772850036621094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8040023446083069, "rewards/margins": 1.0235779285430908, "rewards/rejected": -1.827580213546753, "step": 796 }, { "epoch": 0.75, "grad_norm": 20.72593766772843, "learning_rate": 4.3892223164124027e-07, "logps/chosen": -33.15455627441406, "logps/rejected": -43.081172943115234, "loss": 0.5826, "losses/dpo": 0.7067062258720398, "losses/sft": 1.336037516593933, "losses/total": 0.7067062258720398, "ref_logps/chosen": -23.322328567504883, "ref_logps/rejected": -29.294416427612305, "rewards/accuracies": 0.625, "rewards/chosen": -0.9832228422164917, "rewards/margins": 0.39545297622680664, "rewards/rejected": -1.3786756992340088, "step": 797 }, { "epoch": 0.75, "grad_norm": 23.645227913252707, "learning_rate": 4.387553433160709e-07, "logps/chosen": -42.26625061035156, "logps/rejected": -50.99680709838867, "loss": 0.5373, "losses/dpo": 0.4852534532546997, "losses/sft": 0.9098580479621887, "losses/total": 0.4852534532546997, "ref_logps/chosen": -31.293521881103516, "ref_logps/rejected": -34.47248077392578, "rewards/accuracies": 0.75, "rewards/chosen": -1.0972728729248047, "rewards/margins": 0.5551596283912659, "rewards/rejected": -1.6524324417114258, "step": 798 }, { "epoch": 0.75, "grad_norm": 19.554388579319085, "learning_rate": 4.38588259119864e-07, "logps/chosen": -33.870384216308594, "logps/rejected": -48.270294189453125, "loss": 0.4781, "losses/dpo": 0.4003491699695587, "losses/sft": 0.42158710956573486, "losses/total": 0.4003491699695587, "ref_logps/chosen": -23.020797729492188, "ref_logps/rejected": -30.722801208496094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0849584341049194, "rewards/margins": 0.6697909235954285, "rewards/rejected": -1.7547494173049927, "step": 799 }, { "epoch": 0.75, "grad_norm": 29.02774846227882, "learning_rate": 4.384209792260026e-07, "logps/chosen": -43.20262908935547, "logps/rejected": -48.27785110473633, "loss": 0.6333, "losses/dpo": 0.3640609383583069, "losses/sft": 1.6486001014709473, "losses/total": 0.3640609383583069, "ref_logps/chosen": -31.80328369140625, "ref_logps/rejected": -33.28325653076172, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1399343013763428, "rewards/margins": 0.359525203704834, "rewards/rejected": -1.4994595050811768, "step": 800 }, { "epoch": 0.76, "grad_norm": 41.552332112095954, "learning_rate": 4.382535038080728e-07, "logps/chosen": -61.14263153076172, "logps/rejected": -63.99385070800781, "loss": 0.7446, "losses/dpo": 0.1368061900138855, "losses/sft": 1.9303011894226074, "losses/total": 0.1368061900138855, "ref_logps/chosen": -44.60306930541992, "ref_logps/rejected": -43.706451416015625, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6539560556411743, "rewards/margins": 0.37478381395339966, "rewards/rejected": -2.0287399291992188, "step": 801 }, { "epoch": 0.76, "grad_norm": 20.3947688561762, "learning_rate": 4.380858330398632e-07, "logps/chosen": -31.896087646484375, "logps/rejected": -58.641597747802734, "loss": 0.437, "losses/dpo": 0.10228653252124786, "losses/sft": 0.5415535569190979, "losses/total": 0.10228653252124786, "ref_logps/chosen": -25.525894165039062, "ref_logps/rejected": -40.50410461425781, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6370193958282471, "rewards/margins": 1.1767299175262451, "rewards/rejected": -1.8137493133544922, "step": 802 }, { "epoch": 0.76, "grad_norm": 21.895111861363578, "learning_rate": 4.379179670953658e-07, "logps/chosen": -41.08079528808594, "logps/rejected": -50.909767150878906, "loss": 0.4575, "losses/dpo": 0.4632890224456787, "losses/sft": 0.8922792077064514, "losses/total": 0.4632890224456787, "ref_logps/chosen": -30.41971778869629, "ref_logps/rejected": -33.91250228881836, "rewards/accuracies": 0.9375, "rewards/chosen": -1.066107988357544, "rewards/margins": 0.6336187720298767, "rewards/rejected": -1.6997268199920654, "step": 803 }, { "epoch": 0.76, "grad_norm": 21.22146337056997, "learning_rate": 4.377499061487744e-07, "logps/chosen": -31.99154281616211, "logps/rejected": -45.188072204589844, "loss": 0.5255, "losses/dpo": 0.30885541439056396, "losses/sft": 0.6226474642753601, "losses/total": 0.30885541439056396, "ref_logps/chosen": -24.61892318725586, "ref_logps/rejected": -32.35627365112305, "rewards/accuracies": 0.875, "rewards/chosen": -0.7372620105743408, "rewards/margins": 0.5459173917770386, "rewards/rejected": -1.283179521560669, "step": 804 }, { "epoch": 0.76, "grad_norm": 20.824476392440477, "learning_rate": 4.375816503744857e-07, "logps/chosen": -38.44076156616211, "logps/rejected": -54.021026611328125, "loss": 0.487, "losses/dpo": 0.42200180888175964, "losses/sft": 0.6520575284957886, "losses/total": 0.42200180888175964, "ref_logps/chosen": -29.050310134887695, "ref_logps/rejected": -34.12311935424805, "rewards/accuracies": 0.75, "rewards/chosen": -0.9390450716018677, "rewards/margins": 1.0507457256317139, "rewards/rejected": -1.989790678024292, "step": 805 }, { "epoch": 0.76, "grad_norm": 20.2180641184865, "learning_rate": 4.3741319994709836e-07, "logps/chosen": -37.20660400390625, "logps/rejected": -55.291961669921875, "loss": 0.4342, "losses/dpo": 0.8890147805213928, "losses/sft": 1.810036063194275, "losses/total": 0.8890147805213928, "ref_logps/chosen": -28.058502197265625, "ref_logps/rejected": -36.05955123901367, "rewards/accuracies": 0.75, "rewards/chosen": -0.9148102402687073, "rewards/margins": 1.0084308385849, "rewards/rejected": -1.923241138458252, "step": 806 }, { "epoch": 0.76, "grad_norm": 17.564380600532655, "learning_rate": 4.37244555041413e-07, "logps/chosen": -29.315372467041016, "logps/rejected": -43.98402786254883, "loss": 0.396, "losses/dpo": 0.5654928684234619, "losses/sft": 0.8892321586608887, "losses/total": 0.5654928684234619, "ref_logps/chosen": -22.339874267578125, "ref_logps/rejected": -27.6405029296875, "rewards/accuracies": 0.875, "rewards/chosen": -0.6975498199462891, "rewards/margins": 0.9368026256561279, "rewards/rejected": -1.6343523263931274, "step": 807 }, { "epoch": 0.76, "grad_norm": 25.495775807024078, "learning_rate": 4.3707571583243207e-07, "logps/chosen": -38.94894027709961, "logps/rejected": -45.790889739990234, "loss": 0.5178, "losses/dpo": 0.4084452986717224, "losses/sft": 0.9997464418411255, "losses/total": 0.4084452986717224, "ref_logps/chosen": -30.083349227905273, "ref_logps/rejected": -31.075786590576172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8865591287612915, "rewards/margins": 0.5849515199661255, "rewards/rejected": -1.471510648727417, "step": 808 }, { "epoch": 0.76, "grad_norm": 25.976058687243587, "learning_rate": 4.369066824953598e-07, "logps/chosen": -38.05353546142578, "logps/rejected": -43.01832962036133, "loss": 0.5936, "losses/dpo": 0.2433972954750061, "losses/sft": 0.810321569442749, "losses/total": 0.2433972954750061, "ref_logps/chosen": -29.97530746459961, "ref_logps/rejected": -29.25502586364746, "rewards/accuracies": 0.625, "rewards/chosen": -0.8078223466873169, "rewards/margins": 0.5685082674026489, "rewards/rejected": -1.3763306140899658, "step": 809 }, { "epoch": 0.76, "grad_norm": 26.822998350605296, "learning_rate": 4.367374552056016e-07, "logps/chosen": -39.60565185546875, "logps/rejected": -59.385589599609375, "loss": 0.5089, "losses/dpo": 2.511753797531128, "losses/sft": 2.832050085067749, "losses/total": 2.511753797531128, "ref_logps/chosen": -29.56186294555664, "ref_logps/rejected": -37.063011169433594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.004379153251648, "rewards/margins": 1.2278790473937988, "rewards/rejected": -2.2322583198547363, "step": 810 }, { "epoch": 0.77, "grad_norm": 19.042733022491035, "learning_rate": 4.3656803413876434e-07, "logps/chosen": -29.443523406982422, "logps/rejected": -35.71784210205078, "loss": 0.5267, "losses/dpo": 0.7451307773590088, "losses/sft": 1.0588394403457642, "losses/total": 0.7451307773590088, "ref_logps/chosen": -22.92627716064453, "ref_logps/rejected": -23.250686645507812, "rewards/accuracies": 0.625, "rewards/chosen": -0.651724636554718, "rewards/margins": 0.5949908494949341, "rewards/rejected": -1.2467154264450073, "step": 811 }, { "epoch": 0.77, "grad_norm": 17.504784157883297, "learning_rate": 4.36398419470656e-07, "logps/chosen": -31.551584243774414, "logps/rejected": -49.595741271972656, "loss": 0.4386, "losses/dpo": 0.41020897030830383, "losses/sft": 1.2353472709655762, "losses/total": 0.41020897030830383, "ref_logps/chosen": -24.514694213867188, "ref_logps/rejected": -34.7826042175293, "rewards/accuracies": 0.6875, "rewards/chosen": -0.703689455986023, "rewards/margins": 0.777624249458313, "rewards/rejected": -1.481313705444336, "step": 812 }, { "epoch": 0.77, "grad_norm": 25.950923221641222, "learning_rate": 4.362286113772853e-07, "logps/chosen": -36.85247039794922, "logps/rejected": -44.38380813598633, "loss": 0.685, "losses/dpo": 0.7452908158302307, "losses/sft": 0.6408355236053467, "losses/total": 0.7452908158302307, "ref_logps/chosen": -26.52400779724121, "ref_logps/rejected": -32.18990707397461, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0328458547592163, "rewards/margins": 0.18654440343379974, "rewards/rejected": -1.2193903923034668, "step": 813 }, { "epoch": 0.77, "grad_norm": 32.08077612644418, "learning_rate": 4.3605861003486164e-07, "logps/chosen": -35.947975158691406, "logps/rejected": -42.26429748535156, "loss": 0.8745, "losses/dpo": 2.2508819103240967, "losses/sft": 1.9072312116622925, "losses/total": 2.2508819103240967, "ref_logps/chosen": -25.940275192260742, "ref_logps/rejected": -32.01435089111328, "rewards/accuracies": 0.5, "rewards/chosen": -1.000770092010498, "rewards/margins": 0.024224787950515747, "rewards/rejected": -1.0249948501586914, "step": 814 }, { "epoch": 0.77, "grad_norm": 30.62827720485105, "learning_rate": 4.358884156197953e-07, "logps/chosen": -48.43110656738281, "logps/rejected": -44.43311309814453, "loss": 0.7359, "losses/dpo": 0.9253662824630737, "losses/sft": 0.7036754488945007, "losses/total": 0.9253662824630737, "ref_logps/chosen": -35.446510314941406, "ref_logps/rejected": -29.90973472595215, "rewards/accuracies": 0.625, "rewards/chosen": -1.2984590530395508, "rewards/margins": 0.1538783609867096, "rewards/rejected": -1.4523375034332275, "step": 815 }, { "epoch": 0.77, "grad_norm": 25.959182518191934, "learning_rate": 4.3571802830869653e-07, "logps/chosen": -43.09547424316406, "logps/rejected": -52.48502731323242, "loss": 0.5795, "losses/dpo": 0.7330676913261414, "losses/sft": 0.9109383821487427, "losses/total": 0.7330676913261414, "ref_logps/chosen": -33.0014762878418, "ref_logps/rejected": -35.23403549194336, "rewards/accuracies": 0.75, "rewards/chosen": -1.009399652481079, "rewards/margins": 0.7156994938850403, "rewards/rejected": -1.7250993251800537, "step": 816 }, { "epoch": 0.77, "grad_norm": 22.09189779611563, "learning_rate": 4.3554744827837596e-07, "logps/chosen": -36.174285888671875, "logps/rejected": -60.13335418701172, "loss": 0.4447, "losses/dpo": 1.0403589010238647, "losses/sft": 1.019152283668518, "losses/total": 1.0403589010238647, "ref_logps/chosen": -24.988567352294922, "ref_logps/rejected": -39.33331298828125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1185719966888428, "rewards/margins": 0.9614325761795044, "rewards/rejected": -2.0800044536590576, "step": 817 }, { "epoch": 0.77, "grad_norm": 19.438854368118694, "learning_rate": 4.3537667570584413e-07, "logps/chosen": -29.492258071899414, "logps/rejected": -42.7059326171875, "loss": 0.4741, "losses/dpo": 0.36728546023368835, "losses/sft": 0.9337797164916992, "losses/total": 0.36728546023368835, "ref_logps/chosen": -23.098024368286133, "ref_logps/rejected": -29.273696899414062, "rewards/accuracies": 0.875, "rewards/chosen": -0.6394232511520386, "rewards/margins": 0.7038000822067261, "rewards/rejected": -1.3432233333587646, "step": 818 }, { "epoch": 0.77, "grad_norm": 21.733275982602855, "learning_rate": 4.3520571076831134e-07, "logps/chosen": -34.52713394165039, "logps/rejected": -49.3465576171875, "loss": 0.5035, "losses/dpo": 0.10767048597335815, "losses/sft": 1.2224453687667847, "losses/total": 0.10767048597335815, "ref_logps/chosen": -25.726301193237305, "ref_logps/rejected": -33.26957321166992, "rewards/accuracies": 0.75, "rewards/chosen": -0.8800832033157349, "rewards/margins": 0.7276151776313782, "rewards/rejected": -1.6076984405517578, "step": 819 }, { "epoch": 0.77, "grad_norm": 27.88105278245411, "learning_rate": 4.3503455364318764e-07, "logps/chosen": -38.77394485473633, "logps/rejected": -66.12185668945312, "loss": 0.61, "losses/dpo": 0.2157764434814453, "losses/sft": 0.9700895547866821, "losses/total": 0.2157764434814453, "ref_logps/chosen": -26.93456268310547, "ref_logps/rejected": -49.28598403930664, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1839382648468018, "rewards/margins": 0.4996487498283386, "rewards/rejected": -1.6835870742797852, "step": 820 }, { "epoch": 0.77, "grad_norm": 18.161445067399917, "learning_rate": 4.3486320450808233e-07, "logps/chosen": -31.4815731048584, "logps/rejected": -57.334259033203125, "loss": 0.3834, "losses/dpo": 0.7016698122024536, "losses/sft": 0.7137575745582581, "losses/total": 0.7016698122024536, "ref_logps/chosen": -25.332656860351562, "ref_logps/rejected": -39.906673431396484, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6148916482925415, "rewards/margins": 1.127867341041565, "rewards/rejected": -1.7427589893341064, "step": 821 }, { "epoch": 0.78, "grad_norm": 23.09684649542129, "learning_rate": 4.346916635408041e-07, "logps/chosen": -38.860565185546875, "logps/rejected": -43.19403076171875, "loss": 0.5479, "losses/dpo": 1.0368878841400146, "losses/sft": 1.4976493120193481, "losses/total": 1.0368878841400146, "ref_logps/chosen": -30.516613006591797, "ref_logps/rejected": -30.09356689453125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8343952894210815, "rewards/margins": 0.4756510853767395, "rewards/rejected": -1.3100463151931763, "step": 822 }, { "epoch": 0.78, "grad_norm": 19.787490539970122, "learning_rate": 4.345199309193608e-07, "logps/chosen": -39.334205627441406, "logps/rejected": -49.93346405029297, "loss": 0.3799, "losses/dpo": 0.15681107342243195, "losses/sft": 0.7721452713012695, "losses/total": 0.15681107342243195, "ref_logps/chosen": -30.64920997619629, "ref_logps/rejected": -30.44471549987793, "rewards/accuracies": 0.875, "rewards/chosen": -0.8684993982315063, "rewards/margins": 1.08037531375885, "rewards/rejected": -1.9488747119903564, "step": 823 }, { "epoch": 0.78, "grad_norm": 24.22170784640041, "learning_rate": 4.343480068219588e-07, "logps/chosen": -36.199058532714844, "logps/rejected": -43.088623046875, "loss": 0.5999, "losses/dpo": 0.15817894041538239, "losses/sft": 1.0921982526779175, "losses/total": 0.15817894041538239, "ref_logps/chosen": -25.291311264038086, "ref_logps/rejected": -27.148611068725586, "rewards/accuracies": 0.6875, "rewards/chosen": -1.090774655342102, "rewards/margins": 0.5032262802124023, "rewards/rejected": -1.5940008163452148, "step": 824 }, { "epoch": 0.78, "grad_norm": 15.549095086129165, "learning_rate": 4.341758914270036e-07, "logps/chosen": -27.930805206298828, "logps/rejected": -53.262062072753906, "loss": 0.3852, "losses/dpo": 0.12852279841899872, "losses/sft": 0.6595547199249268, "losses/total": 0.12852279841899872, "ref_logps/chosen": -19.343692779541016, "ref_logps/rejected": -33.81951904296875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8587114214897156, "rewards/margins": 1.0855433940887451, "rewards/rejected": -1.9442548751831055, "step": 825 }, { "epoch": 0.78, "grad_norm": 22.424434288949893, "learning_rate": 4.3400358491309884e-07, "logps/chosen": -34.79418182373047, "logps/rejected": -71.99134063720703, "loss": 0.468, "losses/dpo": 0.15619011223316193, "losses/sft": 1.062423586845398, "losses/total": 0.15619011223316193, "ref_logps/chosen": -25.164939880371094, "ref_logps/rejected": -52.55967330932617, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9629240036010742, "rewards/margins": 0.9802433252334595, "rewards/rejected": -1.9431674480438232, "step": 826 }, { "epoch": 0.78, "grad_norm": 19.862249683502608, "learning_rate": 4.3383108745904674e-07, "logps/chosen": -39.0028190612793, "logps/rejected": -52.19498825073242, "loss": 0.4567, "losses/dpo": 0.7333137392997742, "losses/sft": 1.6686570644378662, "losses/total": 0.7333137392997742, "ref_logps/chosen": -29.313705444335938, "ref_logps/rejected": -32.50993347167969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9689112305641174, "rewards/margins": 0.9995944499969482, "rewards/rejected": -1.968505620956421, "step": 827 }, { "epoch": 0.78, "grad_norm": 24.724975085123972, "learning_rate": 4.336583992438474e-07, "logps/chosen": -38.74475860595703, "logps/rejected": -48.986610412597656, "loss": 0.6087, "losses/dpo": 1.0818607807159424, "losses/sft": 0.9503090977668762, "losses/total": 1.0818607807159424, "ref_logps/chosen": -28.4317626953125, "ref_logps/rejected": -34.315731048583984, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0312998294830322, "rewards/margins": 0.4357880651950836, "rewards/rejected": -1.467087745666504, "step": 828 }, { "epoch": 0.78, "grad_norm": 23.81180520038289, "learning_rate": 4.3348552044669924e-07, "logps/chosen": -37.6627311706543, "logps/rejected": -59.63153839111328, "loss": 0.5625, "losses/dpo": 0.041991688311100006, "losses/sft": 0.3123105466365814, "losses/total": 0.041991688311100006, "ref_logps/chosen": -27.76708984375, "ref_logps/rejected": -43.15996551513672, "rewards/accuracies": 0.6875, "rewards/chosen": -0.989564061164856, "rewards/margins": 0.6575927734375, "rewards/rejected": -1.6471567153930664, "step": 829 }, { "epoch": 0.78, "grad_norm": 24.801287396097216, "learning_rate": 4.333124512469981e-07, "logps/chosen": -34.9952392578125, "logps/rejected": -45.895103454589844, "loss": 0.554, "losses/dpo": 0.5131391286849976, "losses/sft": 1.1208503246307373, "losses/total": 0.5131391286849976, "ref_logps/chosen": -25.20644760131836, "ref_logps/rejected": -30.754863739013672, "rewards/accuracies": 0.75, "rewards/chosen": -0.9788789749145508, "rewards/margins": 0.5351447463035583, "rewards/rejected": -1.514023780822754, "step": 830 }, { "epoch": 0.78, "grad_norm": 18.791320373592992, "learning_rate": 4.3313919182433755e-07, "logps/chosen": -32.15766906738281, "logps/rejected": -51.55867004394531, "loss": 0.4777, "losses/dpo": 0.2297465056180954, "losses/sft": 1.557383418083191, "losses/total": 0.2297465056180954, "ref_logps/chosen": -23.314697265625, "ref_logps/rejected": -34.66668701171875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8842968940734863, "rewards/margins": 0.8049015998840332, "rewards/rejected": -1.6891984939575195, "step": 831 }, { "epoch": 0.78, "grad_norm": 25.859386997480417, "learning_rate": 4.329657423585087e-07, "logps/chosen": -42.643699645996094, "logps/rejected": -58.48636245727539, "loss": 0.4566, "losses/dpo": 0.17608317732810974, "losses/sft": 0.29776668548583984, "losses/total": 0.17608317732810974, "ref_logps/chosen": -32.929744720458984, "ref_logps/rejected": -40.38896560668945, "rewards/accuracies": 0.875, "rewards/chosen": -0.9713953733444214, "rewards/margins": 0.8383443355560303, "rewards/rejected": -1.8097397089004517, "step": 832 }, { "epoch": 0.79, "grad_norm": 28.413844398033056, "learning_rate": 4.327921030294994e-07, "logps/chosen": -39.22859573364258, "logps/rejected": -56.508201599121094, "loss": 0.5281, "losses/dpo": 0.58809894323349, "losses/sft": 0.6999874711036682, "losses/total": 0.58809894323349, "ref_logps/chosen": -29.353269577026367, "ref_logps/rejected": -40.18728256225586, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9875328540802002, "rewards/margins": 0.6445591449737549, "rewards/rejected": -1.632091999053955, "step": 833 }, { "epoch": 0.79, "grad_norm": 21.29203457663904, "learning_rate": 4.3261827401749515e-07, "logps/chosen": -37.0638427734375, "logps/rejected": -47.036827087402344, "loss": 0.4839, "losses/dpo": 0.6428827047348022, "losses/sft": 0.6438477039337158, "losses/total": 0.6428827047348022, "ref_logps/chosen": -25.85205841064453, "ref_logps/rejected": -29.28852653503418, "rewards/accuracies": 0.75, "rewards/chosen": -1.1211787462234497, "rewards/margins": 0.6536511778831482, "rewards/rejected": -1.7748299837112427, "step": 834 }, { "epoch": 0.79, "grad_norm": 22.89014337876645, "learning_rate": 4.324442555028778e-07, "logps/chosen": -44.175437927246094, "logps/rejected": -56.87782669067383, "loss": 0.4934, "losses/dpo": 0.6930604577064514, "losses/sft": 0.8424225449562073, "losses/total": 0.6930604577064514, "ref_logps/chosen": -32.23529052734375, "ref_logps/rejected": -36.22230911254883, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1940147876739502, "rewards/margins": 0.8715369701385498, "rewards/rejected": -2.0655517578125, "step": 835 }, { "epoch": 0.79, "grad_norm": 37.1615837782916, "learning_rate": 4.32270047666226e-07, "logps/chosen": -59.06950759887695, "logps/rejected": -53.894073486328125, "loss": 0.8249, "losses/dpo": 0.5724746584892273, "losses/sft": 1.412003755569458, "losses/total": 0.5724746584892273, "ref_logps/chosen": -42.57904815673828, "ref_logps/rejected": -37.24115753173828, "rewards/accuracies": 0.5, "rewards/chosen": -1.6490461826324463, "rewards/margins": 0.016245156526565552, "rewards/rejected": -1.6652913093566895, "step": 836 }, { "epoch": 0.79, "grad_norm": 27.165959447954766, "learning_rate": 4.3209565068831497e-07, "logps/chosen": -33.49055480957031, "logps/rejected": -44.295143127441406, "loss": 0.6176, "losses/dpo": 0.29490843415260315, "losses/sft": 1.4679502248764038, "losses/total": 0.29490843415260315, "ref_logps/chosen": -22.781482696533203, "ref_logps/rejected": -28.715303421020508, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0709071159362793, "rewards/margins": 0.4870767295360565, "rewards/rejected": -1.5579838752746582, "step": 837 }, { "epoch": 0.79, "grad_norm": 19.299145100069556, "learning_rate": 4.31921064750116e-07, "logps/chosen": -38.19317626953125, "logps/rejected": -56.40037155151367, "loss": 0.411, "losses/dpo": 0.3233025372028351, "losses/sft": 1.8889353275299072, "losses/total": 0.3233025372028351, "ref_logps/chosen": -26.93160629272461, "ref_logps/rejected": -36.334869384765625, "rewards/accuracies": 0.875, "rewards/chosen": -1.1261570453643799, "rewards/margins": 0.8803930282592773, "rewards/rejected": -2.0065500736236572, "step": 838 }, { "epoch": 0.79, "grad_norm": 27.463249667135514, "learning_rate": 4.3174629003279656e-07, "logps/chosen": -47.103797912597656, "logps/rejected": -56.64356994628906, "loss": 0.5476, "losses/dpo": 0.7370509505271912, "losses/sft": 1.21625816822052, "losses/total": 0.7370509505271912, "ref_logps/chosen": -35.396175384521484, "ref_logps/rejected": -38.5470085144043, "rewards/accuracies": 0.6875, "rewards/chosen": -1.170762300491333, "rewards/margins": 0.6388940215110779, "rewards/rejected": -1.8096563816070557, "step": 839 }, { "epoch": 0.79, "grad_norm": 19.198048861596842, "learning_rate": 4.315713267177201e-07, "logps/chosen": -43.6738166809082, "logps/rejected": -56.37264633178711, "loss": 0.3906, "losses/dpo": 0.08474992215633392, "losses/sft": 1.0784871578216553, "losses/total": 0.08474992215633392, "ref_logps/chosen": -33.89051055908203, "ref_logps/rejected": -35.459739685058594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9783310890197754, "rewards/margins": 1.112959861755371, "rewards/rejected": -2.0912909507751465, "step": 840 }, { "epoch": 0.79, "grad_norm": 25.164116530174255, "learning_rate": 4.3139617498644563e-07, "logps/chosen": -32.76846694946289, "logps/rejected": -44.86730194091797, "loss": 0.6456, "losses/dpo": 0.7904577255249023, "losses/sft": 1.0791305303573608, "losses/total": 0.7904577255249023, "ref_logps/chosen": -23.9481201171875, "ref_logps/rejected": -30.839088439941406, "rewards/accuracies": 0.5, "rewards/chosen": -0.8820344805717468, "rewards/margins": 0.5207870602607727, "rewards/rejected": -1.4028215408325195, "step": 841 }, { "epoch": 0.79, "grad_norm": 25.741222534457442, "learning_rate": 4.3122083502072783e-07, "logps/chosen": -36.44879150390625, "logps/rejected": -48.36648941040039, "loss": 0.6029, "losses/dpo": 0.022492455318570137, "losses/sft": 0.6729640960693359, "losses/total": 0.022492455318570137, "ref_logps/chosen": -26.176525115966797, "ref_logps/rejected": -29.90645980834961, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0272268056869507, "rewards/margins": 0.8187762498855591, "rewards/rejected": -1.8460030555725098, "step": 842 }, { "epoch": 0.8, "grad_norm": 21.514349252732675, "learning_rate": 4.310453070025165e-07, "logps/chosen": -41.34471893310547, "logps/rejected": -58.46442794799805, "loss": 0.4729, "losses/dpo": 0.46963632106781006, "losses/sft": 0.5022264719009399, "losses/total": 0.46963632106781006, "ref_logps/chosen": -28.411331176757812, "ref_logps/rejected": -37.408470153808594, "rewards/accuracies": 0.75, "rewards/chosen": -1.2933388948440552, "rewards/margins": 0.8122568130493164, "rewards/rejected": -2.105595827102661, "step": 843 }, { "epoch": 0.8, "grad_norm": 22.0555958589229, "learning_rate": 4.3086959111395684e-07, "logps/chosen": -39.74952697753906, "logps/rejected": -54.9790153503418, "loss": 0.493, "losses/dpo": 0.18897032737731934, "losses/sft": 1.6664345264434814, "losses/total": 0.18897032737731934, "ref_logps/chosen": -26.06460189819336, "ref_logps/rejected": -33.88776779174805, "rewards/accuracies": 0.75, "rewards/chosen": -1.368492603302002, "rewards/margins": 0.7406319379806519, "rewards/rejected": -2.1091246604919434, "step": 844 }, { "epoch": 0.8, "grad_norm": 19.20339676589074, "learning_rate": 4.3069368753738876e-07, "logps/chosen": -35.41682434082031, "logps/rejected": -62.827415466308594, "loss": 0.3612, "losses/dpo": 0.07280334085226059, "losses/sft": 1.1531418561935425, "losses/total": 0.07280334085226059, "ref_logps/chosen": -27.479114532470703, "ref_logps/rejected": -42.78466796875, "rewards/accuracies": 0.875, "rewards/chosen": -0.7937713861465454, "rewards/margins": 1.210503339767456, "rewards/rejected": -2.004274606704712, "step": 845 }, { "epoch": 0.8, "grad_norm": 20.44459754488772, "learning_rate": 4.305175964553471e-07, "logps/chosen": -35.639949798583984, "logps/rejected": -64.82831573486328, "loss": 0.4035, "losses/dpo": 0.10438064485788345, "losses/sft": 0.8455856442451477, "losses/total": 0.10438064485788345, "ref_logps/chosen": -25.722156524658203, "ref_logps/rejected": -44.38544464111328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9917794466018677, "rewards/margins": 1.0525081157684326, "rewards/rejected": -2.04428768157959, "step": 846 }, { "epoch": 0.8, "grad_norm": 22.10116322671017, "learning_rate": 4.303413180505613e-07, "logps/chosen": -43.01482391357422, "logps/rejected": -63.037147521972656, "loss": 0.449, "losses/dpo": 0.049540337175130844, "losses/sft": 1.0383487939834595, "losses/total": 0.049540337175130844, "ref_logps/chosen": -33.11957931518555, "ref_logps/rejected": -45.228187561035156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9895247220993042, "rewards/margins": 0.7913712859153748, "rewards/rejected": -1.7808959484100342, "step": 847 }, { "epoch": 0.8, "grad_norm": 15.751664557072697, "learning_rate": 4.301648525059549e-07, "logps/chosen": -32.5885009765625, "logps/rejected": -52.09130859375, "loss": 0.3592, "losses/dpo": 0.4653776288032532, "losses/sft": 1.1456605195999146, "losses/total": 0.4653776288032532, "ref_logps/chosen": -24.416004180908203, "ref_logps/rejected": -32.61113739013672, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8172492980957031, "rewards/margins": 1.130768060684204, "rewards/rejected": -1.9480174779891968, "step": 848 }, { "epoch": 0.8, "grad_norm": 21.623538738332275, "learning_rate": 4.29988200004646e-07, "logps/chosen": -47.9739990234375, "logps/rejected": -63.34143829345703, "loss": 0.408, "losses/dpo": 0.7488494515419006, "losses/sft": 0.8991531729698181, "losses/total": 0.7488494515419006, "ref_logps/chosen": -36.21205520629883, "ref_logps/rejected": -40.42082977294922, "rewards/accuracies": 0.875, "rewards/chosen": -1.176194429397583, "rewards/margins": 1.1158661842346191, "rewards/rejected": -2.292060613632202, "step": 849 }, { "epoch": 0.8, "grad_norm": 21.50025579177163, "learning_rate": 4.2981136072994654e-07, "logps/chosen": -37.829952239990234, "logps/rejected": -49.068389892578125, "loss": 0.5173, "losses/dpo": 0.3032788038253784, "losses/sft": 0.5891618728637695, "losses/total": 0.3032788038253784, "ref_logps/chosen": -30.068845748901367, "ref_logps/rejected": -34.15401840209961, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7761110067367554, "rewards/margins": 0.7153260111808777, "rewards/rejected": -1.4914369583129883, "step": 850 }, { "epoch": 0.8, "grad_norm": 27.024554626218897, "learning_rate": 4.296343348653623e-07, "logps/chosen": -40.989402770996094, "logps/rejected": -49.32255554199219, "loss": 0.6441, "losses/dpo": 1.4690272808074951, "losses/sft": 1.5781453847885132, "losses/total": 1.4690272808074951, "ref_logps/chosen": -30.04800796508789, "ref_logps/rejected": -34.02117919921875, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0941394567489624, "rewards/margins": 0.4359980821609497, "rewards/rejected": -1.530137538909912, "step": 851 }, { "epoch": 0.8, "grad_norm": 27.542430852319875, "learning_rate": 4.2945712259459243e-07, "logps/chosen": -43.318931579589844, "logps/rejected": -55.04365539550781, "loss": 0.6259, "losses/dpo": 0.49473416805267334, "losses/sft": 1.4425640106201172, "losses/total": 0.49473416805267334, "ref_logps/chosen": -31.769954681396484, "ref_logps/rejected": -39.113651275634766, "rewards/accuracies": 0.625, "rewards/chosen": -1.154897689819336, "rewards/margins": 0.438102662563324, "rewards/rejected": -1.5930002927780151, "step": 852 }, { "epoch": 0.8, "grad_norm": 25.60474579510131, "learning_rate": 4.2927972410153e-07, "logps/chosen": -34.9415168762207, "logps/rejected": -49.44129180908203, "loss": 0.6144, "losses/dpo": 0.8030616044998169, "losses/sft": 1.043968915939331, "losses/total": 0.8030616044998169, "ref_logps/chosen": -24.760311126708984, "ref_logps/rejected": -35.73371124267578, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0181204080581665, "rewards/margins": 0.3526375889778137, "rewards/rejected": -1.3707579374313354, "step": 853 }, { "epoch": 0.81, "grad_norm": 22.89537351794885, "learning_rate": 4.29102139570261e-07, "logps/chosen": -36.03483581542969, "logps/rejected": -57.90533447265625, "loss": 0.4933, "losses/dpo": 0.3417987525463104, "losses/sft": 1.8761478662490845, "losses/total": 0.3417987525463104, "ref_logps/chosen": -26.173080444335938, "ref_logps/rejected": -39.36085510253906, "rewards/accuracies": 0.75, "rewards/chosen": -0.9861757755279541, "rewards/margins": 0.8682719469070435, "rewards/rejected": -1.8544477224349976, "step": 854 }, { "epoch": 0.81, "grad_norm": 23.79855961099447, "learning_rate": 4.289243691850645e-07, "logps/chosen": -33.915985107421875, "logps/rejected": -44.962371826171875, "loss": 0.5478, "losses/dpo": 1.0747870206832886, "losses/sft": 1.4602185487747192, "losses/total": 1.0747870206832886, "ref_logps/chosen": -24.953645706176758, "ref_logps/rejected": -29.86067008972168, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8962340354919434, "rewards/margins": 0.6139360070228577, "rewards/rejected": -1.5101702213287354, "step": 855 }, { "epoch": 0.81, "grad_norm": 16.3342341238513, "learning_rate": 4.287464131304125e-07, "logps/chosen": -41.38634490966797, "logps/rejected": -68.03534698486328, "loss": 0.3542, "losses/dpo": 1.1304460763931274, "losses/sft": 1.5031330585479736, "losses/total": 1.1304460763931274, "ref_logps/chosen": -32.61249542236328, "ref_logps/rejected": -46.77125549316406, "rewards/accuracies": 0.875, "rewards/chosen": -0.8773849606513977, "rewards/margins": 1.2490243911743164, "rewards/rejected": -2.1264092922210693, "step": 856 }, { "epoch": 0.81, "grad_norm": 32.98768933346141, "learning_rate": 4.285682715909696e-07, "logps/chosen": -57.24964904785156, "logps/rejected": -55.591529846191406, "loss": 0.6924, "losses/dpo": 1.8484503030776978, "losses/sft": 1.8289012908935547, "losses/total": 1.8484503030776978, "ref_logps/chosen": -43.29078674316406, "ref_logps/rejected": -37.121009826660156, "rewards/accuracies": 0.75, "rewards/chosen": -1.395885705947876, "rewards/margins": 0.45116597414016724, "rewards/rejected": -1.847051739692688, "step": 857 }, { "epoch": 0.81, "grad_norm": 26.751526932598242, "learning_rate": 4.28389944751593e-07, "logps/chosen": -44.84846115112305, "logps/rejected": -57.519561767578125, "loss": 0.5554, "losses/dpo": 0.20904089510440826, "losses/sft": 0.9565716981887817, "losses/total": 0.20904089510440826, "ref_logps/chosen": -31.796981811523438, "ref_logps/rejected": -38.044700622558594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3051478862762451, "rewards/margins": 0.642338752746582, "rewards/rejected": -1.9474868774414062, "step": 858 }, { "epoch": 0.81, "grad_norm": 25.365535026181732, "learning_rate": 4.2821143279733197e-07, "logps/chosen": -43.48521041870117, "logps/rejected": -44.23451232910156, "loss": 0.5993, "losses/dpo": 0.4217086434364319, "losses/sft": 0.9753214716911316, "losses/total": 0.4217086434364319, "ref_logps/chosen": -33.99661636352539, "ref_logps/rejected": -28.993146896362305, "rewards/accuracies": 0.75, "rewards/chosen": -0.9488595128059387, "rewards/margins": 0.5752773880958557, "rewards/rejected": -1.5241367816925049, "step": 859 }, { "epoch": 0.81, "grad_norm": 25.868809200477816, "learning_rate": 4.2803273591342807e-07, "logps/chosen": -43.138221740722656, "logps/rejected": -52.60883331298828, "loss": 0.6069, "losses/dpo": 1.266563892364502, "losses/sft": 1.4299288988113403, "losses/total": 1.266563892364502, "ref_logps/chosen": -31.175785064697266, "ref_logps/rejected": -36.19804382324219, "rewards/accuracies": 0.625, "rewards/chosen": -1.1962437629699707, "rewards/margins": 0.44483527541160583, "rewards/rejected": -1.641079068183899, "step": 860 }, { "epoch": 0.81, "grad_norm": 21.818718639349424, "learning_rate": 4.2785385428531475e-07, "logps/chosen": -41.55552291870117, "logps/rejected": -58.676979064941406, "loss": 0.4762, "losses/dpo": 0.4090752899646759, "losses/sft": 0.7323951721191406, "losses/total": 0.4090752899646759, "ref_logps/chosen": -33.15373229980469, "ref_logps/rejected": -43.65513610839844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8401787281036377, "rewards/margins": 0.6620054841041565, "rewards/rejected": -1.502184271812439, "step": 861 }, { "epoch": 0.81, "grad_norm": 32.50748448085487, "learning_rate": 4.27674788098617e-07, "logps/chosen": -50.00140380859375, "logps/rejected": -56.55503845214844, "loss": 0.5932, "losses/dpo": 0.636223316192627, "losses/sft": 0.5794678330421448, "losses/total": 0.636223316192627, "ref_logps/chosen": -38.343406677246094, "ref_logps/rejected": -40.09641647338867, "rewards/accuracies": 0.75, "rewards/chosen": -1.165799617767334, "rewards/margins": 0.4800626039505005, "rewards/rejected": -1.645862102508545, "step": 862 }, { "epoch": 0.81, "grad_norm": 23.32070420204424, "learning_rate": 4.2749553753915155e-07, "logps/chosen": -35.469398498535156, "logps/rejected": -46.373130798339844, "loss": 0.5088, "losses/dpo": 0.37784063816070557, "losses/sft": 0.19384412467479706, "losses/total": 0.37784063816070557, "ref_logps/chosen": -27.120525360107422, "ref_logps/rejected": -30.732004165649414, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8348869681358337, "rewards/margins": 0.7292253971099854, "rewards/rejected": -1.5641124248504639, "step": 863 }, { "epoch": 0.82, "grad_norm": 22.336057606045586, "learning_rate": 4.2731610279292627e-07, "logps/chosen": -37.82498550415039, "logps/rejected": -58.099281311035156, "loss": 0.463, "losses/dpo": 0.07659786194562912, "losses/sft": 1.6574050188064575, "losses/total": 0.07659786194562912, "ref_logps/chosen": -27.698331832885742, "ref_logps/rejected": -38.688167572021484, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0126652717590332, "rewards/margins": 0.9284458160400391, "rewards/rejected": -1.9411110877990723, "step": 864 }, { "epoch": 0.82, "grad_norm": 21.64053567805845, "learning_rate": 4.2713648404614027e-07, "logps/chosen": -35.09897994995117, "logps/rejected": -55.40342712402344, "loss": 0.4881, "losses/dpo": 0.028586389496922493, "losses/sft": 0.69829261302948, "losses/total": 0.028586389496922493, "ref_logps/chosen": -25.35892105102539, "ref_logps/rejected": -38.064727783203125, "rewards/accuracies": 0.625, "rewards/chosen": -0.974005937576294, "rewards/margins": 0.7598639726638794, "rewards/rejected": -1.7338697910308838, "step": 865 }, { "epoch": 0.82, "grad_norm": 28.786756810047613, "learning_rate": 4.269566814851836e-07, "logps/chosen": -37.20191192626953, "logps/rejected": -49.91041564941406, "loss": 0.5891, "losses/dpo": 0.353341668844223, "losses/sft": 0.19828428328037262, "losses/total": 0.353341668844223, "ref_logps/chosen": -28.02576446533203, "ref_logps/rejected": -35.84364318847656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9176145792007446, "rewards/margins": 0.48906266689300537, "rewards/rejected": -1.40667724609375, "step": 866 }, { "epoch": 0.82, "grad_norm": 28.109831644594845, "learning_rate": 4.2677669529663686e-07, "logps/chosen": -39.925880432128906, "logps/rejected": -41.058753967285156, "loss": 0.6955, "losses/dpo": 0.9688034057617188, "losses/sft": 1.7984353303909302, "losses/total": 0.9688034057617188, "ref_logps/chosen": -29.440811157226562, "ref_logps/rejected": -28.655174255371094, "rewards/accuracies": 0.625, "rewards/chosen": -1.0485072135925293, "rewards/margins": 0.19185052812099457, "rewards/rejected": -1.240357756614685, "step": 867 }, { "epoch": 0.82, "grad_norm": 23.81857188925514, "learning_rate": 4.265965256672715e-07, "logps/chosen": -29.71728515625, "logps/rejected": -34.753849029541016, "loss": 0.5891, "losses/dpo": 0.17340071499347687, "losses/sft": 0.8952162861824036, "losses/total": 0.17340071499347687, "ref_logps/chosen": -22.950359344482422, "ref_logps/rejected": -24.106307983398438, "rewards/accuracies": 0.625, "rewards/chosen": -0.6766927242279053, "rewards/margins": 0.3880615234375, "rewards/rejected": -1.0647542476654053, "step": 868 }, { "epoch": 0.82, "grad_norm": 22.503475229516344, "learning_rate": 4.264161727840492e-07, "logps/chosen": -31.84789276123047, "logps/rejected": -50.69245910644531, "loss": 0.5078, "losses/dpo": 0.7439978718757629, "losses/sft": 0.9980310797691345, "losses/total": 0.7439978718757629, "ref_logps/chosen": -23.545299530029297, "ref_logps/rejected": -34.66242218017578, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8302593231201172, "rewards/margins": 0.7727442383766174, "rewards/rejected": -1.6030035018920898, "step": 869 }, { "epoch": 0.82, "grad_norm": 25.202229293534604, "learning_rate": 4.2623563683412155e-07, "logps/chosen": -39.02529525756836, "logps/rejected": -57.5784912109375, "loss": 0.5543, "losses/dpo": 0.019401580095291138, "losses/sft": 1.0978034734725952, "losses/total": 0.019401580095291138, "ref_logps/chosen": -25.47161865234375, "ref_logps/rejected": -37.003108978271484, "rewards/accuracies": 0.6875, "rewards/chosen": -1.35536789894104, "rewards/margins": 0.7021702527999878, "rewards/rejected": -2.0575380325317383, "step": 870 }, { "epoch": 0.82, "grad_norm": 25.293240775506973, "learning_rate": 4.260549180048306e-07, "logps/chosen": -41.232383728027344, "logps/rejected": -62.823673248291016, "loss": 0.457, "losses/dpo": 0.4650460183620453, "losses/sft": 1.029525637626648, "losses/total": 0.4650460183620453, "ref_logps/chosen": -29.300626754760742, "ref_logps/rejected": -40.602760314941406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1931756734848022, "rewards/margins": 1.028916358947754, "rewards/rejected": -2.2220919132232666, "step": 871 }, { "epoch": 0.82, "grad_norm": 27.657791529984888, "learning_rate": 4.258740164837078e-07, "logps/chosen": -39.19025421142578, "logps/rejected": -44.481666564941406, "loss": 0.6232, "losses/dpo": 0.34446772933006287, "losses/sft": 0.4681813716888428, "losses/total": 0.34446772933006287, "ref_logps/chosen": -28.98261833190918, "ref_logps/rejected": -29.28434944152832, "rewards/accuracies": 0.625, "rewards/chosen": -1.020763874053955, "rewards/margins": 0.49896836280822754, "rewards/rejected": -1.5197322368621826, "step": 872 }, { "epoch": 0.82, "grad_norm": 29.55190238409277, "learning_rate": 4.2569293245847436e-07, "logps/chosen": -42.208290100097656, "logps/rejected": -48.73455810546875, "loss": 0.6076, "losses/dpo": 1.3910491466522217, "losses/sft": 1.5759814977645874, "losses/total": 1.3910491466522217, "ref_logps/chosen": -33.690818786621094, "ref_logps/rejected": -35.2227783203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8517469167709351, "rewards/margins": 0.4994305670261383, "rewards/rejected": -1.3511775732040405, "step": 873 }, { "epoch": 0.82, "grad_norm": 25.307526529695977, "learning_rate": 4.255116661170407e-07, "logps/chosen": -37.2789192199707, "logps/rejected": -48.20201873779297, "loss": 0.6239, "losses/dpo": 0.7445598244667053, "losses/sft": 0.9770598411560059, "losses/total": 0.7445598244667053, "ref_logps/chosen": -28.52960205078125, "ref_logps/rejected": -35.75668716430664, "rewards/accuracies": 0.5, "rewards/chosen": -0.8749314546585083, "rewards/margins": 0.3696015775203705, "rewards/rejected": -1.2445330619812012, "step": 874 }, { "epoch": 0.83, "grad_norm": 17.149663480499395, "learning_rate": 4.2533021764750656e-07, "logps/chosen": -29.815223693847656, "logps/rejected": -44.2485466003418, "loss": 0.4453, "losses/dpo": 0.04119879752397537, "losses/sft": 1.1483876705169678, "losses/total": 0.04119879752397537, "ref_logps/chosen": -23.965961456298828, "ref_logps/rejected": -28.209945678710938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5849262475967407, "rewards/margins": 1.0189337730407715, "rewards/rejected": -1.6038601398468018, "step": 875 }, { "epoch": 0.83, "grad_norm": 24.14163085376045, "learning_rate": 4.251485872381607e-07, "logps/chosen": -46.784515380859375, "logps/rejected": -49.711273193359375, "loss": 0.5779, "losses/dpo": 0.7077220678329468, "losses/sft": 1.035994052886963, "losses/total": 0.7077220678329468, "ref_logps/chosen": -34.38722610473633, "ref_logps/rejected": -32.472938537597656, "rewards/accuracies": 0.625, "rewards/chosen": -1.2397289276123047, "rewards/margins": 0.48410508036613464, "rewards/rejected": -1.7238339185714722, "step": 876 }, { "epoch": 0.83, "grad_norm": 22.371371272407764, "learning_rate": 4.2496677507748067e-07, "logps/chosen": -26.57626724243164, "logps/rejected": -45.25965118408203, "loss": 0.5982, "losses/dpo": 1.4484498500823975, "losses/sft": 2.4833250045776367, "losses/total": 1.4484498500823975, "ref_logps/chosen": -18.971176147460938, "ref_logps/rejected": -33.432823181152344, "rewards/accuracies": 0.625, "rewards/chosen": -0.7605089545249939, "rewards/margins": 0.4221734404563904, "rewards/rejected": -1.1826823949813843, "step": 877 }, { "epoch": 0.83, "grad_norm": 32.044399847916445, "learning_rate": 4.247847813541324e-07, "logps/chosen": -40.90718078613281, "logps/rejected": -45.55347442626953, "loss": 0.7517, "losses/dpo": 2.0268614292144775, "losses/sft": 2.4159626960754395, "losses/total": 2.0268614292144775, "ref_logps/chosen": -28.795560836791992, "ref_logps/rejected": -30.236644744873047, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2111618518829346, "rewards/margins": 0.3205212354660034, "rewards/rejected": -1.531683087348938, "step": 878 }, { "epoch": 0.83, "grad_norm": 28.992556402734298, "learning_rate": 4.2460260625697064e-07, "logps/chosen": -55.45459747314453, "logps/rejected": -67.37348937988281, "loss": 0.5611, "losses/dpo": 1.3373414278030396, "losses/sft": 2.1409013271331787, "losses/total": 1.3373414278030396, "ref_logps/chosen": -42.57952880859375, "ref_logps/rejected": -48.025848388671875, "rewards/accuracies": 0.625, "rewards/chosen": -1.2875070571899414, "rewards/margins": 0.6472575068473816, "rewards/rejected": -1.9347646236419678, "step": 879 }, { "epoch": 0.83, "grad_norm": 22.689927731695136, "learning_rate": 4.24420249975038e-07, "logps/chosen": -32.00750732421875, "logps/rejected": -39.17375946044922, "loss": 0.5423, "losses/dpo": 0.30983972549438477, "losses/sft": 1.9107270240783691, "losses/total": 0.30983972549438477, "ref_logps/chosen": -24.46953010559082, "ref_logps/rejected": -26.982234954833984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7537978887557983, "rewards/margins": 0.46535471081733704, "rewards/rejected": -1.2191526889801025, "step": 880 }, { "epoch": 0.83, "grad_norm": 20.137225823081145, "learning_rate": 4.242377126975652e-07, "logps/chosen": -35.67836380004883, "logps/rejected": -47.37017822265625, "loss": 0.4708, "losses/dpo": 0.24442127346992493, "losses/sft": 1.8829383850097656, "losses/total": 0.24442127346992493, "ref_logps/chosen": -26.496742248535156, "ref_logps/rejected": -30.625097274780273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9181622266769409, "rewards/margins": 0.756345808506012, "rewards/rejected": -1.6745080947875977, "step": 881 }, { "epoch": 0.83, "grad_norm": 20.48566753110938, "learning_rate": 4.240549946139708e-07, "logps/chosen": -40.35795593261719, "logps/rejected": -55.129539489746094, "loss": 0.4379, "losses/dpo": 0.5003517866134644, "losses/sft": 1.5231432914733887, "losses/total": 0.5003517866134644, "ref_logps/chosen": -29.303142547607422, "ref_logps/rejected": -35.54966354370117, "rewards/accuracies": 0.8125, "rewards/chosen": -1.105481505393982, "rewards/margins": 0.8525061011314392, "rewards/rejected": -1.957987666130066, "step": 882 }, { "epoch": 0.83, "grad_norm": 19.244804024846104, "learning_rate": 4.2387209591386107e-07, "logps/chosen": -36.11845016479492, "logps/rejected": -53.083709716796875, "loss": 0.4445, "losses/dpo": 0.3409557342529297, "losses/sft": 1.4509333372116089, "losses/total": 0.3409557342529297, "ref_logps/chosen": -27.88420295715332, "ref_logps/rejected": -36.24152374267578, "rewards/accuracies": 0.875, "rewards/chosen": -0.8234250545501709, "rewards/margins": 0.8607935905456543, "rewards/rejected": -1.6842186450958252, "step": 883 }, { "epoch": 0.83, "grad_norm": 27.40011822975937, "learning_rate": 4.236890167870295e-07, "logps/chosen": -45.236454010009766, "logps/rejected": -55.905364990234375, "loss": 0.5217, "losses/dpo": 0.7337958812713623, "losses/sft": 0.4632863402366638, "losses/total": 0.7337958812713623, "ref_logps/chosen": -33.387481689453125, "ref_logps/rejected": -36.434776306152344, "rewards/accuracies": 0.75, "rewards/chosen": -1.1848974227905273, "rewards/margins": 0.7621617913246155, "rewards/rejected": -1.9470592737197876, "step": 884 }, { "epoch": 0.83, "grad_norm": 20.66722240025977, "learning_rate": 4.2350575742345705e-07, "logps/chosen": -38.14088439941406, "logps/rejected": -53.85149383544922, "loss": 0.5042, "losses/dpo": 0.06404484808444977, "losses/sft": 1.0338200330734253, "losses/total": 0.06404484808444977, "ref_logps/chosen": -25.90497398376465, "ref_logps/rejected": -34.86529541015625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2235910892486572, "rewards/margins": 0.6750292778015137, "rewards/rejected": -1.8986204862594604, "step": 885 }, { "epoch": 0.84, "grad_norm": 28.366974837089895, "learning_rate": 4.2332231801331155e-07, "logps/chosen": -37.211952209472656, "logps/rejected": -40.66716003417969, "loss": 0.7333, "losses/dpo": 1.7431272268295288, "losses/sft": 2.0811851024627686, "losses/total": 1.7431272268295288, "ref_logps/chosen": -25.513259887695312, "ref_logps/rejected": -27.65351104736328, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1698689460754395, "rewards/margins": 0.131495863199234, "rewards/rejected": -1.3013646602630615, "step": 886 }, { "epoch": 0.84, "grad_norm": 22.461569390658266, "learning_rate": 4.231386987469477e-07, "logps/chosen": -39.5666618347168, "logps/rejected": -44.73712158203125, "loss": 0.5223, "losses/dpo": 0.24080155789852142, "losses/sft": 0.37672528624534607, "losses/total": 0.24080155789852142, "ref_logps/chosen": -29.5966796875, "ref_logps/rejected": -29.555326461791992, "rewards/accuracies": 0.75, "rewards/chosen": -0.9969984292984009, "rewards/margins": 0.5211812257766724, "rewards/rejected": -1.5181796550750732, "step": 887 }, { "epoch": 0.84, "grad_norm": 25.350502174825856, "learning_rate": 4.229548998149068e-07, "logps/chosen": -40.994319915771484, "logps/rejected": -47.68584442138672, "loss": 0.585, "losses/dpo": 0.38419023156166077, "losses/sft": 0.7617555260658264, "losses/total": 0.38419023156166077, "ref_logps/chosen": -30.611968994140625, "ref_logps/rejected": -33.29136657714844, "rewards/accuracies": 0.75, "rewards/chosen": -1.0382354259490967, "rewards/margins": 0.40121230483055115, "rewards/rejected": -1.4394476413726807, "step": 888 }, { "epoch": 0.84, "grad_norm": 23.296197995130004, "learning_rate": 4.2277092140791664e-07, "logps/chosen": -38.84124755859375, "logps/rejected": -60.473480224609375, "loss": 0.5181, "losses/dpo": 0.3058624863624573, "losses/sft": 2.290700674057007, "losses/total": 0.3058624863624573, "ref_logps/chosen": -27.394676208496094, "ref_logps/rejected": -40.191261291503906, "rewards/accuracies": 0.75, "rewards/chosen": -1.1446572542190552, "rewards/margins": 0.8835650682449341, "rewards/rejected": -2.0282223224639893, "step": 889 }, { "epoch": 0.84, "grad_norm": 23.47360022408501, "learning_rate": 4.225867637168913e-07, "logps/chosen": -41.65113067626953, "logps/rejected": -53.37971496582031, "loss": 0.5077, "losses/dpo": 0.47968441247940063, "losses/sft": 1.1210615634918213, "losses/total": 0.47968441247940063, "ref_logps/chosen": -29.941635131835938, "ref_logps/rejected": -34.634178161621094, "rewards/accuracies": 0.625, "rewards/chosen": -1.1709498167037964, "rewards/margins": 0.7036041617393494, "rewards/rejected": -1.874553918838501, "step": 890 }, { "epoch": 0.84, "grad_norm": 21.949539635882118, "learning_rate": 4.224024269329308e-07, "logps/chosen": -34.05655288696289, "logps/rejected": -55.65813446044922, "loss": 0.4906, "losses/dpo": 0.1416895091533661, "losses/sft": 0.9442422986030579, "losses/total": 0.1416895091533661, "ref_logps/chosen": -23.59194564819336, "ref_logps/rejected": -37.235755920410156, "rewards/accuracies": 0.75, "rewards/chosen": -1.0464608669281006, "rewards/margins": 0.7957769632339478, "rewards/rejected": -1.8422378301620483, "step": 891 }, { "epoch": 0.84, "grad_norm": 16.86266888581389, "learning_rate": 4.2221791124732104e-07, "logps/chosen": -41.09844207763672, "logps/rejected": -49.58598327636719, "loss": 0.3696, "losses/dpo": 0.2666790187358856, "losses/sft": 0.642557680606842, "losses/total": 0.2666790187358856, "ref_logps/chosen": -32.1427116394043, "ref_logps/rejected": -31.095916748046875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.895573079586029, "rewards/margins": 0.9534338712692261, "rewards/rejected": -1.8490068912506104, "step": 892 }, { "epoch": 0.84, "grad_norm": 17.51032739617254, "learning_rate": 4.220332168515336e-07, "logps/chosen": -29.22475814819336, "logps/rejected": -41.751373291015625, "loss": 0.4841, "losses/dpo": 0.6480799317359924, "losses/sft": 1.6184110641479492, "losses/total": 0.6480799317359924, "ref_logps/chosen": -21.588714599609375, "ref_logps/rejected": -26.595481872558594, "rewards/accuracies": 0.875, "rewards/chosen": -0.7636042833328247, "rewards/margins": 0.7519844174385071, "rewards/rejected": -1.5155887603759766, "step": 893 }, { "epoch": 0.84, "grad_norm": 23.65445303722998, "learning_rate": 4.2184834393722545e-07, "logps/chosen": -36.81787872314453, "logps/rejected": -45.07358169555664, "loss": 0.549, "losses/dpo": 0.4079200029373169, "losses/sft": 0.2991778254508972, "losses/total": 0.4079200029373169, "ref_logps/chosen": -29.657514572143555, "ref_logps/rejected": -32.900047302246094, "rewards/accuracies": 0.75, "rewards/chosen": -0.7160365581512451, "rewards/margins": 0.5013170838356018, "rewards/rejected": -1.2173538208007812, "step": 894 }, { "epoch": 0.84, "grad_norm": 25.022903343138587, "learning_rate": 4.216632926962389e-07, "logps/chosen": -35.84685134887695, "logps/rejected": -37.62126159667969, "loss": 0.6184, "losses/dpo": 0.22609052062034607, "losses/sft": 0.4855944812297821, "losses/total": 0.22609052062034607, "ref_logps/chosen": -26.49825668334961, "ref_logps/rejected": -25.11414337158203, "rewards/accuracies": 0.625, "rewards/chosen": -0.9348593950271606, "rewards/margins": 0.3158528208732605, "rewards/rejected": -1.2507121562957764, "step": 895 }, { "epoch": 0.85, "grad_norm": 31.04219004721604, "learning_rate": 4.214780633206012e-07, "logps/chosen": -49.33222198486328, "logps/rejected": -47.34046173095703, "loss": 0.7146, "losses/dpo": 0.2918076813220978, "losses/sft": 1.4308329820632935, "losses/total": 0.2918076813220978, "ref_logps/chosen": -34.938323974609375, "ref_logps/rejected": -31.046127319335938, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4393898248672485, "rewards/margins": 0.19004321098327637, "rewards/rejected": -1.629433035850525, "step": 896 }, { "epoch": 0.85, "grad_norm": 22.158678915501152, "learning_rate": 4.212926560025244e-07, "logps/chosen": -32.85496520996094, "logps/rejected": -45.506874084472656, "loss": 0.4419, "losses/dpo": 0.2261766642332077, "losses/sft": 0.6468708515167236, "losses/total": 0.2261766642332077, "ref_logps/chosen": -24.889896392822266, "ref_logps/rejected": -28.904983520507812, "rewards/accuracies": 0.8125, "rewards/chosen": -0.796506941318512, "rewards/margins": 0.8636821508407593, "rewards/rejected": -1.660189151763916, "step": 897 }, { "epoch": 0.85, "grad_norm": 21.593196095656328, "learning_rate": 4.211070709344055e-07, "logps/chosen": -47.58964157104492, "logps/rejected": -61.803470611572266, "loss": 0.3767, "losses/dpo": 0.18157172203063965, "losses/sft": 0.7361206412315369, "losses/total": 0.18157172203063965, "ref_logps/chosen": -37.74618148803711, "ref_logps/rejected": -41.80682373046875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9843460321426392, "rewards/margins": 1.0153183937072754, "rewards/rejected": -1.999664545059204, "step": 898 }, { "epoch": 0.85, "grad_norm": 30.107451536081978, "learning_rate": 4.2092130830882564e-07, "logps/chosen": -48.28242492675781, "logps/rejected": -49.177852630615234, "loss": 0.6539, "losses/dpo": 0.4532894492149353, "losses/sft": 1.109244704246521, "losses/total": 0.4532894492149353, "ref_logps/chosen": -35.44921112060547, "ref_logps/rejected": -31.92978286743164, "rewards/accuracies": 0.625, "rewards/chosen": -1.2833216190338135, "rewards/margins": 0.4414854645729065, "rewards/rejected": -1.7248070240020752, "step": 899 }, { "epoch": 0.85, "grad_norm": 21.548141463009962, "learning_rate": 4.2073536831855027e-07, "logps/chosen": -40.34624481201172, "logps/rejected": -58.552818298339844, "loss": 0.4443, "losses/dpo": 0.4098782539367676, "losses/sft": 1.62034010887146, "losses/total": 0.4098782539367676, "ref_logps/chosen": -30.996620178222656, "ref_logps/rejected": -39.21705627441406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9349627494812012, "rewards/margins": 0.9986135363578796, "rewards/rejected": -1.9335763454437256, "step": 900 }, { "epoch": 0.85, "grad_norm": 23.735415295355338, "learning_rate": 4.2054925115652907e-07, "logps/chosen": -32.29183578491211, "logps/rejected": -46.44764709472656, "loss": 0.6138, "losses/dpo": 0.5655469298362732, "losses/sft": 0.6119101643562317, "losses/total": 0.5655469298362732, "ref_logps/chosen": -21.461463928222656, "ref_logps/rejected": -29.421070098876953, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0830371379852295, "rewards/margins": 0.6196205615997314, "rewards/rejected": -1.702657699584961, "step": 901 }, { "epoch": 0.85, "grad_norm": 24.093381496687478, "learning_rate": 4.2036295701589533e-07, "logps/chosen": -37.44984817504883, "logps/rejected": -52.437278747558594, "loss": 0.5782, "losses/dpo": 0.04512450098991394, "losses/sft": 0.6906949281692505, "losses/total": 0.04512450098991394, "ref_logps/chosen": -26.320056915283203, "ref_logps/rejected": -34.74459457397461, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1129789352416992, "rewards/margins": 0.6562893986701965, "rewards/rejected": -1.76926851272583, "step": 902 }, { "epoch": 0.85, "grad_norm": 24.649774496900328, "learning_rate": 4.2017648608996623e-07, "logps/chosen": -36.30400085449219, "logps/rejected": -48.88892364501953, "loss": 0.5608, "losses/dpo": 0.30197587609291077, "losses/sft": 1.3575855493545532, "losses/total": 0.30197587609291077, "ref_logps/chosen": -26.131393432617188, "ref_logps/rejected": -32.7120361328125, "rewards/accuracies": 0.75, "rewards/chosen": -1.0172609090805054, "rewards/margins": 0.6004276275634766, "rewards/rejected": -1.617688536643982, "step": 903 }, { "epoch": 0.85, "grad_norm": 20.701941958999168, "learning_rate": 4.199898385722421e-07, "logps/chosen": -40.8466796875, "logps/rejected": -62.95216369628906, "loss": 0.4697, "losses/dpo": 0.3783988952636719, "losses/sft": 0.8412463068962097, "losses/total": 0.3783988952636719, "ref_logps/chosen": -31.116901397705078, "ref_logps/rejected": -45.569740295410156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9729781150817871, "rewards/margins": 0.7652643918991089, "rewards/rejected": -1.738242506980896, "step": 904 }, { "epoch": 0.85, "grad_norm": 21.885979075330905, "learning_rate": 4.198030146564068e-07, "logps/chosen": -33.2156982421875, "logps/rejected": -48.059654235839844, "loss": 0.6081, "losses/dpo": 0.7103680968284607, "losses/sft": 1.8014177083969116, "losses/total": 0.7103680968284607, "ref_logps/chosen": -25.270357131958008, "ref_logps/rejected": -36.032447814941406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7945341467857361, "rewards/margins": 0.40818628668785095, "rewards/rejected": -1.2027204036712646, "step": 905 }, { "epoch": 0.85, "grad_norm": 30.87966352623012, "learning_rate": 4.196160145363271e-07, "logps/chosen": -44.1373405456543, "logps/rejected": -50.23430633544922, "loss": 0.5997, "losses/dpo": 0.21787326037883759, "losses/sft": 0.9446054100990295, "losses/total": 0.21787326037883759, "ref_logps/chosen": -32.710533142089844, "ref_logps/rejected": -34.9009895324707, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1426810026168823, "rewards/margins": 0.39065074920654297, "rewards/rejected": -1.5333317518234253, "step": 906 }, { "epoch": 0.86, "grad_norm": 35.61824452042066, "learning_rate": 4.194288384060527e-07, "logps/chosen": -49.84856033325195, "logps/rejected": -59.60784149169922, "loss": 0.6451, "losses/dpo": 0.7903792262077332, "losses/sft": 2.0921454429626465, "losses/total": 0.7903792262077332, "ref_logps/chosen": -35.85502624511719, "ref_logps/rejected": -42.68927001953125, "rewards/accuracies": 0.625, "rewards/chosen": -1.39935302734375, "rewards/margins": 0.2925039529800415, "rewards/rejected": -1.691856861114502, "step": 907 }, { "epoch": 0.86, "grad_norm": 21.040795988728043, "learning_rate": 4.1924148645981584e-07, "logps/chosen": -31.744388580322266, "logps/rejected": -58.99102783203125, "loss": 0.4424, "losses/dpo": 0.09946266561746597, "losses/sft": 0.96051025390625, "losses/total": 0.09946266561746597, "ref_logps/chosen": -25.121173858642578, "ref_logps/rejected": -38.431861877441406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6623214483261108, "rewards/margins": 1.3935949802398682, "rewards/rejected": -2.0559163093566895, "step": 908 }, { "epoch": 0.86, "grad_norm": 24.55298256867917, "learning_rate": 4.190539588920312e-07, "logps/chosen": -37.96533966064453, "logps/rejected": -48.64141082763672, "loss": 0.5516, "losses/dpo": 0.5060700178146362, "losses/sft": 1.3595590591430664, "losses/total": 0.5060700178146362, "ref_logps/chosen": -28.22365951538086, "ref_logps/rejected": -32.48709487915039, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9741681814193726, "rewards/margins": 0.6412633061408997, "rewards/rejected": -1.615431547164917, "step": 909 }, { "epoch": 0.86, "grad_norm": 19.25141638485868, "learning_rate": 4.1886625589729584e-07, "logps/chosen": -32.801734924316406, "logps/rejected": -50.66381072998047, "loss": 0.4314, "losses/dpo": 0.12299010902643204, "losses/sft": 0.5259350538253784, "losses/total": 0.12299010902643204, "ref_logps/chosen": -24.813827514648438, "ref_logps/rejected": -33.942726135253906, "rewards/accuracies": 0.875, "rewards/chosen": -0.7987908124923706, "rewards/margins": 0.8733178377151489, "rewards/rejected": -1.67210853099823, "step": 910 }, { "epoch": 0.86, "grad_norm": 27.523177920124922, "learning_rate": 4.1867837767038876e-07, "logps/chosen": -43.854923248291016, "logps/rejected": -43.86776351928711, "loss": 0.6935, "losses/dpo": 1.2161500453948975, "losses/sft": 1.8285671472549438, "losses/total": 1.2161500453948975, "ref_logps/chosen": -29.582447052001953, "ref_logps/rejected": -26.645231246948242, "rewards/accuracies": 0.625, "rewards/chosen": -1.4272481203079224, "rewards/margins": 0.2950051724910736, "rewards/rejected": -1.7222533226013184, "step": 911 }, { "epoch": 0.86, "grad_norm": 21.249846035209337, "learning_rate": 4.184903244062709e-07, "logps/chosen": -37.290870666503906, "logps/rejected": -63.22903823852539, "loss": 0.3682, "losses/dpo": 0.3751368224620819, "losses/sft": 1.876264214515686, "losses/total": 0.3751368224620819, "ref_logps/chosen": -26.00030517578125, "ref_logps/rejected": -40.87769317626953, "rewards/accuracies": 0.875, "rewards/chosen": -1.129056692123413, "rewards/margins": 1.1060776710510254, "rewards/rejected": -2.2351346015930176, "step": 912 }, { "epoch": 0.86, "grad_norm": 27.32736596516411, "learning_rate": 4.183020963000845e-07, "logps/chosen": -39.349090576171875, "logps/rejected": -51.62690353393555, "loss": 0.661, "losses/dpo": 0.1195010095834732, "losses/sft": 0.7793853282928467, "losses/total": 0.1195010095834732, "ref_logps/chosen": -25.993942260742188, "ref_logps/rejected": -33.1617431640625, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3355151414871216, "rewards/margins": 0.5110006928443909, "rewards/rejected": -1.8465158939361572, "step": 913 }, { "epoch": 0.86, "grad_norm": 19.873843795293066, "learning_rate": 4.181136935471538e-07, "logps/chosen": -34.120635986328125, "logps/rejected": -58.012611389160156, "loss": 0.3899, "losses/dpo": 0.24101115763187408, "losses/sft": 1.3617658615112305, "losses/total": 0.24101115763187408, "ref_logps/chosen": -25.903457641601562, "ref_logps/rejected": -39.11019515991211, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8217180371284485, "rewards/margins": 1.068523645401001, "rewards/rejected": -1.8902416229248047, "step": 914 }, { "epoch": 0.86, "grad_norm": 23.516324105231767, "learning_rate": 4.1792511634298365e-07, "logps/chosen": -38.14878463745117, "logps/rejected": -57.26447296142578, "loss": 0.5045, "losses/dpo": 0.06791044026613235, "losses/sft": 2.1535677909851074, "losses/total": 0.06791044026613235, "ref_logps/chosen": -26.90703582763672, "ref_logps/rejected": -39.09941101074219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1241748332977295, "rewards/margins": 0.6923313140869141, "rewards/rejected": -1.8165061473846436, "step": 915 }, { "epoch": 0.86, "grad_norm": 21.01995078153882, "learning_rate": 4.1773636488326047e-07, "logps/chosen": -45.83652114868164, "logps/rejected": -52.887779235839844, "loss": 0.4578, "losses/dpo": 0.402238667011261, "losses/sft": 1.1336357593536377, "losses/total": 0.402238667011261, "ref_logps/chosen": -37.3010139465332, "ref_logps/rejected": -35.98963928222656, "rewards/accuracies": 0.75, "rewards/chosen": -0.8535506129264832, "rewards/margins": 0.8362632989883423, "rewards/rejected": -1.6898140907287598, "step": 916 }, { "epoch": 0.87, "grad_norm": 19.541760237111458, "learning_rate": 4.1754743936385116e-07, "logps/chosen": -34.07489013671875, "logps/rejected": -53.80813980102539, "loss": 0.4672, "losses/dpo": 0.3115488290786743, "losses/sft": 1.6388158798217773, "losses/total": 0.3115488290786743, "ref_logps/chosen": -25.176061630249023, "ref_logps/rejected": -35.571571350097656, "rewards/accuracies": 0.75, "rewards/chosen": -0.8898826241493225, "rewards/margins": 0.9337741136550903, "rewards/rejected": -1.823656678199768, "step": 917 }, { "epoch": 0.87, "grad_norm": 24.050914912153157, "learning_rate": 4.1735833998080336e-07, "logps/chosen": -34.09495162963867, "logps/rejected": -42.01456832885742, "loss": 0.6001, "losses/dpo": 0.36542776226997375, "losses/sft": 0.2020290344953537, "losses/total": 0.36542776226997375, "ref_logps/chosen": -25.51016616821289, "ref_logps/rejected": -28.448287963867188, "rewards/accuracies": 0.6875, "rewards/chosen": -0.858478307723999, "rewards/margins": 0.498149573802948, "rewards/rejected": -1.3566279411315918, "step": 918 }, { "epoch": 0.87, "grad_norm": 17.672781298163052, "learning_rate": 4.1716906693034517e-07, "logps/chosen": -34.082191467285156, "logps/rejected": -54.48222351074219, "loss": 0.4744, "losses/dpo": 0.31520146131515503, "losses/sft": 1.6116650104522705, "losses/total": 0.31520146131515503, "ref_logps/chosen": -25.027650833129883, "ref_logps/rejected": -37.58657455444336, "rewards/accuracies": 0.75, "rewards/chosen": -0.9054545164108276, "rewards/margins": 0.7841106057167053, "rewards/rejected": -1.6895650625228882, "step": 919 }, { "epoch": 0.87, "grad_norm": 23.033338188784683, "learning_rate": 4.169796204088848e-07, "logps/chosen": -44.78615188598633, "logps/rejected": -52.996185302734375, "loss": 0.5632, "losses/dpo": 0.6786547303199768, "losses/sft": 1.5458940267562866, "losses/total": 0.6786547303199768, "ref_logps/chosen": -33.22325134277344, "ref_logps/rejected": -34.295135498046875, "rewards/accuracies": 0.625, "rewards/chosen": -1.1562899351119995, "rewards/margins": 0.713815450668335, "rewards/rejected": -1.8701053857803345, "step": 920 }, { "epoch": 0.87, "grad_norm": 21.341508703453666, "learning_rate": 4.167900006130105e-07, "logps/chosen": -26.671735763549805, "logps/rejected": -37.99494934082031, "loss": 0.5483, "losses/dpo": 1.1578335762023926, "losses/sft": 1.520421028137207, "losses/total": 1.1578335762023926, "ref_logps/chosen": -19.331226348876953, "ref_logps/rejected": -25.16969108581543, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7340512275695801, "rewards/margins": 0.5484746694564819, "rewards/rejected": -1.282525897026062, "step": 921 }, { "epoch": 0.87, "grad_norm": 21.545773387460155, "learning_rate": 4.1660020773949044e-07, "logps/chosen": -38.54132080078125, "logps/rejected": -48.71319580078125, "loss": 0.4891, "losses/dpo": 0.47671496868133545, "losses/sft": 1.584112286567688, "losses/total": 0.47671496868133545, "ref_logps/chosen": -26.758333206176758, "ref_logps/rejected": -28.39618492126465, "rewards/accuracies": 0.75, "rewards/chosen": -1.1782984733581543, "rewards/margins": 0.8534023761749268, "rewards/rejected": -2.03170108795166, "step": 922 }, { "epoch": 0.87, "grad_norm": 20.211765842171626, "learning_rate": 4.164102419852722e-07, "logps/chosen": -35.64414596557617, "logps/rejected": -58.01095199584961, "loss": 0.4194, "losses/dpo": 0.1684272587299347, "losses/sft": 0.8491212129592896, "losses/total": 0.1684272587299347, "ref_logps/chosen": -27.0108642578125, "ref_logps/rejected": -38.452754974365234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8633280992507935, "rewards/margins": 1.0924915075302124, "rewards/rejected": -1.9558196067810059, "step": 923 }, { "epoch": 0.87, "grad_norm": 19.32566237079442, "learning_rate": 4.162201035474829e-07, "logps/chosen": -37.293060302734375, "logps/rejected": -60.52928161621094, "loss": 0.4169, "losses/dpo": 0.4890132546424866, "losses/sft": 2.0359716415405273, "losses/total": 0.4890132546424866, "ref_logps/chosen": -28.863285064697266, "ref_logps/rejected": -38.64293670654297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8429778814315796, "rewards/margins": 1.3456566333770752, "rewards/rejected": -2.1886343955993652, "step": 924 }, { "epoch": 0.87, "grad_norm": 25.438562369646444, "learning_rate": 4.160297926234288e-07, "logps/chosen": -39.23985290527344, "logps/rejected": -53.797080993652344, "loss": 0.4939, "losses/dpo": 1.1518280506134033, "losses/sft": 2.238197088241577, "losses/total": 1.1518280506134033, "ref_logps/chosen": -28.536823272705078, "ref_logps/rejected": -35.79549026489258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.070302963256836, "rewards/margins": 0.7298557758331299, "rewards/rejected": -1.8001585006713867, "step": 925 }, { "epoch": 0.87, "grad_norm": 25.12653580005135, "learning_rate": 4.158393094105952e-07, "logps/chosen": -35.06109619140625, "logps/rejected": -45.30868148803711, "loss": 0.6171, "losses/dpo": 1.0147124528884888, "losses/sft": 1.2626616954803467, "losses/total": 1.0147124528884888, "ref_logps/chosen": -26.704570770263672, "ref_logps/rejected": -32.252532958984375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8356528878211975, "rewards/margins": 0.46996191143989563, "rewards/rejected": -1.305614709854126, "step": 926 }, { "epoch": 0.87, "grad_norm": 28.538080941916665, "learning_rate": 4.156486541066461e-07, "logps/chosen": -37.79431915283203, "logps/rejected": -47.0215950012207, "loss": 0.6962, "losses/dpo": 0.7076489925384521, "losses/sft": 1.7395241260528564, "losses/total": 0.7076489925384521, "ref_logps/chosen": -26.874177932739258, "ref_logps/rejected": -33.293487548828125, "rewards/accuracies": 0.5, "rewards/chosen": -1.092014193534851, "rewards/margins": 0.2807968258857727, "rewards/rejected": -1.372810959815979, "step": 927 }, { "epoch": 0.88, "grad_norm": 31.02583568717746, "learning_rate": 4.154578269094241e-07, "logps/chosen": -46.856624603271484, "logps/rejected": -48.588565826416016, "loss": 0.6968, "losses/dpo": 0.3355472981929779, "losses/sft": 1.623366117477417, "losses/total": 0.3355472981929779, "ref_logps/chosen": -34.83954620361328, "ref_logps/rejected": -34.47611618041992, "rewards/accuracies": 0.75, "rewards/chosen": -1.2017079591751099, "rewards/margins": 0.20953738689422607, "rewards/rejected": -1.411245346069336, "step": 928 }, { "epoch": 0.88, "grad_norm": 19.974365019535927, "learning_rate": 4.152668280169502e-07, "logps/chosen": -29.488571166992188, "logps/rejected": -46.755592346191406, "loss": 0.4747, "losses/dpo": 0.478587806224823, "losses/sft": 1.516053557395935, "losses/total": 0.478587806224823, "ref_logps/chosen": -20.146007537841797, "ref_logps/rejected": -29.7484130859375, "rewards/accuracies": 0.75, "rewards/chosen": -0.9342561960220337, "rewards/margins": 0.7664618492126465, "rewards/rejected": -1.7007180452346802, "step": 929 }, { "epoch": 0.88, "grad_norm": 17.106727353333355, "learning_rate": 4.1507565762742367e-07, "logps/chosen": -36.37318420410156, "logps/rejected": -55.72610092163086, "loss": 0.3667, "losses/dpo": 0.13924357295036316, "losses/sft": 1.3576077222824097, "losses/total": 0.13924357295036316, "ref_logps/chosen": -27.57881736755371, "ref_logps/rejected": -36.09309387207031, "rewards/accuracies": 0.875, "rewards/chosen": -0.879436731338501, "rewards/margins": 1.083863615989685, "rewards/rejected": -1.9633002281188965, "step": 930 }, { "epoch": 0.88, "grad_norm": 17.36052494128547, "learning_rate": 4.148843159392216e-07, "logps/chosen": -29.51447105407715, "logps/rejected": -37.97950744628906, "loss": 0.4575, "losses/dpo": 0.27825498580932617, "losses/sft": 1.104833960533142, "losses/total": 0.27825498580932617, "ref_logps/chosen": -23.042869567871094, "ref_logps/rejected": -22.743064880371094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6471602916717529, "rewards/margins": 0.8764839172363281, "rewards/rejected": -1.523644208908081, "step": 931 }, { "epoch": 0.88, "grad_norm": 18.411405864559523, "learning_rate": 4.1469280315089873e-07, "logps/chosen": -35.15660858154297, "logps/rejected": -64.53216552734375, "loss": 0.388, "losses/dpo": 0.7594536542892456, "losses/sft": 1.0308568477630615, "losses/total": 0.7594536542892456, "ref_logps/chosen": -24.94455337524414, "ref_logps/rejected": -44.252685546875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0212055444717407, "rewards/margins": 1.0067416429519653, "rewards/rejected": -2.027947187423706, "step": 932 }, { "epoch": 0.88, "grad_norm": 23.27586127264196, "learning_rate": 4.145011194611877e-07, "logps/chosen": -40.706092834472656, "logps/rejected": -61.866783142089844, "loss": 0.4482, "losses/dpo": 0.2101690173149109, "losses/sft": 0.783534049987793, "losses/total": 0.2101690173149109, "ref_logps/chosen": -31.454662322998047, "ref_logps/rejected": -42.69493103027344, "rewards/accuracies": 0.875, "rewards/chosen": -0.9251430034637451, "rewards/margins": 0.9920419454574585, "rewards/rejected": -1.9171849489212036, "step": 933 }, { "epoch": 0.88, "grad_norm": 16.16822286082194, "learning_rate": 4.1430926506899813e-07, "logps/chosen": -30.84832763671875, "logps/rejected": -57.22462463378906, "loss": 0.3478, "losses/dpo": 0.473705530166626, "losses/sft": 1.2044432163238525, "losses/total": 0.473705530166626, "ref_logps/chosen": -21.768909454345703, "ref_logps/rejected": -37.43292999267578, "rewards/accuracies": 0.875, "rewards/chosen": -0.9079415798187256, "rewards/margins": 1.0712279081344604, "rewards/rejected": -1.9791696071624756, "step": 934 }, { "epoch": 0.88, "grad_norm": 20.88142101336967, "learning_rate": 4.141172401734169e-07, "logps/chosen": -27.629892349243164, "logps/rejected": -40.79683303833008, "loss": 0.5073, "losses/dpo": 0.6389729976654053, "losses/sft": 0.7405750155448914, "losses/total": 0.6389729976654053, "ref_logps/chosen": -20.725839614868164, "ref_logps/rejected": -25.920459747314453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6904053688049316, "rewards/margins": 0.7972322702407837, "rewards/rejected": -1.4876375198364258, "step": 935 }, { "epoch": 0.88, "grad_norm": 28.303164222374075, "learning_rate": 4.1392504497370807e-07, "logps/chosen": -44.73594665527344, "logps/rejected": -41.179542541503906, "loss": 0.7563, "losses/dpo": 0.5812946557998657, "losses/sft": 0.8916969299316406, "losses/total": 0.5812946557998657, "ref_logps/chosen": -31.700759887695312, "ref_logps/rejected": -27.221107482910156, "rewards/accuracies": 0.4375, "rewards/chosen": -1.3035187721252441, "rewards/margins": 0.09232449531555176, "rewards/rejected": -1.395843267440796, "step": 936 }, { "epoch": 0.88, "grad_norm": 24.226974103294758, "learning_rate": 4.13732679669312e-07, "logps/chosen": -60.54051971435547, "logps/rejected": -56.063411712646484, "loss": 0.5003, "losses/dpo": 0.21512259542942047, "losses/sft": 1.5589460134506226, "losses/total": 0.21512259542942047, "ref_logps/chosen": -45.43059158325195, "ref_logps/rejected": -34.27918243408203, "rewards/accuracies": 0.75, "rewards/chosen": -1.5109926462173462, "rewards/margins": 0.6674305200576782, "rewards/rejected": -2.1784234046936035, "step": 937 }, { "epoch": 0.88, "grad_norm": 24.622923381245666, "learning_rate": 4.1354014445984596e-07, "logps/chosen": -27.552387237548828, "logps/rejected": -38.4012565612793, "loss": 0.6489, "losses/dpo": 0.5676543116569519, "losses/sft": 0.8778327107429504, "losses/total": 0.5676543116569519, "ref_logps/chosen": -19.484699249267578, "ref_logps/rejected": -27.839187622070312, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8067688941955566, "rewards/margins": 0.24943795800209045, "rewards/rejected": -1.0562069416046143, "step": 938 }, { "epoch": 0.89, "grad_norm": 27.26265633200948, "learning_rate": 4.133474395451032e-07, "logps/chosen": -47.102108001708984, "logps/rejected": -62.25968933105469, "loss": 0.5554, "losses/dpo": 0.24964337050914764, "losses/sft": 1.1658868789672852, "losses/total": 0.24964337050914764, "ref_logps/chosen": -34.893028259277344, "ref_logps/rejected": -43.97866439819336, "rewards/accuracies": 0.75, "rewards/chosen": -1.2209080457687378, "rewards/margins": 0.607194185256958, "rewards/rejected": -1.8281021118164062, "step": 939 }, { "epoch": 0.89, "grad_norm": 20.409911417285727, "learning_rate": 4.131545651250532e-07, "logps/chosen": -33.39942169189453, "logps/rejected": -50.12690353393555, "loss": 0.4873, "losses/dpo": 0.2818277180194855, "losses/sft": 1.163731336593628, "losses/total": 0.2818277180194855, "ref_logps/chosen": -22.998645782470703, "ref_logps/rejected": -33.234153747558594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0400779247283936, "rewards/margins": 0.6491969227790833, "rewards/rejected": -1.6892749071121216, "step": 940 }, { "epoch": 0.89, "grad_norm": 22.879328238992745, "learning_rate": 4.1296152139984145e-07, "logps/chosen": -42.4998779296875, "logps/rejected": -59.99407196044922, "loss": 0.4579, "losses/dpo": 0.7276193499565125, "losses/sft": 0.5507347583770752, "losses/total": 0.7276193499565125, "ref_logps/chosen": -31.505016326904297, "ref_logps/rejected": -41.084754943847656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.099485993385315, "rewards/margins": 0.7914453744888306, "rewards/rejected": -1.8909313678741455, "step": 941 }, { "epoch": 0.89, "grad_norm": 22.065060785520135, "learning_rate": 4.1276830856978907e-07, "logps/chosen": -38.23424530029297, "logps/rejected": -53.54481506347656, "loss": 0.5199, "losses/dpo": 0.4069402813911438, "losses/sft": 0.9586144685745239, "losses/total": 0.4069402813911438, "ref_logps/chosen": -26.189794540405273, "ref_logps/rejected": -34.601959228515625, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2044451236724854, "rewards/margins": 0.68984055519104, "rewards/rejected": -1.8942856788635254, "step": 942 }, { "epoch": 0.89, "grad_norm": 27.096353796383955, "learning_rate": 4.125749268353925e-07, "logps/chosen": -43.002601623535156, "logps/rejected": -51.36360168457031, "loss": 0.6942, "losses/dpo": 1.0688354969024658, "losses/sft": 1.0040515661239624, "losses/total": 1.0688354969024658, "ref_logps/chosen": -29.006195068359375, "ref_logps/rejected": -34.05558395385742, "rewards/accuracies": 0.6875, "rewards/chosen": -1.399640679359436, "rewards/margins": 0.3311614692211151, "rewards/rejected": -1.7308021783828735, "step": 943 }, { "epoch": 0.89, "grad_norm": 17.151233371600284, "learning_rate": 4.123813763973237e-07, "logps/chosen": -32.12122344970703, "logps/rejected": -51.037864685058594, "loss": 0.371, "losses/dpo": 0.44943976402282715, "losses/sft": 0.6698719263076782, "losses/total": 0.44943976402282715, "ref_logps/chosen": -22.959924697875977, "ref_logps/rejected": -30.344282150268555, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9161297678947449, "rewards/margins": 1.1532281637191772, "rewards/rejected": -2.0693578720092773, "step": 944 }, { "epoch": 0.89, "grad_norm": 18.915128810345, "learning_rate": 4.1218765745642943e-07, "logps/chosen": -46.362266540527344, "logps/rejected": -61.30126953125, "loss": 0.3489, "losses/dpo": 0.2665194571018219, "losses/sft": 0.7454931735992432, "losses/total": 0.2665194571018219, "ref_logps/chosen": -35.63953399658203, "ref_logps/rejected": -37.41960906982422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0722736120224, "rewards/margins": 1.3158924579620361, "rewards/rejected": -2.3881659507751465, "step": 945 }, { "epoch": 0.89, "grad_norm": 30.431112020968488, "learning_rate": 4.119937702137315e-07, "logps/chosen": -55.10050964355469, "logps/rejected": -56.323768615722656, "loss": 0.6864, "losses/dpo": 0.05825190618634224, "losses/sft": 0.9805903434753418, "losses/total": 0.05825190618634224, "ref_logps/chosen": -38.9305419921875, "ref_logps/rejected": -37.0120849609375, "rewards/accuracies": 0.625, "rewards/chosen": -1.6169965267181396, "rewards/margins": 0.3141716718673706, "rewards/rejected": -1.9311683177947998, "step": 946 }, { "epoch": 0.89, "grad_norm": 28.460300309390384, "learning_rate": 4.117997148704263e-07, "logps/chosen": -51.590450286865234, "logps/rejected": -52.965240478515625, "loss": 0.5853, "losses/dpo": 0.30931028723716736, "losses/sft": 0.575111448764801, "losses/total": 0.30931028723716736, "ref_logps/chosen": -38.308349609375, "ref_logps/rejected": -32.67570495605469, "rewards/accuracies": 0.6875, "rewards/chosen": -1.328209638595581, "rewards/margins": 0.7007438540458679, "rewards/rejected": -2.0289535522460938, "step": 947 }, { "epoch": 0.89, "grad_norm": 23.765590800337456, "learning_rate": 4.116054916278848e-07, "logps/chosen": -39.64250946044922, "logps/rejected": -47.443115234375, "loss": 0.5439, "losses/dpo": 0.6618918776512146, "losses/sft": 1.5694547891616821, "losses/total": 0.6618918776512146, "ref_logps/chosen": -28.88414764404297, "ref_logps/rejected": -31.335552215576172, "rewards/accuracies": 0.75, "rewards/chosen": -1.0758363008499146, "rewards/margins": 0.5349198579788208, "rewards/rejected": -1.6107561588287354, "step": 948 }, { "epoch": 0.9, "grad_norm": 20.424450421805023, "learning_rate": 4.114111006876518e-07, "logps/chosen": -36.64237594604492, "logps/rejected": -47.12493133544922, "loss": 0.4308, "losses/dpo": 0.5104063749313354, "losses/sft": 0.24806171655654907, "losses/total": 0.5104063749313354, "ref_logps/chosen": -28.47429656982422, "ref_logps/rejected": -31.90326499938965, "rewards/accuracies": 0.875, "rewards/chosen": -0.8168081641197205, "rewards/margins": 0.705358624458313, "rewards/rejected": -1.5221667289733887, "step": 949 }, { "epoch": 0.9, "grad_norm": 19.746244456312898, "learning_rate": 4.1121654225144666e-07, "logps/chosen": -32.590179443359375, "logps/rejected": -51.37494659423828, "loss": 0.3773, "losses/dpo": 1.5521235466003418, "losses/sft": 1.5323375463485718, "losses/total": 1.5521235466003418, "ref_logps/chosen": -24.619604110717773, "ref_logps/rejected": -31.256420135498047, "rewards/accuracies": 0.875, "rewards/chosen": -0.7970576286315918, "rewards/margins": 1.2147953510284424, "rewards/rejected": -2.011852979660034, "step": 950 }, { "epoch": 0.9, "grad_norm": 18.842547253217475, "learning_rate": 4.110218165211621e-07, "logps/chosen": -37.189918518066406, "logps/rejected": -52.492557525634766, "loss": 0.4448, "losses/dpo": 0.19946394860744476, "losses/sft": 1.0891224145889282, "losses/total": 0.19946394860744476, "ref_logps/chosen": -28.227340698242188, "ref_logps/rejected": -35.375343322753906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8962579965591431, "rewards/margins": 0.8154633045196533, "rewards/rejected": -1.7117211818695068, "step": 951 }, { "epoch": 0.9, "grad_norm": 22.124104064870195, "learning_rate": 4.1082692369886474e-07, "logps/chosen": -39.60765838623047, "logps/rejected": -39.8839225769043, "loss": 0.6695, "losses/dpo": 0.6484159231185913, "losses/sft": 1.9475579261779785, "losses/total": 0.6484159231185913, "ref_logps/chosen": -29.052398681640625, "ref_logps/rejected": -27.085844039916992, "rewards/accuracies": 0.5625, "rewards/chosen": -1.055525779724121, "rewards/margins": 0.22428196668624878, "rewards/rejected": -1.2798078060150146, "step": 952 }, { "epoch": 0.9, "grad_norm": 30.654770567931347, "learning_rate": 4.106318639867943e-07, "logps/chosen": -37.44842529296875, "logps/rejected": -48.44736862182617, "loss": 0.6249, "losses/dpo": 0.29709115624427795, "losses/sft": 1.0591084957122803, "losses/total": 0.29709115624427795, "ref_logps/chosen": -26.765769958496094, "ref_logps/rejected": -32.277523040771484, "rewards/accuracies": 0.75, "rewards/chosen": -1.0682653188705444, "rewards/margins": 0.5487193465232849, "rewards/rejected": -1.6169846057891846, "step": 953 }, { "epoch": 0.9, "grad_norm": 31.0074287299833, "learning_rate": 4.10436637587364e-07, "logps/chosen": -40.96473693847656, "logps/rejected": -39.593902587890625, "loss": 0.7967, "losses/dpo": 0.5278167724609375, "losses/sft": 0.42939168214797974, "losses/total": 0.5278167724609375, "ref_logps/chosen": -28.345829010009766, "ref_logps/rejected": -26.48255157470703, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2618907690048218, "rewards/margins": 0.04924415051937103, "rewards/rejected": -1.311134934425354, "step": 954 }, { "epoch": 0.9, "grad_norm": 23.9967906050299, "learning_rate": 4.102412447031598e-07, "logps/chosen": -34.390953063964844, "logps/rejected": -43.49022674560547, "loss": 0.6751, "losses/dpo": 0.18967188894748688, "losses/sft": 1.004607081413269, "losses/total": 0.18967188894748688, "ref_logps/chosen": -24.329885482788086, "ref_logps/rejected": -30.013843536376953, "rewards/accuracies": 0.5, "rewards/chosen": -1.006106972694397, "rewards/margins": 0.3415314853191376, "rewards/rejected": -1.347638487815857, "step": 955 }, { "epoch": 0.9, "grad_norm": 21.209132329179756, "learning_rate": 4.100456855369405e-07, "logps/chosen": -42.68427658081055, "logps/rejected": -58.93936538696289, "loss": 0.4397, "losses/dpo": 0.4105394780635834, "losses/sft": 2.217130184173584, "losses/total": 0.4105394780635834, "ref_logps/chosen": -32.10963821411133, "ref_logps/rejected": -39.74237823486328, "rewards/accuracies": 0.75, "rewards/chosen": -1.0574637651443481, "rewards/margins": 0.8622349500656128, "rewards/rejected": -1.919698715209961, "step": 956 }, { "epoch": 0.9, "grad_norm": 27.26125743591905, "learning_rate": 4.0984996029163757e-07, "logps/chosen": -34.47675704956055, "logps/rejected": -38.14021301269531, "loss": 0.7387, "losses/dpo": 0.3229867219924927, "losses/sft": 0.668172299861908, "losses/total": 0.3229867219924927, "ref_logps/chosen": -24.05929183959961, "ref_logps/rejected": -26.483633041381836, "rewards/accuracies": 0.5, "rewards/chosen": -1.0417463779449463, "rewards/margins": 0.12391176074743271, "rewards/rejected": -1.1656582355499268, "step": 957 }, { "epoch": 0.9, "grad_norm": 30.219191907806707, "learning_rate": 4.096540691703545e-07, "logps/chosen": -49.04740905761719, "logps/rejected": -60.62848663330078, "loss": 0.6939, "losses/dpo": 0.07222314924001694, "losses/sft": 0.9844986796379089, "losses/total": 0.07222314924001694, "ref_logps/chosen": -36.246192932128906, "ref_logps/rejected": -42.481849670410156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2801218032836914, "rewards/margins": 0.534541666507721, "rewards/rejected": -1.8146635293960571, "step": 958 }, { "epoch": 0.9, "grad_norm": 25.042095233083053, "learning_rate": 4.094580123763672e-07, "logps/chosen": -34.09675216674805, "logps/rejected": -33.04334259033203, "loss": 0.6935, "losses/dpo": 2.321268081665039, "losses/sft": 1.3246458768844604, "losses/total": 2.321268081665039, "ref_logps/chosen": -24.71842384338379, "ref_logps/rejected": -19.325790405273438, "rewards/accuracies": 0.75, "rewards/chosen": -0.937832772731781, "rewards/margins": 0.4339222311973572, "rewards/rejected": -1.3717550039291382, "step": 959 }, { "epoch": 0.91, "grad_norm": 24.220281821602562, "learning_rate": 4.0926179011312346e-07, "logps/chosen": -45.240413665771484, "logps/rejected": -49.03614807128906, "loss": 0.4818, "losses/dpo": 0.4483388066291809, "losses/sft": 1.0254427194595337, "losses/total": 0.4483388066291809, "ref_logps/chosen": -33.312782287597656, "ref_logps/rejected": -30.057186126708984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.192763090133667, "rewards/margins": 0.7051330804824829, "rewards/rejected": -1.8978960514068604, "step": 960 }, { "epoch": 0.91, "grad_norm": 27.28656665580076, "learning_rate": 4.090654025842427e-07, "logps/chosen": -36.368896484375, "logps/rejected": -46.89258575439453, "loss": 0.6717, "losses/dpo": 1.1607427597045898, "losses/sft": 1.1011731624603271, "losses/total": 1.1607427597045898, "ref_logps/chosen": -22.88170623779297, "ref_logps/rejected": -28.813852310180664, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3487193584442139, "rewards/margins": 0.4591540992259979, "rewards/rejected": -1.8078734874725342, "step": 961 }, { "epoch": 0.91, "grad_norm": 18.896708204738044, "learning_rate": 4.0886884999351585e-07, "logps/chosen": -38.43382263183594, "logps/rejected": -50.486656188964844, "loss": 0.5027, "losses/dpo": 0.7835617065429688, "losses/sft": 1.9915879964828491, "losses/total": 0.7835617065429688, "ref_logps/chosen": -26.343469619750977, "ref_logps/rejected": -31.90937042236328, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2090353965759277, "rewards/margins": 0.6486935615539551, "rewards/rejected": -1.8577288389205933, "step": 962 }, { "epoch": 0.91, "grad_norm": 23.59092058289499, "learning_rate": 4.0867213254490505e-07, "logps/chosen": -30.900407791137695, "logps/rejected": -61.05036926269531, "loss": 0.4533, "losses/dpo": 0.706386387348175, "losses/sft": 0.9897667169570923, "losses/total": 0.706386387348175, "ref_logps/chosen": -20.993370056152344, "ref_logps/rejected": -43.396636962890625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9907039403915405, "rewards/margins": 0.7746692299842834, "rewards/rejected": -1.7653732299804688, "step": 963 }, { "epoch": 0.91, "grad_norm": 28.253909609690812, "learning_rate": 4.0847525044254384e-07, "logps/chosen": -38.37682342529297, "logps/rejected": -34.442474365234375, "loss": 0.7424, "losses/dpo": 0.15847434103488922, "losses/sft": 1.013485312461853, "losses/total": 0.15847434103488922, "ref_logps/chosen": -26.901519775390625, "ref_logps/rejected": -21.51749038696289, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1475303173065186, "rewards/margins": 0.1449681520462036, "rewards/rejected": -1.2924983501434326, "step": 964 }, { "epoch": 0.91, "grad_norm": 24.595751466556578, "learning_rate": 4.082782038907361e-07, "logps/chosen": -33.562076568603516, "logps/rejected": -49.32318115234375, "loss": 0.4752, "losses/dpo": 0.3901654779911041, "losses/sft": 0.8778814077377319, "losses/total": 0.3901654779911041, "ref_logps/chosen": -24.126752853393555, "ref_logps/rejected": -31.07928466796875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9435324668884277, "rewards/margins": 0.8808570504188538, "rewards/rejected": -1.8243894577026367, "step": 965 }, { "epoch": 0.91, "grad_norm": 21.476182201915616, "learning_rate": 4.0808099309395675e-07, "logps/chosen": -34.322715759277344, "logps/rejected": -45.85235595703125, "loss": 0.5419, "losses/dpo": 0.5955326557159424, "losses/sft": 0.8122118711471558, "losses/total": 0.5955326557159424, "ref_logps/chosen": -23.015098571777344, "ref_logps/rejected": -29.153017044067383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1307618618011475, "rewards/margins": 0.5391722917556763, "rewards/rejected": -1.6699342727661133, "step": 966 }, { "epoch": 0.91, "grad_norm": 24.75390694918523, "learning_rate": 4.078836182568511e-07, "logps/chosen": -36.259525299072266, "logps/rejected": -50.369415283203125, "loss": 0.5233, "losses/dpo": 0.40763530135154724, "losses/sft": 1.6705811023712158, "losses/total": 0.40763530135154724, "ref_logps/chosen": -26.25718879699707, "ref_logps/rejected": -32.144248962402344, "rewards/accuracies": 0.75, "rewards/chosen": -1.0002340078353882, "rewards/margins": 0.8222827911376953, "rewards/rejected": -1.822516679763794, "step": 967 }, { "epoch": 0.91, "grad_norm": 28.097635177121486, "learning_rate": 4.076860795842345e-07, "logps/chosen": -49.71784591674805, "logps/rejected": -49.030792236328125, "loss": 0.6643, "losses/dpo": 0.6246939897537231, "losses/sft": 1.0372647047042847, "losses/total": 0.6246939897537231, "ref_logps/chosen": -36.09700012207031, "ref_logps/rejected": -32.444068908691406, "rewards/accuracies": 0.75, "rewards/chosen": -1.3620846271514893, "rewards/margins": 0.29658737778663635, "rewards/rejected": -1.6586718559265137, "step": 968 }, { "epoch": 0.91, "grad_norm": 24.573716930196582, "learning_rate": 4.074883772810926e-07, "logps/chosen": -42.480018615722656, "logps/rejected": -55.04345703125, "loss": 0.4933, "losses/dpo": 0.8237965703010559, "losses/sft": 1.3406672477722168, "losses/total": 0.8237965703010559, "ref_logps/chosen": -29.139511108398438, "ref_logps/rejected": -33.257659912109375, "rewards/accuracies": 0.625, "rewards/chosen": -1.3340506553649902, "rewards/margins": 0.8445292711257935, "rewards/rejected": -2.178579807281494, "step": 969 }, { "epoch": 0.92, "grad_norm": 23.128011036100165, "learning_rate": 4.0729051155258046e-07, "logps/chosen": -39.19987869262695, "logps/rejected": -63.086692810058594, "loss": 0.5392, "losses/dpo": 0.09225346893072128, "losses/sft": 1.3230981826782227, "losses/total": 0.09225346893072128, "ref_logps/chosen": -25.533851623535156, "ref_logps/rejected": -42.48691177368164, "rewards/accuracies": 0.6875, "rewards/chosen": -1.366602897644043, "rewards/margins": 0.6933755874633789, "rewards/rejected": -2.059978485107422, "step": 970 }, { "epoch": 0.92, "grad_norm": 26.76592896576607, "learning_rate": 4.070924826040231e-07, "logps/chosen": -37.07754898071289, "logps/rejected": -47.904964447021484, "loss": 0.6176, "losses/dpo": 0.22553053498268127, "losses/sft": 0.514618456363678, "losses/total": 0.22553053498268127, "ref_logps/chosen": -28.699478149414062, "ref_logps/rejected": -33.298519134521484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8378071784973145, "rewards/margins": 0.6228376626968384, "rewards/rejected": -1.4606447219848633, "step": 971 }, { "epoch": 0.92, "grad_norm": 25.452264938490845, "learning_rate": 4.0689429064091474e-07, "logps/chosen": -41.313011169433594, "logps/rejected": -49.577064514160156, "loss": 0.5494, "losses/dpo": 0.2794289290904999, "losses/sft": 1.4381513595581055, "losses/total": 0.2794289290904999, "ref_logps/chosen": -30.086538314819336, "ref_logps/rejected": -32.28352737426758, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1226469278335571, "rewards/margins": 0.6067068576812744, "rewards/rejected": -1.7293537855148315, "step": 972 }, { "epoch": 0.92, "grad_norm": 34.13962150737034, "learning_rate": 4.066959358689187e-07, "logps/chosen": -50.42010498046875, "logps/rejected": -46.42280960083008, "loss": 0.8672, "losses/dpo": 1.1204185485839844, "losses/sft": 1.4064680337905884, "losses/total": 1.1204185485839844, "ref_logps/chosen": -35.85338592529297, "ref_logps/rejected": -32.94068908691406, "rewards/accuracies": 0.4375, "rewards/chosen": -1.456672191619873, "rewards/margins": -0.10846063494682312, "rewards/rejected": -1.3482115268707275, "step": 973 }, { "epoch": 0.92, "grad_norm": 19.20013108433908, "learning_rate": 4.0649741849386743e-07, "logps/chosen": -34.16221618652344, "logps/rejected": -57.746437072753906, "loss": 0.3702, "losses/dpo": 0.08797335624694824, "losses/sft": 0.6001022458076477, "losses/total": 0.08797335624694824, "ref_logps/chosen": -26.330663681030273, "ref_logps/rejected": -38.53510284423828, "rewards/accuracies": 0.875, "rewards/chosen": -0.7831549644470215, "rewards/margins": 1.1379785537719727, "rewards/rejected": -1.9211335182189941, "step": 974 }, { "epoch": 0.92, "grad_norm": 26.188997329829796, "learning_rate": 4.062987387217619e-07, "logps/chosen": -33.89508819580078, "logps/rejected": -46.88935089111328, "loss": 0.5848, "losses/dpo": 0.5017450451850891, "losses/sft": 1.5105183124542236, "losses/total": 0.5017450451850891, "ref_logps/chosen": -22.7584285736084, "ref_logps/rejected": -30.749298095703125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1136659383773804, "rewards/margins": 0.5003396272659302, "rewards/rejected": -1.6140055656433105, "step": 975 }, { "epoch": 0.92, "grad_norm": 20.65301359396536, "learning_rate": 4.060998967587718e-07, "logps/chosen": -35.200111389160156, "logps/rejected": -50.98758316040039, "loss": 0.4425, "losses/dpo": 0.07735816389322281, "losses/sft": 0.4294287860393524, "losses/total": 0.07735816389322281, "ref_logps/chosen": -24.71404266357422, "ref_logps/rejected": -32.60467529296875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0486066341400146, "rewards/margins": 0.7896842360496521, "rewards/rejected": -1.8382909297943115, "step": 976 }, { "epoch": 0.92, "grad_norm": 23.740628268056103, "learning_rate": 4.05900892811235e-07, "logps/chosen": -38.86762237548828, "logps/rejected": -65.00733184814453, "loss": 0.506, "losses/dpo": 0.18625126779079437, "losses/sft": 0.36629176139831543, "losses/total": 0.18625126779079437, "ref_logps/chosen": -27.494983673095703, "ref_logps/rejected": -43.85276794433594, "rewards/accuracies": 0.75, "rewards/chosen": -1.137263536453247, "rewards/margins": 0.9781923890113831, "rewards/rejected": -2.1154558658599854, "step": 977 }, { "epoch": 0.92, "grad_norm": 25.808250609652085, "learning_rate": 4.0570172708565753e-07, "logps/chosen": -32.73719787597656, "logps/rejected": -55.95555877685547, "loss": 0.6409, "losses/dpo": 0.19814392924308777, "losses/sft": 0.8002946376800537, "losses/total": 0.19814392924308777, "ref_logps/chosen": -22.79299545288086, "ref_logps/rejected": -42.247901916503906, "rewards/accuracies": 0.625, "rewards/chosen": -0.9944203495979309, "rewards/margins": 0.37634533643722534, "rewards/rejected": -1.3707656860351562, "step": 978 }, { "epoch": 0.92, "grad_norm": 24.06583763785966, "learning_rate": 4.0550239978871313e-07, "logps/chosen": -38.99336242675781, "logps/rejected": -51.01910400390625, "loss": 0.5762, "losses/dpo": 0.2684239447116852, "losses/sft": 0.9388821721076965, "losses/total": 0.2684239447116852, "ref_logps/chosen": -28.559389114379883, "ref_logps/rejected": -34.06928253173828, "rewards/accuracies": 0.75, "rewards/chosen": -1.0433971881866455, "rewards/margins": 0.6515847444534302, "rewards/rejected": -1.6949820518493652, "step": 979 }, { "epoch": 0.92, "grad_norm": 22.067005813325252, "learning_rate": 4.0530291112724344e-07, "logps/chosen": -42.85731887817383, "logps/rejected": -56.52409362792969, "loss": 0.4122, "losses/dpo": 0.7063273787498474, "losses/sft": 0.27006223797798157, "losses/total": 0.7063273787498474, "ref_logps/chosen": -31.624221801757812, "ref_logps/rejected": -35.05857467651367, "rewards/accuracies": 0.8125, "rewards/chosen": -1.123309850692749, "rewards/margins": 1.023242473602295, "rewards/rejected": -2.146552324295044, "step": 980 }, { "epoch": 0.93, "grad_norm": 24.445507522313015, "learning_rate": 4.0510326130825744e-07, "logps/chosen": -41.94757080078125, "logps/rejected": -47.99468231201172, "loss": 0.4985, "losses/dpo": 0.24275439977645874, "losses/sft": 1.8180056810379028, "losses/total": 0.24275439977645874, "ref_logps/chosen": -31.535484313964844, "ref_logps/rejected": -30.66009521484375, "rewards/accuracies": 0.75, "rewards/chosen": -1.0412089824676514, "rewards/margins": 0.6922494769096375, "rewards/rejected": -1.733458399772644, "step": 981 }, { "epoch": 0.93, "grad_norm": 27.433969867284382, "learning_rate": 4.049034505389314e-07, "logps/chosen": -45.22052001953125, "logps/rejected": -62.472923278808594, "loss": 0.5593, "losses/dpo": 1.2286272048950195, "losses/sft": 1.839874505996704, "losses/total": 1.2286272048950195, "ref_logps/chosen": -31.589874267578125, "ref_logps/rejected": -42.197235107421875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3630645275115967, "rewards/margins": 0.6645039319992065, "rewards/rejected": -2.0275685787200928, "step": 982 }, { "epoch": 0.93, "grad_norm": 17.228304412907367, "learning_rate": 4.047034790266084e-07, "logps/chosen": -32.60816192626953, "logps/rejected": -47.907066345214844, "loss": 0.4517, "losses/dpo": 0.6266508102416992, "losses/sft": 1.0748611688613892, "losses/total": 0.6266508102416992, "ref_logps/chosen": -24.581335067749023, "ref_logps/rejected": -31.070087432861328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8026827573776245, "rewards/margins": 0.881015419960022, "rewards/rejected": -1.6836981773376465, "step": 983 }, { "epoch": 0.93, "grad_norm": 18.73785120736326, "learning_rate": 4.045033469787985e-07, "logps/chosen": -36.096370697021484, "logps/rejected": -59.733612060546875, "loss": 0.32, "losses/dpo": 0.6229232549667358, "losses/sft": 1.6419044733047485, "losses/total": 0.6229232549667358, "ref_logps/chosen": -26.60003662109375, "ref_logps/rejected": -38.140010833740234, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9496331214904785, "rewards/margins": 1.209727168083191, "rewards/rejected": -2.15936017036438, "step": 984 }, { "epoch": 0.93, "grad_norm": 19.112629676840566, "learning_rate": 4.0430305460317846e-07, "logps/chosen": -39.98643112182617, "logps/rejected": -50.35930633544922, "loss": 0.4245, "losses/dpo": 0.48485368490219116, "losses/sft": 0.5731634497642517, "losses/total": 0.48485368490219116, "ref_logps/chosen": -28.448583602905273, "ref_logps/rejected": -28.259279251098633, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1537847518920898, "rewards/margins": 1.0562180280685425, "rewards/rejected": -2.210002899169922, "step": 985 }, { "epoch": 0.93, "grad_norm": 21.122962795945913, "learning_rate": 4.0410260210759117e-07, "logps/chosen": -27.84048080444336, "logps/rejected": -46.20421600341797, "loss": 0.5089, "losses/dpo": 0.8634471893310547, "losses/sft": 1.0503407716751099, "losses/total": 0.8634471893310547, "ref_logps/chosen": -19.59548568725586, "ref_logps/rejected": -31.975753784179688, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8244994878768921, "rewards/margins": 0.5983467102050781, "rewards/rejected": -1.4228460788726807, "step": 986 }, { "epoch": 0.93, "grad_norm": 25.316822016964338, "learning_rate": 4.0390198970004576e-07, "logps/chosen": -35.25215148925781, "logps/rejected": -50.55662536621094, "loss": 0.5119, "losses/dpo": 0.4168107509613037, "losses/sft": 1.846895456314087, "losses/total": 0.4168107509613037, "ref_logps/chosen": -25.799970626831055, "ref_logps/rejected": -34.815433502197266, "rewards/accuracies": 0.75, "rewards/chosen": -0.9452182054519653, "rewards/margins": 0.6289011240005493, "rewards/rejected": -1.574119210243225, "step": 987 }, { "epoch": 0.93, "grad_norm": 24.487130188689196, "learning_rate": 4.0370121758871735e-07, "logps/chosen": -37.78101348876953, "logps/rejected": -51.854679107666016, "loss": 0.4882, "losses/dpo": 0.2551235556602478, "losses/sft": 0.7892552018165588, "losses/total": 0.2551235556602478, "ref_logps/chosen": -28.175891876220703, "ref_logps/rejected": -35.3887939453125, "rewards/accuracies": 0.875, "rewards/chosen": -0.9605122804641724, "rewards/margins": 0.6860761642456055, "rewards/rejected": -1.6465884447097778, "step": 988 }, { "epoch": 0.93, "grad_norm": 18.5033232313906, "learning_rate": 4.035002859819467e-07, "logps/chosen": -33.05633544921875, "logps/rejected": -61.904579162597656, "loss": 0.3688, "losses/dpo": 0.723024308681488, "losses/sft": 1.7772815227508545, "losses/total": 0.723024308681488, "ref_logps/chosen": -23.704181671142578, "ref_logps/rejected": -37.59575653076172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9352152347564697, "rewards/margins": 1.495666742324829, "rewards/rejected": -2.430881977081299, "step": 989 }, { "epoch": 0.93, "grad_norm": 23.121381277017996, "learning_rate": 4.032991950882403e-07, "logps/chosen": -43.812889099121094, "logps/rejected": -61.442771911621094, "loss": 0.4249, "losses/dpo": 1.2560138702392578, "losses/sft": 2.1441802978515625, "losses/total": 1.2560138702392578, "ref_logps/chosen": -30.75006103515625, "ref_logps/rejected": -41.02832794189453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3062825202941895, "rewards/margins": 0.7351620197296143, "rewards/rejected": -2.0414445400238037, "step": 990 }, { "epoch": 0.93, "grad_norm": 28.395701894649626, "learning_rate": 4.030979451162695e-07, "logps/chosen": -46.49436950683594, "logps/rejected": -39.80975341796875, "loss": 0.6171, "losses/dpo": 1.154015302658081, "losses/sft": 1.9275577068328857, "losses/total": 1.154015302658081, "ref_logps/chosen": -33.13420104980469, "ref_logps/rejected": -24.099079132080078, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3360165357589722, "rewards/margins": 0.2350504994392395, "rewards/rejected": -1.5710670948028564, "step": 991 }, { "epoch": 0.94, "grad_norm": 29.188041923470117, "learning_rate": 4.028965362748713e-07, "logps/chosen": -51.00653076171875, "logps/rejected": -63.400447845458984, "loss": 0.5058, "losses/dpo": 1.1077628135681152, "losses/sft": 2.8163421154022217, "losses/total": 1.1077628135681152, "ref_logps/chosen": -37.4287109375, "ref_logps/rejected": -42.334041595458984, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3577818870544434, "rewards/margins": 0.7488589286804199, "rewards/rejected": -2.1066408157348633, "step": 992 }, { "epoch": 0.94, "grad_norm": 16.730796031982308, "learning_rate": 4.02694968773047e-07, "logps/chosen": -24.482059478759766, "logps/rejected": -39.199462890625, "loss": 0.4718, "losses/dpo": 0.5782393217086792, "losses/sft": 0.14753539860248566, "losses/total": 0.5782393217086792, "ref_logps/chosen": -17.723363876342773, "ref_logps/rejected": -25.27458381652832, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6758695840835571, "rewards/margins": 0.7166181206703186, "rewards/rejected": -1.3924877643585205, "step": 993 }, { "epoch": 0.94, "grad_norm": 15.541225785106743, "learning_rate": 4.024932428199629e-07, "logps/chosen": -44.73456954956055, "logps/rejected": -60.0916633605957, "loss": 0.3327, "losses/dpo": 0.130669504404068, "losses/sft": 0.7980248332023621, "losses/total": 0.130669504404068, "ref_logps/chosen": -34.874969482421875, "ref_logps/rejected": -35.59234619140625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9859601259231567, "rewards/margins": 1.463971495628357, "rewards/rejected": -2.4499316215515137, "step": 994 }, { "epoch": 0.94, "grad_norm": 19.36175105253885, "learning_rate": 4.022913586249496e-07, "logps/chosen": -33.327613830566406, "logps/rejected": -49.69723892211914, "loss": 0.4703, "losses/dpo": 0.10184983164072037, "losses/sft": 1.3358538150787354, "losses/total": 0.10184983164072037, "ref_logps/chosen": -22.3563232421875, "ref_logps/rejected": -31.69628143310547, "rewards/accuracies": 0.8125, "rewards/chosen": -1.097128987312317, "rewards/margins": 0.7029666304588318, "rewards/rejected": -1.8000956773757935, "step": 995 }, { "epoch": 0.94, "grad_norm": 22.433857269739804, "learning_rate": 4.020893163975018e-07, "logps/chosen": -36.17875671386719, "logps/rejected": -49.34565734863281, "loss": 0.5623, "losses/dpo": 0.6116194725036621, "losses/sft": 1.0737508535385132, "losses/total": 0.6116194725036621, "ref_logps/chosen": -24.795454025268555, "ref_logps/rejected": -33.16874694824219, "rewards/accuracies": 0.75, "rewards/chosen": -1.1383302211761475, "rewards/margins": 0.4793607294559479, "rewards/rejected": -1.617690920829773, "step": 996 }, { "epoch": 0.94, "grad_norm": 23.25263700822142, "learning_rate": 4.0188711634727845e-07, "logps/chosen": -32.04694366455078, "logps/rejected": -39.556854248046875, "loss": 0.613, "losses/dpo": 0.9376345872879028, "losses/sft": 0.6084400415420532, "losses/total": 0.9376345872879028, "ref_logps/chosen": -21.26218032836914, "ref_logps/rejected": -25.59914779663086, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0784763097763062, "rewards/margins": 0.31729409098625183, "rewards/rejected": -1.3957704305648804, "step": 997 }, { "epoch": 0.94, "grad_norm": 18.426823594761306, "learning_rate": 4.0168475868410217e-07, "logps/chosen": -30.696327209472656, "logps/rejected": -61.11962890625, "loss": 0.4651, "losses/dpo": 0.8437910676002502, "losses/sft": 0.6919924020767212, "losses/total": 0.8437910676002502, "ref_logps/chosen": -20.460323333740234, "ref_logps/rejected": -41.978904724121094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0236003398895264, "rewards/margins": 0.8904717564582825, "rewards/rejected": -1.914072036743164, "step": 998 }, { "epoch": 0.94, "grad_norm": 28.853258310053132, "learning_rate": 4.0148224361795903e-07, "logps/chosen": -36.79867172241211, "logps/rejected": -46.769386291503906, "loss": 0.675, "losses/dpo": 0.8463704586029053, "losses/sft": 1.5406107902526855, "losses/total": 0.8463704586029053, "ref_logps/chosen": -25.650943756103516, "ref_logps/rejected": -32.39238739013672, "rewards/accuracies": 0.625, "rewards/chosen": -1.1147732734680176, "rewards/margins": 0.3229266405105591, "rewards/rejected": -1.4377000331878662, "step": 999 }, { "epoch": 0.94, "grad_norm": 17.938631571424203, "learning_rate": 4.012795713589984e-07, "logps/chosen": -28.109460830688477, "logps/rejected": -60.697235107421875, "loss": 0.3111, "losses/dpo": 0.02374754659831524, "losses/sft": 1.0612306594848633, "losses/total": 0.02374754659831524, "ref_logps/chosen": -18.36028289794922, "ref_logps/rejected": -34.9582633972168, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9749178886413574, "rewards/margins": 1.5989797115325928, "rewards/rejected": -2.573897361755371, "step": 1000 }, { "epoch": 0.94, "grad_norm": 22.970670898713443, "learning_rate": 4.0107674211753305e-07, "logps/chosen": -36.94134521484375, "logps/rejected": -53.56560516357422, "loss": 0.5022, "losses/dpo": 0.587539553642273, "losses/sft": 1.4679391384124756, "losses/total": 0.587539553642273, "ref_logps/chosen": -24.965801239013672, "ref_logps/rejected": -33.03172302246094, "rewards/accuracies": 0.75, "rewards/chosen": -1.197554111480713, "rewards/margins": 0.8558339476585388, "rewards/rejected": -2.0533881187438965, "step": 1001 }, { "epoch": 0.95, "grad_norm": 26.47993477500064, "learning_rate": 4.0087375610403834e-07, "logps/chosen": -35.78563690185547, "logps/rejected": -47.87857437133789, "loss": 0.5893, "losses/dpo": 0.2598097622394562, "losses/sft": 0.8641888499259949, "losses/total": 0.2598097622394562, "ref_logps/chosen": -23.96908950805664, "ref_logps/rejected": -30.619556427001953, "rewards/accuracies": 0.625, "rewards/chosen": -1.1816545724868774, "rewards/margins": 0.5442470908164978, "rewards/rejected": -1.7259016036987305, "step": 1002 }, { "epoch": 0.95, "grad_norm": 29.44034036931007, "learning_rate": 4.0067061352915255e-07, "logps/chosen": -43.72096252441406, "logps/rejected": -49.40547561645508, "loss": 0.7461, "losses/dpo": 2.0286412239074707, "losses/sft": 1.7572665214538574, "losses/total": 2.0286412239074707, "ref_logps/chosen": -29.278432846069336, "ref_logps/rejected": -32.910884857177734, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4442532062530518, "rewards/margins": 0.20520581305027008, "rewards/rejected": -1.6494591236114502, "step": 1003 }, { "epoch": 0.95, "grad_norm": 32.229228621869424, "learning_rate": 4.0046731460367624e-07, "logps/chosen": -46.41156768798828, "logps/rejected": -56.6599006652832, "loss": 0.7757, "losses/dpo": 0.19635723531246185, "losses/sft": 2.205096960067749, "losses/total": 0.19635723531246185, "ref_logps/chosen": -32.35621643066406, "ref_logps/rejected": -36.18800735473633, "rewards/accuracies": 0.625, "rewards/chosen": -1.4055352210998535, "rewards/margins": 0.641654372215271, "rewards/rejected": -2.047189474105835, "step": 1004 }, { "epoch": 0.95, "grad_norm": 18.61794420177084, "learning_rate": 4.0026385953857233e-07, "logps/chosen": -34.57862091064453, "logps/rejected": -58.40393829345703, "loss": 0.4281, "losses/dpo": 0.10062730312347412, "losses/sft": 1.4812061786651611, "losses/total": 0.10062730312347412, "ref_logps/chosen": -22.946334838867188, "ref_logps/rejected": -38.58843231201172, "rewards/accuracies": 0.875, "rewards/chosen": -1.163228988647461, "rewards/margins": 0.8183216452598572, "rewards/rejected": -1.9815505743026733, "step": 1005 }, { "epoch": 0.95, "grad_norm": 31.21490144082869, "learning_rate": 4.000602485449657e-07, "logps/chosen": -42.832969665527344, "logps/rejected": -55.2991828918457, "loss": 0.6583, "losses/dpo": 0.3773960471153259, "losses/sft": 1.58484947681427, "losses/total": 0.3773960471153259, "ref_logps/chosen": -26.625202178955078, "ref_logps/rejected": -33.1253662109375, "rewards/accuracies": 0.625, "rewards/chosen": -1.6207770109176636, "rewards/margins": 0.5966048240661621, "rewards/rejected": -2.2173819541931152, "step": 1006 }, { "epoch": 0.95, "grad_norm": 20.90800647675476, "learning_rate": 3.998564818341431e-07, "logps/chosen": -39.098785400390625, "logps/rejected": -62.0140266418457, "loss": 0.4888, "losses/dpo": 0.5720321536064148, "losses/sft": 1.311089038848877, "losses/total": 0.5720321536064148, "ref_logps/chosen": -26.291439056396484, "ref_logps/rejected": -40.23983383178711, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2807344198226929, "rewards/margins": 0.8966848850250244, "rewards/rejected": -2.1774191856384277, "step": 1007 }, { "epoch": 0.95, "grad_norm": 25.770219944516256, "learning_rate": 3.996525596175528e-07, "logps/chosen": -47.13904571533203, "logps/rejected": -71.1592788696289, "loss": 0.5285, "losses/dpo": 0.7668172717094421, "losses/sft": 1.7751970291137695, "losses/total": 0.7668172717094421, "ref_logps/chosen": -32.77466583251953, "ref_logps/rejected": -49.53199005126953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4364383220672607, "rewards/margins": 0.7262911200523376, "rewards/rejected": -2.162729263305664, "step": 1008 }, { "epoch": 0.95, "grad_norm": 21.01016814969142, "learning_rate": 3.994484821068045e-07, "logps/chosen": -40.147857666015625, "logps/rejected": -61.054203033447266, "loss": 0.4133, "losses/dpo": 0.5575972199440002, "losses/sft": 1.284871220588684, "losses/total": 0.5575972199440002, "ref_logps/chosen": -26.64169692993164, "ref_logps/rejected": -38.23046875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.350616216659546, "rewards/margins": 0.9317572712898254, "rewards/rejected": -2.2823734283447266, "step": 1009 }, { "epoch": 0.95, "grad_norm": 21.90919232004699, "learning_rate": 3.99244249513669e-07, "logps/chosen": -39.03604507446289, "logps/rejected": -58.864891052246094, "loss": 0.4699, "losses/dpo": 0.5691840648651123, "losses/sft": 0.4264674484729767, "losses/total": 0.5691840648651123, "ref_logps/chosen": -27.08576774597168, "ref_logps/rejected": -36.57981872558594, "rewards/accuracies": 0.875, "rewards/chosen": -1.1950278282165527, "rewards/margins": 1.0334792137145996, "rewards/rejected": -2.2285070419311523, "step": 1010 }, { "epoch": 0.95, "grad_norm": 24.714624048591016, "learning_rate": 3.990398620500781e-07, "logps/chosen": -42.138832092285156, "logps/rejected": -56.63161849975586, "loss": 0.5669, "losses/dpo": 0.68064284324646, "losses/sft": 0.4383991062641144, "losses/total": 0.68064284324646, "ref_logps/chosen": -31.023162841796875, "ref_logps/rejected": -38.606449127197266, "rewards/accuracies": 0.625, "rewards/chosen": -1.1115671396255493, "rewards/margins": 0.6909497976303101, "rewards/rejected": -1.8025168180465698, "step": 1011 }, { "epoch": 0.95, "grad_norm": 30.622167118556394, "learning_rate": 3.988353199281241e-07, "logps/chosen": -48.811073303222656, "logps/rejected": -60.829505920410156, "loss": 0.6443, "losses/dpo": 0.43874111771583557, "losses/sft": 0.9627297520637512, "losses/total": 0.43874111771583557, "ref_logps/chosen": -36.68352508544922, "ref_logps/rejected": -42.83772277832031, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2127552032470703, "rewards/margins": 0.5864229202270508, "rewards/rejected": -1.799178123474121, "step": 1012 }, { "epoch": 0.96, "grad_norm": 44.22096750576931, "learning_rate": 3.986306233600602e-07, "logps/chosen": -68.57107543945312, "logps/rejected": -63.53805923461914, "loss": 0.7748, "losses/dpo": 1.1544485092163086, "losses/sft": 2.6587026119232178, "losses/total": 1.1544485092163086, "ref_logps/chosen": -50.49474334716797, "ref_logps/rejected": -41.987545013427734, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8076326847076416, "rewards/margins": 0.3474186956882477, "rewards/rejected": -2.1550514698028564, "step": 1013 }, { "epoch": 0.96, "grad_norm": 37.765744096520756, "learning_rate": 3.9842577255829944e-07, "logps/chosen": -49.34235382080078, "logps/rejected": -49.47279357910156, "loss": 0.6846, "losses/dpo": 0.02393813244998455, "losses/sft": 1.022971272468567, "losses/total": 0.02393813244998455, "ref_logps/chosen": -34.964134216308594, "ref_logps/rejected": -30.41244888305664, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4378215074539185, "rewards/margins": 0.4682128131389618, "rewards/rejected": -1.9060344696044922, "step": 1014 }, { "epoch": 0.96, "grad_norm": 21.328145813313835, "learning_rate": 3.9822076773541513e-07, "logps/chosen": -33.32415008544922, "logps/rejected": -51.33327102661133, "loss": 0.521, "losses/dpo": 1.1363649368286133, "losses/sft": 0.8718039989471436, "losses/total": 1.1363649368286133, "ref_logps/chosen": -23.884567260742188, "ref_logps/rejected": -33.56378936767578, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9439581632614136, "rewards/margins": 0.8329901695251465, "rewards/rejected": -1.77694833278656, "step": 1015 }, { "epoch": 0.96, "grad_norm": 28.887964186578245, "learning_rate": 3.9801560910414023e-07, "logps/chosen": -46.61481475830078, "logps/rejected": -51.635597229003906, "loss": 0.6077, "losses/dpo": 0.5943745970726013, "losses/sft": 1.6687411069869995, "losses/total": 0.5943745970726013, "ref_logps/chosen": -35.116031646728516, "ref_logps/rejected": -32.352294921875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1498786211013794, "rewards/margins": 0.7784517407417297, "rewards/rejected": -1.928330421447754, "step": 1016 }, { "epoch": 0.96, "grad_norm": 28.887873544815744, "learning_rate": 3.978102968773677e-07, "logps/chosen": -53.68333435058594, "logps/rejected": -60.48887634277344, "loss": 0.564, "losses/dpo": 0.4168902635574341, "losses/sft": 0.8704301118850708, "losses/total": 0.4168902635574341, "ref_logps/chosen": -40.899330139160156, "ref_logps/rejected": -40.63711929321289, "rewards/accuracies": 0.6875, "rewards/chosen": -1.278400182723999, "rewards/margins": 0.7067753672599792, "rewards/rejected": -1.985175609588623, "step": 1017 }, { "epoch": 0.96, "grad_norm": 23.338878419797535, "learning_rate": 3.9760483126814936e-07, "logps/chosen": -42.11591720581055, "logps/rejected": -49.63475036621094, "loss": 0.4575, "losses/dpo": 0.36951959133148193, "losses/sft": 1.2735604047775269, "losses/total": 0.36951959133148193, "ref_logps/chosen": -32.89154052734375, "ref_logps/rejected": -31.37501335144043, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9224375486373901, "rewards/margins": 0.9035360217094421, "rewards/rejected": -1.8259735107421875, "step": 1018 }, { "epoch": 0.96, "grad_norm": 20.682159705940162, "learning_rate": 3.9739921248969664e-07, "logps/chosen": -34.591835021972656, "logps/rejected": -51.81550216674805, "loss": 0.4842, "losses/dpo": 0.3758397400379181, "losses/sft": 1.611865758895874, "losses/total": 0.3758397400379181, "ref_logps/chosen": -25.07999038696289, "ref_logps/rejected": -36.197601318359375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9511845111846924, "rewards/margins": 0.6106051206588745, "rewards/rejected": -1.5617897510528564, "step": 1019 }, { "epoch": 0.96, "grad_norm": 27.46179336213173, "learning_rate": 3.9719344075537965e-07, "logps/chosen": -51.98396682739258, "logps/rejected": -54.85955047607422, "loss": 0.457, "losses/dpo": 0.24269317090511322, "losses/sft": 1.8757885694503784, "losses/total": 0.24269317090511322, "ref_logps/chosen": -39.94660949707031, "ref_logps/rejected": -35.01060485839844, "rewards/accuracies": 0.875, "rewards/chosen": -1.2037359476089478, "rewards/margins": 0.7811586260795593, "rewards/rejected": -1.9848945140838623, "step": 1020 }, { "epoch": 0.96, "grad_norm": 19.30766314646741, "learning_rate": 3.969875162787273e-07, "logps/chosen": -34.30625915527344, "logps/rejected": -46.29557800292969, "loss": 0.4063, "losses/dpo": 0.3795323669910431, "losses/sft": 1.0994352102279663, "losses/total": 0.3795323669910431, "ref_logps/chosen": -24.96270751953125, "ref_logps/rejected": -28.686542510986328, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9343553185462952, "rewards/margins": 0.8265483975410461, "rewards/rejected": -1.7609035968780518, "step": 1021 }, { "epoch": 0.96, "grad_norm": 21.172575742177564, "learning_rate": 3.96781439273427e-07, "logps/chosen": -44.02948760986328, "logps/rejected": -51.131099700927734, "loss": 0.4553, "losses/dpo": 1.224124789237976, "losses/sft": 1.860759973526001, "losses/total": 1.224124789237976, "ref_logps/chosen": -35.59166717529297, "ref_logps/rejected": -33.60868453979492, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8437824845314026, "rewards/margins": 0.9084591865539551, "rewards/rejected": -1.752241611480713, "step": 1022 }, { "epoch": 0.97, "grad_norm": 23.96915464412051, "learning_rate": 3.9657520995332447e-07, "logps/chosen": -43.14241027832031, "logps/rejected": -65.51487731933594, "loss": 0.4178, "losses/dpo": 0.42832469940185547, "losses/sft": 1.3191862106323242, "losses/total": 0.42832469940185547, "ref_logps/chosen": -30.299819946289062, "ref_logps/rejected": -42.029205322265625, "rewards/accuracies": 0.875, "rewards/chosen": -1.2842588424682617, "rewards/margins": 1.0643088817596436, "rewards/rejected": -2.3485677242279053, "step": 1023 }, { "epoch": 0.97, "grad_norm": 20.557916909973734, "learning_rate": 3.9636882853242345e-07, "logps/chosen": -33.9954948425293, "logps/rejected": -49.23075866699219, "loss": 0.4884, "losses/dpo": 1.00289785861969, "losses/sft": 0.902940571308136, "losses/total": 1.00289785861969, "ref_logps/chosen": -22.26241111755371, "ref_logps/rejected": -30.08566665649414, "rewards/accuracies": 0.875, "rewards/chosen": -1.1733086109161377, "rewards/margins": 0.7412006258964539, "rewards/rejected": -1.9145092964172363, "step": 1024 }, { "epoch": 0.97, "grad_norm": 28.521945311452743, "learning_rate": 3.961622952248855e-07, "logps/chosen": -48.588722229003906, "logps/rejected": -55.51066589355469, "loss": 0.624, "losses/dpo": 1.6991848945617676, "losses/sft": 1.193450927734375, "losses/total": 1.6991848945617676, "ref_logps/chosen": -32.976104736328125, "ref_logps/rejected": -35.9530029296875, "rewards/accuracies": 0.625, "rewards/chosen": -1.5612612962722778, "rewards/margins": 0.3945049047470093, "rewards/rejected": -1.955766201019287, "step": 1025 }, { "epoch": 0.97, "grad_norm": 30.373192640804415, "learning_rate": 3.959556102450298e-07, "logps/chosen": -55.0746955871582, "logps/rejected": -57.58749008178711, "loss": 0.4777, "losses/dpo": 1.622846007347107, "losses/sft": 2.0991051197052, "losses/total": 1.622846007347107, "ref_logps/chosen": -41.194847106933594, "ref_logps/rejected": -33.99305725097656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3879847526550293, "rewards/margins": 0.9714586138725281, "rewards/rejected": -2.359443187713623, "step": 1026 }, { "epoch": 0.97, "grad_norm": 24.14394465524865, "learning_rate": 3.9574877380733307e-07, "logps/chosen": -34.33084487915039, "logps/rejected": -50.13302993774414, "loss": 0.5426, "losses/dpo": 0.1624370962381363, "losses/sft": 0.9252960085868835, "losses/total": 0.1624370962381363, "ref_logps/chosen": -23.46704864501953, "ref_logps/rejected": -33.27928924560547, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0863797664642334, "rewards/margins": 0.5989944934844971, "rewards/rejected": -1.6853742599487305, "step": 1027 }, { "epoch": 0.97, "grad_norm": 27.4742115115022, "learning_rate": 3.9554178612642885e-07, "logps/chosen": -37.22035598754883, "logps/rejected": -52.454612731933594, "loss": 0.6248, "losses/dpo": 0.7873481512069702, "losses/sft": 0.7287651300430298, "losses/total": 0.7873481512069702, "ref_logps/chosen": -25.451814651489258, "ref_logps/rejected": -35.69581604003906, "rewards/accuracies": 0.625, "rewards/chosen": -1.1768542528152466, "rewards/margins": 0.49902552366256714, "rewards/rejected": -1.675879716873169, "step": 1028 }, { "epoch": 0.97, "grad_norm": 16.316523973728472, "learning_rate": 3.95334647417108e-07, "logps/chosen": -46.7923583984375, "logps/rejected": -77.90206146240234, "loss": 0.3239, "losses/dpo": 0.1296568363904953, "losses/sft": 1.867835283279419, "losses/total": 0.1296568363904953, "ref_logps/chosen": -32.03740310668945, "ref_logps/rejected": -49.21030044555664, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4754958152770996, "rewards/margins": 1.3936800956726074, "rewards/rejected": -2.869175910949707, "step": 1029 }, { "epoch": 0.97, "grad_norm": 17.944533237966045, "learning_rate": 3.9512735789431783e-07, "logps/chosen": -35.59933853149414, "logps/rejected": -48.4488410949707, "loss": 0.4178, "losses/dpo": 0.5648460984230042, "losses/sft": 1.2142046689987183, "losses/total": 0.5648460984230042, "ref_logps/chosen": -26.476839065551758, "ref_logps/rejected": -30.11578369140625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9122499823570251, "rewards/margins": 0.9210560321807861, "rewards/rejected": -1.833306074142456, "step": 1030 }, { "epoch": 0.97, "grad_norm": 31.55127127141279, "learning_rate": 3.9491991777316237e-07, "logps/chosen": -43.973236083984375, "logps/rejected": -55.69775390625, "loss": 0.6914, "losses/dpo": 1.3929805755615234, "losses/sft": 2.5297605991363525, "losses/total": 1.3929805755615234, "ref_logps/chosen": -30.130294799804688, "ref_logps/rejected": -37.58891296386719, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3842942714691162, "rewards/margins": 0.4265894591808319, "rewards/rejected": -1.8108837604522705, "step": 1031 }, { "epoch": 0.97, "grad_norm": 20.48168758143476, "learning_rate": 3.947123272689018e-07, "logps/chosen": -37.257286071777344, "logps/rejected": -46.78541564941406, "loss": 0.4629, "losses/dpo": 0.07899203896522522, "losses/sft": 0.5115848779678345, "losses/total": 0.07899203896522522, "ref_logps/chosen": -26.179855346679688, "ref_logps/rejected": -26.191699981689453, "rewards/accuracies": 0.875, "rewards/chosen": -1.1077431440353394, "rewards/margins": 0.9516285061836243, "rewards/rejected": -2.0593717098236084, "step": 1032 }, { "epoch": 0.97, "grad_norm": 27.783224227245643, "learning_rate": 3.945045865969522e-07, "logps/chosen": -30.869869232177734, "logps/rejected": -40.264915466308594, "loss": 0.5858, "losses/dpo": 0.5613311529159546, "losses/sft": 1.1421786546707153, "losses/total": 0.5613311529159546, "ref_logps/chosen": -20.534080505371094, "ref_logps/rejected": -26.267133712768555, "rewards/accuracies": 0.5625, "rewards/chosen": -1.033578872680664, "rewards/margins": 0.3661993741989136, "rewards/rejected": -1.399778127670288, "step": 1033 }, { "epoch": 0.98, "grad_norm": 21.975327010766847, "learning_rate": 3.9429669597288585e-07, "logps/chosen": -46.38306427001953, "logps/rejected": -68.55596160888672, "loss": 0.4344, "losses/dpo": 0.03970367833971977, "losses/sft": 0.6249553561210632, "losses/total": 0.03970367833971977, "ref_logps/chosen": -32.92217254638672, "ref_logps/rejected": -45.737491607666016, "rewards/accuracies": 0.75, "rewards/chosen": -1.3460888862609863, "rewards/margins": 0.9357584714889526, "rewards/rejected": -2.2818474769592285, "step": 1034 }, { "epoch": 0.98, "grad_norm": 21.449679471830297, "learning_rate": 3.9408865561243033e-07, "logps/chosen": -43.42915344238281, "logps/rejected": -58.917179107666016, "loss": 0.5064, "losses/dpo": 0.6765826940536499, "losses/sft": 2.3992557525634766, "losses/total": 0.6765826940536499, "ref_logps/chosen": -32.489803314208984, "ref_logps/rejected": -39.609962463378906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0939347743988037, "rewards/margins": 0.8367871046066284, "rewards/rejected": -1.9307219982147217, "step": 1035 }, { "epoch": 0.98, "grad_norm": 19.732371306849355, "learning_rate": 3.9388046573146867e-07, "logps/chosen": -36.013362884521484, "logps/rejected": -51.13922882080078, "loss": 0.4829, "losses/dpo": 0.05127733573317528, "losses/sft": 0.6979616284370422, "losses/total": 0.05127733573317528, "ref_logps/chosen": -24.578365325927734, "ref_logps/rejected": -32.069190979003906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1434998512268066, "rewards/margins": 0.7635036706924438, "rewards/rejected": -1.907003402709961, "step": 1036 }, { "epoch": 0.98, "grad_norm": 22.348319925432246, "learning_rate": 3.936721265460392e-07, "logps/chosen": -36.23816680908203, "logps/rejected": -51.0474967956543, "loss": 0.5223, "losses/dpo": 0.562946617603302, "losses/sft": 1.190043330192566, "losses/total": 0.562946617603302, "ref_logps/chosen": -24.711044311523438, "ref_logps/rejected": -32.236080169677734, "rewards/accuracies": 0.6875, "rewards/chosen": -1.152712345123291, "rewards/margins": 0.7284295558929443, "rewards/rejected": -1.8811419010162354, "step": 1037 }, { "epoch": 0.98, "grad_norm": 18.234227454181674, "learning_rate": 3.9346363827233496e-07, "logps/chosen": -34.26264953613281, "logps/rejected": -64.39864349365234, "loss": 0.3335, "losses/dpo": 0.03045717440545559, "losses/sft": 0.9679940342903137, "losses/total": 0.03045717440545559, "ref_logps/chosen": -21.52338409423828, "ref_logps/rejected": -38.3624267578125, "rewards/accuracies": 0.875, "rewards/chosen": -1.2739266157150269, "rewards/margins": 1.3296949863433838, "rewards/rejected": -2.603621482849121, "step": 1038 }, { "epoch": 0.98, "grad_norm": 14.41664957031219, "learning_rate": 3.9325500112670383e-07, "logps/chosen": -37.49544906616211, "logps/rejected": -55.061431884765625, "loss": 0.3404, "losses/dpo": 0.0915268212556839, "losses/sft": 1.533477783203125, "losses/total": 0.0915268212556839, "ref_logps/chosen": -29.45014190673828, "ref_logps/rejected": -34.4407958984375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8045306205749512, "rewards/margins": 1.2575329542160034, "rewards/rejected": -2.062063694000244, "step": 1039 }, { "epoch": 0.98, "grad_norm": 24.13992950062181, "learning_rate": 3.9304621532564825e-07, "logps/chosen": -40.170684814453125, "logps/rejected": -52.89023208618164, "loss": 0.5738, "losses/dpo": 0.0517609566450119, "losses/sft": 1.6037631034851074, "losses/total": 0.0517609566450119, "ref_logps/chosen": -29.163434982299805, "ref_logps/rejected": -34.79393768310547, "rewards/accuracies": 0.75, "rewards/chosen": -1.1007249355316162, "rewards/margins": 0.7089046239852905, "rewards/rejected": -1.8096296787261963, "step": 1040 }, { "epoch": 0.98, "grad_norm": 26.481339760164598, "learning_rate": 3.928372810858247e-07, "logps/chosen": -41.433692932128906, "logps/rejected": -52.57738494873047, "loss": 0.4869, "losses/dpo": 0.45965397357940674, "losses/sft": 1.8626577854156494, "losses/total": 0.45965397357940674, "ref_logps/chosen": -30.308094024658203, "ref_logps/rejected": -31.744840621948242, "rewards/accuracies": 0.75, "rewards/chosen": -1.1125597953796387, "rewards/margins": 0.9706943035125732, "rewards/rejected": -2.083254337310791, "step": 1041 }, { "epoch": 0.98, "grad_norm": 19.1493142414542, "learning_rate": 3.9262819862404397e-07, "logps/chosen": -35.40009307861328, "logps/rejected": -45.63196563720703, "loss": 0.4323, "losses/dpo": 0.7003259062767029, "losses/sft": 0.5744147300720215, "losses/total": 0.7003259062767029, "ref_logps/chosen": -27.390735626220703, "ref_logps/rejected": -28.69588851928711, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8009358644485474, "rewards/margins": 0.8926717042922974, "rewards/rejected": -1.6936075687408447, "step": 1042 }, { "epoch": 0.98, "grad_norm": 24.79782302450129, "learning_rate": 3.924189681572703e-07, "logps/chosen": -40.3966064453125, "logps/rejected": -45.24082946777344, "loss": 0.5973, "losses/dpo": 0.4332876205444336, "losses/sft": 1.6216309070587158, "losses/total": 0.4332876205444336, "ref_logps/chosen": -29.323585510253906, "ref_logps/rejected": -30.660926818847656, "rewards/accuracies": 0.625, "rewards/chosen": -1.1073020696640015, "rewards/margins": 0.3506883382797241, "rewards/rejected": -1.4579904079437256, "step": 1043 }, { "epoch": 0.98, "grad_norm": 20.19336452235345, "learning_rate": 3.922095899026218e-07, "logps/chosen": -41.09893035888672, "logps/rejected": -73.10771942138672, "loss": 0.3614, "losses/dpo": 0.37805917859077454, "losses/sft": 1.3584108352661133, "losses/total": 0.37805917859077454, "ref_logps/chosen": -28.721651077270508, "ref_logps/rejected": -49.115325927734375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2377281188964844, "rewards/margins": 1.161511778831482, "rewards/rejected": -2.399240016937256, "step": 1044 }, { "epoch": 0.99, "grad_norm": 26.098074434838985, "learning_rate": 3.9200006407737e-07, "logps/chosen": -42.99740982055664, "logps/rejected": -43.967559814453125, "loss": 0.6492, "losses/dpo": 0.055699653923511505, "losses/sft": 1.1118806600570679, "losses/total": 0.055699653923511505, "ref_logps/chosen": -30.60862922668457, "ref_logps/rejected": -28.550750732421875, "rewards/accuracies": 0.625, "rewards/chosen": -1.2388780117034912, "rewards/margins": 0.3028026223182678, "rewards/rejected": -1.5416808128356934, "step": 1045 }, { "epoch": 0.99, "grad_norm": 22.20904719665348, "learning_rate": 3.917903908989392e-07, "logps/chosen": -38.00318145751953, "logps/rejected": -42.74497985839844, "loss": 0.5931, "losses/dpo": 1.2536778450012207, "losses/sft": 1.3586101531982422, "losses/total": 1.2536778450012207, "ref_logps/chosen": -27.790884017944336, "ref_logps/rejected": -25.340330123901367, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0212297439575195, "rewards/margins": 0.7192348837852478, "rewards/rejected": -1.740464687347412, "step": 1046 }, { "epoch": 0.99, "grad_norm": 14.575916244923102, "learning_rate": 3.9158057058490703e-07, "logps/chosen": -35.26850891113281, "logps/rejected": -52.41597366333008, "loss": 0.3275, "losses/dpo": 0.47166013717651367, "losses/sft": 1.8649122714996338, "losses/total": 0.47166013717651367, "ref_logps/chosen": -27.140335083007812, "ref_logps/rejected": -32.70242691040039, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8128176331520081, "rewards/margins": 1.1585370302200317, "rewards/rejected": -1.9713547229766846, "step": 1047 }, { "epoch": 0.99, "grad_norm": 24.71139486846611, "learning_rate": 3.913706033530034e-07, "logps/chosen": -41.3636474609375, "logps/rejected": -67.18218994140625, "loss": 0.5223, "losses/dpo": 0.06403117626905441, "losses/sft": 1.1499313116073608, "losses/total": 0.06403117626905441, "ref_logps/chosen": -30.513351440429688, "ref_logps/rejected": -47.6934814453125, "rewards/accuracies": 0.75, "rewards/chosen": -1.0850296020507812, "rewards/margins": 0.8638412356376648, "rewards/rejected": -1.9488708972930908, "step": 1048 }, { "epoch": 0.99, "grad_norm": 24.759525255611955, "learning_rate": 3.911604894211111e-07, "logps/chosen": -32.64926528930664, "logps/rejected": -45.05854415893555, "loss": 0.6448, "losses/dpo": 0.5078526139259338, "losses/sft": 1.5365383625030518, "losses/total": 0.5078526139259338, "ref_logps/chosen": -21.944854736328125, "ref_logps/rejected": -30.25153350830078, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0704410076141357, "rewards/margins": 0.41026008129119873, "rewards/rejected": -1.480701208114624, "step": 1049 }, { "epoch": 0.99, "grad_norm": 16.04731543249244, "learning_rate": 3.9095022900726487e-07, "logps/chosen": -34.157752990722656, "logps/rejected": -48.13341522216797, "loss": 0.372, "losses/dpo": 0.333607017993927, "losses/sft": 0.8511551022529602, "losses/total": 0.333607017993927, "ref_logps/chosen": -25.128292083740234, "ref_logps/rejected": -27.78917694091797, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9029462337493896, "rewards/margins": 1.1314773559570312, "rewards/rejected": -2.034423589706421, "step": 1050 }, { "epoch": 0.99, "grad_norm": 24.50125675488272, "learning_rate": 3.907398223296514e-07, "logps/chosen": -34.58292770385742, "logps/rejected": -44.53699493408203, "loss": 0.5951, "losses/dpo": 0.7936403155326843, "losses/sft": 0.7330171465873718, "losses/total": 0.7936403155326843, "ref_logps/chosen": -21.77176284790039, "ref_logps/rejected": -28.179994583129883, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2811167240142822, "rewards/margins": 0.3545833230018616, "rewards/rejected": -1.635699987411499, "step": 1051 }, { "epoch": 0.99, "grad_norm": 25.78244879671098, "learning_rate": 3.905292696066094e-07, "logps/chosen": -37.35552215576172, "logps/rejected": -48.51580810546875, "loss": 0.5979, "losses/dpo": 0.7492583394050598, "losses/sft": 1.392290711402893, "losses/total": 0.7492583394050598, "ref_logps/chosen": -26.055147171020508, "ref_logps/rejected": -32.10831069946289, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1300376653671265, "rewards/margins": 0.5107120275497437, "rewards/rejected": -1.6407495737075806, "step": 1052 }, { "epoch": 0.99, "grad_norm": 24.330079826247847, "learning_rate": 3.9031857105662894e-07, "logps/chosen": -39.96257019042969, "logps/rejected": -46.935768127441406, "loss": 0.5759, "losses/dpo": 0.10898684710264206, "losses/sft": 1.4653749465942383, "losses/total": 0.10898684710264206, "ref_logps/chosen": -28.335193634033203, "ref_logps/rejected": -29.37299346923828, "rewards/accuracies": 0.625, "rewards/chosen": -1.1627378463745117, "rewards/margins": 0.5935396552085876, "rewards/rejected": -1.7562774419784546, "step": 1053 }, { "epoch": 0.99, "grad_norm": 16.670573519041316, "learning_rate": 3.901077268983515e-07, "logps/chosen": -27.274457931518555, "logps/rejected": -49.113704681396484, "loss": 0.4626, "losses/dpo": 0.004965565167367458, "losses/sft": 0.6389699578285217, "losses/total": 0.004965565167367458, "ref_logps/chosen": -18.34235954284668, "ref_logps/rejected": -31.089006423950195, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8932098150253296, "rewards/margins": 0.909260094165802, "rewards/rejected": -1.8024699687957764, "step": 1054 }, { "epoch": 1.0, "grad_norm": 30.115765325481178, "learning_rate": 3.898967373505696e-07, "logps/chosen": -43.4399299621582, "logps/rejected": -60.42368698120117, "loss": 0.5595, "losses/dpo": 0.7572669386863708, "losses/sft": 1.4592530727386475, "losses/total": 0.7572669386863708, "ref_logps/chosen": -29.97287368774414, "ref_logps/rejected": -41.14918518066406, "rewards/accuracies": 0.75, "rewards/chosen": -1.346705675125122, "rewards/margins": 0.5807443857192993, "rewards/rejected": -1.9274500608444214, "step": 1055 }, { "epoch": 1.0, "grad_norm": 22.47626287915103, "learning_rate": 3.8968560263222674e-07, "logps/chosen": -31.3892822265625, "logps/rejected": -50.06562805175781, "loss": 0.5472, "losses/dpo": 0.587516188621521, "losses/sft": 0.7536409497261047, "losses/total": 0.587516188621521, "ref_logps/chosen": -19.469989776611328, "ref_logps/rejected": -33.05249786376953, "rewards/accuracies": 0.625, "rewards/chosen": -1.1919294595718384, "rewards/margins": 0.5093833804130554, "rewards/rejected": -1.7013128995895386, "step": 1056 }, { "epoch": 1.0, "grad_norm": 23.194137621684042, "learning_rate": 3.894743229624169e-07, "logps/chosen": -41.781646728515625, "logps/rejected": -45.311912536621094, "loss": 0.4953, "losses/dpo": 0.12354342639446259, "losses/sft": 1.2336398363113403, "losses/total": 0.12354342639446259, "ref_logps/chosen": -31.10155487060547, "ref_logps/rejected": -28.769189834594727, "rewards/accuracies": 0.875, "rewards/chosen": -1.0680090188980103, "rewards/margins": 0.5862630605697632, "rewards/rejected": -1.6542720794677734, "step": 1057 }, { "epoch": 1.0, "grad_norm": 20.34036001747499, "learning_rate": 3.892628985603846e-07, "logps/chosen": -36.076595306396484, "logps/rejected": -61.744964599609375, "loss": 0.4248, "losses/dpo": 0.07421176135540009, "losses/sft": 1.0140098333358765, "losses/total": 0.07421176135540009, "ref_logps/chosen": -24.82464599609375, "ref_logps/rejected": -39.159568786621094, "rewards/accuracies": 0.75, "rewards/chosen": -1.125195026397705, "rewards/margins": 1.1333446502685547, "rewards/rejected": -2.2585396766662598, "step": 1058 }, { "epoch": 1.0, "grad_norm": 19.34005204793847, "learning_rate": 3.890513296455246e-07, "logps/chosen": -38.32910919189453, "logps/rejected": -48.99036407470703, "loss": 0.4602, "losses/dpo": 0.21851889789104462, "losses/sft": 2.1089398860931396, "losses/total": 0.21851889789104462, "ref_logps/chosen": -30.53329086303711, "ref_logps/rejected": -33.52596664428711, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7795819044113159, "rewards/margins": 0.7668582797050476, "rewards/rejected": -1.5464402437210083, "step": 1059 }, { "epoch": 1.0, "grad_norm": 21.30058812123869, "learning_rate": 3.888396164373814e-07, "logps/chosen": -35.131500244140625, "logps/rejected": -60.61800003051758, "loss": 0.5007, "losses/dpo": 0.12881594896316528, "losses/sft": 1.1105518341064453, "losses/total": 0.12881594896316528, "ref_logps/chosen": -24.866079330444336, "ref_logps/rejected": -41.560089111328125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0265421867370605, "rewards/margins": 0.879249095916748, "rewards/rejected": -1.9057912826538086, "step": 1060 }, { "epoch": 1.0, "grad_norm": 20.773720382942113, "learning_rate": 3.886277591556495e-07, "logps/chosen": -39.28744125366211, "logps/rejected": -42.299560546875, "loss": 0.4557, "losses/dpo": 0.8129985928535461, "losses/sft": 1.4660594463348389, "losses/total": 0.8129985928535461, "ref_logps/chosen": -30.24908447265625, "ref_logps/rejected": -26.190990447998047, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9038355350494385, "rewards/margins": 0.7070214748382568, "rewards/rejected": -1.6108570098876953, "step": 1061 }, { "epoch": 1.0, "grad_norm": 15.673835998215978, "learning_rate": 3.884157580201726e-07, "logps/chosen": -35.449119567871094, "logps/rejected": -43.68218231201172, "loss": 0.3796, "losses/dpo": 0.9693501591682434, "losses/sft": 1.5548503398895264, "losses/total": 0.9693501591682434, "ref_logps/chosen": -27.734792709350586, "ref_logps/rejected": -25.159093856811523, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7714328765869141, "rewards/margins": 1.0808757543563843, "rewards/rejected": -1.8523085117340088, "step": 1062 }, { "epoch": 1.0, "grad_norm": 16.56662705480673, "learning_rate": 3.88203613250944e-07, "logps/chosen": -30.544038772583008, "logps/rejected": -50.87483215332031, "loss": 0.3934, "losses/dpo": 0.2238471657037735, "losses/sft": 0.6348114013671875, "losses/total": 0.2238471657037735, "ref_logps/chosen": -22.130605697631836, "ref_logps/rejected": -32.741111755371094, "rewards/accuracies": 0.875, "rewards/chosen": -0.841343343257904, "rewards/margins": 0.9720290899276733, "rewards/rejected": -1.8133724927902222, "step": 1063 }, { "epoch": 1.0, "grad_norm": 18.08184159646416, "learning_rate": 3.8799132506810585e-07, "logps/chosen": -32.180564880371094, "logps/rejected": -50.19245529174805, "loss": 0.4208, "losses/dpo": 0.02356860600411892, "losses/sft": 1.686329960823059, "losses/total": 0.02356860600411892, "ref_logps/chosen": -23.96695899963379, "ref_logps/rejected": -32.35799026489258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8213602900505066, "rewards/margins": 0.9620863199234009, "rewards/rejected": -1.7834465503692627, "step": 1064 }, { "epoch": 1.0, "grad_norm": 14.024114482381082, "learning_rate": 3.8777889369194916e-07, "logps/chosen": -49.3974609375, "logps/rejected": -70.33177947998047, "loss": 0.2489, "losses/dpo": 0.010439960286021233, "losses/sft": 1.540587306022644, "losses/total": 0.010439960286021233, "ref_logps/chosen": -40.54839324951172, "ref_logps/rejected": -42.489173889160156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8849071264266968, "rewards/margins": 1.8993539810180664, "rewards/rejected": -2.7842609882354736, "step": 1065 }, { "epoch": 1.01, "grad_norm": 19.56776548885868, "learning_rate": 3.875663193429135e-07, "logps/chosen": -35.0987434387207, "logps/rejected": -49.809444427490234, "loss": 0.4499, "losses/dpo": 1.3561466932296753, "losses/sft": 2.1643142700195312, "losses/total": 1.3561466932296753, "ref_logps/chosen": -24.303688049316406, "ref_logps/rejected": -29.933626174926758, "rewards/accuracies": 0.75, "rewards/chosen": -1.0795056819915771, "rewards/margins": 0.9080762267112732, "rewards/rejected": -1.9875819683074951, "step": 1066 }, { "epoch": 1.01, "grad_norm": 19.62666336907181, "learning_rate": 3.8735360224158695e-07, "logps/chosen": -35.721988677978516, "logps/rejected": -62.043243408203125, "loss": 0.368, "losses/dpo": 0.2223505973815918, "losses/sft": 1.3223426342010498, "losses/total": 0.2223505973815918, "ref_logps/chosen": -27.872211456298828, "ref_logps/rejected": -40.053157806396484, "rewards/accuracies": 0.875, "rewards/chosen": -0.7849777936935425, "rewards/margins": 1.4140307903289795, "rewards/rejected": -2.1990084648132324, "step": 1067 }, { "epoch": 1.01, "grad_norm": 15.37517601180276, "learning_rate": 3.8714074260870557e-07, "logps/chosen": -46.69065475463867, "logps/rejected": -62.970279693603516, "loss": 0.2811, "losses/dpo": 0.37433531880378723, "losses/sft": 1.1791850328445435, "losses/total": 0.37433531880378723, "ref_logps/chosen": -32.230812072753906, "ref_logps/rejected": -35.87079620361328, "rewards/accuracies": 1.0, "rewards/chosen": -1.4459843635559082, "rewards/margins": 1.2639636993408203, "rewards/rejected": -2.7099478244781494, "step": 1068 }, { "epoch": 1.01, "grad_norm": 14.792999152986198, "learning_rate": 3.869277406651532e-07, "logps/chosen": -42.89924621582031, "logps/rejected": -64.38871765136719, "loss": 0.2829, "losses/dpo": 0.1235465332865715, "losses/sft": 1.9096745252609253, "losses/total": 0.1235465332865715, "ref_logps/chosen": -31.678852081298828, "ref_logps/rejected": -39.52003479003906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1220393180847168, "rewards/margins": 1.3648295402526855, "rewards/rejected": -2.4868688583374023, "step": 1069 }, { "epoch": 1.01, "grad_norm": 13.559522395316344, "learning_rate": 3.867145966319618e-07, "logps/chosen": -40.809791564941406, "logps/rejected": -61.137969970703125, "loss": 0.3162, "losses/dpo": 0.07706329971551895, "losses/sft": 1.0499351024627686, "losses/total": 0.07706329971551895, "ref_logps/chosen": -29.811796188354492, "ref_logps/rejected": -36.29522705078125, "rewards/accuracies": 0.875, "rewards/chosen": -1.0997991561889648, "rewards/margins": 1.3844749927520752, "rewards/rejected": -2.48427414894104, "step": 1070 }, { "epoch": 1.01, "grad_norm": 16.38477991030903, "learning_rate": 3.8650131073031034e-07, "logps/chosen": -45.757442474365234, "logps/rejected": -62.34249496459961, "loss": 0.3434, "losses/dpo": 0.3462372422218323, "losses/sft": 1.4342024326324463, "losses/total": 0.3462372422218323, "ref_logps/chosen": -33.636199951171875, "ref_logps/rejected": -36.422462463378906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2121243476867676, "rewards/margins": 1.3798789978027344, "rewards/rejected": -2.592003345489502, "step": 1071 }, { "epoch": 1.01, "grad_norm": 15.909184687497431, "learning_rate": 3.8628788318152525e-07, "logps/chosen": -36.68939208984375, "logps/rejected": -51.055419921875, "loss": 0.3222, "losses/dpo": 0.6191872358322144, "losses/sft": 1.705318570137024, "losses/total": 0.6191872358322144, "ref_logps/chosen": -28.09721565246582, "ref_logps/rejected": -28.648197174072266, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8592178225517273, "rewards/margins": 1.3815041780471802, "rewards/rejected": -2.2407219409942627, "step": 1072 }, { "epoch": 1.01, "grad_norm": 15.240331431797916, "learning_rate": 3.860743142070797e-07, "logps/chosen": -44.02427291870117, "logps/rejected": -55.249488830566406, "loss": 0.3127, "losses/dpo": 0.45780742168426514, "losses/sft": 0.762271523475647, "losses/total": 0.45780742168426514, "ref_logps/chosen": -33.710975646972656, "ref_logps/rejected": -31.126049041748047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.031329870223999, "rewards/margins": 1.381014108657837, "rewards/rejected": -2.412343978881836, "step": 1073 }, { "epoch": 1.01, "grad_norm": 22.551068042654126, "learning_rate": 3.85860604028594e-07, "logps/chosen": -41.987388610839844, "logps/rejected": -72.10792541503906, "loss": 0.3716, "losses/dpo": 0.3234601616859436, "losses/sft": 1.8225573301315308, "losses/total": 0.3234601616859436, "ref_logps/chosen": -28.661727905273438, "ref_logps/rejected": -44.75563430786133, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3325661420822144, "rewards/margins": 1.4026631116867065, "rewards/rejected": -2.7352294921875, "step": 1074 }, { "epoch": 1.01, "grad_norm": 17.93048448658171, "learning_rate": 3.8564675286783463e-07, "logps/chosen": -49.06647491455078, "logps/rejected": -67.94747924804688, "loss": 0.3236, "losses/dpo": 0.0837593674659729, "losses/sft": 0.585945188999176, "losses/total": 0.0837593674659729, "ref_logps/chosen": -35.78802490234375, "ref_logps/rejected": -41.74650573730469, "rewards/accuracies": 0.875, "rewards/chosen": -1.327844500541687, "rewards/margins": 1.2922521829605103, "rewards/rejected": -2.6200966835021973, "step": 1075 }, { "epoch": 1.02, "grad_norm": 22.623768778376846, "learning_rate": 3.8543276094671463e-07, "logps/chosen": -25.897878646850586, "logps/rejected": -47.27302169799805, "loss": 0.5346, "losses/dpo": 0.6454851627349854, "losses/sft": 0.5052352547645569, "losses/total": 0.6454851627349854, "ref_logps/chosen": -16.049158096313477, "ref_logps/rejected": -26.69660758972168, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9848721623420715, "rewards/margins": 1.0727694034576416, "rewards/rejected": -2.0576415061950684, "step": 1076 }, { "epoch": 1.02, "grad_norm": 13.845716447520177, "learning_rate": 3.852186284872928e-07, "logps/chosen": -32.02875518798828, "logps/rejected": -43.848758697509766, "loss": 0.3635, "losses/dpo": 0.1401573270559311, "losses/sft": 1.316564917564392, "losses/total": 0.1401573270559311, "ref_logps/chosen": -23.923919677734375, "ref_logps/rejected": -25.553083419799805, "rewards/accuracies": 0.875, "rewards/chosen": -0.810483455657959, "rewards/margins": 1.019084095954895, "rewards/rejected": -1.829567551612854, "step": 1077 }, { "epoch": 1.02, "grad_norm": 17.787556810772518, "learning_rate": 3.850043557117741e-07, "logps/chosen": -43.16048812866211, "logps/rejected": -65.98006439208984, "loss": 0.3585, "losses/dpo": 0.011432019993662834, "losses/sft": 1.7534499168395996, "losses/total": 0.011432019993662834, "ref_logps/chosen": -30.426984786987305, "ref_logps/rejected": -39.605560302734375, "rewards/accuracies": 0.75, "rewards/chosen": -1.2733503580093384, "rewards/margins": 1.3641002178192139, "rewards/rejected": -2.637450695037842, "step": 1078 }, { "epoch": 1.02, "grad_norm": 15.723586655125526, "learning_rate": 3.847899428425089e-07, "logps/chosen": -34.18373489379883, "logps/rejected": -65.2739028930664, "loss": 0.3384, "losses/dpo": 0.6864811778068542, "losses/sft": 1.177485466003418, "losses/total": 0.6864811778068542, "ref_logps/chosen": -22.295761108398438, "ref_logps/rejected": -39.61479187011719, "rewards/accuracies": 0.875, "rewards/chosen": -1.1887972354888916, "rewards/margins": 1.3771134614944458, "rewards/rejected": -2.565910816192627, "step": 1079 }, { "epoch": 1.02, "grad_norm": 13.852772643565904, "learning_rate": 3.84575390101993e-07, "logps/chosen": -30.309722900390625, "logps/rejected": -48.01521301269531, "loss": 0.3516, "losses/dpo": 0.2482934445142746, "losses/sft": 1.244446873664856, "losses/total": 0.2482934445142746, "ref_logps/chosen": -20.412860870361328, "ref_logps/rejected": -26.61670684814453, "rewards/accuracies": 1.0, "rewards/chosen": -0.9896861910820007, "rewards/margins": 1.1501648426055908, "rewards/rejected": -2.1398510932922363, "step": 1080 }, { "epoch": 1.02, "grad_norm": 18.57940290492782, "learning_rate": 3.843606977128674e-07, "logps/chosen": -36.46092224121094, "logps/rejected": -56.24848175048828, "loss": 0.4813, "losses/dpo": 0.06641383469104767, "losses/sft": 1.0745898485183716, "losses/total": 0.06641383469104767, "ref_logps/chosen": -23.987407684326172, "ref_logps/rejected": -32.590545654296875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.247351884841919, "rewards/margins": 1.1184420585632324, "rewards/rejected": -2.3657937049865723, "step": 1081 }, { "epoch": 1.02, "grad_norm": 13.42609879555041, "learning_rate": 3.8414586589791786e-07, "logps/chosen": -38.155517578125, "logps/rejected": -43.581268310546875, "loss": 0.2923, "losses/dpo": 0.34964147210121155, "losses/sft": 1.65849769115448, "losses/total": 0.34964147210121155, "ref_logps/chosen": -28.469024658203125, "ref_logps/rejected": -21.450204849243164, "rewards/accuracies": 1.0, "rewards/chosen": -0.9686492085456848, "rewards/margins": 1.2444570064544678, "rewards/rejected": -2.213106393814087, "step": 1082 }, { "epoch": 1.02, "grad_norm": 15.75949960307344, "learning_rate": 3.83930894880075e-07, "logps/chosen": -39.70122528076172, "logps/rejected": -57.91136932373047, "loss": 0.3523, "losses/dpo": 0.2653749883174896, "losses/sft": 1.6792843341827393, "losses/total": 0.2653749883174896, "ref_logps/chosen": -29.557796478271484, "ref_logps/rejected": -33.892906188964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.014343023300171, "rewards/margins": 1.387502908706665, "rewards/rejected": -2.401845932006836, "step": 1083 }, { "epoch": 1.02, "grad_norm": 17.95867468729176, "learning_rate": 3.837157848824138e-07, "logps/chosen": -34.709068298339844, "logps/rejected": -41.85590744018555, "loss": 0.4163, "losses/dpo": 0.1802622526884079, "losses/sft": 1.3515468835830688, "losses/total": 0.1802622526884079, "ref_logps/chosen": -25.341049194335938, "ref_logps/rejected": -23.560579299926758, "rewards/accuracies": 0.875, "rewards/chosen": -0.9368019104003906, "rewards/margins": 0.8927311897277832, "rewards/rejected": -1.8295331001281738, "step": 1084 }, { "epoch": 1.02, "grad_norm": 19.17392792608309, "learning_rate": 3.835005361281535e-07, "logps/chosen": -46.741676330566406, "logps/rejected": -61.85373306274414, "loss": 0.3469, "losses/dpo": 1.1386730670928955, "losses/sft": 1.3995803594589233, "losses/total": 1.1386730670928955, "ref_logps/chosen": -36.29225540161133, "ref_logps/rejected": -36.068763732910156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0449419021606445, "rewards/margins": 1.533555030822754, "rewards/rejected": -2.5784969329833984, "step": 1085 }, { "epoch": 1.02, "grad_norm": 14.844655299282508, "learning_rate": 3.832851488406571e-07, "logps/chosen": -38.32896423339844, "logps/rejected": -66.20970153808594, "loss": 0.2645, "losses/dpo": 0.6931471824645996, "losses/sft": 0.2651146948337555, "losses/total": 0.6931471824645996, "ref_logps/chosen": -27.304344177246094, "ref_logps/rejected": -37.80181121826172, "rewards/accuracies": 0.875, "rewards/chosen": -1.1024619340896606, "rewards/margins": 1.7383266687393188, "rewards/rejected": -2.8407886028289795, "step": 1086 }, { "epoch": 1.03, "grad_norm": 14.84959677689981, "learning_rate": 3.8306962324343183e-07, "logps/chosen": -38.46881103515625, "logps/rejected": -54.68381118774414, "loss": 0.2923, "losses/dpo": 0.14418359100818634, "losses/sft": 1.3668935298919678, "losses/total": 0.14418359100818634, "ref_logps/chosen": -30.46726417541504, "ref_logps/rejected": -32.323509216308594, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8001548647880554, "rewards/margins": 1.435875415802002, "rewards/rejected": -2.236030101776123, "step": 1087 }, { "epoch": 1.03, "grad_norm": 19.028848961492482, "learning_rate": 3.82853959560128e-07, "logps/chosen": -39.97552490234375, "logps/rejected": -65.05609893798828, "loss": 0.3547, "losses/dpo": 0.7056108713150024, "losses/sft": 1.1432026624679565, "losses/total": 0.7056108713150024, "ref_logps/chosen": -29.216978073120117, "ref_logps/rejected": -40.59376525878906, "rewards/accuracies": 0.875, "rewards/chosen": -1.0758545398712158, "rewards/margins": 1.3703789710998535, "rewards/rejected": -2.4462335109710693, "step": 1088 }, { "epoch": 1.03, "grad_norm": 11.668218042824142, "learning_rate": 3.826381580145394e-07, "logps/chosen": -52.5631217956543, "logps/rejected": -71.39967346191406, "loss": 0.1983, "losses/dpo": 0.7678510546684265, "losses/sft": 1.9271916151046753, "losses/total": 0.7678510546684265, "ref_logps/chosen": -40.82374572753906, "ref_logps/rejected": -37.17310333251953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1739375591278076, "rewards/margins": 2.2487194538116455, "rewards/rejected": -3.422657012939453, "step": 1089 }, { "epoch": 1.03, "grad_norm": 18.46853500263163, "learning_rate": 3.824222188306029e-07, "logps/chosen": -41.39289474487305, "logps/rejected": -55.62557601928711, "loss": 0.3232, "losses/dpo": 0.5838654041290283, "losses/sft": 1.395037293434143, "losses/total": 0.5838654041290283, "ref_logps/chosen": -31.51066017150879, "ref_logps/rejected": -31.755752563476562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9882232546806335, "rewards/margins": 1.398758888244629, "rewards/rejected": -2.3869822025299072, "step": 1090 }, { "epoch": 1.03, "grad_norm": 17.858897185986905, "learning_rate": 3.822061422323981e-07, "logps/chosen": -35.516788482666016, "logps/rejected": -59.3851318359375, "loss": 0.3949, "losses/dpo": 0.1013200432062149, "losses/sft": 0.7480210661888123, "losses/total": 0.1013200432062149, "ref_logps/chosen": -25.139387130737305, "ref_logps/rejected": -36.312171936035156, "rewards/accuracies": 0.875, "rewards/chosen": -1.0377399921417236, "rewards/margins": 1.2695565223693848, "rewards/rejected": -2.3072965145111084, "step": 1091 }, { "epoch": 1.03, "grad_norm": 30.20187654216042, "learning_rate": 3.8198992844414734e-07, "logps/chosen": -47.368404388427734, "logps/rejected": -55.79927062988281, "loss": 0.5575, "losses/dpo": 0.10762609541416168, "losses/sft": 0.9902435541152954, "losses/total": 0.10762609541416168, "ref_logps/chosen": -31.772457122802734, "ref_logps/rejected": -29.47962188720703, "rewards/accuracies": 0.625, "rewards/chosen": -1.5595943927764893, "rewards/margins": 1.0723704099655151, "rewards/rejected": -2.631964683532715, "step": 1092 }, { "epoch": 1.03, "grad_norm": 21.489145300982827, "learning_rate": 3.8177357769021514e-07, "logps/chosen": -39.879249572753906, "logps/rejected": -59.736366271972656, "loss": 0.3692, "losses/dpo": 0.13760626316070557, "losses/sft": 1.3487581014633179, "losses/total": 0.13760626316070557, "ref_logps/chosen": -28.980140686035156, "ref_logps/rejected": -35.153079986572266, "rewards/accuracies": 0.75, "rewards/chosen": -1.0899109840393066, "rewards/margins": 1.368417739868164, "rewards/rejected": -2.4583287239074707, "step": 1093 }, { "epoch": 1.03, "grad_norm": 22.240950539371305, "learning_rate": 3.815570901951082e-07, "logps/chosen": -42.242408752441406, "logps/rejected": -52.2608528137207, "loss": 0.4936, "losses/dpo": 0.30699458718299866, "losses/sft": 0.6513822674751282, "losses/total": 0.30699458718299866, "ref_logps/chosen": -30.086416244506836, "ref_logps/rejected": -31.480026245117188, "rewards/accuracies": 0.75, "rewards/chosen": -1.2155991792678833, "rewards/margins": 0.8624836206436157, "rewards/rejected": -2.07808256149292, "step": 1094 }, { "epoch": 1.03, "grad_norm": 25.801540719712825, "learning_rate": 3.8134046618347527e-07, "logps/chosen": -55.712005615234375, "logps/rejected": -55.9644775390625, "loss": 0.5654, "losses/dpo": 0.20030689239501953, "losses/sft": 0.8517318367958069, "losses/total": 0.20030689239501953, "ref_logps/chosen": -38.7349853515625, "ref_logps/rejected": -30.862821578979492, "rewards/accuracies": 0.75, "rewards/chosen": -1.697702169418335, "rewards/margins": 0.8124634027481079, "rewards/rejected": -2.5101656913757324, "step": 1095 }, { "epoch": 1.03, "grad_norm": 15.832790164603827, "learning_rate": 3.8112370588010657e-07, "logps/chosen": -47.605377197265625, "logps/rejected": -74.81767272949219, "loss": 0.3578, "losses/dpo": 0.00799266155809164, "losses/sft": 1.880340814590454, "losses/total": 0.00799266155809164, "ref_logps/chosen": -33.19274139404297, "ref_logps/rejected": -41.108177185058594, "rewards/accuracies": 0.875, "rewards/chosen": -1.4412637948989868, "rewards/margins": 1.9296857118606567, "rewards/rejected": -3.3709495067596436, "step": 1096 }, { "epoch": 1.03, "grad_norm": 19.266586669841388, "learning_rate": 3.809068095099339e-07, "logps/chosen": -44.86589813232422, "logps/rejected": -63.30792236328125, "loss": 0.3402, "losses/dpo": 1.046573519706726, "losses/sft": 1.4835861921310425, "losses/total": 1.046573519706726, "ref_logps/chosen": -33.848331451416016, "ref_logps/rejected": -35.389671325683594, "rewards/accuracies": 0.875, "rewards/chosen": -1.1017568111419678, "rewards/margins": 1.690068006515503, "rewards/rejected": -2.7918248176574707, "step": 1097 }, { "epoch": 1.04, "grad_norm": 20.14501721934402, "learning_rate": 3.8068977729803007e-07, "logps/chosen": -39.67510986328125, "logps/rejected": -60.828086853027344, "loss": 0.3727, "losses/dpo": 0.22501027584075928, "losses/sft": 2.0869338512420654, "losses/total": 0.22501027584075928, "ref_logps/chosen": -27.639320373535156, "ref_logps/rejected": -36.94578170776367, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2035787105560303, "rewards/margins": 1.1846516132354736, "rewards/rejected": -2.388230323791504, "step": 1098 }, { "epoch": 1.04, "grad_norm": 18.824022807915483, "learning_rate": 3.8047260946960915e-07, "logps/chosen": -33.929203033447266, "logps/rejected": -55.82585906982422, "loss": 0.3953, "losses/dpo": 1.6091337203979492, "losses/sft": 1.6517770290374756, "losses/total": 1.6091337203979492, "ref_logps/chosen": -23.452072143554688, "ref_logps/rejected": -33.204959869384766, "rewards/accuracies": 0.9375, "rewards/chosen": -1.047713041305542, "rewards/margins": 1.21437668800354, "rewards/rejected": -2.262089729309082, "step": 1099 }, { "epoch": 1.04, "grad_norm": 14.219494285211818, "learning_rate": 3.8025530625002556e-07, "logps/chosen": -40.5274772644043, "logps/rejected": -74.189453125, "loss": 0.2706, "losses/dpo": 0.0372844859957695, "losses/sft": 1.6270254850387573, "losses/total": 0.0372844859957695, "ref_logps/chosen": -30.959341049194336, "ref_logps/rejected": -45.16338348388672, "rewards/accuracies": 0.875, "rewards/chosen": -0.9568136930465698, "rewards/margins": 1.945793867111206, "rewards/rejected": -2.9026076793670654, "step": 1100 }, { "epoch": 1.04, "grad_norm": 21.28816835880655, "learning_rate": 3.800378678647745e-07, "logps/chosen": -36.45095443725586, "logps/rejected": -53.53318786621094, "loss": 0.4293, "losses/dpo": 0.060695722699165344, "losses/sft": 1.997143268585205, "losses/total": 0.060695722699165344, "ref_logps/chosen": -26.213611602783203, "ref_logps/rejected": -30.195266723632812, "rewards/accuracies": 0.75, "rewards/chosen": -1.0237343311309814, "rewards/margins": 1.3100576400756836, "rewards/rejected": -2.333791971206665, "step": 1101 }, { "epoch": 1.04, "grad_norm": 16.984431602318114, "learning_rate": 3.798202945394913e-07, "logps/chosen": -41.29727554321289, "logps/rejected": -71.62712860107422, "loss": 0.3391, "losses/dpo": 0.1501004993915558, "losses/sft": 0.5204476714134216, "losses/total": 0.1501004993915558, "ref_logps/chosen": -31.034170150756836, "ref_logps/rejected": -46.427833557128906, "rewards/accuracies": 0.875, "rewards/chosen": -1.0263105630874634, "rewards/margins": 1.4936189651489258, "rewards/rejected": -2.5199294090270996, "step": 1102 }, { "epoch": 1.04, "grad_norm": 18.958420559430095, "learning_rate": 3.796025864999514e-07, "logps/chosen": -34.95928955078125, "logps/rejected": -54.875404357910156, "loss": 0.388, "losses/dpo": 2.1248230934143066, "losses/sft": 2.0550990104675293, "losses/total": 2.1248230934143066, "ref_logps/chosen": -26.65707015991211, "ref_logps/rejected": -32.73448181152344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8302218914031982, "rewards/margins": 1.3838703632354736, "rewards/rejected": -2.214092254638672, "step": 1103 }, { "epoch": 1.04, "grad_norm": 13.557599612908563, "learning_rate": 3.7938474397207004e-07, "logps/chosen": -40.55763244628906, "logps/rejected": -73.20056915283203, "loss": 0.2748, "losses/dpo": 0.41185927391052246, "losses/sft": 2.1165075302124023, "losses/total": 0.41185927391052246, "ref_logps/chosen": -28.40663719177246, "ref_logps/rejected": -42.666194915771484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2150993347167969, "rewards/margins": 1.8383378982543945, "rewards/rejected": -3.0534374713897705, "step": 1104 }, { "epoch": 1.04, "grad_norm": 13.31177035789886, "learning_rate": 3.7916676718190186e-07, "logps/chosen": -35.45941925048828, "logps/rejected": -62.043426513671875, "loss": 0.2544, "losses/dpo": 0.3896048665046692, "losses/sft": 1.3083559274673462, "losses/total": 0.3896048665046692, "ref_logps/chosen": -25.808868408203125, "ref_logps/rejected": -35.90084457397461, "rewards/accuracies": 1.0, "rewards/chosen": -0.9650551080703735, "rewards/margins": 1.6492030620574951, "rewards/rejected": -2.614258289337158, "step": 1105 }, { "epoch": 1.04, "grad_norm": 24.301951836826014, "learning_rate": 3.7894865635564106e-07, "logps/chosen": -42.53192901611328, "logps/rejected": -50.78026580810547, "loss": 0.4597, "losses/dpo": 0.07871002703905106, "losses/sft": 1.1203540563583374, "losses/total": 0.07871002703905106, "ref_logps/chosen": -30.558195114135742, "ref_logps/rejected": -28.999996185302734, "rewards/accuracies": 0.875, "rewards/chosen": -1.197373390197754, "rewards/margins": 0.980653703212738, "rewards/rejected": -2.1780271530151367, "step": 1106 }, { "epoch": 1.04, "grad_norm": 16.571014982380365, "learning_rate": 3.787304117196207e-07, "logps/chosen": -36.11324691772461, "logps/rejected": -52.755069732666016, "loss": 0.3436, "losses/dpo": 0.03720670938491821, "losses/sft": 0.7503969669342041, "losses/total": 0.03720670938491821, "ref_logps/chosen": -25.766523361206055, "ref_logps/rejected": -31.078929901123047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.034672498703003, "rewards/margins": 1.1329413652420044, "rewards/rejected": -2.167613983154297, "step": 1107 }, { "epoch": 1.05, "grad_norm": 12.482011310578331, "learning_rate": 3.785120335003128e-07, "logps/chosen": -30.518686294555664, "logps/rejected": -51.21973419189453, "loss": 0.2747, "losses/dpo": 0.14470818638801575, "losses/sft": 0.7433618903160095, "losses/total": 0.14470818638801575, "ref_logps/chosen": -22.51644515991211, "ref_logps/rejected": -28.712894439697266, "rewards/accuracies": 1.0, "rewards/chosen": -0.8002240657806396, "rewards/margins": 1.4504599571228027, "rewards/rejected": -2.2506840229034424, "step": 1108 }, { "epoch": 1.05, "grad_norm": 16.667313586766614, "learning_rate": 3.782935219243281e-07, "logps/chosen": -29.074893951416016, "logps/rejected": -46.370445251464844, "loss": 0.4429, "losses/dpo": 0.5558191537857056, "losses/sft": 0.4144127666950226, "losses/total": 0.5558191537857056, "ref_logps/chosen": -20.37546157836914, "ref_logps/rejected": -27.845596313476562, "rewards/accuracies": 0.875, "rewards/chosen": -0.8699431419372559, "rewards/margins": 0.9825418591499329, "rewards/rejected": -1.8524850606918335, "step": 1109 }, { "epoch": 1.05, "grad_norm": 20.47099493891291, "learning_rate": 3.780748772184154e-07, "logps/chosen": -43.92449951171875, "logps/rejected": -59.10591125488281, "loss": 0.3972, "losses/dpo": 0.6909176707267761, "losses/sft": 0.8648082613945007, "losses/total": 0.6909176707267761, "ref_logps/chosen": -32.02928924560547, "ref_logps/rejected": -32.77596664428711, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1895209550857544, "rewards/margins": 1.4434735774993896, "rewards/rejected": -2.6329946517944336, "step": 1110 }, { "epoch": 1.05, "grad_norm": 16.088642944549594, "learning_rate": 3.778560996094621e-07, "logps/chosen": -42.683311462402344, "logps/rejected": -59.57121276855469, "loss": 0.3169, "losses/dpo": 0.4571039080619812, "losses/sft": 1.1095010042190552, "losses/total": 0.4571039080619812, "ref_logps/chosen": -29.334747314453125, "ref_logps/rejected": -34.535301208496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3348567485809326, "rewards/margins": 1.1687347888946533, "rewards/rejected": -2.503591537475586, "step": 1111 }, { "epoch": 1.05, "grad_norm": 22.043103263215862, "learning_rate": 3.776371893244932e-07, "logps/chosen": -39.07246780395508, "logps/rejected": -50.83562469482422, "loss": 0.5179, "losses/dpo": 0.4546867609024048, "losses/sft": 2.2165632247924805, "losses/total": 0.4546867609024048, "ref_logps/chosen": -26.281269073486328, "ref_logps/rejected": -31.12175178527832, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2791199684143066, "rewards/margins": 0.6922674179077148, "rewards/rejected": -1.9713873863220215, "step": 1112 }, { "epoch": 1.05, "grad_norm": 18.60268076781509, "learning_rate": 3.774181465906714e-07, "logps/chosen": -45.89662551879883, "logps/rejected": -60.78418731689453, "loss": 0.309, "losses/dpo": 0.19460786879062653, "losses/sft": 0.924973726272583, "losses/total": 0.19460786879062653, "ref_logps/chosen": -33.91802978515625, "ref_logps/rejected": -35.48485565185547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1978594064712524, "rewards/margins": 1.3320739269256592, "rewards/rejected": -2.529933452606201, "step": 1113 }, { "epoch": 1.05, "grad_norm": 13.45817524584172, "learning_rate": 3.7719897163529704e-07, "logps/chosen": -36.08510208129883, "logps/rejected": -78.39775848388672, "loss": 0.1919, "losses/dpo": 0.08737989515066147, "losses/sft": 1.045072078704834, "losses/total": 0.08737989515066147, "ref_logps/chosen": -26.563501358032227, "ref_logps/rejected": -48.45402145385742, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9521602392196655, "rewards/margins": 2.0422139167785645, "rewards/rejected": -2.9943742752075195, "step": 1114 }, { "epoch": 1.05, "grad_norm": 18.04175826728047, "learning_rate": 3.7697966468580735e-07, "logps/chosen": -58.84900665283203, "logps/rejected": -72.17625427246094, "loss": 0.3047, "losses/dpo": 0.31800130009651184, "losses/sft": 0.7146487236022949, "losses/total": 0.31800130009651184, "ref_logps/chosen": -43.284637451171875, "ref_logps/rejected": -40.4155158996582, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5564366579055786, "rewards/margins": 1.6196370124816895, "rewards/rejected": -3.1760735511779785, "step": 1115 }, { "epoch": 1.05, "grad_norm": 16.421530847572384, "learning_rate": 3.767602259697769e-07, "logps/chosen": -31.440263748168945, "logps/rejected": -67.2935562133789, "loss": 0.3051, "losses/dpo": 0.060911960899829865, "losses/sft": 1.4599039554595947, "losses/total": 0.060911960899829865, "ref_logps/chosen": -22.380176544189453, "ref_logps/rejected": -43.92906951904297, "rewards/accuracies": 0.9375, "rewards/chosen": -0.906008780002594, "rewards/margins": 1.4304399490356445, "rewards/rejected": -2.3364486694335938, "step": 1116 }, { "epoch": 1.05, "grad_norm": 12.489556426835923, "learning_rate": 3.765406557149166e-07, "logps/chosen": -26.344642639160156, "logps/rejected": -55.011112213134766, "loss": 0.3065, "losses/dpo": 0.0809282585978508, "losses/sft": 0.8107130527496338, "losses/total": 0.0809282585978508, "ref_logps/chosen": -21.459014892578125, "ref_logps/rejected": -34.87617874145508, "rewards/accuracies": 0.875, "rewards/chosen": -0.48856282234191895, "rewards/margins": 1.5249308347702026, "rewards/rejected": -2.013493537902832, "step": 1117 }, { "epoch": 1.05, "grad_norm": 13.289658198124656, "learning_rate": 3.7632095414907424e-07, "logps/chosen": -33.79952621459961, "logps/rejected": -55.50947570800781, "loss": 0.2647, "losses/dpo": 0.048286642879247665, "losses/sft": 1.0615421533584595, "losses/total": 0.048286642879247665, "ref_logps/chosen": -24.609817504882812, "ref_logps/rejected": -31.29438018798828, "rewards/accuracies": 1.0, "rewards/chosen": -0.9189706444740295, "rewards/margins": 1.5025386810302734, "rewards/rejected": -2.421509265899658, "step": 1118 }, { "epoch": 1.06, "grad_norm": 10.791632994776428, "learning_rate": 3.7610112150023355e-07, "logps/chosen": -34.363285064697266, "logps/rejected": -63.1780891418457, "loss": 0.2028, "losses/dpo": 0.41454777121543884, "losses/sft": 0.9079773426055908, "losses/total": 0.41454777121543884, "ref_logps/chosen": -24.87932586669922, "ref_logps/rejected": -35.7593994140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.948395848274231, "rewards/margins": 1.7934728860855103, "rewards/rejected": -2.741868495941162, "step": 1119 }, { "epoch": 1.06, "grad_norm": 16.23041901982753, "learning_rate": 3.758811579965144e-07, "logps/chosen": -42.52210235595703, "logps/rejected": -67.04352569580078, "loss": 0.2952, "losses/dpo": 0.44898170232772827, "losses/sft": 1.0319433212280273, "losses/total": 0.44898170232772827, "ref_logps/chosen": -27.276161193847656, "ref_logps/rejected": -36.90192413330078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5245944261550903, "rewards/margins": 1.4895652532577515, "rewards/rejected": -3.014159679412842, "step": 1120 }, { "epoch": 1.06, "grad_norm": 26.161889770225688, "learning_rate": 3.7566106386617267e-07, "logps/chosen": -41.75128936767578, "logps/rejected": -45.5972900390625, "loss": 0.5823, "losses/dpo": 0.39460670948028564, "losses/sft": 1.6290218830108643, "losses/total": 0.39460670948028564, "ref_logps/chosen": -28.24317169189453, "ref_logps/rejected": -25.853424072265625, "rewards/accuracies": 0.75, "rewards/chosen": -1.35081148147583, "rewards/margins": 0.6235753297805786, "rewards/rejected": -1.9743866920471191, "step": 1121 }, { "epoch": 1.06, "grad_norm": 13.747382168676909, "learning_rate": 3.754408393375995e-07, "logps/chosen": -39.248046875, "logps/rejected": -59.3335075378418, "loss": 0.2632, "losses/dpo": 0.2207656055688858, "losses/sft": 1.1781947612762451, "losses/total": 0.2207656055688858, "ref_logps/chosen": -30.359527587890625, "ref_logps/rejected": -33.3444709777832, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8888522386550903, "rewards/margins": 1.7100517749786377, "rewards/rejected": -2.5989041328430176, "step": 1122 }, { "epoch": 1.06, "grad_norm": 11.8188857800886, "learning_rate": 3.752204846393214e-07, "logps/chosen": -32.01994705200195, "logps/rejected": -53.63996124267578, "loss": 0.2722, "losses/dpo": 0.22217565774917603, "losses/sft": 0.9825397729873657, "losses/total": 0.22217565774917603, "ref_logps/chosen": -23.055761337280273, "ref_logps/rejected": -28.4815731048584, "rewards/accuracies": 0.9375, "rewards/chosen": -0.896418571472168, "rewards/margins": 1.619420051574707, "rewards/rejected": -2.515838623046875, "step": 1123 }, { "epoch": 1.06, "grad_norm": 22.081823606905356, "learning_rate": 3.75e-07, "logps/chosen": -40.152244567871094, "logps/rejected": -55.49237823486328, "loss": 0.4335, "losses/dpo": 0.037700455635786057, "losses/sft": 1.4763962030410767, "losses/total": 0.037700455635786057, "ref_logps/chosen": -31.103342056274414, "ref_logps/rejected": -35.86906814575195, "rewards/accuracies": 0.75, "rewards/chosen": -0.9048903584480286, "rewards/margins": 1.057440161705017, "rewards/rejected": -1.9623305797576904, "step": 1124 }, { "epoch": 1.06, "grad_norm": 19.398066153955348, "learning_rate": 3.747793856484318e-07, "logps/chosen": -42.12263107299805, "logps/rejected": -54.187278747558594, "loss": 0.3966, "losses/dpo": 0.08542654663324356, "losses/sft": 0.38936033844947815, "losses/total": 0.08542654663324356, "ref_logps/chosen": -30.175207138061523, "ref_logps/rejected": -31.754423141479492, "rewards/accuracies": 0.875, "rewards/chosen": -1.1947426795959473, "rewards/margins": 1.0485429763793945, "rewards/rejected": -2.243285655975342, "step": 1125 }, { "epoch": 1.06, "grad_norm": 13.774018975333801, "learning_rate": 3.745586418135478e-07, "logps/chosen": -31.25288963317871, "logps/rejected": -52.20122528076172, "loss": 0.3015, "losses/dpo": 0.36921557784080505, "losses/sft": 0.3349493145942688, "losses/total": 0.36921557784080505, "ref_logps/chosen": -21.743938446044922, "ref_logps/rejected": -30.061241149902344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9508951902389526, "rewards/margins": 1.2631030082702637, "rewards/rejected": -2.213998317718506, "step": 1126 }, { "epoch": 1.06, "grad_norm": 19.142421649967552, "learning_rate": 3.7433776872441345e-07, "logps/chosen": -36.88043212890625, "logps/rejected": -51.17293930053711, "loss": 0.4006, "losses/dpo": 0.12529900670051575, "losses/sft": 2.0387213230133057, "losses/total": 0.12529900670051575, "ref_logps/chosen": -25.264528274536133, "ref_logps/rejected": -29.2346134185791, "rewards/accuracies": 0.875, "rewards/chosen": -1.1615904569625854, "rewards/margins": 1.032241940498352, "rewards/rejected": -2.1938323974609375, "step": 1127 }, { "epoch": 1.06, "grad_norm": 18.35266420691116, "learning_rate": 3.7411676661022836e-07, "logps/chosen": -41.74523162841797, "logps/rejected": -52.81224822998047, "loss": 0.39, "losses/dpo": 0.16627496480941772, "losses/sft": 1.6007215976715088, "losses/total": 0.16627496480941772, "ref_logps/chosen": -31.160268783569336, "ref_logps/rejected": -33.11539077758789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0584964752197266, "rewards/margins": 0.9111891984939575, "rewards/rejected": -1.9696855545043945, "step": 1128 }, { "epoch": 1.07, "grad_norm": 15.981810729293946, "learning_rate": 3.738956357003259e-07, "logps/chosen": -32.892337799072266, "logps/rejected": -55.69861602783203, "loss": 0.4271, "losses/dpo": 0.12092901021242142, "losses/sft": 1.5354416370391846, "losses/total": 0.12092901021242142, "ref_logps/chosen": -24.368255615234375, "ref_logps/rejected": -36.50320053100586, "rewards/accuracies": 0.8125, "rewards/chosen": -0.852408230304718, "rewards/margins": 1.0671336650848389, "rewards/rejected": -1.919541835784912, "step": 1129 }, { "epoch": 1.07, "grad_norm": 14.905762711005769, "learning_rate": 3.7367437622417323e-07, "logps/chosen": -30.06852149963379, "logps/rejected": -55.12791442871094, "loss": 0.3394, "losses/dpo": 0.10314831137657166, "losses/sft": 1.5138516426086426, "losses/total": 0.10314831137657166, "ref_logps/chosen": -22.721220016479492, "ref_logps/rejected": -34.985801696777344, "rewards/accuracies": 0.875, "rewards/chosen": -0.7347301840782166, "rewards/margins": 1.279481291770935, "rewards/rejected": -2.014211416244507, "step": 1130 }, { "epoch": 1.07, "grad_norm": 14.301984800429539, "learning_rate": 3.734529884113709e-07, "logps/chosen": -37.34754180908203, "logps/rejected": -60.14391326904297, "loss": 0.278, "losses/dpo": 0.204329252243042, "losses/sft": 1.3507181406021118, "losses/total": 0.204329252243042, "ref_logps/chosen": -26.989818572998047, "ref_logps/rejected": -34.680641174316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.0357720851898193, "rewards/margins": 1.5105555057525635, "rewards/rejected": -2.546327590942383, "step": 1131 }, { "epoch": 1.07, "grad_norm": 13.50860000778687, "learning_rate": 3.732314724916525e-07, "logps/chosen": -32.554771423339844, "logps/rejected": -49.97675704956055, "loss": 0.3156, "losses/dpo": 1.0094473361968994, "losses/sft": 1.4226444959640503, "losses/total": 1.0094473361968994, "ref_logps/chosen": -25.62384796142578, "ref_logps/rejected": -27.81631088256836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6930919885635376, "rewards/margins": 1.5229525566101074, "rewards/rejected": -2.2160446643829346, "step": 1132 }, { "epoch": 1.07, "grad_norm": 21.311946027059975, "learning_rate": 3.7300982869488464e-07, "logps/chosen": -44.480003356933594, "logps/rejected": -67.85069274902344, "loss": 0.391, "losses/dpo": 0.05302799120545387, "losses/sft": 1.2256252765655518, "losses/total": 0.05302799120545387, "ref_logps/chosen": -34.03656005859375, "ref_logps/rejected": -42.61008834838867, "rewards/accuracies": 0.9375, "rewards/chosen": -1.044344425201416, "rewards/margins": 1.4797159433364868, "rewards/rejected": -2.5240602493286133, "step": 1133 }, { "epoch": 1.07, "grad_norm": 17.31807101180641, "learning_rate": 3.727880572510667e-07, "logps/chosen": -41.32978439331055, "logps/rejected": -61.32545471191406, "loss": 0.3367, "losses/dpo": 0.26070013642311096, "losses/sft": 1.0575531721115112, "losses/total": 0.26070013642311096, "ref_logps/chosen": -31.065387725830078, "ref_logps/rejected": -37.722904205322266, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0264397859573364, "rewards/margins": 1.3338148593902588, "rewards/rejected": -2.3602545261383057, "step": 1134 }, { "epoch": 1.07, "grad_norm": 16.711592035608337, "learning_rate": 3.7256615839033046e-07, "logps/chosen": -46.947425842285156, "logps/rejected": -70.16259765625, "loss": 0.336, "losses/dpo": 0.11249568313360214, "losses/sft": 1.6243163347244263, "losses/total": 0.11249568313360214, "ref_logps/chosen": -33.91420364379883, "ref_logps/rejected": -44.941856384277344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.303322434425354, "rewards/margins": 1.2187515497207642, "rewards/rejected": -2.5220742225646973, "step": 1135 }, { "epoch": 1.07, "grad_norm": 14.096460947627442, "learning_rate": 3.7234413234293994e-07, "logps/chosen": -36.598358154296875, "logps/rejected": -45.75627136230469, "loss": 0.3598, "losses/dpo": 1.0519461631774902, "losses/sft": 0.792489767074585, "losses/total": 1.0519461631774902, "ref_logps/chosen": -27.261587142944336, "ref_logps/rejected": -23.902400970458984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.933677077293396, "rewards/margins": 1.2517099380493164, "rewards/rejected": -2.185387134552002, "step": 1136 }, { "epoch": 1.07, "grad_norm": 14.185399134822012, "learning_rate": 3.7212197933929107e-07, "logps/chosen": -29.725921630859375, "logps/rejected": -64.5687255859375, "loss": 0.2392, "losses/dpo": 0.02677346020936966, "losses/sft": 0.7307178378105164, "losses/total": 0.02677346020936966, "ref_logps/chosen": -21.189708709716797, "ref_logps/rejected": -36.58026885986328, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8536215424537659, "rewards/margins": 1.9452241659164429, "rewards/rejected": -2.7988455295562744, "step": 1137 }, { "epoch": 1.07, "grad_norm": 15.368626164296193, "learning_rate": 3.718996996099116e-07, "logps/chosen": -28.034603118896484, "logps/rejected": -52.97584533691406, "loss": 0.3012, "losses/dpo": 0.3080601096153259, "losses/sft": 0.9264421463012695, "losses/total": 0.3080601096153259, "ref_logps/chosen": -21.430675506591797, "ref_logps/rejected": -32.42595672607422, "rewards/accuracies": 1.0, "rewards/chosen": -0.6603927612304688, "rewards/margins": 1.3945962190628052, "rewards/rejected": -2.0549890995025635, "step": 1138 }, { "epoch": 1.07, "grad_norm": 14.772014322035611, "learning_rate": 3.716772933854606e-07, "logps/chosen": -32.93284606933594, "logps/rejected": -51.13562774658203, "loss": 0.3453, "losses/dpo": 0.3063950836658478, "losses/sft": 0.7278348803520203, "losses/total": 0.3063950836658478, "ref_logps/chosen": -24.89127540588379, "ref_logps/rejected": -30.550670623779297, "rewards/accuracies": 0.9375, "rewards/chosen": -0.804157018661499, "rewards/margins": 1.2543388605117798, "rewards/rejected": -2.0584959983825684, "step": 1139 }, { "epoch": 1.08, "grad_norm": 18.350924380983965, "learning_rate": 3.714547608967288e-07, "logps/chosen": -44.15406799316406, "logps/rejected": -59.04606628417969, "loss": 0.3518, "losses/dpo": 0.041308458894491196, "losses/sft": 1.4066367149353027, "losses/total": 0.041308458894491196, "ref_logps/chosen": -35.309913635253906, "ref_logps/rejected": -37.75583267211914, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8844153881072998, "rewards/margins": 1.2446081638336182, "rewards/rejected": -2.129023551940918, "step": 1140 }, { "epoch": 1.08, "grad_norm": 15.145717014135224, "learning_rate": 3.7123210237463767e-07, "logps/chosen": -38.424495697021484, "logps/rejected": -51.382320404052734, "loss": 0.3453, "losses/dpo": 0.16456346213817596, "losses/sft": 1.756105661392212, "losses/total": 0.16456346213817596, "ref_logps/chosen": -31.482168197631836, "ref_logps/rejected": -30.108928680419922, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6942325830459595, "rewards/margins": 1.433106541633606, "rewards/rejected": -2.1273391246795654, "step": 1141 }, { "epoch": 1.08, "grad_norm": 15.933071765560923, "learning_rate": 3.710093180502394e-07, "logps/chosen": -38.660545349121094, "logps/rejected": -60.34956741333008, "loss": 0.3525, "losses/dpo": 0.4542374610900879, "losses/sft": 1.5843288898468018, "losses/total": 0.4542374610900879, "ref_logps/chosen": -26.22637176513672, "ref_logps/rejected": -35.75702667236328, "rewards/accuracies": 0.875, "rewards/chosen": -1.243417501449585, "rewards/margins": 1.215836524963379, "rewards/rejected": -2.459254026412964, "step": 1142 }, { "epoch": 1.08, "grad_norm": 16.89593588588707, "learning_rate": 3.7078640815471686e-07, "logps/chosen": -36.06999206542969, "logps/rejected": -58.069332122802734, "loss": 0.263, "losses/dpo": 0.16459773480892181, "losses/sft": 1.1021100282669067, "losses/total": 0.16459773480892181, "ref_logps/chosen": -25.913654327392578, "ref_logps/rejected": -32.255165100097656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.015633463859558, "rewards/margins": 1.5657832622528076, "rewards/rejected": -2.5814168453216553, "step": 1143 }, { "epoch": 1.08, "grad_norm": 12.203970856015362, "learning_rate": 3.705633729193832e-07, "logps/chosen": -39.959571838378906, "logps/rejected": -69.95220184326172, "loss": 0.2282, "losses/dpo": 0.15270398557186127, "losses/sft": 1.5712491273880005, "losses/total": 0.15270398557186127, "ref_logps/chosen": -29.05797576904297, "ref_logps/rejected": -40.5700798034668, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0901594161987305, "rewards/margins": 1.848052740097046, "rewards/rejected": -2.9382119178771973, "step": 1144 }, { "epoch": 1.08, "grad_norm": 14.047522077457993, "learning_rate": 3.703402125756817e-07, "logps/chosen": -39.736907958984375, "logps/rejected": -62.808753967285156, "loss": 0.2597, "losses/dpo": 0.3760320246219635, "losses/sft": 1.4344818592071533, "losses/total": 0.3760320246219635, "ref_logps/chosen": -29.187559127807617, "ref_logps/rejected": -34.78355407714844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0549347400665283, "rewards/margins": 1.7475850582122803, "rewards/rejected": -2.8025197982788086, "step": 1145 }, { "epoch": 1.08, "grad_norm": 14.463511076794397, "learning_rate": 3.701169273551853e-07, "logps/chosen": -34.239501953125, "logps/rejected": -54.10880661010742, "loss": 0.3491, "losses/dpo": 0.203969806432724, "losses/sft": 1.5520085096359253, "losses/total": 0.203969806432724, "ref_logps/chosen": -24.517215728759766, "ref_logps/rejected": -30.166852951049805, "rewards/accuracies": 0.875, "rewards/chosen": -0.9722285866737366, "rewards/margins": 1.421966791152954, "rewards/rejected": -2.394195318222046, "step": 1146 }, { "epoch": 1.08, "grad_norm": 12.976240540985023, "learning_rate": 3.6989351748959673e-07, "logps/chosen": -28.792238235473633, "logps/rejected": -55.76020050048828, "loss": 0.2802, "losses/dpo": 0.20839889347553253, "losses/sft": 1.7647053003311157, "losses/total": 0.20839889347553253, "ref_logps/chosen": -19.54996109008789, "ref_logps/rejected": -28.534038543701172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9242275953292847, "rewards/margins": 1.798388957977295, "rewards/rejected": -2.72261643409729, "step": 1147 }, { "epoch": 1.08, "grad_norm": 15.063738933494806, "learning_rate": 3.696699832107478e-07, "logps/chosen": -32.126747131347656, "logps/rejected": -63.32782745361328, "loss": 0.3311, "losses/dpo": 0.2737649381160736, "losses/sft": 1.1443296670913696, "losses/total": 0.2737649381160736, "ref_logps/chosen": -23.932968139648438, "ref_logps/rejected": -39.90805435180664, "rewards/accuracies": 0.875, "rewards/chosen": -0.8193778395652771, "rewards/margins": 1.522599458694458, "rewards/rejected": -2.341977119445801, "step": 1148 }, { "epoch": 1.08, "grad_norm": 14.629170869694933, "learning_rate": 3.694463247505998e-07, "logps/chosen": -25.257020950317383, "logps/rejected": -49.931941986083984, "loss": 0.3588, "losses/dpo": 0.33933940529823303, "losses/sft": 0.2589021921157837, "losses/total": 0.33933940529823303, "ref_logps/chosen": -16.831134796142578, "ref_logps/rejected": -26.699142456054688, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8425883650779724, "rewards/margins": 1.4806914329528809, "rewards/rejected": -2.323279857635498, "step": 1149 }, { "epoch": 1.08, "grad_norm": 14.944133158421707, "learning_rate": 3.6922254234124245e-07, "logps/chosen": -35.38117980957031, "logps/rejected": -55.17569351196289, "loss": 0.3435, "losses/dpo": 0.22626924514770508, "losses/sft": 0.8597681522369385, "losses/total": 0.22626924514770508, "ref_logps/chosen": -26.92217254638672, "ref_logps/rejected": -32.34584045410156, "rewards/accuracies": 0.875, "rewards/chosen": -0.8459004163742065, "rewards/margins": 1.4370851516723633, "rewards/rejected": -2.2829854488372803, "step": 1150 }, { "epoch": 1.09, "grad_norm": 22.817300464297176, "learning_rate": 3.6899863621489433e-07, "logps/chosen": -50.50436019897461, "logps/rejected": -66.15071868896484, "loss": 0.3442, "losses/dpo": 0.4697788953781128, "losses/sft": 1.634348750114441, "losses/total": 0.4697788953781128, "ref_logps/chosen": -39.071990966796875, "ref_logps/rejected": -40.59386444091797, "rewards/accuracies": 0.875, "rewards/chosen": -1.1432369947433472, "rewards/margins": 1.4124484062194824, "rewards/rejected": -2.555685520172119, "step": 1151 }, { "epoch": 1.09, "grad_norm": 19.88642328505732, "learning_rate": 3.6877460660390253e-07, "logps/chosen": -40.545894622802734, "logps/rejected": -71.78766632080078, "loss": 0.3301, "losses/dpo": 0.40462198853492737, "losses/sft": 2.0589406490325928, "losses/total": 0.40462198853492737, "ref_logps/chosen": -28.705148696899414, "ref_logps/rejected": -44.267417907714844, "rewards/accuracies": 0.875, "rewards/chosen": -1.1840746402740479, "rewards/margins": 1.5679502487182617, "rewards/rejected": -2.7520251274108887, "step": 1152 }, { "epoch": 1.09, "grad_norm": 19.190429115402903, "learning_rate": 3.68550453740742e-07, "logps/chosen": -38.496131896972656, "logps/rejected": -51.738983154296875, "loss": 0.4729, "losses/dpo": 0.06766179949045181, "losses/sft": 1.4017270803451538, "losses/total": 0.06766179949045181, "ref_logps/chosen": -27.166332244873047, "ref_logps/rejected": -30.44839096069336, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1329798698425293, "rewards/margins": 0.9960795640945435, "rewards/rejected": -2.129059314727783, "step": 1153 }, { "epoch": 1.09, "grad_norm": 12.110788083893437, "learning_rate": 3.6832617785801575e-07, "logps/chosen": -41.464176177978516, "logps/rejected": -54.93772888183594, "loss": 0.2258, "losses/dpo": 0.6445057988166809, "losses/sft": 0.8022320866584778, "losses/total": 0.6445057988166809, "ref_logps/chosen": -33.03085708618164, "ref_logps/rejected": -28.001384735107422, "rewards/accuracies": 1.0, "rewards/chosen": -0.8433316946029663, "rewards/margins": 1.8503031730651855, "rewards/rejected": -2.6936349868774414, "step": 1154 }, { "epoch": 1.09, "grad_norm": 13.845087022094274, "learning_rate": 3.681017791884543e-07, "logps/chosen": -35.6863899230957, "logps/rejected": -56.72791290283203, "loss": 0.2415, "losses/dpo": 0.18823689222335815, "losses/sft": 0.9471169710159302, "losses/total": 0.18823689222335815, "ref_logps/chosen": -26.57056427001953, "ref_logps/rejected": -31.639842987060547, "rewards/accuracies": 1.0, "rewards/chosen": -0.9115824699401855, "rewards/margins": 1.5972247123718262, "rewards/rejected": -2.5088071823120117, "step": 1155 }, { "epoch": 1.09, "grad_norm": 15.128181771114411, "learning_rate": 3.678772579649159e-07, "logps/chosen": -47.926849365234375, "logps/rejected": -68.37867736816406, "loss": 0.2441, "losses/dpo": 0.41214409470558167, "losses/sft": 1.940225601196289, "losses/total": 0.41214409470558167, "ref_logps/chosen": -31.851163864135742, "ref_logps/rejected": -36.436336517333984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6075685024261475, "rewards/margins": 1.5866661071777344, "rewards/rejected": -3.1942343711853027, "step": 1156 }, { "epoch": 1.09, "grad_norm": 15.789415140257617, "learning_rate": 3.676526144203855e-07, "logps/chosen": -43.38681411743164, "logps/rejected": -63.054412841796875, "loss": 0.2921, "losses/dpo": 0.11054857820272446, "losses/sft": 1.8503073453903198, "losses/total": 0.11054857820272446, "ref_logps/chosen": -31.661758422851562, "ref_logps/rejected": -36.93024444580078, "rewards/accuracies": 1.0, "rewards/chosen": -1.1725056171417236, "rewards/margins": 1.4399113655090332, "rewards/rejected": -2.612417221069336, "step": 1157 }, { "epoch": 1.09, "grad_norm": 18.336129083230084, "learning_rate": 3.6742784878797556e-07, "logps/chosen": -45.04655456542969, "logps/rejected": -56.708255767822266, "loss": 0.3876, "losses/dpo": 1.0078297853469849, "losses/sft": 0.6982911825180054, "losses/total": 1.0078297853469849, "ref_logps/chosen": -32.7289924621582, "ref_logps/rejected": -32.64131164550781, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2317564487457275, "rewards/margins": 1.1749378442764282, "rewards/rejected": -2.4066944122314453, "step": 1158 }, { "epoch": 1.09, "grad_norm": 16.652529938252805, "learning_rate": 3.672029613009247e-07, "logps/chosen": -26.971498489379883, "logps/rejected": -64.96145629882812, "loss": 0.267, "losses/dpo": 0.4262029230594635, "losses/sft": 0.899478018283844, "losses/total": 0.4262029230594635, "ref_logps/chosen": -19.12271499633789, "ref_logps/rejected": -37.939247131347656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.784878134727478, "rewards/margins": 1.9173431396484375, "rewards/rejected": -2.702221393585205, "step": 1159 }, { "epoch": 1.09, "grad_norm": 16.66057034711534, "learning_rate": 3.669779521925983e-07, "logps/chosen": -38.857887268066406, "logps/rejected": -52.22222137451172, "loss": 0.4106, "losses/dpo": 0.18223586678504944, "losses/sft": 1.8458176851272583, "losses/total": 0.18223586678504944, "ref_logps/chosen": -27.680484771728516, "ref_logps/rejected": -27.30859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.1177400350570679, "rewards/margins": 1.3736228942871094, "rewards/rejected": -2.491363048553467, "step": 1160 }, { "epoch": 1.1, "grad_norm": 18.276503343015857, "learning_rate": 3.6675282169648786e-07, "logps/chosen": -32.42135238647461, "logps/rejected": -55.69471740722656, "loss": 0.3714, "losses/dpo": 0.06502498686313629, "losses/sft": 1.4450342655181885, "losses/total": 0.06502498686313629, "ref_logps/chosen": -22.192684173583984, "ref_logps/rejected": -31.306631088256836, "rewards/accuracies": 0.875, "rewards/chosen": -1.02286696434021, "rewards/margins": 1.4159417152404785, "rewards/rejected": -2.4388084411621094, "step": 1161 }, { "epoch": 1.1, "grad_norm": 14.16101077528635, "learning_rate": 3.665275700462108e-07, "logps/chosen": -43.86423873901367, "logps/rejected": -65.67314910888672, "loss": 0.303, "losses/dpo": 0.34109148383140564, "losses/sft": 0.25651803612709045, "losses/total": 0.34109148383140564, "ref_logps/chosen": -33.537330627441406, "ref_logps/rejected": -38.98838806152344, "rewards/accuracies": 0.875, "rewards/chosen": -1.0326908826828003, "rewards/margins": 1.6357855796813965, "rewards/rejected": -2.6684765815734863, "step": 1162 }, { "epoch": 1.1, "grad_norm": 13.774998118167181, "learning_rate": 3.6630219747551034e-07, "logps/chosen": -36.90766143798828, "logps/rejected": -63.641937255859375, "loss": 0.3227, "losses/dpo": 0.12388196587562561, "losses/sft": 1.7424076795578003, "losses/total": 0.12388196587562561, "ref_logps/chosen": -27.462276458740234, "ref_logps/rejected": -38.85765838623047, "rewards/accuracies": 0.875, "rewards/chosen": -0.9445387721061707, "rewards/margins": 1.5338889360427856, "rewards/rejected": -2.4784276485443115, "step": 1163 }, { "epoch": 1.1, "grad_norm": 10.286803465464185, "learning_rate": 3.6607670421825505e-07, "logps/chosen": -28.386877059936523, "logps/rejected": -55.94755554199219, "loss": 0.2171, "losses/dpo": 0.05209311842918396, "losses/sft": 1.2406623363494873, "losses/total": 0.05209311842918396, "ref_logps/chosen": -22.611927032470703, "ref_logps/rejected": -31.131893157958984, "rewards/accuracies": 1.0, "rewards/chosen": -0.5774952173233032, "rewards/margins": 1.9040707349777222, "rewards/rejected": -2.4815659523010254, "step": 1164 }, { "epoch": 1.1, "grad_norm": 11.786965264511803, "learning_rate": 3.658510905084389e-07, "logps/chosen": -24.33747100830078, "logps/rejected": -56.959442138671875, "loss": 0.2036, "losses/dpo": 0.15773332118988037, "losses/sft": 0.6059127449989319, "losses/total": 0.15773332118988037, "ref_logps/chosen": -17.458654403686523, "ref_logps/rejected": -31.902603149414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.6878817677497864, "rewards/margins": 1.8178024291992188, "rewards/rejected": -2.5056838989257812, "step": 1165 }, { "epoch": 1.1, "grad_norm": 20.20088927658585, "learning_rate": 3.656253565801808e-07, "logps/chosen": -48.547607421875, "logps/rejected": -64.62349700927734, "loss": 0.3233, "losses/dpo": 0.28993210196495056, "losses/sft": 1.3951910734176636, "losses/total": 0.28993210196495056, "ref_logps/chosen": -35.953304290771484, "ref_logps/rejected": -34.35009002685547, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2594302892684937, "rewards/margins": 1.7679107189178467, "rewards/rejected": -3.02734112739563, "step": 1166 }, { "epoch": 1.1, "grad_norm": 14.313506993586154, "learning_rate": 3.653995026677244e-07, "logps/chosen": -34.97276306152344, "logps/rejected": -53.568389892578125, "loss": 0.2932, "losses/dpo": 0.8101403117179871, "losses/sft": 1.2462060451507568, "losses/total": 0.8101403117179871, "ref_logps/chosen": -23.96877098083496, "ref_logps/rejected": -28.537321090698242, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1003988981246948, "rewards/margins": 1.4027081727981567, "rewards/rejected": -2.5031070709228516, "step": 1167 }, { "epoch": 1.1, "grad_norm": 13.966257838720862, "learning_rate": 3.6517352900543776e-07, "logps/chosen": -37.04841613769531, "logps/rejected": -60.606109619140625, "loss": 0.3048, "losses/dpo": 0.03139398247003555, "losses/sft": 1.5524694919586182, "losses/total": 0.03139398247003555, "ref_logps/chosen": -25.816417694091797, "ref_logps/rejected": -34.64518737792969, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1232001781463623, "rewards/margins": 1.4728922843933105, "rewards/rejected": -2.596092462539673, "step": 1168 }, { "epoch": 1.1, "grad_norm": 15.809129989007044, "learning_rate": 3.649474358278134e-07, "logps/chosen": -40.89691925048828, "logps/rejected": -60.621360778808594, "loss": 0.3615, "losses/dpo": 0.8268435001373291, "losses/sft": 0.9087950587272644, "losses/total": 0.8268435001373291, "ref_logps/chosen": -30.674633026123047, "ref_logps/rejected": -35.69976043701172, "rewards/accuracies": 0.75, "rewards/chosen": -1.0222289562225342, "rewards/margins": 1.4699310064315796, "rewards/rejected": -2.4921600818634033, "step": 1169 }, { "epoch": 1.1, "grad_norm": 14.667626631504541, "learning_rate": 3.6472122336946766e-07, "logps/chosen": -34.94202423095703, "logps/rejected": -53.80838394165039, "loss": 0.2886, "losses/dpo": 0.7717955708503723, "losses/sft": 1.5399495363235474, "losses/total": 0.7717955708503723, "ref_logps/chosen": -23.668901443481445, "ref_logps/rejected": -28.355907440185547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.127312421798706, "rewards/margins": 1.4179353713989258, "rewards/rejected": -2.545247793197632, "step": 1170 }, { "epoch": 1.1, "grad_norm": 23.638453719991976, "learning_rate": 3.644948918651408e-07, "logps/chosen": -41.361114501953125, "logps/rejected": -57.94573211669922, "loss": 0.5189, "losses/dpo": 0.42834702134132385, "losses/sft": 1.2592763900756836, "losses/total": 0.42834702134132385, "ref_logps/chosen": -26.145572662353516, "ref_logps/rejected": -34.23031234741211, "rewards/accuracies": 0.75, "rewards/chosen": -1.5215537548065186, "rewards/margins": 0.849987804889679, "rewards/rejected": -2.3715415000915527, "step": 1171 }, { "epoch": 1.11, "grad_norm": 14.44212097020426, "learning_rate": 3.642684415496965e-07, "logps/chosen": -45.62644958496094, "logps/rejected": -57.094364166259766, "loss": 0.2525, "losses/dpo": 0.1321195662021637, "losses/sft": 1.3071500062942505, "losses/total": 0.1321195662021637, "ref_logps/chosen": -35.30525588989258, "ref_logps/rejected": -28.386226654052734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0321195125579834, "rewards/margins": 1.8386942148208618, "rewards/rejected": -2.8708133697509766, "step": 1172 }, { "epoch": 1.11, "grad_norm": 18.682583501517996, "learning_rate": 3.640418726581218e-07, "logps/chosen": -50.57537078857422, "logps/rejected": -56.814796447753906, "loss": 0.347, "losses/dpo": 1.2721405029296875, "losses/sft": 1.2079070806503296, "losses/total": 1.2721405029296875, "ref_logps/chosen": -37.78252029418945, "ref_logps/rejected": -30.56707763671875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2792853116989136, "rewards/margins": 1.3454866409301758, "rewards/rejected": -2.624772071838379, "step": 1173 }, { "epoch": 1.11, "grad_norm": 20.75372270376805, "learning_rate": 3.638151854255269e-07, "logps/chosen": -45.486488342285156, "logps/rejected": -64.0670394897461, "loss": 0.4255, "losses/dpo": 0.7996751666069031, "losses/sft": 1.952994704246521, "losses/total": 0.7996751666069031, "ref_logps/chosen": -28.16813087463379, "ref_logps/rejected": -36.762088775634766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7318356037139893, "rewards/margins": 0.9986597299575806, "rewards/rejected": -2.7304952144622803, "step": 1174 }, { "epoch": 1.11, "grad_norm": 21.622246216937413, "learning_rate": 3.635883800871446e-07, "logps/chosen": -31.899765014648438, "logps/rejected": -57.23695755004883, "loss": 0.4503, "losses/dpo": 0.024823788553476334, "losses/sft": 1.041333556175232, "losses/total": 0.024823788553476334, "ref_logps/chosen": -22.183006286621094, "ref_logps/rejected": -35.26319122314453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9716756939888, "rewards/margins": 1.2257009744644165, "rewards/rejected": -2.1973767280578613, "step": 1175 }, { "epoch": 1.11, "grad_norm": 19.388720551151778, "learning_rate": 3.633614568783303e-07, "logps/chosen": -43.69221496582031, "logps/rejected": -70.34674835205078, "loss": 0.3278, "losses/dpo": 0.3734048008918762, "losses/sft": 0.9131475687026978, "losses/total": 0.3734048008918762, "ref_logps/chosen": -28.821697235107422, "ref_logps/rejected": -40.64659118652344, "rewards/accuracies": 0.875, "rewards/chosen": -1.4870519638061523, "rewards/margins": 1.4829635620117188, "rewards/rejected": -2.970015525817871, "step": 1176 }, { "epoch": 1.11, "grad_norm": 12.93986241662087, "learning_rate": 3.6313441603456183e-07, "logps/chosen": -41.2238883972168, "logps/rejected": -69.53762817382812, "loss": 0.2266, "losses/dpo": 0.18806269764900208, "losses/sft": 1.2533119916915894, "losses/total": 0.18806269764900208, "ref_logps/chosen": -30.53475570678711, "ref_logps/rejected": -39.543251037597656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.068913221359253, "rewards/margins": 1.9305238723754883, "rewards/rejected": -2.999437093734741, "step": 1177 }, { "epoch": 1.11, "grad_norm": 16.44782651402813, "learning_rate": 3.6290725779143907e-07, "logps/chosen": -29.507793426513672, "logps/rejected": -56.38576126098633, "loss": 0.3461, "losses/dpo": 1.2648119926452637, "losses/sft": 0.5440741777420044, "losses/total": 1.2648119926452637, "ref_logps/chosen": -21.141307830810547, "ref_logps/rejected": -35.60857391357422, "rewards/accuracies": 0.8125, "rewards/chosen": -0.83664870262146, "rewards/margins": 1.2410701513290405, "rewards/rejected": -2.077718734741211, "step": 1178 }, { "epoch": 1.11, "grad_norm": 15.166101847856538, "learning_rate": 3.6267998238468346e-07, "logps/chosen": -37.85680389404297, "logps/rejected": -54.898162841796875, "loss": 0.32, "losses/dpo": 1.1893655061721802, "losses/sft": 1.4567476511001587, "losses/total": 1.1893655061721802, "ref_logps/chosen": -27.288986206054688, "ref_logps/rejected": -30.486814498901367, "rewards/accuracies": 0.9375, "rewards/chosen": -1.056781530380249, "rewards/margins": 1.3843536376953125, "rewards/rejected": -2.4411351680755615, "step": 1179 }, { "epoch": 1.11, "grad_norm": 15.912318635388763, "learning_rate": 3.624525900501384e-07, "logps/chosen": -36.49528121948242, "logps/rejected": -58.16595458984375, "loss": 0.3938, "losses/dpo": 0.6283833384513855, "losses/sft": 1.4886921644210815, "losses/total": 0.6283833384513855, "ref_logps/chosen": -23.67521095275879, "ref_logps/rejected": -33.61874771118164, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2820072174072266, "rewards/margins": 1.1727136373519897, "rewards/rejected": -2.454720973968506, "step": 1180 }, { "epoch": 1.11, "grad_norm": 22.46537671214732, "learning_rate": 3.6222508102376836e-07, "logps/chosen": -32.0987663269043, "logps/rejected": -46.63756561279297, "loss": 0.4515, "losses/dpo": 0.23153544962406158, "losses/sft": 0.5167503952980042, "losses/total": 0.23153544962406158, "ref_logps/chosen": -22.389997482299805, "ref_logps/rejected": -26.014101028442383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9708768725395203, "rewards/margins": 1.0914697647094727, "rewards/rejected": -2.0623464584350586, "step": 1181 }, { "epoch": 1.12, "grad_norm": 12.227059982117112, "learning_rate": 3.6199745554165906e-07, "logps/chosen": -34.12700271606445, "logps/rejected": -67.58709716796875, "loss": 0.2223, "losses/dpo": 0.13952849805355072, "losses/sft": 0.5604260563850403, "losses/total": 0.13952849805355072, "ref_logps/chosen": -21.64252281188965, "ref_logps/rejected": -36.3958625793457, "rewards/accuracies": 1.0, "rewards/chosen": -1.248448133468628, "rewards/margins": 1.8706753253936768, "rewards/rejected": -3.1191234588623047, "step": 1182 }, { "epoch": 1.12, "grad_norm": 14.819805616220549, "learning_rate": 3.61769713840017e-07, "logps/chosen": -43.9243278503418, "logps/rejected": -66.76425170898438, "loss": 0.2974, "losses/dpo": 0.04654642939567566, "losses/sft": 1.1854225397109985, "losses/total": 0.04654642939567566, "ref_logps/chosen": -30.234390258789062, "ref_logps/rejected": -38.24805450439453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3689937591552734, "rewards/margins": 1.4826258420944214, "rewards/rejected": -2.8516197204589844, "step": 1183 }, { "epoch": 1.12, "grad_norm": 28.20555546798134, "learning_rate": 3.615418561551692e-07, "logps/chosen": -42.796661376953125, "logps/rejected": -56.016902923583984, "loss": 0.4728, "losses/dpo": 1.3695757389068604, "losses/sft": 2.1342008113861084, "losses/total": 1.3695757389068604, "ref_logps/chosen": -27.28619384765625, "ref_logps/rejected": -30.57953453063965, "rewards/accuracies": 0.75, "rewards/chosen": -1.5510464906692505, "rewards/margins": 0.9926904439926147, "rewards/rejected": -2.5437369346618652, "step": 1184 }, { "epoch": 1.12, "grad_norm": 17.83530644049428, "learning_rate": 3.613138827235633e-07, "logps/chosen": -43.21452331542969, "logps/rejected": -56.60072326660156, "loss": 0.3596, "losses/dpo": 0.6823132634162903, "losses/sft": 2.6523289680480957, "losses/total": 0.6823132634162903, "ref_logps/chosen": -33.23937225341797, "ref_logps/rejected": -33.064369201660156, "rewards/accuracies": 0.875, "rewards/chosen": -0.997515082359314, "rewards/margins": 1.3561204671859741, "rewards/rejected": -2.353635311126709, "step": 1185 }, { "epoch": 1.12, "grad_norm": 21.512530458277492, "learning_rate": 3.610857937817667e-07, "logps/chosen": -50.30699157714844, "logps/rejected": -65.2159194946289, "loss": 0.4811, "losses/dpo": 0.5922908782958984, "losses/sft": 1.8395183086395264, "losses/total": 0.5922908782958984, "ref_logps/chosen": -32.95716857910156, "ref_logps/rejected": -36.78293991088867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7349824905395508, "rewards/margins": 1.1083157062530518, "rewards/rejected": -2.8432981967926025, "step": 1186 }, { "epoch": 1.12, "grad_norm": 15.79493842150878, "learning_rate": 3.6085758956646684e-07, "logps/chosen": -36.47285461425781, "logps/rejected": -49.44633483886719, "loss": 0.3582, "losses/dpo": 0.0070981914177536964, "losses/sft": 1.1429942846298218, "losses/total": 0.0070981914177536964, "ref_logps/chosen": -28.183963775634766, "ref_logps/rejected": -27.27685546875, "rewards/accuracies": 0.875, "rewards/chosen": -0.8288891911506653, "rewards/margins": 1.3880587816238403, "rewards/rejected": -2.2169480323791504, "step": 1187 }, { "epoch": 1.12, "grad_norm": 12.200442514658837, "learning_rate": 3.606292703144709e-07, "logps/chosen": -40.96564483642578, "logps/rejected": -62.34693908691406, "loss": 0.2088, "losses/dpo": 0.09907165169715881, "losses/sft": 1.3731609582901, "losses/total": 0.09907165169715881, "ref_logps/chosen": -29.702468872070312, "ref_logps/rejected": -32.0975227355957, "rewards/accuracies": 1.0, "rewards/chosen": -1.1263175010681152, "rewards/margins": 1.8986241817474365, "rewards/rejected": -3.0249416828155518, "step": 1188 }, { "epoch": 1.12, "grad_norm": 15.342046479833975, "learning_rate": 3.604008362627052e-07, "logps/chosen": -52.87327575683594, "logps/rejected": -76.89970397949219, "loss": 0.2702, "losses/dpo": 0.45030030608177185, "losses/sft": 1.0342566967010498, "losses/total": 0.45030030608177185, "ref_logps/chosen": -39.80231857299805, "ref_logps/rejected": -47.285728454589844, "rewards/accuracies": 1.0, "rewards/chosen": -1.3070958852767944, "rewards/margins": 1.6543012857437134, "rewards/rejected": -2.961397171020508, "step": 1189 }, { "epoch": 1.12, "grad_norm": 20.765521749493644, "learning_rate": 3.6017228764821527e-07, "logps/chosen": -39.59453582763672, "logps/rejected": -53.76583480834961, "loss": 0.4736, "losses/dpo": 0.2180698662996292, "losses/sft": 0.602364182472229, "losses/total": 0.2180698662996292, "ref_logps/chosen": -27.58814811706543, "ref_logps/rejected": -28.8297061920166, "rewards/accuracies": 0.875, "rewards/chosen": -1.200639247894287, "rewards/margins": 1.2929741144180298, "rewards/rejected": -2.4936132431030273, "step": 1190 }, { "epoch": 1.12, "grad_norm": 7.702672674946654, "learning_rate": 3.599436247081656e-07, "logps/chosen": -42.14484786987305, "logps/rejected": -92.52827453613281, "loss": 0.0959, "losses/dpo": 0.01624767668545246, "losses/sft": 0.9124342799186707, "losses/total": 0.01624767668545246, "ref_logps/chosen": -31.960453033447266, "ref_logps/rejected": -51.48301696777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.018439531326294, "rewards/margins": 3.0860862731933594, "rewards/rejected": -4.104525566101074, "step": 1191 }, { "epoch": 1.12, "grad_norm": 15.319720994322742, "learning_rate": 3.597148476798392e-07, "logps/chosen": -36.434452056884766, "logps/rejected": -64.49673461914062, "loss": 0.2953, "losses/dpo": 0.0895497053861618, "losses/sft": 1.7366784811019897, "losses/total": 0.0895497053861618, "ref_logps/chosen": -29.67169761657715, "ref_logps/rejected": -39.106201171875, "rewards/accuracies": 0.875, "rewards/chosen": -0.6762754321098328, "rewards/margins": 1.8627772331237793, "rewards/rejected": -2.5390524864196777, "step": 1192 }, { "epoch": 1.13, "grad_norm": 16.401285329203738, "learning_rate": 3.5948595680063766e-07, "logps/chosen": -39.04680633544922, "logps/rejected": -68.0256576538086, "loss": 0.3351, "losses/dpo": 0.3282080590724945, "losses/sft": 1.5268665552139282, "losses/total": 0.3282080590724945, "ref_logps/chosen": -26.013050079345703, "ref_logps/rejected": -40.09678649902344, "rewards/accuracies": 0.875, "rewards/chosen": -1.3033753633499146, "rewards/margins": 1.4895119667053223, "rewards/rejected": -2.7928872108459473, "step": 1193 }, { "epoch": 1.13, "grad_norm": 18.506045714071533, "learning_rate": 3.592569523080805e-07, "logps/chosen": -41.8995361328125, "logps/rejected": -70.6484375, "loss": 0.2958, "losses/dpo": 0.014705345034599304, "losses/sft": 0.46380969882011414, "losses/total": 0.014705345034599304, "ref_logps/chosen": -31.632503509521484, "ref_logps/rejected": -42.24140930175781, "rewards/accuracies": 0.875, "rewards/chosen": -1.0267032384872437, "rewards/margins": 1.8139996528625488, "rewards/rejected": -2.840703010559082, "step": 1194 }, { "epoch": 1.13, "grad_norm": 13.748652488351022, "learning_rate": 3.590278344398052e-07, "logps/chosen": -50.79377746582031, "logps/rejected": -74.806640625, "loss": 0.2288, "losses/dpo": 0.033863991498947144, "losses/sft": 0.9370471835136414, "losses/total": 0.033863991498947144, "ref_logps/chosen": -38.251564025878906, "ref_logps/rejected": -42.774654388427734, "rewards/accuracies": 1.0, "rewards/chosen": -1.254221796989441, "rewards/margins": 1.948976755142212, "rewards/rejected": -3.2031984329223633, "step": 1195 }, { "epoch": 1.13, "grad_norm": 14.716003192788008, "learning_rate": 3.587986034335669e-07, "logps/chosen": -31.877378463745117, "logps/rejected": -61.99775695800781, "loss": 0.2846, "losses/dpo": 0.24054844677448273, "losses/sft": 1.130856990814209, "losses/total": 0.24054844677448273, "ref_logps/chosen": -21.987751007080078, "ref_logps/rejected": -35.55488586425781, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9889627695083618, "rewards/margins": 1.6553246974945068, "rewards/rejected": -2.644287586212158, "step": 1196 }, { "epoch": 1.13, "grad_norm": 18.74109395258615, "learning_rate": 3.585692595272383e-07, "logps/chosen": -43.16733169555664, "logps/rejected": -73.85990905761719, "loss": 0.2508, "losses/dpo": 0.1732526272535324, "losses/sft": 1.5025995969772339, "losses/total": 0.1732526272535324, "ref_logps/chosen": -29.819351196289062, "ref_logps/rejected": -40.62004852294922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.334797739982605, "rewards/margins": 1.9891891479492188, "rewards/rejected": -3.323986530303955, "step": 1197 }, { "epoch": 1.13, "grad_norm": 13.127300684014683, "learning_rate": 3.5833980295880883e-07, "logps/chosen": -34.194732666015625, "logps/rejected": -66.22953033447266, "loss": 0.2253, "losses/dpo": 0.007577438373118639, "losses/sft": 1.001930832862854, "losses/total": 0.007577438373118639, "ref_logps/chosen": -24.514118194580078, "ref_logps/rejected": -34.3181266784668, "rewards/accuracies": 1.0, "rewards/chosen": -0.9680615663528442, "rewards/margins": 2.223079204559326, "rewards/rejected": -3.191140651702881, "step": 1198 }, { "epoch": 1.13, "grad_norm": 20.198059914563043, "learning_rate": 3.581102339663853e-07, "logps/chosen": -29.482433319091797, "logps/rejected": -40.149139404296875, "loss": 0.5161, "losses/dpo": 0.1546137034893036, "losses/sft": 1.5829217433929443, "losses/total": 0.1546137034893036, "ref_logps/chosen": -18.9508056640625, "ref_logps/rejected": -21.69732666015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0531628131866455, "rewards/margins": 0.7920184135437012, "rewards/rejected": -1.8451811075210571, "step": 1199 }, { "epoch": 1.13, "grad_norm": 25.051191678195185, "learning_rate": 3.5788055278819096e-07, "logps/chosen": -42.71043395996094, "logps/rejected": -55.583011627197266, "loss": 0.42, "losses/dpo": 1.0921741724014282, "losses/sft": 1.8698011636734009, "losses/total": 1.0921741724014282, "ref_logps/chosen": -31.3201961517334, "ref_logps/rejected": -33.697906494140625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1390239000320435, "rewards/margins": 1.0494863986968994, "rewards/rejected": -2.1885104179382324, "step": 1200 }, { "epoch": 1.13, "grad_norm": 17.289744288177435, "learning_rate": 3.5765075966256545e-07, "logps/chosen": -45.33407974243164, "logps/rejected": -61.146610260009766, "loss": 0.2863, "losses/dpo": 0.45602288842201233, "losses/sft": 0.9726884365081787, "losses/total": 0.45602288842201233, "ref_logps/chosen": -31.350421905517578, "ref_logps/rejected": -32.7921142578125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3983654975891113, "rewards/margins": 1.4370841979980469, "rewards/rejected": -2.835449695587158, "step": 1201 }, { "epoch": 1.13, "grad_norm": 15.70017153368921, "learning_rate": 3.5742085482796476e-07, "logps/chosen": -42.81208038330078, "logps/rejected": -66.9070053100586, "loss": 0.2452, "losses/dpo": 0.3487209677696228, "losses/sft": 0.5901926755905151, "losses/total": 0.3487209677696228, "ref_logps/chosen": -31.12416648864746, "ref_logps/rejected": -36.654788970947266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.168791651725769, "rewards/margins": 1.8564300537109375, "rewards/rejected": -3.025221586227417, "step": 1202 }, { "epoch": 1.13, "grad_norm": 17.261734272260625, "learning_rate": 3.571908385229605e-07, "logps/chosen": -41.3569450378418, "logps/rejected": -65.97991943359375, "loss": 0.3474, "losses/dpo": 0.32060736417770386, "losses/sft": 1.9571186304092407, "losses/total": 0.32060736417770386, "ref_logps/chosen": -26.91191864013672, "ref_logps/rejected": -37.22685241699219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.444502353668213, "rewards/margins": 1.43080472946167, "rewards/rejected": -2.875307083129883, "step": 1203 }, { "epoch": 1.14, "grad_norm": 19.113072313442025, "learning_rate": 3.5696071098624024e-07, "logps/chosen": -43.08905029296875, "logps/rejected": -61.04985809326172, "loss": 0.3732, "losses/dpo": 0.36203449964523315, "losses/sft": 0.6231658458709717, "losses/total": 0.36203449964523315, "ref_logps/chosen": -30.44300651550293, "ref_logps/rejected": -34.718353271484375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2646042108535767, "rewards/margins": 1.3685462474822998, "rewards/rejected": -2.633150577545166, "step": 1204 }, { "epoch": 1.14, "grad_norm": 20.98193494280687, "learning_rate": 3.567304724566069e-07, "logps/chosen": -38.79448318481445, "logps/rejected": -73.56617736816406, "loss": 0.307, "losses/dpo": 0.2912108600139618, "losses/sft": 0.2551065683364868, "losses/total": 0.2912108600139618, "ref_logps/chosen": -26.035232543945312, "ref_logps/rejected": -42.24766540527344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2759253978729248, "rewards/margins": 1.8559255599975586, "rewards/rejected": -3.1318509578704834, "step": 1205 }, { "epoch": 1.14, "grad_norm": 14.992296489884128, "learning_rate": 3.565001231729784e-07, "logps/chosen": -43.638694763183594, "logps/rejected": -67.19139099121094, "loss": 0.2233, "losses/dpo": 0.130828857421875, "losses/sft": 0.9600661993026733, "losses/total": 0.130828857421875, "ref_logps/chosen": -34.64909362792969, "ref_logps/rejected": -38.99060821533203, "rewards/accuracies": 1.0, "rewards/chosen": -0.8989603519439697, "rewards/margins": 1.921117901802063, "rewards/rejected": -2.8200783729553223, "step": 1206 }, { "epoch": 1.14, "grad_norm": 18.870916963841285, "learning_rate": 3.562696633743878e-07, "logps/chosen": -41.06891632080078, "logps/rejected": -59.123626708984375, "loss": 0.3515, "losses/dpo": 0.612695574760437, "losses/sft": 1.3702619075775146, "losses/total": 0.612695574760437, "ref_logps/chosen": -25.19651985168457, "ref_logps/rejected": -31.778213500976562, "rewards/accuracies": 0.875, "rewards/chosen": -1.5872396230697632, "rewards/margins": 1.1473016738891602, "rewards/rejected": -2.734541177749634, "step": 1207 }, { "epoch": 1.14, "grad_norm": 19.49272055840881, "learning_rate": 3.560390932999827e-07, "logps/chosen": -47.06035614013672, "logps/rejected": -71.60382080078125, "loss": 0.2459, "losses/dpo": 0.04550513997673988, "losses/sft": 0.6519327759742737, "losses/total": 0.04550513997673988, "ref_logps/chosen": -35.88557434082031, "ref_logps/rejected": -41.15751647949219, "rewards/accuracies": 0.875, "rewards/chosen": -1.1174782514572144, "rewards/margins": 1.927152395248413, "rewards/rejected": -3.044630765914917, "step": 1208 }, { "epoch": 1.14, "grad_norm": 15.446654558075574, "learning_rate": 3.5580841318902527e-07, "logps/chosen": -36.67779541015625, "logps/rejected": -55.37245178222656, "loss": 0.3012, "losses/dpo": 0.2658255696296692, "losses/sft": 1.030027985572815, "losses/total": 0.2658255696296692, "ref_logps/chosen": -26.879119873046875, "ref_logps/rejected": -28.579235076904297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9798673987388611, "rewards/margins": 1.6994543075561523, "rewards/rejected": -2.679321765899658, "step": 1209 }, { "epoch": 1.14, "grad_norm": 16.63789132915831, "learning_rate": 3.555776232808918e-07, "logps/chosen": -34.52616500854492, "logps/rejected": -56.48847198486328, "loss": 0.3373, "losses/dpo": 1.058458924293518, "losses/sft": 0.7464599609375, "losses/total": 1.058458924293518, "ref_logps/chosen": -24.19889259338379, "ref_logps/rejected": -33.17692565917969, "rewards/accuracies": 0.875, "rewards/chosen": -1.0327274799346924, "rewards/margins": 1.2984271049499512, "rewards/rejected": -2.3311545848846436, "step": 1210 }, { "epoch": 1.14, "grad_norm": 17.368729024378023, "learning_rate": 3.553467238150725e-07, "logps/chosen": -49.287208557128906, "logps/rejected": -87.41059875488281, "loss": 0.2254, "losses/dpo": 0.29815971851348877, "losses/sft": 0.5523221492767334, "losses/total": 0.29815971851348877, "ref_logps/chosen": -34.99468994140625, "ref_logps/rejected": -50.83233642578125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4292516708374023, "rewards/margins": 2.228574752807617, "rewards/rejected": -3.6578261852264404, "step": 1211 }, { "epoch": 1.14, "grad_norm": 20.934663044178464, "learning_rate": 3.551157150311712e-07, "logps/chosen": -45.44112014770508, "logps/rejected": -90.88417053222656, "loss": 0.3562, "losses/dpo": 0.7263717651367188, "losses/sft": 0.9131885766983032, "losses/total": 0.7263717651367188, "ref_logps/chosen": -29.107072830200195, "ref_logps/rejected": -50.51493835449219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6334047317504883, "rewards/margins": 2.403517723083496, "rewards/rejected": -4.036922931671143, "step": 1212 }, { "epoch": 1.14, "grad_norm": 12.734014106584432, "learning_rate": 3.548845971689054e-07, "logps/chosen": -33.6397590637207, "logps/rejected": -55.064762115478516, "loss": 0.2604, "losses/dpo": 0.5367465615272522, "losses/sft": 0.41473403573036194, "losses/total": 0.5367465615272522, "ref_logps/chosen": -22.83008575439453, "ref_logps/rejected": -28.445587158203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0809674263000488, "rewards/margins": 1.5809500217437744, "rewards/rejected": -2.6619174480438232, "step": 1213 }, { "epoch": 1.15, "grad_norm": 15.44700269331681, "learning_rate": 3.546533704681055e-07, "logps/chosen": -32.62981033325195, "logps/rejected": -54.24425506591797, "loss": 0.2938, "losses/dpo": 0.024732155725359917, "losses/sft": 1.5992426872253418, "losses/total": 0.024732155725359917, "ref_logps/chosen": -24.55284309387207, "ref_logps/rejected": -29.73065948486328, "rewards/accuracies": 0.875, "rewards/chosen": -0.8076967000961304, "rewards/margins": 1.6436631679534912, "rewards/rejected": -2.451359748840332, "step": 1214 }, { "epoch": 1.15, "grad_norm": 13.223281049562129, "learning_rate": 3.5442203516871503e-07, "logps/chosen": -41.10802459716797, "logps/rejected": -63.352783203125, "loss": 0.1954, "losses/dpo": 0.18321593105793, "losses/sft": 1.3920743465423584, "losses/total": 0.18321593105793, "ref_logps/chosen": -29.05809211730957, "ref_logps/rejected": -31.05250358581543, "rewards/accuracies": 1.0, "rewards/chosen": -1.204993486404419, "rewards/margins": 2.0250344276428223, "rewards/rejected": -3.230027675628662, "step": 1215 }, { "epoch": 1.15, "grad_norm": 17.68223098972156, "learning_rate": 3.5419059151079025e-07, "logps/chosen": -48.91592025756836, "logps/rejected": -60.02714538574219, "loss": 0.3133, "losses/dpo": 0.1533036231994629, "losses/sft": 1.6063789129257202, "losses/total": 0.1533036231994629, "ref_logps/chosen": -35.852569580078125, "ref_logps/rejected": -32.76649475097656, "rewards/accuracies": 0.875, "rewards/chosen": -1.3063350915908813, "rewards/margins": 1.4197298288345337, "rewards/rejected": -2.726064920425415, "step": 1216 }, { "epoch": 1.15, "grad_norm": 20.18147261149857, "learning_rate": 3.539590397344995e-07, "logps/chosen": -50.43798065185547, "logps/rejected": -71.77569580078125, "loss": 0.3149, "losses/dpo": 0.3783245086669922, "losses/sft": 1.982293725013733, "losses/total": 0.3783245086669922, "ref_logps/chosen": -36.827781677246094, "ref_logps/rejected": -41.33243179321289, "rewards/accuracies": 0.875, "rewards/chosen": -1.3610198497772217, "rewards/margins": 1.68330717086792, "rewards/rejected": -3.0443270206451416, "step": 1217 }, { "epoch": 1.15, "grad_norm": 15.070063108121731, "learning_rate": 3.537273800801239e-07, "logps/chosen": -43.97223663330078, "logps/rejected": -61.454654693603516, "loss": 0.3087, "losses/dpo": 0.0025001380126923323, "losses/sft": 1.6825462579727173, "losses/total": 0.0025001380126923323, "ref_logps/chosen": -31.57002830505371, "ref_logps/rejected": -33.550559997558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2402207851409912, "rewards/margins": 1.5501888990402222, "rewards/rejected": -2.790409564971924, "step": 1218 }, { "epoch": 1.15, "grad_norm": 12.335747882308924, "learning_rate": 3.5349561278805605e-07, "logps/chosen": -34.70106506347656, "logps/rejected": -60.73857116699219, "loss": 0.2392, "losses/dpo": 0.6784620881080627, "losses/sft": 0.741628885269165, "losses/total": 0.6784620881080627, "ref_logps/chosen": -25.070159912109375, "ref_logps/rejected": -35.1430549621582, "rewards/accuracies": 1.0, "rewards/chosen": -0.9630905985832214, "rewards/margins": 1.5964609384536743, "rewards/rejected": -2.55955171585083, "step": 1219 }, { "epoch": 1.15, "grad_norm": 15.991521612031065, "learning_rate": 3.532637380988005e-07, "logps/chosen": -30.54268455505371, "logps/rejected": -60.20966339111328, "loss": 0.2827, "losses/dpo": 0.054832879453897476, "losses/sft": 0.741947591304779, "losses/total": 0.054832879453897476, "ref_logps/chosen": -20.784561157226562, "ref_logps/rejected": -34.169612884521484, "rewards/accuracies": 1.0, "rewards/chosen": -0.9758123755455017, "rewards/margins": 1.6281927824020386, "rewards/rejected": -2.6040050983428955, "step": 1220 }, { "epoch": 1.15, "grad_norm": 17.172378241091522, "learning_rate": 3.53031756252973e-07, "logps/chosen": -42.35706329345703, "logps/rejected": -87.65054321289062, "loss": 0.275, "losses/dpo": 0.19430221617221832, "losses/sft": 2.051063299179077, "losses/total": 0.19430221617221832, "ref_logps/chosen": -26.247474670410156, "ref_logps/rejected": -50.124271392822266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6109585762023926, "rewards/margins": 2.1416690349578857, "rewards/rejected": -3.7526278495788574, "step": 1221 }, { "epoch": 1.15, "grad_norm": 18.07140283385314, "learning_rate": 3.5279966749130075e-07, "logps/chosen": -33.547847747802734, "logps/rejected": -54.77010726928711, "loss": 0.4019, "losses/dpo": 0.21409977972507477, "losses/sft": 2.0847580432891846, "losses/total": 0.21409977972507477, "ref_logps/chosen": -24.074968338012695, "ref_logps/rejected": -30.365459442138672, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9472878575325012, "rewards/margins": 1.493177056312561, "rewards/rejected": -2.440464973449707, "step": 1222 }, { "epoch": 1.15, "grad_norm": 18.822525196835066, "learning_rate": 3.5256747205462174e-07, "logps/chosen": -36.894691467285156, "logps/rejected": -59.25859451293945, "loss": 0.4563, "losses/dpo": 1.0476350784301758, "losses/sft": 1.105079174041748, "losses/total": 1.0476350784301758, "ref_logps/chosen": -26.004398345947266, "ref_logps/rejected": -36.01171112060547, "rewards/accuracies": 0.75, "rewards/chosen": -1.0890294313430786, "rewards/margins": 1.2356593608856201, "rewards/rejected": -2.324688673019409, "step": 1223 }, { "epoch": 1.15, "grad_norm": 16.962275270022964, "learning_rate": 3.523351701838847e-07, "logps/chosen": -34.656917572021484, "logps/rejected": -57.43189239501953, "loss": 0.2656, "losses/dpo": 0.3968851566314697, "losses/sft": 1.3145005702972412, "losses/total": 0.3968851566314697, "ref_logps/chosen": -26.322914123535156, "ref_logps/rejected": -31.821001052856445, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8334004282951355, "rewards/margins": 1.7276890277862549, "rewards/rejected": -2.561089277267456, "step": 1224 }, { "epoch": 1.16, "grad_norm": 18.4441284523883, "learning_rate": 3.521027621201488e-07, "logps/chosen": -39.17446517944336, "logps/rejected": -81.45719909667969, "loss": 0.2647, "losses/dpo": 0.04259949550032616, "losses/sft": 1.3361703157424927, "losses/total": 0.04259949550032616, "ref_logps/chosen": -26.619930267333984, "ref_logps/rejected": -47.98835372924805, "rewards/accuracies": 0.875, "rewards/chosen": -1.255453109741211, "rewards/margins": 2.091431140899658, "rewards/rejected": -3.346884250640869, "step": 1225 }, { "epoch": 1.16, "grad_norm": 18.908066596106224, "learning_rate": 3.518702481045835e-07, "logps/chosen": -38.768409729003906, "logps/rejected": -53.85993194580078, "loss": 0.3982, "losses/dpo": 0.10908976942300797, "losses/sft": 1.1594136953353882, "losses/total": 0.10908976942300797, "ref_logps/chosen": -26.181640625, "ref_logps/rejected": -28.01288604736328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2586770057678223, "rewards/margins": 1.3260276317596436, "rewards/rejected": -2.584704875946045, "step": 1226 }, { "epoch": 1.16, "grad_norm": 12.475710271635421, "learning_rate": 3.5163762837846805e-07, "logps/chosen": -34.43476867675781, "logps/rejected": -61.22462844848633, "loss": 0.1951, "losses/dpo": 0.06893114000558853, "losses/sft": 0.6083346009254456, "losses/total": 0.06893114000558853, "ref_logps/chosen": -25.76443099975586, "ref_logps/rejected": -32.316993713378906, "rewards/accuracies": 1.0, "rewards/chosen": -0.8670337200164795, "rewards/margins": 2.0237300395965576, "rewards/rejected": -2.890763759613037, "step": 1227 }, { "epoch": 1.16, "grad_norm": 15.65980193171091, "learning_rate": 3.514049031831915e-07, "logps/chosen": -43.71440887451172, "logps/rejected": -68.01361846923828, "loss": 0.2796, "losses/dpo": 0.2245267778635025, "losses/sft": 1.4508087635040283, "losses/total": 0.2245267778635025, "ref_logps/chosen": -28.940048217773438, "ref_logps/rejected": -36.22004699707031, "rewards/accuracies": 0.875, "rewards/chosen": -1.4774359464645386, "rewards/margins": 1.701920986175537, "rewards/rejected": -3.179356813430786, "step": 1228 }, { "epoch": 1.16, "grad_norm": 15.738047615282786, "learning_rate": 3.5117207276025224e-07, "logps/chosen": -39.7584228515625, "logps/rejected": -59.069190979003906, "loss": 0.2995, "losses/dpo": 0.042153093963861465, "losses/sft": 1.5739824771881104, "losses/total": 0.042153093963861465, "ref_logps/chosen": -27.699844360351562, "ref_logps/rejected": -29.765899658203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2058579921722412, "rewards/margins": 1.7244713306427002, "rewards/rejected": -2.9303293228149414, "step": 1229 }, { "epoch": 1.16, "grad_norm": 12.7278277473639, "learning_rate": 3.5093913735125804e-07, "logps/chosen": -26.690898895263672, "logps/rejected": -50.94518280029297, "loss": 0.2921, "losses/dpo": 0.5840684175491333, "losses/sft": 0.24535392224788666, "losses/total": 0.5840684175491333, "ref_logps/chosen": -17.3442440032959, "ref_logps/rejected": -26.629047393798828, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9346653819084167, "rewards/margins": 1.4969485998153687, "rewards/rejected": -2.4316139221191406, "step": 1230 }, { "epoch": 1.16, "grad_norm": 16.24925952783745, "learning_rate": 3.5070609719792547e-07, "logps/chosen": -42.47386932373047, "logps/rejected": -64.67897033691406, "loss": 0.2909, "losses/dpo": 0.7530825138092041, "losses/sft": 1.2801567316055298, "losses/total": 0.7530825138092041, "ref_logps/chosen": -32.52747344970703, "ref_logps/rejected": -38.12199401855469, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9946399331092834, "rewards/margins": 1.661057949066162, "rewards/rejected": -2.6556975841522217, "step": 1231 }, { "epoch": 1.16, "grad_norm": 14.154314777733287, "learning_rate": 3.5047295254207975e-07, "logps/chosen": -41.05751037597656, "logps/rejected": -67.32926940917969, "loss": 0.2156, "losses/dpo": 0.3394196033477783, "losses/sft": 2.2656209468841553, "losses/total": 0.3394196033477783, "ref_logps/chosen": -28.572128295898438, "ref_logps/rejected": -34.953704833984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2485384941101074, "rewards/margins": 1.9890177249908447, "rewards/rejected": -3.2375564575195312, "step": 1232 }, { "epoch": 1.16, "grad_norm": 21.40917757934772, "learning_rate": 3.5023970362565477e-07, "logps/chosen": -37.011192321777344, "logps/rejected": -56.61212158203125, "loss": 0.3222, "losses/dpo": 0.2037014663219452, "losses/sft": 0.6930399537086487, "losses/total": 0.2037014663219452, "ref_logps/chosen": -27.164884567260742, "ref_logps/rejected": -31.58550262451172, "rewards/accuracies": 0.875, "rewards/chosen": -0.9846310019493103, "rewards/margins": 1.5180307626724243, "rewards/rejected": -2.50266170501709, "step": 1233 }, { "epoch": 1.16, "grad_norm": 27.218804163204272, "learning_rate": 3.500063506906923e-07, "logps/chosen": -34.40533447265625, "logps/rejected": -38.581764221191406, "loss": 0.6949, "losses/dpo": 0.14677521586418152, "losses/sft": 1.9029178619384766, "losses/total": 0.14677521586418152, "ref_logps/chosen": -23.77239227294922, "ref_logps/rejected": -22.008411407470703, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0632944107055664, "rewards/margins": 0.5940414071083069, "rewards/rejected": -1.6573357582092285, "step": 1234 }, { "epoch": 1.17, "grad_norm": 13.29892515123349, "learning_rate": 3.497728939793423e-07, "logps/chosen": -31.669713973999023, "logps/rejected": -64.8182144165039, "loss": 0.2344, "losses/dpo": 0.056641239672899246, "losses/sft": 1.488412618637085, "losses/total": 0.056641239672899246, "ref_logps/chosen": -20.789813995361328, "ref_logps/rejected": -34.687835693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0879900455474854, "rewards/margins": 1.9250481128692627, "rewards/rejected": -3.013038158416748, "step": 1235 }, { "epoch": 1.17, "grad_norm": 12.864121347947833, "learning_rate": 3.4953933373386226e-07, "logps/chosen": -43.4494743347168, "logps/rejected": -64.49913787841797, "loss": 0.2327, "losses/dpo": 0.481173574924469, "losses/sft": 1.3856637477874756, "losses/total": 0.481173574924469, "ref_logps/chosen": -33.407188415527344, "ref_logps/rejected": -39.037620544433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.0042288303375244, "rewards/margins": 1.5419234037399292, "rewards/rejected": -2.546152114868164, "step": 1236 }, { "epoch": 1.17, "grad_norm": 12.010100891578984, "learning_rate": 3.4930567019661716e-07, "logps/chosen": -32.28577423095703, "logps/rejected": -58.61272430419922, "loss": 0.2334, "losses/dpo": 0.02873445861041546, "losses/sft": 1.0930147171020508, "losses/total": 0.02873445861041546, "ref_logps/chosen": -23.355693817138672, "ref_logps/rejected": -31.243770599365234, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8930081129074097, "rewards/margins": 1.8438870906829834, "rewards/rejected": -2.7368953227996826, "step": 1237 }, { "epoch": 1.17, "grad_norm": 13.136355931967683, "learning_rate": 3.490719036100793e-07, "logps/chosen": -41.85881805419922, "logps/rejected": -72.16169738769531, "loss": 0.2376, "losses/dpo": 0.8222066164016724, "losses/sft": 1.4234769344329834, "losses/total": 0.8222066164016724, "ref_logps/chosen": -28.923507690429688, "ref_logps/rejected": -39.03468704223633, "rewards/accuracies": 0.875, "rewards/chosen": -1.2935307025909424, "rewards/margins": 2.019169807434082, "rewards/rejected": -3.3127007484436035, "step": 1238 }, { "epoch": 1.17, "grad_norm": 11.587362639544317, "learning_rate": 3.4883803421682774e-07, "logps/chosen": -46.98445129394531, "logps/rejected": -73.01103210449219, "loss": 0.1835, "losses/dpo": 0.34045907855033875, "losses/sft": 1.2998881340026855, "losses/total": 0.34045907855033875, "ref_logps/chosen": -34.5562744140625, "ref_logps/rejected": -40.125850677490234, "rewards/accuracies": 1.0, "rewards/chosen": -1.2428181171417236, "rewards/margins": 2.0457003116607666, "rewards/rejected": -3.2885184288024902, "step": 1239 }, { "epoch": 1.17, "grad_norm": 21.311149245447893, "learning_rate": 3.4860406225954826e-07, "logps/chosen": -57.41643524169922, "logps/rejected": -70.5974349975586, "loss": 0.3764, "losses/dpo": 0.02043716236948967, "losses/sft": 2.540926694869995, "losses/total": 0.02043716236948967, "ref_logps/chosen": -42.423309326171875, "ref_logps/rejected": -41.45166015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4993126392364502, "rewards/margins": 1.41526460647583, "rewards/rejected": -2.9145774841308594, "step": 1240 }, { "epoch": 1.17, "grad_norm": 21.297002526173493, "learning_rate": 3.48369987981033e-07, "logps/chosen": -40.025474548339844, "logps/rejected": -56.07305145263672, "loss": 0.3902, "losses/dpo": 0.31808698177337646, "losses/sft": 1.4135850667953491, "losses/total": 0.31808698177337646, "ref_logps/chosen": -27.328516006469727, "ref_logps/rejected": -32.80506134033203, "rewards/accuracies": 0.875, "rewards/chosen": -1.26969575881958, "rewards/margins": 1.0571036338806152, "rewards/rejected": -2.3267993927001953, "step": 1241 }, { "epoch": 1.17, "grad_norm": 16.81527755371487, "learning_rate": 3.4813581162418053e-07, "logps/chosen": -45.84945297241211, "logps/rejected": -73.94810485839844, "loss": 0.2698, "losses/dpo": 0.09514369070529938, "losses/sft": 0.6840049028396606, "losses/total": 0.09514369070529938, "ref_logps/chosen": -32.63447570800781, "ref_logps/rejected": -41.50178527832031, "rewards/accuracies": 0.875, "rewards/chosen": -1.321497917175293, "rewards/margins": 1.9231343269348145, "rewards/rejected": -3.2446322441101074, "step": 1242 }, { "epoch": 1.17, "grad_norm": 13.839559274574402, "learning_rate": 3.4790153343199516e-07, "logps/chosen": -37.06706237792969, "logps/rejected": -66.54740142822266, "loss": 0.2305, "losses/dpo": 0.0421316884458065, "losses/sft": 0.838716447353363, "losses/total": 0.0421316884458065, "ref_logps/chosen": -27.32872772216797, "ref_logps/rejected": -36.1890983581543, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9738336205482483, "rewards/margins": 2.0619964599609375, "rewards/rejected": -3.035830020904541, "step": 1243 }, { "epoch": 1.17, "grad_norm": 15.358794053501768, "learning_rate": 3.4766715364758664e-07, "logps/chosen": -29.083436965942383, "logps/rejected": -59.45642852783203, "loss": 0.3104, "losses/dpo": 0.016239557415246964, "losses/sft": 0.43536704778671265, "losses/total": 0.016239557415246964, "ref_logps/chosen": -20.544082641601562, "ref_logps/rejected": -33.175357818603516, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8539355993270874, "rewards/margins": 1.7741713523864746, "rewards/rejected": -2.6281070709228516, "step": 1244 }, { "epoch": 1.17, "grad_norm": 11.959245785489339, "learning_rate": 3.4743267251417074e-07, "logps/chosen": -42.36761474609375, "logps/rejected": -75.29564666748047, "loss": 0.159, "losses/dpo": 0.04697471857070923, "losses/sft": 1.0616217851638794, "losses/total": 0.04697471857070923, "ref_logps/chosen": -28.045408248901367, "ref_logps/rejected": -39.73981857299805, "rewards/accuracies": 1.0, "rewards/chosen": -1.4322208166122437, "rewards/margins": 2.123361587524414, "rewards/rejected": -3.5555825233459473, "step": 1245 }, { "epoch": 1.18, "grad_norm": 23.397954230225473, "learning_rate": 3.4719809027506785e-07, "logps/chosen": -45.66887283325195, "logps/rejected": -69.19059753417969, "loss": 0.3817, "losses/dpo": 0.03026014193892479, "losses/sft": 1.1721794605255127, "losses/total": 0.03026014193892479, "ref_logps/chosen": -31.260103225708008, "ref_logps/rejected": -40.65625762939453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.440877079963684, "rewards/margins": 1.4125571250915527, "rewards/rejected": -2.8534343242645264, "step": 1246 }, { "epoch": 1.18, "grad_norm": 10.525712279949186, "learning_rate": 3.469634071737036e-07, "logps/chosen": -33.89468002319336, "logps/rejected": -63.40309524536133, "loss": 0.2242, "losses/dpo": 0.19053758680820465, "losses/sft": 1.389938235282898, "losses/total": 0.19053758680820465, "ref_logps/chosen": -24.660158157348633, "ref_logps/rejected": -33.00480651855469, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9234521389007568, "rewards/margins": 2.1163768768310547, "rewards/rejected": -3.0398292541503906, "step": 1247 }, { "epoch": 1.18, "grad_norm": 14.580409616212433, "learning_rate": 3.467286234536081e-07, "logps/chosen": -37.84128189086914, "logps/rejected": -71.32891082763672, "loss": 0.2097, "losses/dpo": 0.05562438815832138, "losses/sft": 1.3212941884994507, "losses/total": 0.05562438815832138, "ref_logps/chosen": -25.613019943237305, "ref_logps/rejected": -37.94621276855469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2228262424468994, "rewards/margins": 2.1154441833496094, "rewards/rejected": -3.3382701873779297, "step": 1248 }, { "epoch": 1.18, "grad_norm": 15.772861739913619, "learning_rate": 3.46493739358416e-07, "logps/chosen": -42.187923431396484, "logps/rejected": -52.97758483886719, "loss": 0.3485, "losses/dpo": 0.31670716404914856, "losses/sft": 1.4841196537017822, "losses/total": 0.31670716404914856, "ref_logps/chosen": -28.133216857910156, "ref_logps/rejected": -24.730480194091797, "rewards/accuracies": 0.875, "rewards/chosen": -1.405470609664917, "rewards/margins": 1.41923987865448, "rewards/rejected": -2.8247106075286865, "step": 1249 }, { "epoch": 1.18, "grad_norm": 17.159328282103093, "learning_rate": 3.4625875513186595e-07, "logps/chosen": -49.06797409057617, "logps/rejected": -60.93710708618164, "loss": 0.2889, "losses/dpo": 0.12318193912506104, "losses/sft": 0.36246544122695923, "losses/total": 0.12318193912506104, "ref_logps/chosen": -39.839500427246094, "ref_logps/rejected": -34.87885665893555, "rewards/accuracies": 0.875, "rewards/chosen": -0.9228471517562866, "rewards/margins": 1.6829777956008911, "rewards/rejected": -2.6058249473571777, "step": 1250 }, { "epoch": 1.18, "grad_norm": 12.62714492811886, "learning_rate": 3.460236710178006e-07, "logps/chosen": -28.811222076416016, "logps/rejected": -49.80360794067383, "loss": 0.309, "losses/dpo": 0.17912596464157104, "losses/sft": 1.485190987586975, "losses/total": 0.17912596464157104, "ref_logps/chosen": -20.42276382446289, "ref_logps/rejected": -29.112131118774414, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8388456702232361, "rewards/margins": 1.2303022146224976, "rewards/rejected": -2.069148063659668, "step": 1251 }, { "epoch": 1.18, "grad_norm": 16.701847837999487, "learning_rate": 3.4578848726016643e-07, "logps/chosen": -41.65673065185547, "logps/rejected": -72.83912658691406, "loss": 0.2689, "losses/dpo": 0.01879015751183033, "losses/sft": 0.845649778842926, "losses/total": 0.01879015751183033, "ref_logps/chosen": -28.932785034179688, "ref_logps/rejected": -41.592262268066406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2723941802978516, "rewards/margins": 1.852292537689209, "rewards/rejected": -3.1246867179870605, "step": 1252 }, { "epoch": 1.18, "grad_norm": 19.736268137804984, "learning_rate": 3.455532041030129e-07, "logps/chosen": -43.77389907836914, "logps/rejected": -55.07563018798828, "loss": 0.4275, "losses/dpo": 0.2699599266052246, "losses/sft": 0.7480846643447876, "losses/total": 0.2699599266052246, "ref_logps/chosen": -32.48611068725586, "ref_logps/rejected": -30.267650604248047, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1287789344787598, "rewards/margins": 1.3520190715789795, "rewards/rejected": -2.4807980060577393, "step": 1253 }, { "epoch": 1.18, "grad_norm": 11.526714317485535, "learning_rate": 3.4531782179049314e-07, "logps/chosen": -30.860469818115234, "logps/rejected": -52.62434387207031, "loss": 0.2533, "losses/dpo": 0.07634354382753372, "losses/sft": 1.7467312812805176, "losses/total": 0.07634354382753372, "ref_logps/chosen": -22.11982536315918, "ref_logps/rejected": -24.6275577545166, "rewards/accuracies": 1.0, "rewards/chosen": -0.8740644454956055, "rewards/margins": 1.9256147146224976, "rewards/rejected": -2.7996792793273926, "step": 1254 }, { "epoch": 1.18, "grad_norm": 14.195841952219787, "learning_rate": 3.450823405668627e-07, "logps/chosen": -34.7728271484375, "logps/rejected": -58.07316589355469, "loss": 0.2596, "losses/dpo": 0.13921256363391876, "losses/sft": 0.67213374376297, "losses/total": 0.13921256363391876, "ref_logps/chosen": -24.046913146972656, "ref_logps/rejected": -29.12393569946289, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0725913047790527, "rewards/margins": 1.8223323822021484, "rewards/rejected": -2.894923686981201, "step": 1255 }, { "epoch": 1.18, "grad_norm": 8.355137495858811, "learning_rate": 3.448467606764801e-07, "logps/chosen": -49.08402633666992, "logps/rejected": -78.47813415527344, "loss": 0.1336, "losses/dpo": 0.013440298847854137, "losses/sft": 0.4582674503326416, "losses/total": 0.013440298847854137, "ref_logps/chosen": -37.475555419921875, "ref_logps/rejected": -41.139434814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1608469486236572, "rewards/margins": 2.573023796081543, "rewards/rejected": -3.733870506286621, "step": 1256 }, { "epoch": 1.19, "grad_norm": 21.36012137631738, "learning_rate": 3.4461108236380597e-07, "logps/chosen": -45.10809326171875, "logps/rejected": -59.80812454223633, "loss": 0.3551, "losses/dpo": 0.023634595796465874, "losses/sft": 0.4052187204360962, "losses/total": 0.023634595796465874, "ref_logps/chosen": -33.66944122314453, "ref_logps/rejected": -32.25196838378906, "rewards/accuracies": 0.875, "rewards/chosen": -1.1438653469085693, "rewards/margins": 1.6117501258850098, "rewards/rejected": -2.755615711212158, "step": 1257 }, { "epoch": 1.19, "grad_norm": 11.676828843836931, "learning_rate": 3.4437530587340334e-07, "logps/chosen": -38.888404846191406, "logps/rejected": -68.88369750976562, "loss": 0.2016, "losses/dpo": 0.37875911593437195, "losses/sft": 0.7236414551734924, "losses/total": 0.37875911593437195, "ref_logps/chosen": -26.070194244384766, "ref_logps/rejected": -33.18937301635742, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2818210124969482, "rewards/margins": 2.287611246109009, "rewards/rejected": -3.569432258605957, "step": 1258 }, { "epoch": 1.19, "grad_norm": 15.697589180565013, "learning_rate": 3.44139431449937e-07, "logps/chosen": -38.50602722167969, "logps/rejected": -54.4706916809082, "loss": 0.3582, "losses/dpo": 0.5376030206680298, "losses/sft": 1.2436943054199219, "losses/total": 0.5376030206680298, "ref_logps/chosen": -26.03002166748047, "ref_logps/rejected": -28.188507080078125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2476006746292114, "rewards/margins": 1.3806178569793701, "rewards/rejected": -2.628218650817871, "step": 1259 }, { "epoch": 1.19, "grad_norm": 23.11593172851577, "learning_rate": 3.439034593381732e-07, "logps/chosen": -46.462646484375, "logps/rejected": -49.09725570678711, "loss": 0.4931, "losses/dpo": 0.7332343459129333, "losses/sft": 0.6821543574333191, "losses/total": 0.7332343459129333, "ref_logps/chosen": -32.414894104003906, "ref_logps/rejected": -27.12274742126465, "rewards/accuracies": 0.75, "rewards/chosen": -1.404775619506836, "rewards/margins": 0.7926750779151917, "rewards/rejected": -2.197450637817383, "step": 1260 }, { "epoch": 1.19, "grad_norm": 18.466374771757206, "learning_rate": 3.436673897829799e-07, "logps/chosen": -44.04637908935547, "logps/rejected": -51.649658203125, "loss": 0.378, "losses/dpo": 0.5597497820854187, "losses/sft": 0.28987380862236023, "losses/total": 0.5597497820854187, "ref_logps/chosen": -32.31605911254883, "ref_logps/rejected": -26.869869232177734, "rewards/accuracies": 0.875, "rewards/chosen": -1.1730320453643799, "rewards/margins": 1.3049472570419312, "rewards/rejected": -2.4779794216156006, "step": 1261 }, { "epoch": 1.19, "grad_norm": 17.05216120622346, "learning_rate": 3.4343122302932586e-07, "logps/chosen": -38.30244445800781, "logps/rejected": -57.25736618041992, "loss": 0.2965, "losses/dpo": 0.04510139301419258, "losses/sft": 1.7357890605926514, "losses/total": 0.04510139301419258, "ref_logps/chosen": -25.78046417236328, "ref_logps/rejected": -26.74411392211914, "rewards/accuracies": 0.875, "rewards/chosen": -1.2521982192993164, "rewards/margins": 1.7991273403167725, "rewards/rejected": -3.051325559616089, "step": 1262 }, { "epoch": 1.19, "grad_norm": 26.67455950045394, "learning_rate": 3.43194959322281e-07, "logps/chosen": -42.60475540161133, "logps/rejected": -51.8109130859375, "loss": 0.5544, "losses/dpo": 0.24486935138702393, "losses/sft": 2.0314342975616455, "losses/total": 0.24486935138702393, "ref_logps/chosen": -27.504955291748047, "ref_logps/rejected": -28.485431671142578, "rewards/accuracies": 0.75, "rewards/chosen": -1.5099799633026123, "rewards/margins": 0.8225686550140381, "rewards/rejected": -2.3325483798980713, "step": 1263 }, { "epoch": 1.19, "grad_norm": 12.696404189116123, "learning_rate": 3.429585989070156e-07, "logps/chosen": -30.772218704223633, "logps/rejected": -53.864105224609375, "loss": 0.2605, "losses/dpo": 0.3888619840145111, "losses/sft": 0.8748220205307007, "losses/total": 0.3888619840145111, "ref_logps/chosen": -24.181293487548828, "ref_logps/rejected": -29.229541778564453, "rewards/accuracies": 1.0, "rewards/chosen": -0.6590926647186279, "rewards/margins": 1.8043639659881592, "rewards/rejected": -2.463456630706787, "step": 1264 }, { "epoch": 1.19, "grad_norm": 16.909185528084574, "learning_rate": 3.4272214202880037e-07, "logps/chosen": -39.26023483276367, "logps/rejected": -60.74656295776367, "loss": 0.2762, "losses/dpo": 0.2529284358024597, "losses/sft": 1.3752182722091675, "losses/total": 0.2529284358024597, "ref_logps/chosen": -24.40932846069336, "ref_logps/rejected": -29.37368392944336, "rewards/accuracies": 0.875, "rewards/chosen": -1.4850904941558838, "rewards/margins": 1.6521978378295898, "rewards/rejected": -3.1372883319854736, "step": 1265 }, { "epoch": 1.19, "grad_norm": 23.898441984931765, "learning_rate": 3.424855889330062e-07, "logps/chosen": -43.918617248535156, "logps/rejected": -53.54228973388672, "loss": 0.4798, "losses/dpo": 0.4244132936000824, "losses/sft": 0.20152239501476288, "losses/total": 0.4244132936000824, "ref_logps/chosen": -29.377986907958984, "ref_logps/rejected": -27.87162208557129, "rewards/accuracies": 0.75, "rewards/chosen": -1.4540636539459229, "rewards/margins": 1.1130032539367676, "rewards/rejected": -2.5670666694641113, "step": 1266 }, { "epoch": 1.2, "grad_norm": 17.044325546479673, "learning_rate": 3.422489398651036e-07, "logps/chosen": -44.722991943359375, "logps/rejected": -67.01631164550781, "loss": 0.3353, "losses/dpo": 0.21408997476100922, "losses/sft": 0.35352039337158203, "losses/total": 0.21408997476100922, "ref_logps/chosen": -33.28497314453125, "ref_logps/rejected": -40.07537841796875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1438019275665283, "rewards/margins": 1.5502912998199463, "rewards/rejected": -2.6940934658050537, "step": 1267 }, { "epoch": 1.2, "grad_norm": 21.864655628682858, "learning_rate": 3.4201219507066303e-07, "logps/chosen": -38.20549774169922, "logps/rejected": -55.41269302368164, "loss": 0.3149, "losses/dpo": 0.03128325194120407, "losses/sft": 0.5899477601051331, "losses/total": 0.03128325194120407, "ref_logps/chosen": -29.198528289794922, "ref_logps/rejected": -31.033084869384766, "rewards/accuracies": 0.875, "rewards/chosen": -0.9006970524787903, "rewards/margins": 1.5372637510299683, "rewards/rejected": -2.437960624694824, "step": 1268 }, { "epoch": 1.2, "grad_norm": 18.757379581813105, "learning_rate": 3.41775354795354e-07, "logps/chosen": -43.80974578857422, "logps/rejected": -71.71562194824219, "loss": 0.2942, "losses/dpo": 0.07318668812513351, "losses/sft": 1.3071407079696655, "losses/total": 0.07318668812513351, "ref_logps/chosen": -31.006763458251953, "ref_logps/rejected": -38.67022705078125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.280298113822937, "rewards/margins": 2.0242414474487305, "rewards/rejected": -3.304539442062378, "step": 1269 }, { "epoch": 1.2, "grad_norm": 14.984779946191521, "learning_rate": 3.4153841928494507e-07, "logps/chosen": -36.638511657714844, "logps/rejected": -75.75128936767578, "loss": 0.2116, "losses/dpo": 0.37752699851989746, "losses/sft": 2.2250304222106934, "losses/total": 0.37752699851989746, "ref_logps/chosen": -21.532150268554688, "ref_logps/rejected": -39.24138641357422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5106360912322998, "rewards/margins": 2.1403543949127197, "rewards/rejected": -3.6509904861450195, "step": 1270 }, { "epoch": 1.2, "grad_norm": 22.517281217245056, "learning_rate": 3.41301388785304e-07, "logps/chosen": -43.68616485595703, "logps/rejected": -56.34013366699219, "loss": 0.4906, "losses/dpo": 0.7322086095809937, "losses/sft": 0.7928885817527771, "losses/total": 0.7322086095809937, "ref_logps/chosen": -28.05410385131836, "ref_logps/rejected": -30.892017364501953, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5632057189941406, "rewards/margins": 0.9816054105758667, "rewards/rejected": -2.544811248779297, "step": 1271 }, { "epoch": 1.2, "grad_norm": 23.631649412588235, "learning_rate": 3.4106426354239666e-07, "logps/chosen": -36.91401672363281, "logps/rejected": -46.41883087158203, "loss": 0.5773, "losses/dpo": 0.8471890091896057, "losses/sft": 1.857613205909729, "losses/total": 0.8471890091896057, "ref_logps/chosen": -23.401058197021484, "ref_logps/rejected": -25.848796844482422, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3512959480285645, "rewards/margins": 0.7057077884674072, "rewards/rejected": -2.0570037364959717, "step": 1272 }, { "epoch": 1.2, "grad_norm": 20.687678042525985, "learning_rate": 3.4082704380228746e-07, "logps/chosen": -50.711639404296875, "logps/rejected": -63.28516387939453, "loss": 0.343, "losses/dpo": 0.11243638396263123, "losses/sft": 0.9730046987533569, "losses/total": 0.11243638396263123, "ref_logps/chosen": -32.32966613769531, "ref_logps/rejected": -30.414520263671875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8381972312927246, "rewards/margins": 1.4488670825958252, "rewards/rejected": -3.2870640754699707, "step": 1273 }, { "epoch": 1.2, "grad_norm": 14.973991220771678, "learning_rate": 3.4058972981113884e-07, "logps/chosen": -34.618133544921875, "logps/rejected": -65.889404296875, "loss": 0.2821, "losses/dpo": 0.003829848486930132, "losses/sft": 1.1842979192733765, "losses/total": 0.003829848486930132, "ref_logps/chosen": -24.433338165283203, "ref_logps/rejected": -36.06476974487305, "rewards/accuracies": 0.8125, "rewards/chosen": -1.018479585647583, "rewards/margins": 1.9639838933944702, "rewards/rejected": -2.9824633598327637, "step": 1274 }, { "epoch": 1.2, "grad_norm": 14.697324636897289, "learning_rate": 3.403523218152112e-07, "logps/chosen": -50.609214782714844, "logps/rejected": -73.48710632324219, "loss": 0.2398, "losses/dpo": 0.4489637315273285, "losses/sft": 0.6874176263809204, "losses/total": 0.4489637315273285, "ref_logps/chosen": -33.847660064697266, "ref_logps/rejected": -35.574623107910156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6761560440063477, "rewards/margins": 2.1150927543640137, "rewards/rejected": -3.7912487983703613, "step": 1275 }, { "epoch": 1.2, "grad_norm": 14.063660576981057, "learning_rate": 3.4011482006086213e-07, "logps/chosen": -38.836097717285156, "logps/rejected": -65.7626724243164, "loss": 0.189, "losses/dpo": 0.3338268995285034, "losses/sft": 2.0658442974090576, "losses/total": 0.3338268995285034, "ref_logps/chosen": -25.293941497802734, "ref_logps/rejected": -29.80812644958496, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3542158603668213, "rewards/margins": 2.241238832473755, "rewards/rejected": -3.595454692840576, "step": 1276 }, { "epoch": 1.2, "grad_norm": 22.338759330158403, "learning_rate": 3.3987722479454675e-07, "logps/chosen": -38.693016052246094, "logps/rejected": -57.922115325927734, "loss": 0.5926, "losses/dpo": 0.5328592658042908, "losses/sft": 0.964748203754425, "losses/total": 0.5328592658042908, "ref_logps/chosen": -22.159849166870117, "ref_logps/rejected": -31.234310150146484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6533164978027344, "rewards/margins": 1.0154640674591064, "rewards/rejected": -2.668780565261841, "step": 1277 }, { "epoch": 1.21, "grad_norm": 12.846866507153843, "learning_rate": 3.396395362628173e-07, "logps/chosen": -39.66596603393555, "logps/rejected": -59.21913146972656, "loss": 0.2304, "losses/dpo": 0.006782022304832935, "losses/sft": 0.9138774275779724, "losses/total": 0.006782022304832935, "ref_logps/chosen": -26.888025283813477, "ref_logps/rejected": -27.797679901123047, "rewards/accuracies": 1.0, "rewards/chosen": -1.2777941226959229, "rewards/margins": 1.8643505573272705, "rewards/rejected": -3.1421449184417725, "step": 1278 }, { "epoch": 1.21, "grad_norm": 4.974835463850826, "learning_rate": 3.3940175471232257e-07, "logps/chosen": -45.94539260864258, "logps/rejected": -83.06407165527344, "loss": 0.0776, "losses/dpo": 0.48297953605651855, "losses/sft": 0.4261626899242401, "losses/total": 0.48297953605651855, "ref_logps/chosen": -33.990333557128906, "ref_logps/rejected": -39.47960662841797, "rewards/accuracies": 1.0, "rewards/chosen": -1.1955060958862305, "rewards/margins": 3.162940502166748, "rewards/rejected": -4.3584465980529785, "step": 1279 }, { "epoch": 1.21, "grad_norm": 24.537451244498754, "learning_rate": 3.39163880389808e-07, "logps/chosen": -45.62507629394531, "logps/rejected": -76.68244934082031, "loss": 0.3765, "losses/dpo": 0.17151299118995667, "losses/sft": 1.7071207761764526, "losses/total": 0.17151299118995667, "ref_logps/chosen": -29.167205810546875, "ref_logps/rejected": -44.6408576965332, "rewards/accuracies": 0.75, "rewards/chosen": -1.645787000656128, "rewards/margins": 1.5583717823028564, "rewards/rejected": -3.2041587829589844, "step": 1280 }, { "epoch": 1.21, "grad_norm": 24.15262714373449, "learning_rate": 3.3892591354211546e-07, "logps/chosen": -37.299095153808594, "logps/rejected": -62.39316177368164, "loss": 0.4795, "losses/dpo": 3.081653356552124, "losses/sft": 3.2858569622039795, "losses/total": 3.081653356552124, "ref_logps/chosen": -24.155513763427734, "ref_logps/rejected": -36.44700622558594, "rewards/accuracies": 0.875, "rewards/chosen": -1.3143582344055176, "rewards/margins": 1.2802574634552002, "rewards/rejected": -2.5946156978607178, "step": 1281 }, { "epoch": 1.21, "grad_norm": 9.312668057948759, "learning_rate": 3.386878544161825e-07, "logps/chosen": -39.50762939453125, "logps/rejected": -66.95501708984375, "loss": 0.2051, "losses/dpo": 0.4140365421772003, "losses/sft": 1.40478515625, "losses/total": 0.4140365421772003, "ref_logps/chosen": -25.325298309326172, "ref_logps/rejected": -32.33010482788086, "rewards/accuracies": 1.0, "rewards/chosen": -1.4182331562042236, "rewards/margins": 2.044257640838623, "rewards/rejected": -3.4624907970428467, "step": 1282 }, { "epoch": 1.21, "grad_norm": 12.279671804700357, "learning_rate": 3.3844970325904263e-07, "logps/chosen": -32.740936279296875, "logps/rejected": -70.29789733886719, "loss": 0.2166, "losses/dpo": 0.044993288815021515, "losses/sft": 1.5149940252304077, "losses/total": 0.044993288815021515, "ref_logps/chosen": -22.21756362915039, "ref_logps/rejected": -37.318092346191406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.052337408065796, "rewards/margins": 2.245643138885498, "rewards/rejected": -3.297980785369873, "step": 1283 }, { "epoch": 1.21, "grad_norm": 18.177542006650757, "learning_rate": 3.382114603178249e-07, "logps/chosen": -31.367218017578125, "logps/rejected": -52.16114044189453, "loss": 0.3087, "losses/dpo": 0.6630816459655762, "losses/sft": 0.7411296367645264, "losses/total": 0.6630816459655762, "ref_logps/chosen": -19.83163070678711, "ref_logps/rejected": -25.489858627319336, "rewards/accuracies": 0.875, "rewards/chosen": -1.1535589694976807, "rewards/margins": 1.5135691165924072, "rewards/rejected": -2.667128324508667, "step": 1284 }, { "epoch": 1.21, "grad_norm": 17.697172170829433, "learning_rate": 3.379731258397534e-07, "logps/chosen": -50.35810089111328, "logps/rejected": -67.22805786132812, "loss": 0.2524, "losses/dpo": 0.5808574557304382, "losses/sft": 2.7842350006103516, "losses/total": 0.5808574557304382, "ref_logps/chosen": -37.14164733886719, "ref_logps/rejected": -35.69677734375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3216452598571777, "rewards/margins": 1.8314826488494873, "rewards/rejected": -3.153128147125244, "step": 1285 }, { "epoch": 1.21, "grad_norm": 13.466546472313647, "learning_rate": 3.3773470007214746e-07, "logps/chosen": -33.53276443481445, "logps/rejected": -65.26567077636719, "loss": 0.2472, "losses/dpo": 0.05083456262946129, "losses/sft": 1.2752727270126343, "losses/total": 0.05083456262946129, "ref_logps/chosen": -22.14708709716797, "ref_logps/rejected": -34.862083435058594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1385678052902222, "rewards/margins": 1.901790738105774, "rewards/rejected": -3.040358543395996, "step": 1286 }, { "epoch": 1.21, "grad_norm": 21.011770580710685, "learning_rate": 3.374961832624209e-07, "logps/chosen": -58.864723205566406, "logps/rejected": -97.41055297851562, "loss": 0.3207, "losses/dpo": 0.0005041385884396732, "losses/sft": 2.042907953262329, "losses/total": 0.0005041385884396732, "ref_logps/chosen": -42.893341064453125, "ref_logps/rejected": -58.0426025390625, "rewards/accuracies": 0.875, "rewards/chosen": -1.597138524055481, "rewards/margins": 2.339656352996826, "rewards/rejected": -3.936795234680176, "step": 1287 }, { "epoch": 1.22, "grad_norm": 18.409013393501674, "learning_rate": 3.3725757565808217e-07, "logps/chosen": -42.010929107666016, "logps/rejected": -53.81096649169922, "loss": 0.4213, "losses/dpo": 0.12783809006214142, "losses/sft": 1.7437690496444702, "losses/total": 0.12783809006214142, "ref_logps/chosen": -27.371109008789062, "ref_logps/rejected": -29.235742568969727, "rewards/accuracies": 0.8125, "rewards/chosen": -1.463982343673706, "rewards/margins": 0.9935401678085327, "rewards/rejected": -2.457522392272949, "step": 1288 }, { "epoch": 1.22, "grad_norm": 17.455080737710276, "learning_rate": 3.3701887750673385e-07, "logps/chosen": -35.657249450683594, "logps/rejected": -60.110679626464844, "loss": 0.2987, "losses/dpo": 0.004975537303835154, "losses/sft": 1.291998267173767, "losses/total": 0.004975537303835154, "ref_logps/chosen": -23.33029556274414, "ref_logps/rejected": -30.385608673095703, "rewards/accuracies": 0.875, "rewards/chosen": -1.23269522190094, "rewards/margins": 1.7398121356964111, "rewards/rejected": -2.9725072383880615, "step": 1289 }, { "epoch": 1.22, "grad_norm": 21.316447231807093, "learning_rate": 3.367800890560727e-07, "logps/chosen": -52.807926177978516, "logps/rejected": -67.18551635742188, "loss": 0.3548, "losses/dpo": 0.021653706207871437, "losses/sft": 1.2811278104782104, "losses/total": 0.021653706207871437, "ref_logps/chosen": -36.86651611328125, "ref_logps/rejected": -34.363990783691406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5941405296325684, "rewards/margins": 1.6880121231079102, "rewards/rejected": -3.2821526527404785, "step": 1290 }, { "epoch": 1.22, "grad_norm": 16.600586751271432, "learning_rate": 3.3654121055388884e-07, "logps/chosen": -34.42720413208008, "logps/rejected": -63.42791748046875, "loss": 0.2978, "losses/dpo": 1.2636886835098267, "losses/sft": 1.8478869199752808, "losses/total": 1.2636886835098267, "ref_logps/chosen": -24.302663803100586, "ref_logps/rejected": -36.09109878540039, "rewards/accuracies": 0.875, "rewards/chosen": -1.0124540328979492, "rewards/margins": 1.7212278842926025, "rewards/rejected": -2.733682155609131, "step": 1291 }, { "epoch": 1.22, "grad_norm": 10.238091809047585, "learning_rate": 3.3630224224806614e-07, "logps/chosen": -34.877044677734375, "logps/rejected": -65.5967025756836, "loss": 0.198, "losses/dpo": 0.04755779728293419, "losses/sft": 0.5562039017677307, "losses/total": 0.04755779728293419, "ref_logps/chosen": -25.381420135498047, "ref_logps/rejected": -31.277555465698242, "rewards/accuracies": 1.0, "rewards/chosen": -0.9495624899864197, "rewards/margins": 2.4823522567749023, "rewards/rejected": -3.431914806365967, "step": 1292 }, { "epoch": 1.22, "grad_norm": 17.69739360708456, "learning_rate": 3.3606318438658135e-07, "logps/chosen": -44.81547546386719, "logps/rejected": -68.13468933105469, "loss": 0.2838, "losses/dpo": 0.26717865467071533, "losses/sft": 3.7563364505767822, "losses/total": 0.26717865467071533, "ref_logps/chosen": -29.559974670410156, "ref_logps/rejected": -34.356666564941406, "rewards/accuracies": 0.875, "rewards/chosen": -1.5255500078201294, "rewards/margins": 1.8522528409957886, "rewards/rejected": -3.377802848815918, "step": 1293 }, { "epoch": 1.22, "grad_norm": 15.246958123680269, "learning_rate": 3.3582403721750454e-07, "logps/chosen": -33.02482223510742, "logps/rejected": -56.475040435791016, "loss": 0.2866, "losses/dpo": 0.09447988867759705, "losses/sft": 1.525561809539795, "losses/total": 0.09447988867759705, "ref_logps/chosen": -22.535564422607422, "ref_logps/rejected": -27.271305084228516, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0489256381988525, "rewards/margins": 1.8714476823806763, "rewards/rejected": -2.9203734397888184, "step": 1294 }, { "epoch": 1.22, "grad_norm": 21.74828268020813, "learning_rate": 3.355848009889981e-07, "logps/chosen": -43.37589645385742, "logps/rejected": -69.08082580566406, "loss": 0.371, "losses/dpo": 0.13296382129192352, "losses/sft": 0.29417043924331665, "losses/total": 0.13296382129192352, "ref_logps/chosen": -32.58283996582031, "ref_logps/rejected": -41.96092987060547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.079305648803711, "rewards/margins": 1.632683515548706, "rewards/rejected": -2.711989164352417, "step": 1295 }, { "epoch": 1.22, "grad_norm": 12.99951287263849, "learning_rate": 3.35345475949317e-07, "logps/chosen": -35.75872039794922, "logps/rejected": -57.23997116088867, "loss": 0.211, "losses/dpo": 0.07721314579248428, "losses/sft": 0.20404919981956482, "losses/total": 0.07721314579248428, "ref_logps/chosen": -27.03713607788086, "ref_logps/rejected": -28.920351028442383, "rewards/accuracies": 1.0, "rewards/chosen": -0.8721583485603333, "rewards/margins": 1.959803581237793, "rewards/rejected": -2.8319618701934814, "step": 1296 }, { "epoch": 1.22, "grad_norm": 16.971168369713208, "learning_rate": 3.351060623468083e-07, "logps/chosen": -40.71614074707031, "logps/rejected": -71.04171752929688, "loss": 0.2715, "losses/dpo": 0.5620778799057007, "losses/sft": 1.4084666967391968, "losses/total": 0.5620778799057007, "ref_logps/chosen": -27.708553314208984, "ref_logps/rejected": -39.591697692871094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3007584810256958, "rewards/margins": 1.8442435264587402, "rewards/rejected": -3.1450018882751465, "step": 1297 }, { "epoch": 1.22, "grad_norm": 20.5552543504661, "learning_rate": 3.348665604299112e-07, "logps/chosen": -30.415069580078125, "logps/rejected": -56.826393127441406, "loss": 0.4107, "losses/dpo": 1.1108335256576538, "losses/sft": 1.283721923828125, "losses/total": 1.1108335256576538, "ref_logps/chosen": -19.870473861694336, "ref_logps/rejected": -30.335208892822266, "rewards/accuracies": 0.8125, "rewards/chosen": -1.054459810256958, "rewards/margins": 1.5946587324142456, "rewards/rejected": -2.649118423461914, "step": 1298 }, { "epoch": 1.23, "grad_norm": 9.30963046618163, "learning_rate": 3.3462697044715615e-07, "logps/chosen": -33.19233703613281, "logps/rejected": -68.5179672241211, "loss": 0.1582, "losses/dpo": 0.019849369302392006, "losses/sft": 1.1201117038726807, "losses/total": 0.019849369302392006, "ref_logps/chosen": -22.12771987915039, "ref_logps/rejected": -33.91117477416992, "rewards/accuracies": 1.0, "rewards/chosen": -1.1064614057540894, "rewards/margins": 2.354217767715454, "rewards/rejected": -3.460679531097412, "step": 1299 }, { "epoch": 1.23, "grad_norm": 19.24519688994208, "learning_rate": 3.343872926471653e-07, "logps/chosen": -42.116416931152344, "logps/rejected": -59.734046936035156, "loss": 0.2593, "losses/dpo": 0.16241112351417542, "losses/sft": 1.1847478151321411, "losses/total": 0.16241112351417542, "ref_logps/chosen": -32.55171585083008, "ref_logps/rejected": -33.28989791870117, "rewards/accuracies": 0.875, "rewards/chosen": -0.9564698934555054, "rewards/margins": 1.6879452466964722, "rewards/rejected": -2.6444153785705566, "step": 1300 }, { "epoch": 1.23, "grad_norm": 24.86790621694099, "learning_rate": 3.3414752727865175e-07, "logps/chosen": -45.73321533203125, "logps/rejected": -65.62512969970703, "loss": 0.4138, "losses/dpo": 1.2703280448913574, "losses/sft": 1.6700186729431152, "losses/total": 1.2703280448913574, "ref_logps/chosen": -30.144081115722656, "ref_logps/rejected": -32.71132278442383, "rewards/accuracies": 0.875, "rewards/chosen": -1.5589133501052856, "rewards/margins": 1.7324671745300293, "rewards/rejected": -3.2913804054260254, "step": 1301 }, { "epoch": 1.23, "grad_norm": 25.700066291079484, "learning_rate": 3.3390767459041964e-07, "logps/chosen": -43.2906494140625, "logps/rejected": -57.74649429321289, "loss": 0.4502, "losses/dpo": 0.7590955495834351, "losses/sft": 1.5504162311553955, "losses/total": 0.7590955495834351, "ref_logps/chosen": -26.00358772277832, "ref_logps/rejected": -29.032638549804688, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7287063598632812, "rewards/margins": 1.1426790952682495, "rewards/rejected": -2.8713855743408203, "step": 1302 }, { "epoch": 1.23, "grad_norm": 19.373294777492653, "learning_rate": 3.336677348313635e-07, "logps/chosen": -35.98982238769531, "logps/rejected": -58.63481140136719, "loss": 0.3179, "losses/dpo": 0.025231046602129936, "losses/sft": 0.4818132519721985, "losses/total": 0.025231046602129936, "ref_logps/chosen": -24.50233268737793, "ref_logps/rejected": -32.35173797607422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.148748755455017, "rewards/margins": 1.4795585870742798, "rewards/rejected": -2.628307342529297, "step": 1303 }, { "epoch": 1.23, "grad_norm": 22.009153250935487, "learning_rate": 3.334277082504684e-07, "logps/chosen": -29.082143783569336, "logps/rejected": -41.56816101074219, "loss": 0.4528, "losses/dpo": 0.13927435874938965, "losses/sft": 0.2792842984199524, "losses/total": 0.13927435874938965, "ref_logps/chosen": -20.126379013061523, "ref_logps/rejected": -21.773983001708984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8955765962600708, "rewards/margins": 1.0838415622711182, "rewards/rejected": -1.9794180393218994, "step": 1304 }, { "epoch": 1.23, "grad_norm": 19.33258363240833, "learning_rate": 3.3318759509680945e-07, "logps/chosen": -31.65786361694336, "logps/rejected": -59.37849426269531, "loss": 0.3478, "losses/dpo": 0.1199355348944664, "losses/sft": 0.24104608595371246, "losses/total": 0.1199355348944664, "ref_logps/chosen": -20.713666915893555, "ref_logps/rejected": -30.86538314819336, "rewards/accuracies": 0.875, "rewards/chosen": -1.0944199562072754, "rewards/margins": 1.7568914890289307, "rewards/rejected": -2.851311445236206, "step": 1305 }, { "epoch": 1.23, "grad_norm": 21.427262931415616, "learning_rate": 3.329473956195515e-07, "logps/chosen": -41.68986511230469, "logps/rejected": -74.02061462402344, "loss": 0.3866, "losses/dpo": 0.03348356857895851, "losses/sft": 1.2158939838409424, "losses/total": 0.03348356857895851, "ref_logps/chosen": -28.10188865661621, "ref_logps/rejected": -43.96015167236328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3587979078292847, "rewards/margins": 1.64724862575531, "rewards/rejected": -3.0060462951660156, "step": 1306 }, { "epoch": 1.23, "grad_norm": 17.3722362100418, "learning_rate": 3.3270711006794916e-07, "logps/chosen": -38.973167419433594, "logps/rejected": -63.86054992675781, "loss": 0.2924, "losses/dpo": 0.17515498399734497, "losses/sft": 1.6944915056228638, "losses/total": 0.17515498399734497, "ref_logps/chosen": -25.352554321289062, "ref_logps/rejected": -29.864917755126953, "rewards/accuracies": 0.875, "rewards/chosen": -1.3620612621307373, "rewards/margins": 2.037501811981201, "rewards/rejected": -3.3995630741119385, "step": 1307 }, { "epoch": 1.23, "grad_norm": 23.919903407321602, "learning_rate": 3.3246673869134616e-07, "logps/chosen": -47.41429901123047, "logps/rejected": -61.52079772949219, "loss": 0.4625, "losses/dpo": 0.14473123848438263, "losses/sft": 1.3724931478500366, "losses/total": 0.14473123848438263, "ref_logps/chosen": -31.708065032958984, "ref_logps/rejected": -35.471675872802734, "rewards/accuracies": 0.75, "rewards/chosen": -1.5706233978271484, "rewards/margins": 1.0342891216278076, "rewards/rejected": -2.604912519454956, "step": 1308 }, { "epoch": 1.23, "grad_norm": 17.88545822288946, "learning_rate": 3.322262817391755e-07, "logps/chosen": -42.522544860839844, "logps/rejected": -70.14311218261719, "loss": 0.2482, "losses/dpo": 0.5525601506233215, "losses/sft": 0.6553017497062683, "losses/total": 0.5525601506233215, "ref_logps/chosen": -29.2100772857666, "ref_logps/rejected": -35.55570983886719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.331247091293335, "rewards/margins": 2.1274943351745605, "rewards/rejected": -3.4587411880493164, "step": 1309 }, { "epoch": 1.24, "grad_norm": 20.500361449593584, "learning_rate": 3.319857394609588e-07, "logps/chosen": -36.243160247802734, "logps/rejected": -67.50515747070312, "loss": 0.2714, "losses/dpo": 0.03326806798577309, "losses/sft": 1.3428659439086914, "losses/total": 0.03326806798577309, "ref_logps/chosen": -22.419761657714844, "ref_logps/rejected": -36.684478759765625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3823397159576416, "rewards/margins": 1.6997281312942505, "rewards/rejected": -3.0820679664611816, "step": 1310 }, { "epoch": 1.24, "grad_norm": 17.85900767301599, "learning_rate": 3.3174511210630633e-07, "logps/chosen": -42.582862854003906, "logps/rejected": -74.05235290527344, "loss": 0.2027, "losses/dpo": 0.05948629230260849, "losses/sft": 1.9002223014831543, "losses/total": 0.05948629230260849, "ref_logps/chosen": -29.342405319213867, "ref_logps/rejected": -39.912899017333984, "rewards/accuracies": 1.0, "rewards/chosen": -1.3240458965301514, "rewards/margins": 2.089899778366089, "rewards/rejected": -3.413945436477661, "step": 1311 }, { "epoch": 1.24, "grad_norm": 14.241654841817063, "learning_rate": 3.315043999249166e-07, "logps/chosen": -47.93964767456055, "logps/rejected": -76.70834350585938, "loss": 0.2564, "losses/dpo": 0.04149313643574715, "losses/sft": 1.356377363204956, "losses/total": 0.04149313643574715, "ref_logps/chosen": -28.026004791259766, "ref_logps/rejected": -37.43848419189453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9913640022277832, "rewards/margins": 1.9356216192245483, "rewards/rejected": -3.926985740661621, "step": 1312 }, { "epoch": 1.24, "grad_norm": 24.606974614621265, "learning_rate": 3.3126360316657613e-07, "logps/chosen": -42.86430358886719, "logps/rejected": -48.35157012939453, "loss": 0.4778, "losses/dpo": 0.9610872268676758, "losses/sft": 1.3208250999450684, "losses/total": 0.9610872268676758, "ref_logps/chosen": -27.95205307006836, "ref_logps/rejected": -23.369556427001953, "rewards/accuracies": 0.75, "rewards/chosen": -1.491225004196167, "rewards/margins": 1.0069762468338013, "rewards/rejected": -2.498201370239258, "step": 1313 }, { "epoch": 1.24, "grad_norm": 13.418493517792065, "learning_rate": 3.310227220811593e-07, "logps/chosen": -39.66161346435547, "logps/rejected": -65.03255462646484, "loss": 0.2527, "losses/dpo": 0.06429160386323929, "losses/sft": 1.7636698484420776, "losses/total": 0.06429160386323929, "ref_logps/chosen": -24.86683464050293, "ref_logps/rejected": -31.875904083251953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.479478120803833, "rewards/margins": 1.8361867666244507, "rewards/rejected": -3.3156650066375732, "step": 1314 }, { "epoch": 1.24, "grad_norm": 15.359581029994347, "learning_rate": 3.3078175691862774e-07, "logps/chosen": -33.67129898071289, "logps/rejected": -51.054813385009766, "loss": 0.36, "losses/dpo": 0.02501952275633812, "losses/sft": 0.5470609664916992, "losses/total": 0.02501952275633812, "ref_logps/chosen": -23.429656982421875, "ref_logps/rejected": -26.531875610351562, "rewards/accuracies": 0.75, "rewards/chosen": -1.0241641998291016, "rewards/margins": 1.4281299114227295, "rewards/rejected": -2.452293872833252, "step": 1315 }, { "epoch": 1.24, "grad_norm": 17.73845151625755, "learning_rate": 3.3054070792903073e-07, "logps/chosen": -40.0555305480957, "logps/rejected": -57.17131423950195, "loss": 0.2783, "losses/dpo": 0.2942008674144745, "losses/sft": 0.5789414644241333, "losses/total": 0.2942008674144745, "ref_logps/chosen": -28.356971740722656, "ref_logps/rejected": -27.301836013793945, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1698561906814575, "rewards/margins": 1.8170913457870483, "rewards/rejected": -2.986947536468506, "step": 1316 }, { "epoch": 1.24, "grad_norm": 17.559255709183926, "learning_rate": 3.302995753625042e-07, "logps/chosen": -41.27039337158203, "logps/rejected": -63.307151794433594, "loss": 0.3363, "losses/dpo": 0.13473422825336456, "losses/sft": 1.2800145149230957, "losses/total": 0.13473422825336456, "ref_logps/chosen": -29.162837982177734, "ref_logps/rejected": -32.675106048583984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2107555866241455, "rewards/margins": 1.8524491786956787, "rewards/rejected": -3.063204765319824, "step": 1317 }, { "epoch": 1.24, "grad_norm": 11.81902055723072, "learning_rate": 3.30058359469271e-07, "logps/chosen": -41.779685974121094, "logps/rejected": -73.80335998535156, "loss": 0.2073, "losses/dpo": 0.09581923484802246, "losses/sft": 1.6135286092758179, "losses/total": 0.09581923484802246, "ref_logps/chosen": -29.18328094482422, "ref_logps/rejected": -37.95733642578125, "rewards/accuracies": 0.875, "rewards/chosen": -1.2596404552459717, "rewards/margins": 2.3249616622924805, "rewards/rejected": -3.584602117538452, "step": 1318 }, { "epoch": 1.24, "grad_norm": 18.38235905635225, "learning_rate": 3.2981706049964033e-07, "logps/chosen": -49.35371398925781, "logps/rejected": -72.09634399414062, "loss": 0.2899, "losses/dpo": 0.10837016254663467, "losses/sft": 1.8398723602294922, "losses/total": 0.10837016254663467, "ref_logps/chosen": -34.92890548706055, "ref_logps/rejected": -40.308494567871094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4424805641174316, "rewards/margins": 1.736304759979248, "rewards/rejected": -3.1787853240966797, "step": 1319 }, { "epoch": 1.25, "grad_norm": 21.72186625743247, "learning_rate": 3.295756787040075e-07, "logps/chosen": -56.13287353515625, "logps/rejected": -63.69792556762695, "loss": 0.3761, "losses/dpo": 0.011630987748503685, "losses/sft": 1.711033582687378, "losses/total": 0.011630987748503685, "ref_logps/chosen": -38.754859924316406, "ref_logps/rejected": -31.850805282592773, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7378019094467163, "rewards/margins": 1.4469103813171387, "rewards/rejected": -3.1847124099731445, "step": 1320 }, { "epoch": 1.25, "grad_norm": 16.59479847603965, "learning_rate": 3.2933421433285423e-07, "logps/chosen": -39.04832077026367, "logps/rejected": -60.254783630371094, "loss": 0.2335, "losses/dpo": 0.032486096024513245, "losses/sft": 1.590760350227356, "losses/total": 0.032486096024513245, "ref_logps/chosen": -28.103919982910156, "ref_logps/rejected": -29.861114501953125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.094440221786499, "rewards/margins": 1.9449269771575928, "rewards/rejected": -3.039367198944092, "step": 1321 }, { "epoch": 1.25, "grad_norm": 10.630780028992673, "learning_rate": 3.2909266763674736e-07, "logps/chosen": -30.600135803222656, "logps/rejected": -64.17646789550781, "loss": 0.1779, "losses/dpo": 0.04556729644536972, "losses/sft": 0.42786240577697754, "losses/total": 0.04556729644536972, "ref_logps/chosen": -20.108219146728516, "ref_logps/rejected": -30.276708602905273, "rewards/accuracies": 1.0, "rewards/chosen": -1.0491915941238403, "rewards/margins": 2.3407840728759766, "rewards/rejected": -3.3899755477905273, "step": 1322 }, { "epoch": 1.25, "grad_norm": 17.79928992127477, "learning_rate": 3.288510388663394e-07, "logps/chosen": -40.92877960205078, "logps/rejected": -61.60036087036133, "loss": 0.3557, "losses/dpo": 0.20797428488731384, "losses/sft": 0.6011282205581665, "losses/total": 0.20797428488731384, "ref_logps/chosen": -28.43454360961914, "ref_logps/rejected": -31.711772918701172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2494232654571533, "rewards/margins": 1.7394354343414307, "rewards/rejected": -2.988858699798584, "step": 1323 }, { "epoch": 1.25, "grad_norm": 17.352500621246985, "learning_rate": 3.286093282723682e-07, "logps/chosen": -43.602439880371094, "logps/rejected": -61.83216857910156, "loss": 0.3114, "losses/dpo": 0.420665979385376, "losses/sft": 0.4082103967666626, "losses/total": 0.420665979385376, "ref_logps/chosen": -31.120607376098633, "ref_logps/rejected": -33.616127014160156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2481837272644043, "rewards/margins": 1.573420524597168, "rewards/rejected": -2.8216042518615723, "step": 1324 }, { "epoch": 1.25, "grad_norm": 18.856148718389125, "learning_rate": 3.283675361056562e-07, "logps/chosen": -36.15111541748047, "logps/rejected": -50.17252731323242, "loss": 0.417, "losses/dpo": 0.3310679793357849, "losses/sft": 0.8636975884437561, "losses/total": 0.3310679793357849, "ref_logps/chosen": -22.06011962890625, "ref_logps/rejected": -25.567522048950195, "rewards/accuracies": 0.875, "rewards/chosen": -1.4090993404388428, "rewards/margins": 1.051401138305664, "rewards/rejected": -2.460500478744507, "step": 1325 }, { "epoch": 1.25, "grad_norm": 20.56865023040543, "learning_rate": 3.281256626171108e-07, "logps/chosen": -45.79396057128906, "logps/rejected": -79.83161926269531, "loss": 0.3464, "losses/dpo": 0.04443291202187538, "losses/sft": 1.4568358659744263, "losses/total": 0.04443291202187538, "ref_logps/chosen": -30.778833389282227, "ref_logps/rejected": -42.3217658996582, "rewards/accuracies": 0.875, "rewards/chosen": -1.5015122890472412, "rewards/margins": 2.2494733333587646, "rewards/rejected": -3.750985622406006, "step": 1326 }, { "epoch": 1.25, "grad_norm": 18.342345014312833, "learning_rate": 3.278837080577235e-07, "logps/chosen": -53.557437896728516, "logps/rejected": -75.04664611816406, "loss": 0.233, "losses/dpo": 0.05625596642494202, "losses/sft": 1.4795225858688354, "losses/total": 0.05625596642494202, "ref_logps/chosen": -37.011375427246094, "ref_logps/rejected": -39.01417922973633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6546061038970947, "rewards/margins": 1.9486407041549683, "rewards/rejected": -3.6032469272613525, "step": 1327 }, { "epoch": 1.25, "grad_norm": 18.988391407345926, "learning_rate": 3.276416726785701e-07, "logps/chosen": -46.279441833496094, "logps/rejected": -75.1138916015625, "loss": 0.2936, "losses/dpo": 0.0020275975111871958, "losses/sft": 1.5373157262802124, "losses/total": 0.0020275975111871958, "ref_logps/chosen": -30.26971435546875, "ref_logps/rejected": -40.91531753540039, "rewards/accuracies": 0.875, "rewards/chosen": -1.60097336769104, "rewards/margins": 1.8188841342926025, "rewards/rejected": -3.4198572635650635, "step": 1328 }, { "epoch": 1.25, "grad_norm": 13.661084259630732, "learning_rate": 3.2739955673081014e-07, "logps/chosen": -40.9703369140625, "logps/rejected": -64.78068542480469, "loss": 0.2349, "losses/dpo": 0.9733134508132935, "losses/sft": 0.8587873578071594, "losses/total": 0.9733134508132935, "ref_logps/chosen": -28.43806266784668, "ref_logps/rejected": -32.33397674560547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2532275915145874, "rewards/margins": 1.9914436340332031, "rewards/rejected": -3.24467134475708, "step": 1329 }, { "epoch": 1.25, "grad_norm": 16.11426124680703, "learning_rate": 3.2715736046568705e-07, "logps/chosen": -46.05767822265625, "logps/rejected": -65.45758056640625, "loss": 0.2924, "losses/dpo": 0.2684297263622284, "losses/sft": 0.8939210176467896, "losses/total": 0.2684297263622284, "ref_logps/chosen": -28.42173194885254, "ref_logps/rejected": -34.022708892822266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.763594627380371, "rewards/margins": 1.3798928260803223, "rewards/rejected": -3.1434874534606934, "step": 1330 }, { "epoch": 1.26, "grad_norm": 25.082966090757083, "learning_rate": 3.269150841345272e-07, "logps/chosen": -49.6655158996582, "logps/rejected": -64.84808349609375, "loss": 0.5349, "losses/dpo": 0.11584257334470749, "losses/sft": 1.4175316095352173, "losses/total": 0.11584257334470749, "ref_logps/chosen": -31.959745407104492, "ref_logps/rejected": -33.272552490234375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7705771923065186, "rewards/margins": 1.3869762420654297, "rewards/rejected": -3.1575534343719482, "step": 1331 }, { "epoch": 1.26, "grad_norm": 15.282818269120018, "learning_rate": 3.2667272798874033e-07, "logps/chosen": -34.5420036315918, "logps/rejected": -68.30073547363281, "loss": 0.2515, "losses/dpo": 0.0027874349616467953, "losses/sft": 0.8942425847053528, "losses/total": 0.0027874349616467953, "ref_logps/chosen": -23.386384963989258, "ref_logps/rejected": -38.50825119018555, "rewards/accuracies": 1.0, "rewards/chosen": -1.1155619621276855, "rewards/margins": 1.8636868000030518, "rewards/rejected": -2.9792490005493164, "step": 1332 }, { "epoch": 1.26, "grad_norm": 18.382953030690278, "learning_rate": 3.2643029227981886e-07, "logps/chosen": -40.57898712158203, "logps/rejected": -55.9635009765625, "loss": 0.321, "losses/dpo": 0.109612837433815, "losses/sft": 0.8761170506477356, "losses/total": 0.109612837433815, "ref_logps/chosen": -29.116920471191406, "ref_logps/rejected": -32.33693313598633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1462066173553467, "rewards/margins": 1.2164502143859863, "rewards/rejected": -2.362656831741333, "step": 1333 }, { "epoch": 1.26, "grad_norm": 17.64754481476368, "learning_rate": 3.26187777259338e-07, "logps/chosen": -41.14385986328125, "logps/rejected": -56.11606216430664, "loss": 0.3175, "losses/dpo": 0.10707245767116547, "losses/sft": 1.6178311109542847, "losses/total": 0.10707245767116547, "ref_logps/chosen": -27.080177307128906, "ref_logps/rejected": -26.479215621948242, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4063684940338135, "rewards/margins": 1.5573160648345947, "rewards/rejected": -2.963684558868408, "step": 1334 }, { "epoch": 1.26, "grad_norm": 23.52088901092022, "learning_rate": 3.2594518317895495e-07, "logps/chosen": -40.0862922668457, "logps/rejected": -55.961692810058594, "loss": 0.4042, "losses/dpo": 0.19135548174381256, "losses/sft": 0.8598500490188599, "losses/total": 0.19135548174381256, "ref_logps/chosen": -25.18153190612793, "ref_logps/rejected": -27.781646728515625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.490476131439209, "rewards/margins": 1.327528476715088, "rewards/rejected": -2.818004608154297, "step": 1335 }, { "epoch": 1.26, "grad_norm": 9.991731600243668, "learning_rate": 3.257025102904092e-07, "logps/chosen": -40.133689880371094, "logps/rejected": -76.61393737792969, "loss": 0.1733, "losses/dpo": 0.037086356431245804, "losses/sft": 1.11514151096344, "losses/total": 0.037086356431245804, "ref_logps/chosen": -29.70978546142578, "ref_logps/rejected": -41.31547927856445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0423901081085205, "rewards/margins": 2.4874556064605713, "rewards/rejected": -3.529845714569092, "step": 1336 }, { "epoch": 1.26, "grad_norm": 11.040045776596964, "learning_rate": 3.2545975884552176e-07, "logps/chosen": -43.54847717285156, "logps/rejected": -78.20092010498047, "loss": 0.184, "losses/dpo": 0.134330153465271, "losses/sft": 2.2850735187530518, "losses/total": 0.134330153465271, "ref_logps/chosen": -31.085721969604492, "ref_logps/rejected": -43.563499450683594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2462754249572754, "rewards/margins": 2.2174668312072754, "rewards/rejected": -3.463742256164551, "step": 1337 }, { "epoch": 1.26, "grad_norm": 16.92432685253981, "learning_rate": 3.252169290961954e-07, "logps/chosen": -32.71925354003906, "logps/rejected": -70.55846405029297, "loss": 0.2443, "losses/dpo": 0.01862584799528122, "losses/sft": 2.019355297088623, "losses/total": 0.01862584799528122, "ref_logps/chosen": -20.23807716369629, "ref_logps/rejected": -35.292816162109375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2481175661087036, "rewards/margins": 2.278447389602661, "rewards/rejected": -3.526564836502075, "step": 1338 }, { "epoch": 1.26, "grad_norm": 14.776046272458977, "learning_rate": 3.249740212944141e-07, "logps/chosen": -38.242156982421875, "logps/rejected": -77.92585754394531, "loss": 0.2214, "losses/dpo": 0.09270410984754562, "losses/sft": 1.4647746086120605, "losses/total": 0.09270410984754562, "ref_logps/chosen": -27.25220489501953, "ref_logps/rejected": -42.44512939453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0989954471588135, "rewards/margins": 2.449077606201172, "rewards/rejected": -3.5480728149414062, "step": 1339 }, { "epoch": 1.26, "grad_norm": 29.338223888017907, "learning_rate": 3.247310356922427e-07, "logps/chosen": -35.6669807434082, "logps/rejected": -59.11750411987305, "loss": 0.6359, "losses/dpo": 0.8288795948028564, "losses/sft": 0.9778329730033875, "losses/total": 0.8288795948028564, "ref_logps/chosen": -19.304914474487305, "ref_logps/rejected": -30.3583984375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6362066268920898, "rewards/margins": 1.23970365524292, "rewards/rejected": -2.8759102821350098, "step": 1340 }, { "epoch": 1.27, "grad_norm": 15.1656328922838, "learning_rate": 3.2448797254182704e-07, "logps/chosen": -39.88763427734375, "logps/rejected": -68.71611785888672, "loss": 0.2428, "losses/dpo": 0.5392289757728577, "losses/sft": 0.807812511920929, "losses/total": 0.5392289757728577, "ref_logps/chosen": -26.56769561767578, "ref_logps/rejected": -35.557823181152344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3319939374923706, "rewards/margins": 1.9838361740112305, "rewards/rejected": -3.3158302307128906, "step": 1341 }, { "epoch": 1.27, "grad_norm": 16.806400941771354, "learning_rate": 3.24244832095393e-07, "logps/chosen": -38.266082763671875, "logps/rejected": -52.94994354248047, "loss": 0.3991, "losses/dpo": 1.2014124393463135, "losses/sft": 1.9436874389648438, "losses/total": 1.2014124393463135, "ref_logps/chosen": -26.145401000976562, "ref_logps/rejected": -26.92449378967285, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2120682001113892, "rewards/margins": 1.3904770612716675, "rewards/rejected": -2.6025452613830566, "step": 1342 }, { "epoch": 1.27, "grad_norm": 20.268993746321836, "learning_rate": 3.2400161460524715e-07, "logps/chosen": -38.023033142089844, "logps/rejected": -59.956199645996094, "loss": 0.3881, "losses/dpo": 0.11271297931671143, "losses/sft": 1.0046316385269165, "losses/total": 0.11271297931671143, "ref_logps/chosen": -24.304122924804688, "ref_logps/rejected": -31.194162368774414, "rewards/accuracies": 0.875, "rewards/chosen": -1.3718910217285156, "rewards/margins": 1.5043129920959473, "rewards/rejected": -2.876204013824463, "step": 1343 }, { "epoch": 1.27, "grad_norm": 22.339421245223072, "learning_rate": 3.237583203237757e-07, "logps/chosen": -47.256439208984375, "logps/rejected": -95.50982666015625, "loss": 0.2773, "losses/dpo": 0.0003681657835841179, "losses/sft": 1.0107388496398926, "losses/total": 0.0003681657835841179, "ref_logps/chosen": -29.428592681884766, "ref_logps/rejected": -50.21084213256836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7827849388122559, "rewards/margins": 2.7471134662628174, "rewards/rejected": -4.529898643493652, "step": 1344 }, { "epoch": 1.27, "grad_norm": 23.027922023229568, "learning_rate": 3.235149495034446e-07, "logps/chosen": -50.933921813964844, "logps/rejected": -64.36026000976562, "loss": 0.3662, "losses/dpo": 0.04062264785170555, "losses/sft": 1.2182872295379639, "losses/total": 0.04062264785170555, "ref_logps/chosen": -31.97607421875, "ref_logps/rejected": -30.913761138916016, "rewards/accuracies": 0.875, "rewards/chosen": -1.8957850933074951, "rewards/margins": 1.4488649368286133, "rewards/rejected": -3.3446500301361084, "step": 1345 }, { "epoch": 1.27, "grad_norm": 15.731203361724647, "learning_rate": 3.2327150239679936e-07, "logps/chosen": -38.80110168457031, "logps/rejected": -54.62010955810547, "loss": 0.3013, "losses/dpo": 0.1633961796760559, "losses/sft": 0.5231031775474548, "losses/total": 0.1633961796760559, "ref_logps/chosen": -24.615690231323242, "ref_logps/rejected": -24.930362701416016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4185411930084229, "rewards/margins": 1.5504333972930908, "rewards/rejected": -2.9689745903015137, "step": 1346 }, { "epoch": 1.27, "grad_norm": 15.802411961143322, "learning_rate": 3.230279792564644e-07, "logps/chosen": -42.60869216918945, "logps/rejected": -67.26438903808594, "loss": 0.246, "losses/dpo": 0.15519705414772034, "losses/sft": 2.00467586517334, "losses/total": 0.15519705414772034, "ref_logps/chosen": -25.991539001464844, "ref_logps/rejected": -33.48909378051758, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6617152690887451, "rewards/margins": 1.7158138751983643, "rewards/rejected": -3.3775291442871094, "step": 1347 }, { "epoch": 1.27, "grad_norm": 34.873584683031936, "learning_rate": 3.227843803351434e-07, "logps/chosen": -52.32500457763672, "logps/rejected": -60.31515884399414, "loss": 0.5639, "losses/dpo": 0.3959197998046875, "losses/sft": 1.3024147748947144, "losses/total": 0.3959197998046875, "ref_logps/chosen": -34.005836486816406, "ref_logps/rejected": -31.008913040161133, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8319168090820312, "rewards/margins": 1.098707914352417, "rewards/rejected": -2.9306247234344482, "step": 1348 }, { "epoch": 1.27, "grad_norm": 14.984760584717085, "learning_rate": 3.2254070588561825e-07, "logps/chosen": -46.550052642822266, "logps/rejected": -74.95921325683594, "loss": 0.2367, "losses/dpo": 0.7506791949272156, "losses/sft": 0.9414228796958923, "losses/total": 0.7506791949272156, "ref_logps/chosen": -30.354713439941406, "ref_logps/rejected": -36.38290786743164, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6195337772369385, "rewards/margins": 2.2380971908569336, "rewards/rejected": -3.857630729675293, "step": 1349 }, { "epoch": 1.27, "grad_norm": 19.185359929067943, "learning_rate": 3.222969561607495e-07, "logps/chosen": -36.33119201660156, "logps/rejected": -75.44927215576172, "loss": 0.2876, "losses/dpo": 1.3974391222000122, "losses/sft": 1.447836995124817, "losses/total": 1.3974391222000122, "ref_logps/chosen": -26.439157485961914, "ref_logps/rejected": -41.91450119018555, "rewards/accuracies": 0.875, "rewards/chosen": -0.989203691482544, "rewards/margins": 2.3642733097076416, "rewards/rejected": -3.3534770011901855, "step": 1350 }, { "epoch": 1.27, "grad_norm": 27.200686781674772, "learning_rate": 3.220531314134758e-07, "logps/chosen": -48.340599060058594, "logps/rejected": -55.08031463623047, "loss": 0.4664, "losses/dpo": 0.029192063957452774, "losses/sft": 1.3451035022735596, "losses/total": 0.029192063957452774, "ref_logps/chosen": -32.685184478759766, "ref_logps/rejected": -27.253250122070312, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5655418634414673, "rewards/margins": 1.2171647548675537, "rewards/rejected": -2.7827064990997314, "step": 1351 }, { "epoch": 1.28, "grad_norm": 13.253563625308749, "learning_rate": 3.218092318968136e-07, "logps/chosen": -38.82443618774414, "logps/rejected": -82.58684539794922, "loss": 0.1705, "losses/dpo": 0.04752665013074875, "losses/sft": 1.0353543758392334, "losses/total": 0.04752665013074875, "ref_logps/chosen": -26.364177703857422, "ref_logps/rejected": -43.82279968261719, "rewards/accuracies": 1.0, "rewards/chosen": -1.2460256814956665, "rewards/margins": 2.630378246307373, "rewards/rejected": -3.87640380859375, "step": 1352 }, { "epoch": 1.28, "grad_norm": 15.668450010937388, "learning_rate": 3.215652578638569e-07, "logps/chosen": -44.71215057373047, "logps/rejected": -65.09803009033203, "loss": 0.278, "losses/dpo": 0.08203906565904617, "losses/sft": 1.7939281463623047, "losses/total": 0.08203906565904617, "ref_logps/chosen": -30.530597686767578, "ref_logps/rejected": -31.718385696411133, "rewards/accuracies": 0.875, "rewards/chosen": -1.4181556701660156, "rewards/margins": 1.919809103012085, "rewards/rejected": -3.3379647731781006, "step": 1353 }, { "epoch": 1.28, "grad_norm": 13.908518268723824, "learning_rate": 3.2132120956777705e-07, "logps/chosen": -54.93191909790039, "logps/rejected": -96.65616607666016, "loss": 0.1701, "losses/dpo": 0.12896223366260529, "losses/sft": 1.0173639059066772, "losses/total": 0.12896223366260529, "ref_logps/chosen": -38.17130661010742, "ref_logps/rejected": -56.246368408203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6760611534118652, "rewards/margins": 2.3649189472198486, "rewards/rejected": -4.040980339050293, "step": 1354 }, { "epoch": 1.28, "grad_norm": 17.58701584664112, "learning_rate": 3.2107708726182247e-07, "logps/chosen": -48.39830017089844, "logps/rejected": -67.48295593261719, "loss": 0.3017, "losses/dpo": 0.4826205372810364, "losses/sft": 2.4731802940368652, "losses/total": 0.4826205372810364, "ref_logps/chosen": -32.29206466674805, "ref_logps/rejected": -33.62629318237305, "rewards/accuracies": 0.875, "rewards/chosen": -1.6106231212615967, "rewards/margins": 1.7750434875488281, "rewards/rejected": -3.385666847229004, "step": 1355 }, { "epoch": 1.28, "grad_norm": 18.804317364597026, "learning_rate": 3.2083289119931845e-07, "logps/chosen": -50.2794303894043, "logps/rejected": -61.081787109375, "loss": 0.3298, "losses/dpo": 1.1905138492584229, "losses/sft": 2.3788866996765137, "losses/total": 1.1905138492584229, "ref_logps/chosen": -36.726890563964844, "ref_logps/rejected": -33.399871826171875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3552536964416504, "rewards/margins": 1.4129376411437988, "rewards/rejected": -2.768191337585449, "step": 1356 }, { "epoch": 1.28, "grad_norm": 22.373501153172004, "learning_rate": 3.2058862163366663e-07, "logps/chosen": -44.1959228515625, "logps/rejected": -76.89352416992188, "loss": 0.3124, "losses/dpo": 0.8814524412155151, "losses/sft": 2.666790246963501, "losses/total": 0.8814524412155151, "ref_logps/chosen": -32.113441467285156, "ref_logps/rejected": -40.992347717285156, "rewards/accuracies": 0.875, "rewards/chosen": -1.2082481384277344, "rewards/margins": 2.3818697929382324, "rewards/rejected": -3.590117931365967, "step": 1357 }, { "epoch": 1.28, "grad_norm": 20.88281341187721, "learning_rate": 3.2034427881834513e-07, "logps/chosen": -38.1577262878418, "logps/rejected": -67.50225830078125, "loss": 0.3491, "losses/dpo": 0.5172911286354065, "losses/sft": 1.9154108762741089, "losses/total": 0.5172911286354065, "ref_logps/chosen": -22.361751556396484, "ref_logps/rejected": -38.26085662841797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5795974731445312, "rewards/margins": 1.3445427417755127, "rewards/rejected": -2.924140214920044, "step": 1358 }, { "epoch": 1.28, "grad_norm": 20.027995478492457, "learning_rate": 3.200998630069079e-07, "logps/chosen": -30.665260314941406, "logps/rejected": -59.51911163330078, "loss": 0.3336, "losses/dpo": 0.8535985946655273, "losses/sft": 1.173034906387329, "losses/total": 0.8535985946655273, "ref_logps/chosen": -21.29233169555664, "ref_logps/rejected": -29.71279525756836, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9372931718826294, "rewards/margins": 2.0433387756347656, "rewards/rejected": -2.9806318283081055, "step": 1359 }, { "epoch": 1.28, "grad_norm": 14.986981912986737, "learning_rate": 3.198553744529849e-07, "logps/chosen": -29.49150848388672, "logps/rejected": -51.052711486816406, "loss": 0.3223, "losses/dpo": 0.33856210112571716, "losses/sft": 0.7521783709526062, "losses/total": 0.33856210112571716, "ref_logps/chosen": -17.668827056884766, "ref_logps/rejected": -25.290678024291992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1822683811187744, "rewards/margins": 1.3939348459243774, "rewards/rejected": -2.576202869415283, "step": 1360 }, { "epoch": 1.28, "grad_norm": 25.558994390372366, "learning_rate": 3.1961081341028134e-07, "logps/chosen": -42.91202163696289, "logps/rejected": -60.718040466308594, "loss": 0.445, "losses/dpo": 0.3875630795955658, "losses/sft": 1.0723533630371094, "losses/total": 0.3875630795955658, "ref_logps/chosen": -28.129531860351562, "ref_logps/rejected": -34.84735870361328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4782493114471436, "rewards/margins": 1.1088188886642456, "rewards/rejected": -2.5870680809020996, "step": 1361 }, { "epoch": 1.28, "grad_norm": 21.287713017188878, "learning_rate": 3.1936618013257764e-07, "logps/chosen": -34.3732795715332, "logps/rejected": -58.87042999267578, "loss": 0.3693, "losses/dpo": 0.6931472420692444, "losses/sft": 0.5391783118247986, "losses/total": 0.6931472420692444, "ref_logps/chosen": -22.020662307739258, "ref_logps/rejected": -30.7923583984375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2352615594863892, "rewards/margins": 1.5725452899932861, "rewards/rejected": -2.807806968688965, "step": 1362 }, { "epoch": 1.29, "grad_norm": 17.967535957714606, "learning_rate": 3.191214748737294e-07, "logps/chosen": -53.40403747558594, "logps/rejected": -82.31361389160156, "loss": 0.1936, "losses/dpo": 0.524944007396698, "losses/sft": 1.8395313024520874, "losses/total": 0.524944007396698, "ref_logps/chosen": -37.013648986816406, "ref_logps/rejected": -44.74785614013672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6390385627746582, "rewards/margins": 2.117537498474121, "rewards/rejected": -3.7565758228302, "step": 1363 }, { "epoch": 1.29, "grad_norm": 12.617902478478397, "learning_rate": 3.188766978876666e-07, "logps/chosen": -48.51924133300781, "logps/rejected": -83.20742797851562, "loss": 0.1999, "losses/dpo": 0.4864300787448883, "losses/sft": 0.721997857093811, "losses/total": 0.4864300787448883, "ref_logps/chosen": -33.81163787841797, "ref_logps/rejected": -43.07428741455078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.470760703086853, "rewards/margins": 2.542553186416626, "rewards/rejected": -4.013314247131348, "step": 1364 }, { "epoch": 1.29, "grad_norm": 14.377720488607618, "learning_rate": 3.1863184942839387e-07, "logps/chosen": -27.32819366455078, "logps/rejected": -50.76601028442383, "loss": 0.2548, "losses/dpo": 0.28711074590682983, "losses/sft": 1.7941559553146362, "losses/total": 0.28711074590682983, "ref_logps/chosen": -16.233478546142578, "ref_logps/rejected": -24.383132934570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1094715595245361, "rewards/margins": 1.5288165807724, "rewards/rejected": -2.6382880210876465, "step": 1365 }, { "epoch": 1.29, "grad_norm": 14.010861157162859, "learning_rate": 3.1838692974999003e-07, "logps/chosen": -37.334999084472656, "logps/rejected": -77.51573181152344, "loss": 0.2042, "losses/dpo": 0.17218713462352753, "losses/sft": 1.7482929229736328, "losses/total": 0.17218713462352753, "ref_logps/chosen": -24.96125602722168, "ref_logps/rejected": -38.69319152832031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2373745441436768, "rewards/margins": 2.6448793411254883, "rewards/rejected": -3.882253646850586, "step": 1366 }, { "epoch": 1.29, "grad_norm": 20.088304990550668, "learning_rate": 3.181419391066077e-07, "logps/chosen": -60.35954284667969, "logps/rejected": -71.63055419921875, "loss": 0.2948, "losses/dpo": 0.00843609869480133, "losses/sft": 1.9930287599563599, "losses/total": 0.00843609869480133, "ref_logps/chosen": -41.48717498779297, "ref_logps/rejected": -33.311920166015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.887237310409546, "rewards/margins": 1.9446256160736084, "rewards/rejected": -3.8318631649017334, "step": 1367 }, { "epoch": 1.29, "grad_norm": 23.266719496235726, "learning_rate": 3.178968777524732e-07, "logps/chosen": -40.71403503417969, "logps/rejected": -68.38905334472656, "loss": 0.3143, "losses/dpo": 0.48855525255203247, "losses/sft": 0.8222713470458984, "losses/total": 0.48855525255203247, "ref_logps/chosen": -25.958724975585938, "ref_logps/rejected": -36.854888916015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4755314588546753, "rewards/margins": 1.6778850555419922, "rewards/rejected": -3.153416395187378, "step": 1368 }, { "epoch": 1.29, "grad_norm": 13.047223530446518, "learning_rate": 3.1765174594188615e-07, "logps/chosen": -40.18345642089844, "logps/rejected": -72.08241271972656, "loss": 0.1885, "losses/dpo": 0.45001670718193054, "losses/sft": 0.8514175415039062, "losses/total": 0.45001670718193054, "ref_logps/chosen": -29.096294403076172, "ref_logps/rejected": -38.93220520019531, "rewards/accuracies": 1.0, "rewards/chosen": -1.1087160110473633, "rewards/margins": 2.2063043117523193, "rewards/rejected": -3.3150205612182617, "step": 1369 }, { "epoch": 1.29, "grad_norm": 23.5477531213864, "learning_rate": 3.174065439292193e-07, "logps/chosen": -39.46501541137695, "logps/rejected": -50.162986755371094, "loss": 0.3687, "losses/dpo": 0.07804545760154724, "losses/sft": 1.0488115549087524, "losses/total": 0.07804545760154724, "ref_logps/chosen": -27.037099838256836, "ref_logps/rejected": -25.441333770751953, "rewards/accuracies": 0.875, "rewards/chosen": -1.2427911758422852, "rewards/margins": 1.2293744087219238, "rewards/rejected": -2.472165584564209, "step": 1370 }, { "epoch": 1.29, "grad_norm": 19.656025808793114, "learning_rate": 3.1716127196891847e-07, "logps/chosen": -49.680233001708984, "logps/rejected": -64.4170913696289, "loss": 0.2513, "losses/dpo": 0.49662089347839355, "losses/sft": 1.4750880002975464, "losses/total": 0.49662089347839355, "ref_logps/chosen": -33.09954071044922, "ref_logps/rejected": -31.058517456054688, "rewards/accuracies": 0.9375, "rewards/chosen": -1.658069372177124, "rewards/margins": 1.6777878999710083, "rewards/rejected": -3.335857391357422, "step": 1371 }, { "epoch": 1.29, "grad_norm": 12.718980643348077, "learning_rate": 3.169159303155017e-07, "logps/chosen": -34.31136703491211, "logps/rejected": -65.73953247070312, "loss": 0.2919, "losses/dpo": 0.0562637634575367, "losses/sft": 2.045600175857544, "losses/total": 0.0562637634575367, "ref_logps/chosen": -22.180950164794922, "ref_logps/rejected": -36.07685089111328, "rewards/accuracies": 0.875, "rewards/chosen": -1.2130416631698608, "rewards/margins": 1.7532267570495605, "rewards/rejected": -2.966268539428711, "step": 1372 }, { "epoch": 1.3, "grad_norm": 20.881406122769757, "learning_rate": 3.166705192235595e-07, "logps/chosen": -35.25584411621094, "logps/rejected": -51.54933547973633, "loss": 0.3797, "losses/dpo": 0.6470499634742737, "losses/sft": 2.1149256229400635, "losses/total": 0.6470499634742737, "ref_logps/chosen": -21.803436279296875, "ref_logps/rejected": -25.83841323852539, "rewards/accuracies": 0.875, "rewards/chosen": -1.345240831375122, "rewards/margins": 1.2258515357971191, "rewards/rejected": -2.571092367172241, "step": 1373 }, { "epoch": 1.3, "grad_norm": 18.735504245468974, "learning_rate": 3.164250389477545e-07, "logps/chosen": -55.38046646118164, "logps/rejected": -82.9821548461914, "loss": 0.2509, "losses/dpo": 0.8488867282867432, "losses/sft": 2.1723711490631104, "losses/total": 0.8488867282867432, "ref_logps/chosen": -39.15890884399414, "ref_logps/rejected": -45.93708038330078, "rewards/accuracies": 0.875, "rewards/chosen": -1.6221559047698975, "rewards/margins": 2.0823519229888916, "rewards/rejected": -3.704507827758789, "step": 1374 }, { "epoch": 1.3, "grad_norm": 25.156733315730406, "learning_rate": 3.161794897428213e-07, "logps/chosen": -47.73204803466797, "logps/rejected": -59.072059631347656, "loss": 0.5018, "losses/dpo": 0.17642253637313843, "losses/sft": 0.6446542739868164, "losses/total": 0.17642253637313843, "ref_logps/chosen": -28.811199188232422, "ref_logps/rejected": -28.10662078857422, "rewards/accuracies": 0.75, "rewards/chosen": -1.8920845985412598, "rewards/margins": 1.2044591903686523, "rewards/rejected": -3.096543788909912, "step": 1375 }, { "epoch": 1.3, "grad_norm": 15.51828730319073, "learning_rate": 3.1593387186356545e-07, "logps/chosen": -64.53388977050781, "logps/rejected": -86.65994262695312, "loss": 0.1899, "losses/dpo": 0.07716049253940582, "losses/sft": 1.4720079898834229, "losses/total": 0.07716049253940582, "ref_logps/chosen": -46.20811462402344, "ref_logps/rejected": -45.945045471191406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8325777053833008, "rewards/margins": 2.2389113903045654, "rewards/rejected": -4.071489334106445, "step": 1376 }, { "epoch": 1.3, "grad_norm": 11.146183353322492, "learning_rate": 3.1568818556486443e-07, "logps/chosen": -37.75949478149414, "logps/rejected": -78.257080078125, "loss": 0.1429, "losses/dpo": 0.11379950493574142, "losses/sft": 0.9732004404067993, "losses/total": 0.11379950493574142, "ref_logps/chosen": -27.60883140563965, "ref_logps/rejected": -43.689788818359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0150662660598755, "rewards/margins": 2.441662311553955, "rewards/rejected": -3.456728458404541, "step": 1377 }, { "epoch": 1.3, "grad_norm": 20.738507826524017, "learning_rate": 3.1544243110166624e-07, "logps/chosen": -58.6762580871582, "logps/rejected": -62.27964401245117, "loss": 0.2886, "losses/dpo": 0.2144947052001953, "losses/sft": 0.9775461554527283, "losses/total": 0.2144947052001953, "ref_logps/chosen": -43.219417572021484, "ref_logps/rejected": -30.769149780273438, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5456838607788086, "rewards/margins": 1.6053653955459595, "rewards/rejected": -3.1510493755340576, "step": 1378 }, { "epoch": 1.3, "grad_norm": 22.027226674041515, "learning_rate": 3.1519660872898985e-07, "logps/chosen": -44.985042572021484, "logps/rejected": -51.313995361328125, "loss": 0.4138, "losses/dpo": 0.41517627239227295, "losses/sft": 0.7621329426765442, "losses/total": 0.41517627239227295, "ref_logps/chosen": -30.087188720703125, "ref_logps/rejected": -26.21489715576172, "rewards/accuracies": 0.875, "rewards/chosen": -1.4897855520248413, "rewards/margins": 1.0201241970062256, "rewards/rejected": -2.5099096298217773, "step": 1379 }, { "epoch": 1.3, "grad_norm": 16.55073163910898, "learning_rate": 3.149507187019246e-07, "logps/chosen": -36.96344757080078, "logps/rejected": -57.438873291015625, "loss": 0.2889, "losses/dpo": 0.33857062458992004, "losses/sft": 0.1767057627439499, "losses/total": 0.33857062458992004, "ref_logps/chosen": -25.429107666015625, "ref_logps/rejected": -26.369136810302734, "rewards/accuracies": 0.875, "rewards/chosen": -1.1534337997436523, "rewards/margins": 1.9535399675369263, "rewards/rejected": -3.1069741249084473, "step": 1380 }, { "epoch": 1.3, "grad_norm": 14.005389450089496, "learning_rate": 3.147047612756302e-07, "logps/chosen": -36.73299789428711, "logps/rejected": -66.06843566894531, "loss": 0.2225, "losses/dpo": 0.022921288385987282, "losses/sft": 1.0734434127807617, "losses/total": 0.022921288385987282, "ref_logps/chosen": -24.02361297607422, "ref_logps/rejected": -33.51122283935547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2709383964538574, "rewards/margins": 1.9847825765609741, "rewards/rejected": -3.255721092224121, "step": 1381 }, { "epoch": 1.3, "grad_norm": 14.547782693129502, "learning_rate": 3.1445873670533603e-07, "logps/chosen": -40.61274337768555, "logps/rejected": -65.20588684082031, "loss": 0.2539, "losses/dpo": 0.12797732651233673, "losses/sft": 0.9954688549041748, "losses/total": 0.12797732651233673, "ref_logps/chosen": -26.49566650390625, "ref_logps/rejected": -32.812103271484375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.411708116531372, "rewards/margins": 1.827670693397522, "rewards/rejected": -3.2393784523010254, "step": 1382 }, { "epoch": 1.3, "grad_norm": 14.41351301995366, "learning_rate": 3.142126452463413e-07, "logps/chosen": -41.54463195800781, "logps/rejected": -86.25028991699219, "loss": 0.204, "losses/dpo": 0.21245069801807404, "losses/sft": 1.0795797109603882, "losses/total": 0.21245069801807404, "ref_logps/chosen": -27.105928421020508, "ref_logps/rejected": -51.33068084716797, "rewards/accuracies": 1.0, "rewards/chosen": -1.4438704252243042, "rewards/margins": 2.0480902194976807, "rewards/rejected": -3.4919607639312744, "step": 1383 }, { "epoch": 1.31, "grad_norm": 18.496170304813727, "learning_rate": 3.139664871540147e-07, "logps/chosen": -43.562870025634766, "logps/rejected": -58.217464447021484, "loss": 0.2949, "losses/dpo": 0.019411930814385414, "losses/sft": 1.900407314300537, "losses/total": 0.019411930814385414, "ref_logps/chosen": -29.682071685791016, "ref_logps/rejected": -29.82047462463379, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3880798816680908, "rewards/margins": 1.4516191482543945, "rewards/rejected": -2.8396990299224854, "step": 1384 }, { "epoch": 1.31, "grad_norm": 17.616138976986985, "learning_rate": 3.137202626837939e-07, "logps/chosen": -50.840274810791016, "logps/rejected": -73.70732116699219, "loss": 0.2802, "losses/dpo": 1.513790249824524, "losses/sft": 1.9228363037109375, "losses/total": 1.513790249824524, "ref_logps/chosen": -36.26213073730469, "ref_logps/rejected": -39.47850799560547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4578145742416382, "rewards/margins": 1.9650670289993286, "rewards/rejected": -3.422881603240967, "step": 1385 }, { "epoch": 1.31, "grad_norm": 16.784062857950676, "learning_rate": 3.1347397209118555e-07, "logps/chosen": -38.83984375, "logps/rejected": -58.24061965942383, "loss": 0.275, "losses/dpo": 0.031245630234479904, "losses/sft": 1.3329898118972778, "losses/total": 0.031245630234479904, "ref_logps/chosen": -25.218080520629883, "ref_logps/rejected": -27.98609161376953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3621764183044434, "rewards/margins": 1.6632764339447021, "rewards/rejected": -3.0254528522491455, "step": 1386 }, { "epoch": 1.31, "grad_norm": 12.883369252616209, "learning_rate": 3.132276156317649e-07, "logps/chosen": -41.62300109863281, "logps/rejected": -60.34943771362305, "loss": 0.1892, "losses/dpo": 0.025257008150219917, "losses/sft": 0.9741492867469788, "losses/total": 0.025257008150219917, "ref_logps/chosen": -29.5534610748291, "ref_logps/rejected": -27.790912628173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.2069542407989502, "rewards/margins": 2.048898220062256, "rewards/rejected": -3.255852222442627, "step": 1387 }, { "epoch": 1.31, "grad_norm": 15.259512111190766, "learning_rate": 3.129811935611757e-07, "logps/chosen": -39.444236755371094, "logps/rejected": -59.69005584716797, "loss": 0.2461, "losses/dpo": 0.27259424328804016, "losses/sft": 1.413808822631836, "losses/total": 0.27259424328804016, "ref_logps/chosen": -30.054428100585938, "ref_logps/rejected": -30.112937927246094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.938980758190155, "rewards/margins": 2.0187315940856934, "rewards/rejected": -2.957712173461914, "step": 1388 }, { "epoch": 1.31, "grad_norm": 19.522561802986093, "learning_rate": 3.1273470613512944e-07, "logps/chosen": -43.838706970214844, "logps/rejected": -61.99713134765625, "loss": 0.3014, "losses/dpo": 0.4192206859588623, "losses/sft": 0.7934176921844482, "losses/total": 0.4192206859588623, "ref_logps/chosen": -29.05087661743164, "ref_logps/rejected": -28.246549606323242, "rewards/accuracies": 0.875, "rewards/chosen": -1.4787828922271729, "rewards/margins": 1.8962750434875488, "rewards/rejected": -3.3750579357147217, "step": 1389 }, { "epoch": 1.31, "grad_norm": 22.02601087856001, "learning_rate": 3.124881536094057e-07, "logps/chosen": -53.613433837890625, "logps/rejected": -61.42377471923828, "loss": 0.2593, "losses/dpo": 0.3724302053451538, "losses/sft": 1.218389868736267, "losses/total": 0.3724302053451538, "ref_logps/chosen": -41.250282287597656, "ref_logps/rejected": -33.55189514160156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2363152503967285, "rewards/margins": 1.5508726835250854, "rewards/rejected": -2.7871880531311035, "step": 1390 }, { "epoch": 1.31, "grad_norm": 16.339031764139776, "learning_rate": 3.1224153623985147e-07, "logps/chosen": -28.52994728088379, "logps/rejected": -46.93132781982422, "loss": 0.3292, "losses/dpo": 0.19260352849960327, "losses/sft": 0.09492281079292297, "losses/total": 0.19260352849960327, "ref_logps/chosen": -20.058406829833984, "ref_logps/rejected": -24.62704086303711, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8471540808677673, "rewards/margins": 1.383274793624878, "rewards/rejected": -2.230428695678711, "step": 1391 }, { "epoch": 1.31, "grad_norm": 14.417574079330798, "learning_rate": 3.1199485428238115e-07, "logps/chosen": -35.38548278808594, "logps/rejected": -69.1876220703125, "loss": 0.2493, "losses/dpo": 0.9543416500091553, "losses/sft": 0.7392401695251465, "losses/total": 0.9543416500091553, "ref_logps/chosen": -25.691715240478516, "ref_logps/rejected": -36.88298797607422, "rewards/accuracies": 0.875, "rewards/chosen": -0.969376802444458, "rewards/margins": 2.261086940765381, "rewards/rejected": -3.230463743209839, "step": 1392 }, { "epoch": 1.31, "grad_norm": 10.295029683495663, "learning_rate": 3.117481079929761e-07, "logps/chosen": -51.039466857910156, "logps/rejected": -79.59796142578125, "loss": 0.1643, "losses/dpo": 0.06199564039707184, "losses/sft": 2.1624906063079834, "losses/total": 0.06199564039707184, "ref_logps/chosen": -37.40200424194336, "ref_logps/rejected": -38.09355163574219, "rewards/accuracies": 1.0, "rewards/chosen": -1.3637460470199585, "rewards/margins": 2.7866954803466797, "rewards/rejected": -4.1504411697387695, "step": 1393 }, { "epoch": 1.32, "grad_norm": 16.828011662832367, "learning_rate": 3.115012976276845e-07, "logps/chosen": -53.56597137451172, "logps/rejected": -87.96880340576172, "loss": 0.2843, "losses/dpo": 0.0012601118069142103, "losses/sft": 1.4684534072875977, "losses/total": 0.0012601118069142103, "ref_logps/chosen": -36.31292724609375, "ref_logps/rejected": -47.43257141113281, "rewards/accuracies": 0.875, "rewards/chosen": -1.725304365158081, "rewards/margins": 2.3283188343048096, "rewards/rejected": -4.053623199462891, "step": 1394 }, { "epoch": 1.32, "grad_norm": 15.869607419163318, "learning_rate": 3.1125442344262087e-07, "logps/chosen": -33.77313995361328, "logps/rejected": -64.69073486328125, "loss": 0.2326, "losses/dpo": 0.05500578135251999, "losses/sft": 1.0315386056900024, "losses/total": 0.05500578135251999, "ref_logps/chosen": -23.597564697265625, "ref_logps/rejected": -34.359466552734375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0175575017929077, "rewards/margins": 2.0155699253082275, "rewards/rejected": -3.033127546310425, "step": 1395 }, { "epoch": 1.32, "grad_norm": 18.762488637913457, "learning_rate": 3.110074856939661e-07, "logps/chosen": -46.61671447753906, "logps/rejected": -65.85150146484375, "loss": 0.366, "losses/dpo": 0.7623270750045776, "losses/sft": 0.6854113936424255, "losses/total": 0.7623270750045776, "ref_logps/chosen": -30.256450653076172, "ref_logps/rejected": -35.3115348815918, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6360267400741577, "rewards/margins": 1.4179701805114746, "rewards/rejected": -3.0539968013763428, "step": 1396 }, { "epoch": 1.32, "grad_norm": 27.28451207117768, "learning_rate": 3.107604846379671e-07, "logps/chosen": -55.921382904052734, "logps/rejected": -84.86720275878906, "loss": 0.3479, "losses/dpo": 0.16975104808807373, "losses/sft": 1.0976189374923706, "losses/total": 0.16975104808807373, "ref_logps/chosen": -34.24074172973633, "ref_logps/rejected": -44.73518371582031, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1680641174316406, "rewards/margins": 1.8451370000839233, "rewards/rejected": -4.0132012367248535, "step": 1397 }, { "epoch": 1.32, "grad_norm": 22.994327347787834, "learning_rate": 3.105134205309361e-07, "logps/chosen": -50.354896545410156, "logps/rejected": -74.078857421875, "loss": 0.2725, "losses/dpo": 0.06360163539648056, "losses/sft": 0.2799937129020691, "losses/total": 0.06360163539648056, "ref_logps/chosen": -35.31947326660156, "ref_logps/rejected": -38.864418029785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5035423040390015, "rewards/margins": 2.017902135848999, "rewards/rejected": -3.521444082260132, "step": 1398 }, { "epoch": 1.32, "grad_norm": 11.607307335389155, "learning_rate": 3.102662936292513e-07, "logps/chosen": -32.136695861816406, "logps/rejected": -59.29497146606445, "loss": 0.2138, "losses/dpo": 0.0875086560845375, "losses/sft": 2.293851852416992, "losses/total": 0.0875086560845375, "ref_logps/chosen": -21.651092529296875, "ref_logps/rejected": -29.652145385742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.0485607385635376, "rewards/margins": 1.9157218933105469, "rewards/rejected": -2.964282512664795, "step": 1399 }, { "epoch": 1.32, "grad_norm": 18.490498204340614, "learning_rate": 3.1001910418935554e-07, "logps/chosen": -39.65591049194336, "logps/rejected": -57.00837707519531, "loss": 0.3065, "losses/dpo": 0.3364455997943878, "losses/sft": 1.5451871156692505, "losses/total": 0.3364455997943878, "ref_logps/chosen": -27.037635803222656, "ref_logps/rejected": -27.542686462402344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2618277072906494, "rewards/margins": 1.6847419738769531, "rewards/rejected": -2.9465694427490234, "step": 1400 }, { "epoch": 1.32, "grad_norm": 17.49755321577486, "learning_rate": 3.0977185246775705e-07, "logps/chosen": -41.952781677246094, "logps/rejected": -66.49818420410156, "loss": 0.3043, "losses/dpo": 0.05482149124145508, "losses/sft": 2.0683367252349854, "losses/total": 0.05482149124145508, "ref_logps/chosen": -26.273517608642578, "ref_logps/rejected": -32.13227081298828, "rewards/accuracies": 0.875, "rewards/chosen": -1.5679266452789307, "rewards/margins": 1.8686643838882446, "rewards/rejected": -3.4365909099578857, "step": 1401 }, { "epoch": 1.32, "grad_norm": 26.10177655528871, "learning_rate": 3.0952453872102825e-07, "logps/chosen": -46.199378967285156, "logps/rejected": -45.91789245605469, "loss": 0.4911, "losses/dpo": 0.31415560841560364, "losses/sft": 0.41115888953208923, "losses/total": 0.31415560841560364, "ref_logps/chosen": -32.28415298461914, "ref_logps/rejected": -21.885190963745117, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3915224075317383, "rewards/margins": 1.0117480754852295, "rewards/rejected": -2.4032704830169678, "step": 1402 }, { "epoch": 1.32, "grad_norm": 15.96710378211295, "learning_rate": 3.092771632058061e-07, "logps/chosen": -48.36985397338867, "logps/rejected": -78.32643127441406, "loss": 0.2358, "losses/dpo": 0.04179888963699341, "losses/sft": 1.4908018112182617, "losses/total": 0.04179888963699341, "ref_logps/chosen": -33.834590911865234, "ref_logps/rejected": -44.69692611694336, "rewards/accuracies": 0.875, "rewards/chosen": -1.4535266160964966, "rewards/margins": 1.909423828125, "rewards/rejected": -3.362950563430786, "step": 1403 }, { "epoch": 1.32, "grad_norm": 30.88792364071171, "learning_rate": 3.0902972617879184e-07, "logps/chosen": -41.960227966308594, "logps/rejected": -67.7916030883789, "loss": 0.4006, "losses/dpo": 0.11576896905899048, "losses/sft": 0.513933539390564, "losses/total": 0.11576896905899048, "ref_logps/chosen": -31.009445190429688, "ref_logps/rejected": -38.74067687988281, "rewards/accuracies": 0.8125, "rewards/chosen": -1.095078468322754, "rewards/margins": 1.8100143671035767, "rewards/rejected": -2.905092716217041, "step": 1404 }, { "epoch": 1.33, "grad_norm": 15.166379507238597, "learning_rate": 3.087822278967503e-07, "logps/chosen": -41.77265930175781, "logps/rejected": -68.5725326538086, "loss": 0.2483, "losses/dpo": 0.6737079620361328, "losses/sft": 1.3419865369796753, "losses/total": 0.6737079620361328, "ref_logps/chosen": -28.813377380371094, "ref_logps/rejected": -36.44935607910156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2959284782409668, "rewards/margins": 1.9163892269134521, "rewards/rejected": -3.21231746673584, "step": 1405 }, { "epoch": 1.33, "grad_norm": 16.9287394964917, "learning_rate": 3.085346686165099e-07, "logps/chosen": -37.65862274169922, "logps/rejected": -61.56184005737305, "loss": 0.3429, "losses/dpo": 0.0676048994064331, "losses/sft": 1.0760717391967773, "losses/total": 0.0676048994064331, "ref_logps/chosen": -26.596153259277344, "ref_logps/rejected": -31.68478775024414, "rewards/accuracies": 0.9375, "rewards/chosen": -1.106247067451477, "rewards/margins": 1.8814581632614136, "rewards/rejected": -2.9877052307128906, "step": 1406 }, { "epoch": 1.33, "grad_norm": 17.122364980334595, "learning_rate": 3.0828704859496246e-07, "logps/chosen": -49.540313720703125, "logps/rejected": -74.84905242919922, "loss": 0.2273, "losses/dpo": 0.05435238033533096, "losses/sft": 0.9692513942718506, "losses/total": 0.05435238033533096, "ref_logps/chosen": -31.80430030822754, "ref_logps/rejected": -36.619956970214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.7736012935638428, "rewards/margins": 2.0493083000183105, "rewards/rejected": -3.8229098320007324, "step": 1407 }, { "epoch": 1.33, "grad_norm": 17.46424142764141, "learning_rate": 3.0803936808906274e-07, "logps/chosen": -41.30474853515625, "logps/rejected": -61.31260299682617, "loss": 0.2658, "losses/dpo": 0.046600744128227234, "losses/sft": 1.1723265647888184, "losses/total": 0.046600744128227234, "ref_logps/chosen": -27.957374572753906, "ref_logps/rejected": -29.93868637084961, "rewards/accuracies": 0.875, "rewards/chosen": -1.334737777709961, "rewards/margins": 1.8026540279388428, "rewards/rejected": -3.1373918056488037, "step": 1408 }, { "epoch": 1.33, "grad_norm": 9.000508084414319, "learning_rate": 3.0779162735582833e-07, "logps/chosen": -34.019142150878906, "logps/rejected": -70.99647521972656, "loss": 0.1602, "losses/dpo": 0.015147344209253788, "losses/sft": 1.0689213275909424, "losses/total": 0.015147344209253788, "ref_logps/chosen": -23.742855072021484, "ref_logps/rejected": -36.57503128051758, "rewards/accuracies": 1.0, "rewards/chosen": -1.027628779411316, "rewards/margins": 2.414515495300293, "rewards/rejected": -3.4421439170837402, "step": 1409 }, { "epoch": 1.33, "grad_norm": 22.948517357114177, "learning_rate": 3.0754382665233916e-07, "logps/chosen": -25.570220947265625, "logps/rejected": -40.958370208740234, "loss": 0.4671, "losses/dpo": 0.023334253579378128, "losses/sft": 0.7527511715888977, "losses/total": 0.023334253579378128, "ref_logps/chosen": -17.47615623474121, "ref_logps/rejected": -21.068096160888672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8094065189361572, "rewards/margins": 1.1796208620071411, "rewards/rejected": -1.9890273809432983, "step": 1410 }, { "epoch": 1.33, "grad_norm": 20.101109047861705, "learning_rate": 3.072959662357377e-07, "logps/chosen": -47.22430419921875, "logps/rejected": -70.33983612060547, "loss": 0.3049, "losses/dpo": 0.2515951097011566, "losses/sft": 2.0519003868103027, "losses/total": 0.2515951097011566, "ref_logps/chosen": -32.38325500488281, "ref_logps/rejected": -34.746002197265625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4841053485870361, "rewards/margins": 2.0752780437469482, "rewards/rejected": -3.5593833923339844, "step": 1411 }, { "epoch": 1.33, "grad_norm": 17.911017245324427, "learning_rate": 3.070480463632281e-07, "logps/chosen": -56.77378463745117, "logps/rejected": -75.05183410644531, "loss": 0.2258, "losses/dpo": 0.021033428609371185, "losses/sft": 0.3239271342754364, "losses/total": 0.021033428609371185, "ref_logps/chosen": -42.29442596435547, "ref_logps/rejected": -37.46259307861328, "rewards/accuracies": 0.875, "rewards/chosen": -1.4479355812072754, "rewards/margins": 2.310988426208496, "rewards/rejected": -3.7589240074157715, "step": 1412 }, { "epoch": 1.33, "grad_norm": 10.884318683672445, "learning_rate": 3.0680006729207634e-07, "logps/chosen": -33.29119110107422, "logps/rejected": -67.68447875976562, "loss": 0.2029, "losses/dpo": 0.10050446540117264, "losses/sft": 1.5251775979995728, "losses/total": 0.10050446540117264, "ref_logps/chosen": -23.49117660522461, "ref_logps/rejected": -33.054344177246094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9800010919570923, "rewards/margins": 2.4830126762390137, "rewards/rejected": -3.4630136489868164, "step": 1413 }, { "epoch": 1.33, "grad_norm": 14.719008720401384, "learning_rate": 3.065520292796098e-07, "logps/chosen": -46.31703567504883, "logps/rejected": -87.81803894042969, "loss": 0.2086, "losses/dpo": 0.33490222692489624, "losses/sft": 1.4915435314178467, "losses/total": 0.33490222692489624, "ref_logps/chosen": -31.860668182373047, "ref_logps/rejected": -50.284820556640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4456369876861572, "rewards/margins": 2.3076844215393066, "rewards/rejected": -3.753321647644043, "step": 1414 }, { "epoch": 1.33, "grad_norm": 21.52553940133467, "learning_rate": 3.06303932583217e-07, "logps/chosen": -41.78518295288086, "logps/rejected": -56.071983337402344, "loss": 0.4845, "losses/dpo": 0.04287974163889885, "losses/sft": 1.4666032791137695, "losses/total": 0.04287974163889885, "ref_logps/chosen": -29.02096176147461, "ref_logps/rejected": -29.70186996459961, "rewards/accuracies": 0.8125, "rewards/chosen": -1.276422381401062, "rewards/margins": 1.3605891466140747, "rewards/rejected": -2.6370115280151367, "step": 1415 }, { "epoch": 1.34, "grad_norm": 14.913151478051288, "learning_rate": 3.0605577746034743e-07, "logps/chosen": -46.14606475830078, "logps/rejected": -79.3421401977539, "loss": 0.205, "losses/dpo": 1.0630391836166382, "losses/sft": 0.8568871021270752, "losses/total": 1.0630391836166382, "ref_logps/chosen": -31.086854934692383, "ref_logps/rejected": -42.21414566040039, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5059210062026978, "rewards/margins": 2.206878662109375, "rewards/rejected": -3.712799549102783, "step": 1416 }, { "epoch": 1.34, "grad_norm": 27.292329842419903, "learning_rate": 3.0580756416851113e-07, "logps/chosen": -44.78272247314453, "logps/rejected": -57.89014434814453, "loss": 0.5776, "losses/dpo": 0.031285084784030914, "losses/sft": 1.2270759344100952, "losses/total": 0.031285084784030914, "ref_logps/chosen": -26.72174835205078, "ref_logps/rejected": -29.020065307617188, "rewards/accuracies": 0.75, "rewards/chosen": -1.806097388267517, "rewards/margins": 1.0809104442596436, "rewards/rejected": -2.88700795173645, "step": 1417 }, { "epoch": 1.34, "grad_norm": 19.6818547774034, "learning_rate": 3.055592929652785e-07, "logps/chosen": -51.69029998779297, "logps/rejected": -74.67167663574219, "loss": 0.2852, "losses/dpo": 0.10434439778327942, "losses/sft": 1.6734610795974731, "losses/total": 0.10434439778327942, "ref_logps/chosen": -37.019065856933594, "ref_logps/rejected": -41.56062316894531, "rewards/accuracies": 0.9375, "rewards/chosen": -1.46712327003479, "rewards/margins": 1.8439818620681763, "rewards/rejected": -3.311105251312256, "step": 1418 }, { "epoch": 1.34, "grad_norm": 12.87768212579387, "learning_rate": 3.053109641082801e-07, "logps/chosen": -28.224382400512695, "logps/rejected": -58.55480194091797, "loss": 0.2019, "losses/dpo": 0.6830101609230042, "losses/sft": 0.23940163850784302, "losses/total": 0.6830101609230042, "ref_logps/chosen": -18.852127075195312, "ref_logps/rejected": -28.9815673828125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9372255206108093, "rewards/margins": 2.0200984477996826, "rewards/rejected": -2.9573240280151367, "step": 1419 }, { "epoch": 1.34, "grad_norm": 16.263633420706682, "learning_rate": 3.050625778552062e-07, "logps/chosen": -40.60322570800781, "logps/rejected": -64.99244689941406, "loss": 0.2992, "losses/dpo": 0.8322396278381348, "losses/sft": 1.7243766784667969, "losses/total": 0.8322396278381348, "ref_logps/chosen": -24.9498348236084, "ref_logps/rejected": -31.678945541381836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5653387308120728, "rewards/margins": 1.7660112380981445, "rewards/rejected": -3.331350088119507, "step": 1420 }, { "epoch": 1.34, "grad_norm": 25.06884981161078, "learning_rate": 3.0481413446380687e-07, "logps/chosen": -47.437232971191406, "logps/rejected": -61.722293853759766, "loss": 0.4, "losses/dpo": 0.28550541400909424, "losses/sft": 0.6486669182777405, "losses/total": 0.28550541400909424, "ref_logps/chosen": -30.1005916595459, "ref_logps/rejected": -32.188682556152344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.73366379737854, "rewards/margins": 1.2196977138519287, "rewards/rejected": -2.9533615112304688, "step": 1421 }, { "epoch": 1.34, "grad_norm": 20.250520458384408, "learning_rate": 3.045656341918911e-07, "logps/chosen": -36.643497467041016, "logps/rejected": -49.31394958496094, "loss": 0.4049, "losses/dpo": 0.5186187624931335, "losses/sft": 1.0549520254135132, "losses/total": 0.5186187624931335, "ref_logps/chosen": -24.245473861694336, "ref_logps/rejected": -23.575559616088867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.239802360534668, "rewards/margins": 1.3340365886688232, "rewards/rejected": -2.573838949203491, "step": 1422 }, { "epoch": 1.34, "grad_norm": 17.75983972603738, "learning_rate": 3.0431707729732736e-07, "logps/chosen": -38.185035705566406, "logps/rejected": -57.22908020019531, "loss": 0.3465, "losses/dpo": 0.1357872486114502, "losses/sft": 0.9193980097770691, "losses/total": 0.1357872486114502, "ref_logps/chosen": -23.940940856933594, "ref_logps/rejected": -27.931053161621094, "rewards/accuracies": 0.875, "rewards/chosen": -1.4244097471237183, "rewards/margins": 1.505393385887146, "rewards/rejected": -2.929802894592285, "step": 1423 }, { "epoch": 1.34, "grad_norm": 9.873471273128379, "learning_rate": 3.0406846403804256e-07, "logps/chosen": -40.76422119140625, "logps/rejected": -70.91935729980469, "loss": 0.1757, "losses/dpo": 0.22056858241558075, "losses/sft": 0.7822636961936951, "losses/total": 0.22056858241558075, "ref_logps/chosen": -28.01970863342285, "ref_logps/rejected": -36.072147369384766, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2744508981704712, "rewards/margins": 2.2102694511413574, "rewards/rejected": -3.484720468521118, "step": 1424 }, { "epoch": 1.34, "grad_norm": 22.453838082287106, "learning_rate": 3.0381979467202204e-07, "logps/chosen": -56.15535354614258, "logps/rejected": -79.8262710571289, "loss": 0.366, "losses/dpo": 0.064254529774189, "losses/sft": 2.253131866455078, "losses/total": 0.064254529774189, "ref_logps/chosen": -37.82724380493164, "ref_logps/rejected": -44.056339263916016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8328111171722412, "rewards/margins": 1.7441822290420532, "rewards/rejected": -3.576993227005005, "step": 1425 }, { "epoch": 1.35, "grad_norm": 13.669809095450503, "learning_rate": 3.0357106945730967e-07, "logps/chosen": -46.508785247802734, "logps/rejected": -77.78749084472656, "loss": 0.1852, "losses/dpo": 0.4811297059059143, "losses/sft": 2.0597198009490967, "losses/total": 0.4811297059059143, "ref_logps/chosen": -31.942691802978516, "ref_logps/rejected": -38.777523040771484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4566093683242798, "rewards/margins": 2.444387674331665, "rewards/rejected": -3.9009971618652344, "step": 1426 }, { "epoch": 1.35, "grad_norm": 9.616253979219994, "learning_rate": 3.03322288652007e-07, "logps/chosen": -35.215457916259766, "logps/rejected": -55.13160705566406, "loss": 0.1851, "losses/dpo": 0.2037929892539978, "losses/sft": 0.1460641771554947, "losses/total": 0.2037929892539978, "ref_logps/chosen": -25.066864013671875, "ref_logps/rejected": -25.196407318115234, "rewards/accuracies": 1.0, "rewards/chosen": -1.0148591995239258, "rewards/margins": 1.9786607027053833, "rewards/rejected": -2.9935197830200195, "step": 1427 }, { "epoch": 1.35, "grad_norm": 17.27966022731949, "learning_rate": 3.030734525142734e-07, "logps/chosen": -44.618743896484375, "logps/rejected": -72.2547836303711, "loss": 0.2405, "losses/dpo": 0.5735760927200317, "losses/sft": 0.4119182527065277, "losses/total": 0.5735760927200317, "ref_logps/chosen": -32.32391357421875, "ref_logps/rejected": -38.12977600097656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2294831275939941, "rewards/margins": 2.1830177307128906, "rewards/rejected": -3.4125008583068848, "step": 1428 }, { "epoch": 1.35, "grad_norm": 15.603010971167901, "learning_rate": 3.028245613023256e-07, "logps/chosen": -37.3242073059082, "logps/rejected": -65.66627502441406, "loss": 0.3092, "losses/dpo": 0.7389892339706421, "losses/sft": 0.9806507229804993, "losses/total": 0.7389892339706421, "ref_logps/chosen": -24.354408264160156, "ref_logps/rejected": -34.359771728515625, "rewards/accuracies": 0.75, "rewards/chosen": -1.2969800233840942, "rewards/margins": 1.8336701393127441, "rewards/rejected": -3.130650043487549, "step": 1429 }, { "epoch": 1.35, "grad_norm": 38.43562079890046, "learning_rate": 3.0257561527443754e-07, "logps/chosen": -47.58129119873047, "logps/rejected": -68.39112854003906, "loss": 0.5474, "losses/dpo": 0.008123328909277916, "losses/sft": 1.7599931955337524, "losses/total": 0.008123328909277916, "ref_logps/chosen": -30.78308868408203, "ref_logps/rejected": -34.629886627197266, "rewards/accuracies": 0.875, "rewards/chosen": -1.6798206567764282, "rewards/margins": 1.6963038444519043, "rewards/rejected": -3.376124382019043, "step": 1430 }, { "epoch": 1.35, "grad_norm": 24.217320899644758, "learning_rate": 3.0232661468893995e-07, "logps/chosen": -41.09233093261719, "logps/rejected": -69.59478759765625, "loss": 0.4019, "losses/dpo": 0.007654478307813406, "losses/sft": 0.332536518573761, "losses/total": 0.007654478307813406, "ref_logps/chosen": -25.90593719482422, "ref_logps/rejected": -34.614051818847656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.518639087677002, "rewards/margins": 1.9794347286224365, "rewards/rejected": -3.4980740547180176, "step": 1431 }, { "epoch": 1.35, "grad_norm": 16.838629668842366, "learning_rate": 3.0207755980422036e-07, "logps/chosen": -44.336143493652344, "logps/rejected": -68.55915832519531, "loss": 0.2553, "losses/dpo": 0.05405145138502121, "losses/sft": 0.4604761004447937, "losses/total": 0.05405145138502121, "ref_logps/chosen": -30.6013240814209, "ref_logps/rejected": -33.7476692199707, "rewards/accuracies": 0.875, "rewards/chosen": -1.3734819889068604, "rewards/margins": 2.1076667308807373, "rewards/rejected": -3.4811487197875977, "step": 1432 }, { "epoch": 1.35, "grad_norm": 21.553386905221384, "learning_rate": 3.018284508787224e-07, "logps/chosen": -43.48333740234375, "logps/rejected": -73.49456024169922, "loss": 0.3449, "losses/dpo": 0.8298816680908203, "losses/sft": 1.4330990314483643, "losses/total": 0.8298816680908203, "ref_logps/chosen": -25.39370346069336, "ref_logps/rejected": -38.354270935058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8089635372161865, "rewards/margins": 1.7050654888153076, "rewards/rejected": -3.514029026031494, "step": 1433 }, { "epoch": 1.35, "grad_norm": 21.743309871139893, "learning_rate": 3.015792881709459e-07, "logps/chosen": -42.24188995361328, "logps/rejected": -53.59918975830078, "loss": 0.4106, "losses/dpo": 0.25340214371681213, "losses/sft": 0.15852412581443787, "losses/total": 0.25340214371681213, "ref_logps/chosen": -25.47177505493164, "ref_logps/rejected": -25.131362915039062, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6770116090774536, "rewards/margins": 1.1697711944580078, "rewards/rejected": -2.846782684326172, "step": 1434 }, { "epoch": 1.35, "grad_norm": 13.3652690838592, "learning_rate": 3.013300719394466e-07, "logps/chosen": -36.2518310546875, "logps/rejected": -66.32289123535156, "loss": 0.2316, "losses/dpo": 1.658157467842102, "losses/sft": 1.6939048767089844, "losses/total": 1.658157467842102, "ref_logps/chosen": -24.382644653320312, "ref_logps/rejected": -33.32195281982422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1869187355041504, "rewards/margins": 2.113175630569458, "rewards/rejected": -3.3000943660736084, "step": 1435 }, { "epoch": 1.35, "grad_norm": 26.340685243149874, "learning_rate": 3.010808024428356e-07, "logps/chosen": -40.367271423339844, "logps/rejected": -77.90351867675781, "loss": 0.2887, "losses/dpo": 0.04319733381271362, "losses/sft": 1.5139224529266357, "losses/total": 0.04319733381271362, "ref_logps/chosen": -23.87700080871582, "ref_logps/rejected": -38.513038635253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6490272283554077, "rewards/margins": 2.29002046585083, "rewards/rejected": -3.9390475749969482, "step": 1436 }, { "epoch": 1.36, "grad_norm": 15.6765881403095, "learning_rate": 3.008314799397793e-07, "logps/chosen": -47.96514892578125, "logps/rejected": -66.68744659423828, "loss": 0.2573, "losses/dpo": 0.12287583202123642, "losses/sft": 1.1460134983062744, "losses/total": 0.12287583202123642, "ref_logps/chosen": -36.31563949584961, "ref_logps/rejected": -31.305810928344727, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1649506092071533, "rewards/margins": 2.3732128143310547, "rewards/rejected": -3.538163661956787, "step": 1437 }, { "epoch": 1.36, "grad_norm": 10.454472234093462, "learning_rate": 3.0058210468899933e-07, "logps/chosen": -32.65086364746094, "logps/rejected": -77.60858154296875, "loss": 0.1475, "losses/dpo": 0.051829878240823746, "losses/sft": 1.464661717414856, "losses/total": 0.051829878240823746, "ref_logps/chosen": -20.531330108642578, "ref_logps/rejected": -39.72455596923828, "rewards/accuracies": 1.0, "rewards/chosen": -1.2119536399841309, "rewards/margins": 2.576448917388916, "rewards/rejected": -3.788402795791626, "step": 1438 }, { "epoch": 1.36, "grad_norm": 19.080366573775635, "learning_rate": 3.0033267694927163e-07, "logps/chosen": -41.764259338378906, "logps/rejected": -62.72797393798828, "loss": 0.3311, "losses/dpo": 0.21394817531108856, "losses/sft": 1.96067214012146, "losses/total": 0.21394817531108856, "ref_logps/chosen": -27.60601806640625, "ref_logps/rejected": -33.34519958496094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.415824294090271, "rewards/margins": 1.5224528312683105, "rewards/rejected": -2.938277244567871, "step": 1439 }, { "epoch": 1.36, "grad_norm": 8.702545377682467, "learning_rate": 3.000831969794271e-07, "logps/chosen": -38.88002014160156, "logps/rejected": -74.43126678466797, "loss": 0.1554, "losses/dpo": 0.3757380247116089, "losses/sft": 1.1915669441223145, "losses/total": 0.3757380247116089, "ref_logps/chosen": -25.054325103759766, "ref_logps/rejected": -36.55120849609375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3825691938400269, "rewards/margins": 2.4054367542266846, "rewards/rejected": -3.78800630569458, "step": 1440 }, { "epoch": 1.36, "grad_norm": 10.67635549468961, "learning_rate": 2.9983366503835043e-07, "logps/chosen": -45.056880950927734, "logps/rejected": -76.99773406982422, "loss": 0.1929, "losses/dpo": 0.06900396943092346, "losses/sft": 2.3243253231048584, "losses/total": 0.06900396943092346, "ref_logps/chosen": -30.845775604248047, "ref_logps/rejected": -40.71488952636719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4211108684539795, "rewards/margins": 2.2071733474731445, "rewards/rejected": -3.628283977508545, "step": 1441 }, { "epoch": 1.36, "grad_norm": 11.48306925693141, "learning_rate": 2.9958408138498057e-07, "logps/chosen": -47.72575378417969, "logps/rejected": -77.99559020996094, "loss": 0.1436, "losses/dpo": 0.03755783662199974, "losses/sft": 1.4729794263839722, "losses/total": 0.03755783662199974, "ref_logps/chosen": -30.187776565551758, "ref_logps/rejected": -34.915748596191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7537975311279297, "rewards/margins": 2.5541868209838867, "rewards/rejected": -4.307984352111816, "step": 1442 }, { "epoch": 1.36, "grad_norm": 15.484328722173283, "learning_rate": 2.9933444627830983e-07, "logps/chosen": -44.27562713623047, "logps/rejected": -68.8431396484375, "loss": 0.3034, "losses/dpo": 0.02692590467631817, "losses/sft": 0.5123031735420227, "losses/total": 0.02692590467631817, "ref_logps/chosen": -29.69750213623047, "ref_logps/rejected": -32.562686920166016, "rewards/accuracies": 0.875, "rewards/chosen": -1.457812786102295, "rewards/margins": 2.1702325344085693, "rewards/rejected": -3.6280455589294434, "step": 1443 }, { "epoch": 1.36, "grad_norm": 12.167334715672224, "learning_rate": 2.990847599773841e-07, "logps/chosen": -41.0130615234375, "logps/rejected": -69.667724609375, "loss": 0.175, "losses/dpo": 0.044620051980018616, "losses/sft": 0.643551766872406, "losses/total": 0.044620051980018616, "ref_logps/chosen": -27.140731811523438, "ref_logps/rejected": -33.486873626708984, "rewards/accuracies": 1.0, "rewards/chosen": -1.387233018875122, "rewards/margins": 2.230851650238037, "rewards/rejected": -3.6180849075317383, "step": 1444 }, { "epoch": 1.36, "grad_norm": 13.580451891725499, "learning_rate": 2.9883502274130247e-07, "logps/chosen": -36.72465515136719, "logps/rejected": -53.91063690185547, "loss": 0.3011, "losses/dpo": 0.6815512180328369, "losses/sft": 0.4006335437297821, "losses/total": 0.6815512180328369, "ref_logps/chosen": -25.183635711669922, "ref_logps/rejected": -25.61595916748047, "rewards/accuracies": 0.875, "rewards/chosen": -1.1541019678115845, "rewards/margins": 1.675365924835205, "rewards/rejected": -2.8294677734375, "step": 1445 }, { "epoch": 1.36, "grad_norm": 12.157626134594098, "learning_rate": 2.9858523482921656e-07, "logps/chosen": -49.077301025390625, "logps/rejected": -76.68414306640625, "loss": 0.1637, "losses/dpo": 0.24800290167331696, "losses/sft": 1.1493597030639648, "losses/total": 0.24800290167331696, "ref_logps/chosen": -31.108768463134766, "ref_logps/rejected": -37.292633056640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7968533039093018, "rewards/margins": 2.1422975063323975, "rewards/rejected": -3.939150810241699, "step": 1446 }, { "epoch": 1.37, "grad_norm": 16.10761146984006, "learning_rate": 2.9833539650033086e-07, "logps/chosen": -36.47691345214844, "logps/rejected": -65.06837463378906, "loss": 0.2487, "losses/dpo": 0.015907280147075653, "losses/sft": 1.801396369934082, "losses/total": 0.015907280147075653, "ref_logps/chosen": -21.308757781982422, "ref_logps/rejected": -32.70999526977539, "rewards/accuracies": 1.0, "rewards/chosen": -1.516815185546875, "rewards/margins": 1.7190229892730713, "rewards/rejected": -3.2358384132385254, "step": 1447 }, { "epoch": 1.37, "grad_norm": 16.84668393597248, "learning_rate": 2.980855080139021e-07, "logps/chosen": -39.89360809326172, "logps/rejected": -53.07173156738281, "loss": 0.357, "losses/dpo": 0.24004988372325897, "losses/sft": 0.769109845161438, "losses/total": 0.24004988372325897, "ref_logps/chosen": -26.59369659423828, "ref_logps/rejected": -25.47890281677246, "rewards/accuracies": 0.8125, "rewards/chosen": -1.329991102218628, "rewards/margins": 1.4292919635772705, "rewards/rejected": -2.7592830657958984, "step": 1448 }, { "epoch": 1.37, "grad_norm": 13.49342809284198, "learning_rate": 2.9783556962923906e-07, "logps/chosen": -39.7247428894043, "logps/rejected": -64.53295135498047, "loss": 0.2559, "losses/dpo": 0.7641324996948242, "losses/sft": 0.7763729691505432, "losses/total": 0.7641324996948242, "ref_logps/chosen": -27.6272029876709, "ref_logps/rejected": -33.94409942626953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2097541093826294, "rewards/margins": 1.8491311073303223, "rewards/rejected": -3.058885097503662, "step": 1449 }, { "epoch": 1.37, "grad_norm": 15.177890501850355, "learning_rate": 2.9758558160570237e-07, "logps/chosen": -39.257835388183594, "logps/rejected": -56.148963928222656, "loss": 0.2677, "losses/dpo": 0.22040356695652008, "losses/sft": 0.35040050745010376, "losses/total": 0.22040356695652008, "ref_logps/chosen": -26.866546630859375, "ref_logps/rejected": -28.436309814453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2391290664672852, "rewards/margins": 1.5321362018585205, "rewards/rejected": -2.7712652683258057, "step": 1450 }, { "epoch": 1.37, "grad_norm": 14.306110503805368, "learning_rate": 2.9733554420270395e-07, "logps/chosen": -38.83451843261719, "logps/rejected": -67.75174713134766, "loss": 0.2055, "losses/dpo": 0.09323230385780334, "losses/sft": 1.461445927619934, "losses/total": 0.09323230385780334, "ref_logps/chosen": -27.112140655517578, "ref_logps/rejected": -33.676307678222656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1722378730773926, "rewards/margins": 2.2353062629699707, "rewards/rejected": -3.4075441360473633, "step": 1451 }, { "epoch": 1.37, "grad_norm": 25.71512021522747, "learning_rate": 2.9708545767970726e-07, "logps/chosen": -46.32401657104492, "logps/rejected": -74.04649353027344, "loss": 0.3624, "losses/dpo": 0.003507654182612896, "losses/sft": 1.0846240520477295, "losses/total": 0.003507654182612896, "ref_logps/chosen": -32.94955825805664, "ref_logps/rejected": -36.96346664428711, "rewards/accuracies": 0.875, "rewards/chosen": -1.3374457359313965, "rewards/margins": 2.3708572387695312, "rewards/rejected": -3.7083029747009277, "step": 1452 }, { "epoch": 1.37, "grad_norm": 21.69086118194796, "learning_rate": 2.968353222962265e-07, "logps/chosen": -43.533531188964844, "logps/rejected": -59.046791076660156, "loss": 0.3599, "losses/dpo": 0.04370349645614624, "losses/sft": 1.0122779607772827, "losses/total": 0.04370349645614624, "ref_logps/chosen": -29.663578033447266, "ref_logps/rejected": -30.229351043701172, "rewards/accuracies": 0.75, "rewards/chosen": -1.3869950771331787, "rewards/margins": 1.4947490692138672, "rewards/rejected": -2.881744384765625, "step": 1453 }, { "epoch": 1.37, "grad_norm": 21.398647459585433, "learning_rate": 2.965851383118266e-07, "logps/chosen": -61.35820770263672, "logps/rejected": -90.23042297363281, "loss": 0.2049, "losses/dpo": 0.04979977011680603, "losses/sft": 1.3459398746490479, "losses/total": 0.04979977011680603, "ref_logps/chosen": -42.82500457763672, "ref_logps/rejected": -48.30196762084961, "rewards/accuracies": 0.875, "rewards/chosen": -1.8533202409744263, "rewards/margins": 2.3395254611968994, "rewards/rejected": -4.192845344543457, "step": 1454 }, { "epoch": 1.37, "grad_norm": 17.707610439149143, "learning_rate": 2.96334905986123e-07, "logps/chosen": -29.607940673828125, "logps/rejected": -48.206573486328125, "loss": 0.3453, "losses/dpo": 0.37312161922454834, "losses/sft": 0.5129525661468506, "losses/total": 0.37312161922454834, "ref_logps/chosen": -19.936931610107422, "ref_logps/rejected": -25.01300621032715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9671008586883545, "rewards/margins": 1.3522558212280273, "rewards/rejected": -2.319356679916382, "step": 1455 }, { "epoch": 1.37, "grad_norm": 23.088839394303836, "learning_rate": 2.9608462557878136e-07, "logps/chosen": -45.07745361328125, "logps/rejected": -81.71855926513672, "loss": 0.3171, "losses/dpo": 0.8255636692047119, "losses/sft": 1.4302542209625244, "losses/total": 0.8255636692047119, "ref_logps/chosen": -27.78646469116211, "ref_logps/rejected": -44.148380279541016, "rewards/accuracies": 0.875, "rewards/chosen": -1.7290986776351929, "rewards/margins": 2.0279195308685303, "rewards/rejected": -3.757018566131592, "step": 1456 }, { "epoch": 1.37, "grad_norm": 12.517545238857009, "learning_rate": 2.958342973495171e-07, "logps/chosen": -41.115440368652344, "logps/rejected": -91.32968139648438, "loss": 0.1495, "losses/dpo": 0.006359227932989597, "losses/sft": 1.503999948501587, "losses/total": 0.006359227932989597, "ref_logps/chosen": -27.62248420715332, "ref_logps/rejected": -46.7417106628418, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3492956161499023, "rewards/margins": 3.109501361846924, "rewards/rejected": -4.458796977996826, "step": 1457 }, { "epoch": 1.38, "grad_norm": 24.384522673335866, "learning_rate": 2.9558392155809533e-07, "logps/chosen": -38.94096374511719, "logps/rejected": -57.40300750732422, "loss": 0.3637, "losses/dpo": 0.5718526840209961, "losses/sft": 1.4582452774047852, "losses/total": 0.5718526840209961, "ref_logps/chosen": -25.122879028320312, "ref_logps/rejected": -27.247961044311523, "rewards/accuracies": 0.875, "rewards/chosen": -1.3818085193634033, "rewards/margins": 1.6336960792541504, "rewards/rejected": -3.0155045986175537, "step": 1458 }, { "epoch": 1.38, "grad_norm": 19.916648046755483, "learning_rate": 2.953334984643304e-07, "logps/chosen": -43.65755844116211, "logps/rejected": -59.18495559692383, "loss": 0.3737, "losses/dpo": 0.2794215679168701, "losses/sft": 1.0150725841522217, "losses/total": 0.2794215679168701, "ref_logps/chosen": -27.090145111083984, "ref_logps/rejected": -28.839431762695312, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6567412614822388, "rewards/margins": 1.3778111934661865, "rewards/rejected": -3.034552574157715, "step": 1459 }, { "epoch": 1.38, "grad_norm": 16.080581087014494, "learning_rate": 2.9508302832808603e-07, "logps/chosen": -44.774662017822266, "logps/rejected": -65.32532501220703, "loss": 0.2353, "losses/dpo": 0.0961315706372261, "losses/sft": 1.2163171768188477, "losses/total": 0.0961315706372261, "ref_logps/chosen": -33.11354064941406, "ref_logps/rejected": -31.555835723876953, "rewards/accuracies": 1.0, "rewards/chosen": -1.1661118268966675, "rewards/margins": 2.210836887359619, "rewards/rejected": -3.376948833465576, "step": 1460 }, { "epoch": 1.38, "grad_norm": 20.64700267870011, "learning_rate": 2.948325114092744e-07, "logps/chosen": -39.288490295410156, "logps/rejected": -57.806644439697266, "loss": 0.4462, "losses/dpo": 0.011603395454585552, "losses/sft": 1.2010611295700073, "losses/total": 0.011603395454585552, "ref_logps/chosen": -24.776926040649414, "ref_logps/rejected": -29.463947296142578, "rewards/accuracies": 0.8125, "rewards/chosen": -1.451156735420227, "rewards/margins": 1.383113145828247, "rewards/rejected": -2.8342700004577637, "step": 1461 }, { "epoch": 1.38, "grad_norm": 23.791425610717198, "learning_rate": 2.9458194796785654e-07, "logps/chosen": -38.31641387939453, "logps/rejected": -62.26927947998047, "loss": 0.3945, "losses/dpo": 0.7313476800918579, "losses/sft": 0.9102062582969666, "losses/total": 0.7313476800918579, "ref_logps/chosen": -24.66943359375, "ref_logps/rejected": -34.92713928222656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.364698052406311, "rewards/margins": 1.3695154190063477, "rewards/rejected": -2.7342135906219482, "step": 1462 }, { "epoch": 1.38, "grad_norm": 12.304806624694093, "learning_rate": 2.9433133826384155e-07, "logps/chosen": -43.01666259765625, "logps/rejected": -77.86328125, "loss": 0.1811, "losses/dpo": 0.1409328430891037, "losses/sft": 1.1007572412490845, "losses/total": 0.1409328430891037, "ref_logps/chosen": -27.814260482788086, "ref_logps/rejected": -38.97038269042969, "rewards/accuracies": 0.9375, "rewards/chosen": -1.520240068435669, "rewards/margins": 2.3690497875213623, "rewards/rejected": -3.8892898559570312, "step": 1463 }, { "epoch": 1.38, "grad_norm": 15.63749895390377, "learning_rate": 2.9408068255728657e-07, "logps/chosen": -41.107025146484375, "logps/rejected": -64.88746643066406, "loss": 0.323, "losses/dpo": 0.00872580986469984, "losses/sft": 1.1189634799957275, "losses/total": 0.00872580986469984, "ref_logps/chosen": -27.336944580078125, "ref_logps/rejected": -33.46990966796875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3770084381103516, "rewards/margins": 1.764747142791748, "rewards/rejected": -3.1417553424835205, "step": 1464 }, { "epoch": 1.38, "grad_norm": 15.716404147992778, "learning_rate": 2.938299811082966e-07, "logps/chosen": -47.084617614746094, "logps/rejected": -75.12101745605469, "loss": 0.2412, "losses/dpo": 0.0026116613298654556, "losses/sft": 1.253553032875061, "losses/total": 0.0026116613298654556, "ref_logps/chosen": -34.25510787963867, "ref_logps/rejected": -39.4142951965332, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2829511165618896, "rewards/margins": 2.2877211570739746, "rewards/rejected": -3.5706725120544434, "step": 1465 }, { "epoch": 1.38, "grad_norm": 12.061683759768165, "learning_rate": 2.9357923417702405e-07, "logps/chosen": -48.528076171875, "logps/rejected": -78.13833618164062, "loss": 0.2563, "losses/dpo": 0.3230562210083008, "losses/sft": 2.370514154434204, "losses/total": 0.3230562210083008, "ref_logps/chosen": -34.50697326660156, "ref_logps/rejected": -43.404930114746094, "rewards/accuracies": 0.875, "rewards/chosen": -1.4021098613739014, "rewards/margins": 2.0712311267852783, "rewards/rejected": -3.4733409881591797, "step": 1466 }, { "epoch": 1.38, "grad_norm": 15.383984435414467, "learning_rate": 2.933284420236685e-07, "logps/chosen": -44.7817497253418, "logps/rejected": -71.11353302001953, "loss": 0.2176, "losses/dpo": 0.0716216117143631, "losses/sft": 1.0443071126937866, "losses/total": 0.0716216117143631, "ref_logps/chosen": -29.986309051513672, "ref_logps/rejected": -36.018707275390625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.479543685913086, "rewards/margins": 2.0299391746520996, "rewards/rejected": -3.5094828605651855, "step": 1467 }, { "epoch": 1.38, "grad_norm": 16.62154272161633, "learning_rate": 2.930776049084763e-07, "logps/chosen": -47.44051742553711, "logps/rejected": -75.04167175292969, "loss": 0.2473, "losses/dpo": 0.4674528241157532, "losses/sft": 0.4010137915611267, "losses/total": 0.4674528241157532, "ref_logps/chosen": -29.162521362304688, "ref_logps/rejected": -37.780784606933594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8277997970581055, "rewards/margins": 1.8982887268066406, "rewards/rejected": -3.726088523864746, "step": 1468 }, { "epoch": 1.39, "grad_norm": 14.616732352817952, "learning_rate": 2.9282672309174085e-07, "logps/chosen": -50.32554626464844, "logps/rejected": -87.01734924316406, "loss": 0.2011, "losses/dpo": 0.8387554287910461, "losses/sft": 1.999991774559021, "losses/total": 0.8387554287910461, "ref_logps/chosen": -33.304107666015625, "ref_logps/rejected": -47.67225646972656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7021441459655762, "rewards/margins": 2.232365131378174, "rewards/rejected": -3.934509515762329, "step": 1469 }, { "epoch": 1.39, "grad_norm": 25.29326953221265, "learning_rate": 2.9257579683380154e-07, "logps/chosen": -47.03556823730469, "logps/rejected": -89.46857452392578, "loss": 0.3187, "losses/dpo": 0.25417107343673706, "losses/sft": 1.567099928855896, "losses/total": 0.25417107343673706, "ref_logps/chosen": -26.924713134765625, "ref_logps/rejected": -48.321598052978516, "rewards/accuracies": 0.875, "rewards/chosen": -2.0110855102539062, "rewards/margins": 2.1036124229431152, "rewards/rejected": -4.1146979331970215, "step": 1470 }, { "epoch": 1.39, "grad_norm": 15.747039425000986, "learning_rate": 2.923248263950441e-07, "logps/chosen": -40.47208023071289, "logps/rejected": -70.05586242675781, "loss": 0.2751, "losses/dpo": 0.2918963134288788, "losses/sft": 1.4062659740447998, "losses/total": 0.2918963134288788, "ref_logps/chosen": -28.12968635559082, "ref_logps/rejected": -36.73493194580078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2342392206192017, "rewards/margins": 2.0978541374206543, "rewards/rejected": -3.3320932388305664, "step": 1471 }, { "epoch": 1.39, "grad_norm": 14.093152086075191, "learning_rate": 2.9207381203589996e-07, "logps/chosen": -40.235679626464844, "logps/rejected": -75.10343933105469, "loss": 0.2009, "losses/dpo": 0.05608630180358887, "losses/sft": 0.8335394859313965, "losses/total": 0.05608630180358887, "ref_logps/chosen": -23.981735229492188, "ref_logps/rejected": -37.16865539550781, "rewards/accuracies": 1.0, "rewards/chosen": -1.6253942251205444, "rewards/margins": 2.1680850982666016, "rewards/rejected": -3.7934789657592773, "step": 1472 }, { "epoch": 1.39, "grad_norm": 22.048076728506086, "learning_rate": 2.9182275401684644e-07, "logps/chosen": -37.677345275878906, "logps/rejected": -67.85891723632812, "loss": 0.3673, "losses/dpo": 0.37645190954208374, "losses/sft": 1.6482958793640137, "losses/total": 0.37645190954208374, "ref_logps/chosen": -24.60279083251953, "ref_logps/rejected": -38.02972412109375, "rewards/accuracies": 0.875, "rewards/chosen": -1.3074554204940796, "rewards/margins": 1.6754642724990845, "rewards/rejected": -2.982919692993164, "step": 1473 }, { "epoch": 1.39, "grad_norm": 28.184788309412063, "learning_rate": 2.9157165259840566e-07, "logps/chosen": -57.0305061340332, "logps/rejected": -73.3507080078125, "loss": 0.3568, "losses/dpo": 1.1823127269744873, "losses/sft": 2.4641098976135254, "losses/total": 1.1823127269744873, "ref_logps/chosen": -40.17429733276367, "ref_logps/rejected": -36.48909378051758, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6856212615966797, "rewards/margins": 2.000540256500244, "rewards/rejected": -3.6861612796783447, "step": 1474 }, { "epoch": 1.39, "grad_norm": 18.633600038494528, "learning_rate": 2.913205080411452e-07, "logps/chosen": -54.808250427246094, "logps/rejected": -78.76731872558594, "loss": 0.2258, "losses/dpo": 0.5114039778709412, "losses/sft": 2.0315799713134766, "losses/total": 0.5114039778709412, "ref_logps/chosen": -35.08723068237305, "ref_logps/rejected": -39.831180572509766, "rewards/accuracies": 1.0, "rewards/chosen": -1.972102165222168, "rewards/margins": 1.9215112924575806, "rewards/rejected": -3.893613338470459, "step": 1475 }, { "epoch": 1.39, "grad_norm": 13.820325305143331, "learning_rate": 2.9106932060567726e-07, "logps/chosen": -45.67437744140625, "logps/rejected": -69.07518005371094, "loss": 0.2596, "losses/dpo": 0.07433663308620453, "losses/sft": 0.1182662770152092, "losses/total": 0.07433663308620453, "ref_logps/chosen": -29.492040634155273, "ref_logps/rejected": -34.81360626220703, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6182336807250977, "rewards/margins": 1.8079237937927246, "rewards/rejected": -3.4261574745178223, "step": 1476 }, { "epoch": 1.39, "grad_norm": 17.0652343654667, "learning_rate": 2.908180905526584e-07, "logps/chosen": -55.353763580322266, "logps/rejected": -72.31548309326172, "loss": 0.2292, "losses/dpo": 0.10555914789438248, "losses/sft": 1.7653651237487793, "losses/total": 0.10555914789438248, "ref_logps/chosen": -37.52522277832031, "ref_logps/rejected": -35.72598648071289, "rewards/accuracies": 1.0, "rewards/chosen": -1.782853603363037, "rewards/margins": 1.876096487045288, "rewards/rejected": -3.6589503288269043, "step": 1477 }, { "epoch": 1.39, "grad_norm": 21.854284537797877, "learning_rate": 2.9056681814278984e-07, "logps/chosen": -42.65955352783203, "logps/rejected": -59.91982650756836, "loss": 0.3373, "losses/dpo": 0.3987390995025635, "losses/sft": 2.3252899646759033, "losses/total": 0.3987390995025635, "ref_logps/chosen": -25.81639862060547, "ref_logps/rejected": -28.612924575805664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6843156814575195, "rewards/margins": 1.4463746547698975, "rewards/rejected": -3.130690336227417, "step": 1478 }, { "epoch": 1.4, "grad_norm": 19.58299138357521, "learning_rate": 2.903155036368161e-07, "logps/chosen": -48.480682373046875, "logps/rejected": -75.70742797851562, "loss": 0.279, "losses/dpo": 0.021907396614551544, "losses/sft": 1.1324025392532349, "losses/total": 0.021907396614551544, "ref_logps/chosen": -32.799957275390625, "ref_logps/rejected": -39.47787857055664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5680723190307617, "rewards/margins": 2.0548830032348633, "rewards/rejected": -3.622955322265625, "step": 1479 }, { "epoch": 1.4, "grad_norm": 15.943040073604962, "learning_rate": 2.90064147295526e-07, "logps/chosen": -42.106834411621094, "logps/rejected": -66.48725891113281, "loss": 0.2717, "losses/dpo": 1.075848937034607, "losses/sft": 0.541961669921875, "losses/total": 1.075848937034607, "ref_logps/chosen": -29.399559020996094, "ref_logps/rejected": -32.4732666015625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2707276344299316, "rewards/margins": 2.130671739578247, "rewards/rejected": -3.4013993740081787, "step": 1480 }, { "epoch": 1.4, "grad_norm": 20.624514775464558, "learning_rate": 2.8981274937975133e-07, "logps/chosen": -46.856544494628906, "logps/rejected": -59.2685546875, "loss": 0.3278, "losses/dpo": 0.016813475638628006, "losses/sft": 1.2878328561782837, "losses/total": 0.016813475638628006, "ref_logps/chosen": -32.157371520996094, "ref_logps/rejected": -29.684722900390625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4699172973632812, "rewards/margins": 1.4884662628173828, "rewards/rejected": -2.958383560180664, "step": 1481 }, { "epoch": 1.4, "grad_norm": 19.853618624820857, "learning_rate": 2.895613101503673e-07, "logps/chosen": -46.966270446777344, "logps/rejected": -70.55146789550781, "loss": 0.2991, "losses/dpo": 0.6407129168510437, "losses/sft": 0.9887091517448425, "losses/total": 0.6407129168510437, "ref_logps/chosen": -29.566226959228516, "ref_logps/rejected": -35.05615997314453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.740004539489746, "rewards/margins": 1.8095265626907349, "rewards/rejected": -3.5495309829711914, "step": 1482 }, { "epoch": 1.4, "grad_norm": 18.582855999549178, "learning_rate": 2.8930982986829184e-07, "logps/chosen": -45.383338928222656, "logps/rejected": -74.62618255615234, "loss": 0.2388, "losses/dpo": 0.04880343750119209, "losses/sft": 1.208908200263977, "losses/total": 0.04880343750119209, "ref_logps/chosen": -29.72504234313965, "ref_logps/rejected": -37.22764205932617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.565829873085022, "rewards/margins": 2.1740241050720215, "rewards/rejected": -3.739853858947754, "step": 1483 }, { "epoch": 1.4, "grad_norm": 18.99269573902686, "learning_rate": 2.890583087944855e-07, "logps/chosen": -38.661956787109375, "logps/rejected": -83.45037841796875, "loss": 0.2697, "losses/dpo": 1.342690348625183, "losses/sft": 2.0386931896209717, "losses/total": 1.342690348625183, "ref_logps/chosen": -23.93436050415039, "ref_logps/rejected": -41.683040618896484, "rewards/accuracies": 0.875, "rewards/chosen": -1.4727598428726196, "rewards/margins": 2.7039742469787598, "rewards/rejected": -4.17673397064209, "step": 1484 }, { "epoch": 1.4, "grad_norm": 29.47224407280236, "learning_rate": 2.888067471899512e-07, "logps/chosen": -43.053794860839844, "logps/rejected": -51.79656219482422, "loss": 0.5263, "losses/dpo": 0.30069395899772644, "losses/sft": 1.2043811082839966, "losses/total": 0.30069395899772644, "ref_logps/chosen": -29.840923309326172, "ref_logps/rejected": -28.240394592285156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3212876319885254, "rewards/margins": 1.0343289375305176, "rewards/rejected": -2.355616569519043, "step": 1485 }, { "epoch": 1.4, "grad_norm": 18.447108411520478, "learning_rate": 2.8855514531573413e-07, "logps/chosen": -41.24052429199219, "logps/rejected": -65.79774475097656, "loss": 0.3607, "losses/dpo": 0.6759271621704102, "losses/sft": 1.8188871145248413, "losses/total": 0.6759271621704102, "ref_logps/chosen": -27.37999153137207, "ref_logps/rejected": -35.18235778808594, "rewards/accuracies": 0.875, "rewards/chosen": -1.386053442955017, "rewards/margins": 1.675485372543335, "rewards/rejected": -3.0615386962890625, "step": 1486 }, { "epoch": 1.4, "grad_norm": 16.737062908835913, "learning_rate": 2.883035034329207e-07, "logps/chosen": -42.22106170654297, "logps/rejected": -66.63787841796875, "loss": 0.2503, "losses/dpo": 0.2233356237411499, "losses/sft": 0.40584397315979004, "losses/total": 0.2233356237411499, "ref_logps/chosen": -27.40723419189453, "ref_logps/rejected": -34.231510162353516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4813823699951172, "rewards/margins": 1.7592543363571167, "rewards/rejected": -3.2406368255615234, "step": 1487 }, { "epoch": 1.4, "grad_norm": 12.462273253531407, "learning_rate": 2.880518218026394e-07, "logps/chosen": -43.00181579589844, "logps/rejected": -60.04022979736328, "loss": 0.1825, "losses/dpo": 0.1846960335969925, "losses/sft": 1.6335563659667969, "losses/total": 0.1846960335969925, "ref_logps/chosen": -30.776336669921875, "ref_logps/rejected": -25.988018035888672, "rewards/accuracies": 1.0, "rewards/chosen": -1.222548246383667, "rewards/margins": 2.182673454284668, "rewards/rejected": -3.405221939086914, "step": 1488 }, { "epoch": 1.4, "grad_norm": 19.824165382652552, "learning_rate": 2.8780010068605964e-07, "logps/chosen": -46.90850830078125, "logps/rejected": -65.91969299316406, "loss": 0.3143, "losses/dpo": 0.442234069108963, "losses/sft": 1.8401453495025635, "losses/total": 0.442234069108963, "ref_logps/chosen": -29.565046310424805, "ref_logps/rejected": -32.62403106689453, "rewards/accuracies": 0.875, "rewards/chosen": -1.7343462705612183, "rewards/margins": 1.595219612121582, "rewards/rejected": -3.32956600189209, "step": 1489 }, { "epoch": 1.41, "grad_norm": 26.5779264662845, "learning_rate": 2.87548340344392e-07, "logps/chosen": -28.84585952758789, "logps/rejected": -55.76186752319336, "loss": 0.4639, "losses/dpo": 0.010969307273626328, "losses/sft": 1.064958095550537, "losses/total": 0.010969307273626328, "ref_logps/chosen": -17.521148681640625, "ref_logps/rejected": -30.970918655395508, "rewards/accuracies": 0.75, "rewards/chosen": -1.1324713230133057, "rewards/margins": 1.3466236591339111, "rewards/rejected": -2.479095220565796, "step": 1490 }, { "epoch": 1.41, "grad_norm": 12.054373037927824, "learning_rate": 2.8729654103888764e-07, "logps/chosen": -35.16297149658203, "logps/rejected": -64.96625518798828, "loss": 0.1682, "losses/dpo": 0.5086395740509033, "losses/sft": 0.6616599559783936, "losses/total": 0.5086395740509033, "ref_logps/chosen": -23.537416458129883, "ref_logps/rejected": -30.331127166748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.162555456161499, "rewards/margins": 2.300957202911377, "rewards/rejected": -3.463512659072876, "step": 1491 }, { "epoch": 1.41, "grad_norm": 23.693258221310582, "learning_rate": 2.8704470303083806e-07, "logps/chosen": -53.48826217651367, "logps/rejected": -72.68281555175781, "loss": 0.2444, "losses/dpo": 0.06315474957227707, "losses/sft": 1.0165811777114868, "losses/total": 0.06315474957227707, "ref_logps/chosen": -35.303985595703125, "ref_logps/rejected": -36.260643005371094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8184278011322021, "rewards/margins": 1.823789119720459, "rewards/rejected": -3.642216920852661, "step": 1492 }, { "epoch": 1.41, "grad_norm": 22.40106597440471, "learning_rate": 2.867928265815753e-07, "logps/chosen": -48.10880661010742, "logps/rejected": -60.51283645629883, "loss": 0.3898, "losses/dpo": 0.5340179800987244, "losses/sft": 2.078284978866577, "losses/total": 0.5340179800987244, "ref_logps/chosen": -28.08648681640625, "ref_logps/rejected": -27.36773681640625, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0022318363189697, "rewards/margins": 1.312277913093567, "rewards/rejected": -3.314509868621826, "step": 1493 }, { "epoch": 1.41, "grad_norm": 17.627364764961158, "learning_rate": 2.8654091195247086e-07, "logps/chosen": -45.09344482421875, "logps/rejected": -76.04299926757812, "loss": 0.3455, "losses/dpo": 0.1800275593996048, "losses/sft": 2.3666090965270996, "losses/total": 0.1800275593996048, "ref_logps/chosen": -23.700458526611328, "ref_logps/rejected": -37.45427703857422, "rewards/accuracies": 0.6875, "rewards/chosen": -2.139298915863037, "rewards/margins": 1.7195731401443481, "rewards/rejected": -3.8588719367980957, "step": 1494 }, { "epoch": 1.41, "grad_norm": 13.4317943181342, "learning_rate": 2.8628895940493616e-07, "logps/chosen": -31.562101364135742, "logps/rejected": -68.21430969238281, "loss": 0.2191, "losses/dpo": 0.633723258972168, "losses/sft": 0.6267958283424377, "losses/total": 0.633723258972168, "ref_logps/chosen": -20.11395263671875, "ref_logps/rejected": -36.930946350097656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1448148488998413, "rewards/margins": 1.9835211038589478, "rewards/rejected": -3.128335952758789, "step": 1495 }, { "epoch": 1.41, "grad_norm": 20.652969176685367, "learning_rate": 2.8603696920042176e-07, "logps/chosen": -41.582237243652344, "logps/rejected": -61.32974624633789, "loss": 0.341, "losses/dpo": 0.8957720994949341, "losses/sft": 2.5699946880340576, "losses/total": 0.8957720994949341, "ref_logps/chosen": -28.05765151977539, "ref_logps/rejected": -29.036022186279297, "rewards/accuracies": 0.75, "rewards/chosen": -1.3524587154388428, "rewards/margins": 1.8769140243530273, "rewards/rejected": -3.229372501373291, "step": 1496 }, { "epoch": 1.41, "grad_norm": 14.67909762136599, "learning_rate": 2.857849416004174e-07, "logps/chosen": -38.2585563659668, "logps/rejected": -62.442283630371094, "loss": 0.1951, "losses/dpo": 0.3756580650806427, "losses/sft": 1.4337725639343262, "losses/total": 0.3756580650806427, "ref_logps/chosen": -26.772151947021484, "ref_logps/rejected": -29.20330047607422, "rewards/accuracies": 1.0, "rewards/chosen": -1.148640513420105, "rewards/margins": 2.175258159637451, "rewards/rejected": -3.3238987922668457, "step": 1497 }, { "epoch": 1.41, "grad_norm": 18.746549517807836, "learning_rate": 2.855328768664516e-07, "logps/chosen": -57.99837875366211, "logps/rejected": -88.55046081542969, "loss": 0.2624, "losses/dpo": 0.0813995897769928, "losses/sft": 1.0964161157608032, "losses/total": 0.0813995897769928, "ref_logps/chosen": -39.57355499267578, "ref_logps/rejected": -50.566810607910156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8424824476242065, "rewards/margins": 1.9558823108673096, "rewards/rejected": -3.7983646392822266, "step": 1498 }, { "epoch": 1.41, "grad_norm": 18.395925270934352, "learning_rate": 2.852807752600915e-07, "logps/chosen": -40.04420852661133, "logps/rejected": -81.20155334472656, "loss": 0.2877, "losses/dpo": 0.011667287908494473, "losses/sft": 0.832777738571167, "losses/total": 0.011667287908494473, "ref_logps/chosen": -24.54023551940918, "ref_logps/rejected": -41.250328063964844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5503973960876465, "rewards/margins": 2.444725275039673, "rewards/rejected": -3.9951229095458984, "step": 1499 }, { "epoch": 1.42, "grad_norm": 19.97278902014747, "learning_rate": 2.8502863704294233e-07, "logps/chosen": -49.978538513183594, "logps/rejected": -77.41592407226562, "loss": 0.2639, "losses/dpo": 0.07432214170694351, "losses/sft": 1.48811936378479, "losses/total": 0.07432214170694351, "ref_logps/chosen": -29.924800872802734, "ref_logps/rejected": -36.66884994506836, "rewards/accuracies": 0.9375, "rewards/chosen": -2.00537371635437, "rewards/margins": 2.069333553314209, "rewards/rejected": -4.07470703125, "step": 1500 }, { "epoch": 1.42, "grad_norm": 25.457562467022804, "learning_rate": 2.8477646247664737e-07, "logps/chosen": -46.088905334472656, "logps/rejected": -69.43608856201172, "loss": 0.4012, "losses/dpo": 0.14153599739074707, "losses/sft": 1.3135031461715698, "losses/total": 0.14153599739074707, "ref_logps/chosen": -27.803356170654297, "ref_logps/rejected": -35.329078674316406, "rewards/accuracies": 0.75, "rewards/chosen": -1.8285547494888306, "rewards/margins": 1.5821467638015747, "rewards/rejected": -3.410701274871826, "step": 1501 }, { "epoch": 1.42, "grad_norm": 16.400778292084805, "learning_rate": 2.845242518228879e-07, "logps/chosen": -38.005859375, "logps/rejected": -68.13429260253906, "loss": 0.2836, "losses/dpo": 0.4431219696998596, "losses/sft": 0.519609808921814, "losses/total": 0.4431219696998596, "ref_logps/chosen": -20.328615188598633, "ref_logps/rejected": -31.90057945251465, "rewards/accuracies": 0.875, "rewards/chosen": -1.767724633216858, "rewards/margins": 1.855647087097168, "rewards/rejected": -3.6233718395233154, "step": 1502 }, { "epoch": 1.42, "grad_norm": 19.182967742557846, "learning_rate": 2.842720053433821e-07, "logps/chosen": -43.29232406616211, "logps/rejected": -75.30523681640625, "loss": 0.2746, "losses/dpo": 0.0042997910641133785, "losses/sft": 0.5446252226829529, "losses/total": 0.0042997910641133785, "ref_logps/chosen": -26.800029754638672, "ref_logps/rejected": -39.189002990722656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6492295265197754, "rewards/margins": 1.962394118309021, "rewards/rejected": -3.611623764038086, "step": 1503 }, { "epoch": 1.42, "grad_norm": 17.197256558795388, "learning_rate": 2.8401972329988585e-07, "logps/chosen": -40.445899963378906, "logps/rejected": -74.7647476196289, "loss": 0.2324, "losses/dpo": 0.04821028187870979, "losses/sft": 1.0422616004943848, "losses/total": 0.04821028187870979, "ref_logps/chosen": -25.301782608032227, "ref_logps/rejected": -39.54978942871094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.514411449432373, "rewards/margins": 2.007084846496582, "rewards/rejected": -3.521496295928955, "step": 1504 }, { "epoch": 1.42, "grad_norm": 18.70383293816455, "learning_rate": 2.837674059541916e-07, "logps/chosen": -40.13631057739258, "logps/rejected": -57.40666580200195, "loss": 0.3585, "losses/dpo": 0.4113523066043854, "losses/sft": 1.1039223670959473, "losses/total": 0.4113523066043854, "ref_logps/chosen": -24.55382537841797, "ref_logps/rejected": -28.992633819580078, "rewards/accuracies": 0.8125, "rewards/chosen": -1.558248519897461, "rewards/margins": 1.2831543684005737, "rewards/rejected": -2.841403007507324, "step": 1505 }, { "epoch": 1.42, "grad_norm": 17.142050128901825, "learning_rate": 2.8351505356812854e-07, "logps/chosen": -40.42778015136719, "logps/rejected": -78.88175964355469, "loss": 0.2048, "losses/dpo": 0.16649383306503296, "losses/sft": 1.5780366659164429, "losses/total": 0.16649383306503296, "ref_logps/chosen": -26.23801612854004, "ref_logps/rejected": -38.62843322753906, "rewards/accuracies": 0.875, "rewards/chosen": -1.4189765453338623, "rewards/margins": 2.606355905532837, "rewards/rejected": -4.025332450866699, "step": 1506 }, { "epoch": 1.42, "grad_norm": 16.51415297690972, "learning_rate": 2.8326266640356233e-07, "logps/chosen": -44.14759826660156, "logps/rejected": -67.64875030517578, "loss": 0.2066, "losses/dpo": 0.12996353209018707, "losses/sft": 1.7882308959960938, "losses/total": 0.12996353209018707, "ref_logps/chosen": -28.72895050048828, "ref_logps/rejected": -31.842498779296875, "rewards/accuracies": 0.875, "rewards/chosen": -1.5418646335601807, "rewards/margins": 2.0387606620788574, "rewards/rejected": -3.580625295639038, "step": 1507 }, { "epoch": 1.42, "grad_norm": 14.839810863053382, "learning_rate": 2.830102447223946e-07, "logps/chosen": -56.45869445800781, "logps/rejected": -75.00613403320312, "loss": 0.2066, "losses/dpo": 0.029965778812766075, "losses/sft": 1.2207244634628296, "losses/total": 0.029965778812766075, "ref_logps/chosen": -37.782630920410156, "ref_logps/rejected": -35.75640106201172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8676064014434814, "rewards/margins": 2.0573668479919434, "rewards/rejected": -3.924973487854004, "step": 1508 }, { "epoch": 1.42, "grad_norm": 22.792773330925126, "learning_rate": 2.827577887865626e-07, "logps/chosen": -43.6530876159668, "logps/rejected": -68.17218780517578, "loss": 0.2945, "losses/dpo": 0.09382007271051407, "losses/sft": 0.8857348561286926, "losses/total": 0.09382007271051407, "ref_logps/chosen": -27.94548797607422, "ref_logps/rejected": -35.37647247314453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5707600116729736, "rewards/margins": 1.7088117599487305, "rewards/rejected": -3.279571771621704, "step": 1509 }, { "epoch": 1.42, "grad_norm": 20.39289347439394, "learning_rate": 2.8250529885803977e-07, "logps/chosen": -38.27227783203125, "logps/rejected": -62.33458709716797, "loss": 0.344, "losses/dpo": 0.44378089904785156, "losses/sft": 0.5871397852897644, "losses/total": 0.44378089904785156, "ref_logps/chosen": -24.258590698242188, "ref_logps/rejected": -32.59885787963867, "rewards/accuracies": 0.9375, "rewards/chosen": -1.401368260383606, "rewards/margins": 1.572204351425171, "rewards/rejected": -2.9735727310180664, "step": 1510 }, { "epoch": 1.43, "grad_norm": 10.198770784977409, "learning_rate": 2.82252775198834e-07, "logps/chosen": -33.042686462402344, "logps/rejected": -73.49043273925781, "loss": 0.1344, "losses/dpo": 0.08297610282897949, "losses/sft": 1.0453860759735107, "losses/total": 0.08297610282897949, "ref_logps/chosen": -19.771440505981445, "ref_logps/rejected": -34.43911361694336, "rewards/accuracies": 1.0, "rewards/chosen": -1.3271245956420898, "rewards/margins": 2.578007221221924, "rewards/rejected": -3.9051318168640137, "step": 1511 }, { "epoch": 1.43, "grad_norm": 17.41514046900155, "learning_rate": 2.820002180709888e-07, "logps/chosen": -46.408382415771484, "logps/rejected": -73.6947021484375, "loss": 0.2877, "losses/dpo": 0.8679726719856262, "losses/sft": 1.4359886646270752, "losses/total": 0.8679726719856262, "ref_logps/chosen": -30.19162368774414, "ref_logps/rejected": -38.499900817871094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6216756105422974, "rewards/margins": 1.8978043794631958, "rewards/rejected": -3.5194802284240723, "step": 1512 }, { "epoch": 1.43, "grad_norm": 16.99756066194047, "learning_rate": 2.817476277365821e-07, "logps/chosen": -46.31840515136719, "logps/rejected": -76.31419372558594, "loss": 0.1932, "losses/dpo": 0.029824910685420036, "losses/sft": 0.8292525410652161, "losses/total": 0.029824910685420036, "ref_logps/chosen": -30.476451873779297, "ref_logps/rejected": -37.40619659423828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.584195613861084, "rewards/margins": 2.3066043853759766, "rewards/rejected": -3.8907997608184814, "step": 1513 }, { "epoch": 1.43, "grad_norm": 19.80328892188271, "learning_rate": 2.814950044577264e-07, "logps/chosen": -38.52341842651367, "logps/rejected": -59.79050064086914, "loss": 0.2646, "losses/dpo": 0.11846840381622314, "losses/sft": 0.964898943901062, "losses/total": 0.11846840381622314, "ref_logps/chosen": -25.11005210876465, "ref_logps/rejected": -27.16377067565918, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3413368463516235, "rewards/margins": 1.9213361740112305, "rewards/rejected": -3.2626729011535645, "step": 1514 }, { "epoch": 1.43, "grad_norm": 22.207066840771923, "learning_rate": 2.812423484965683e-07, "logps/chosen": -54.66124725341797, "logps/rejected": -74.20699310302734, "loss": 0.3509, "losses/dpo": 0.12696129083633423, "losses/sft": 2.4506373405456543, "losses/total": 0.12696129083633423, "ref_logps/chosen": -34.91069030761719, "ref_logps/rejected": -35.53656005859375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.97505521774292, "rewards/margins": 1.8919886350631714, "rewards/rejected": -3.867043972015381, "step": 1515 }, { "epoch": 1.43, "grad_norm": 10.449281443859544, "learning_rate": 2.809896601152885e-07, "logps/chosen": -48.33226013183594, "logps/rejected": -84.06820678710938, "loss": 0.1593, "losses/dpo": 0.3957901895046234, "losses/sft": 0.454482764005661, "losses/total": 0.3957901895046234, "ref_logps/chosen": -33.33330154418945, "ref_logps/rejected": -43.76318359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4998953342437744, "rewards/margins": 2.5306077003479004, "rewards/rejected": -4.030502796173096, "step": 1516 }, { "epoch": 1.43, "grad_norm": 15.509292678950258, "learning_rate": 2.8073693957610117e-07, "logps/chosen": -49.04058074951172, "logps/rejected": -68.7525634765625, "loss": 0.2082, "losses/dpo": 0.07326573133468628, "losses/sft": 1.3021893501281738, "losses/total": 0.07326573133468628, "ref_logps/chosen": -31.938785552978516, "ref_logps/rejected": -32.956092834472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7101794481277466, "rewards/margins": 1.8694679737091064, "rewards/rejected": -3.5796475410461426, "step": 1517 }, { "epoch": 1.43, "grad_norm": 19.658999035285415, "learning_rate": 2.804841871412539e-07, "logps/chosen": -31.57876968383789, "logps/rejected": -61.403892517089844, "loss": 0.2729, "losses/dpo": 0.32509344816207886, "losses/sft": 0.3433818817138672, "losses/total": 0.32509344816207886, "ref_logps/chosen": -20.33191680908203, "ref_logps/rejected": -29.94161033630371, "rewards/accuracies": 0.875, "rewards/chosen": -1.124685287475586, "rewards/margins": 2.021543264389038, "rewards/rejected": -3.146228790283203, "step": 1518 }, { "epoch": 1.43, "grad_norm": 21.953588677301887, "learning_rate": 2.8023140307302734e-07, "logps/chosen": -52.227439880371094, "logps/rejected": -81.20362854003906, "loss": 0.372, "losses/dpo": 0.1808631271123886, "losses/sft": 1.849611520767212, "losses/total": 0.1808631271123886, "ref_logps/chosen": -32.617889404296875, "ref_logps/rejected": -42.323360443115234, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9609551429748535, "rewards/margins": 1.9270710945129395, "rewards/rejected": -3.888026237487793, "step": 1519 }, { "epoch": 1.43, "grad_norm": 20.11765716108283, "learning_rate": 2.79978587633735e-07, "logps/chosen": -42.674747467041016, "logps/rejected": -61.06516647338867, "loss": 0.343, "losses/dpo": 0.0716119259595871, "losses/sft": 1.5539891719818115, "losses/total": 0.0716119259595871, "ref_logps/chosen": -25.722759246826172, "ref_logps/rejected": -28.820470809936523, "rewards/accuracies": 0.8125, "rewards/chosen": -1.695198893547058, "rewards/margins": 1.5292707681655884, "rewards/rejected": -3.2244696617126465, "step": 1520 }, { "epoch": 1.43, "grad_norm": 28.39688456307541, "learning_rate": 2.7972574108572306e-07, "logps/chosen": -48.82621765136719, "logps/rejected": -66.89705657958984, "loss": 0.3709, "losses/dpo": 0.7029011845588684, "losses/sft": 1.5097267627716064, "losses/total": 0.7029011845588684, "ref_logps/chosen": -32.42400360107422, "ref_logps/rejected": -33.380332946777344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.640221118927002, "rewards/margins": 1.7114516496658325, "rewards/rejected": -3.351672649383545, "step": 1521 }, { "epoch": 1.44, "grad_norm": 13.005589592346757, "learning_rate": 2.794728636913698e-07, "logps/chosen": -49.95878601074219, "logps/rejected": -81.48592376708984, "loss": 0.1428, "losses/dpo": 0.008120165206491947, "losses/sft": 1.6931148767471313, "losses/total": 0.008120165206491947, "ref_logps/chosen": -32.85780715942383, "ref_logps/rejected": -37.58015060424805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7100977897644043, "rewards/margins": 2.6804795265197754, "rewards/rejected": -4.39057731628418, "step": 1522 }, { "epoch": 1.44, "grad_norm": 21.48433118047766, "learning_rate": 2.792199557130856e-07, "logps/chosen": -44.41685104370117, "logps/rejected": -71.19195556640625, "loss": 0.2923, "losses/dpo": 0.34061938524246216, "losses/sft": 0.49159297347068787, "losses/total": 0.34061938524246216, "ref_logps/chosen": -27.50825309753418, "ref_logps/rejected": -34.66374588012695, "rewards/accuracies": 0.875, "rewards/chosen": -1.6908596754074097, "rewards/margins": 1.9619618654251099, "rewards/rejected": -3.6528215408325195, "step": 1523 }, { "epoch": 1.44, "grad_norm": 21.259997490466436, "learning_rate": 2.7896701741331253e-07, "logps/chosen": -40.38349914550781, "logps/rejected": -63.30242919921875, "loss": 0.3281, "losses/dpo": 0.23562465608119965, "losses/sft": 2.0209245681762695, "losses/total": 0.23562465608119965, "ref_logps/chosen": -26.272991180419922, "ref_logps/rejected": -33.4117546081543, "rewards/accuracies": 0.875, "rewards/chosen": -1.411050796508789, "rewards/margins": 1.578016996383667, "rewards/rejected": -2.989068031311035, "step": 1524 }, { "epoch": 1.44, "grad_norm": 9.555182273019645, "learning_rate": 2.787140490545241e-07, "logps/chosen": -40.99742126464844, "logps/rejected": -79.0224380493164, "loss": 0.1033, "losses/dpo": 0.3463381826877594, "losses/sft": 1.6733416318893433, "losses/total": 0.3463381826877594, "ref_logps/chosen": -27.776233673095703, "ref_logps/rejected": -37.81031799316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.3221185207366943, "rewards/margins": 2.799093246459961, "rewards/rejected": -4.121212005615234, "step": 1525 }, { "epoch": 1.44, "grad_norm": 22.79543712828212, "learning_rate": 2.7846105089922523e-07, "logps/chosen": -31.73844337463379, "logps/rejected": -62.28700637817383, "loss": 0.4303, "losses/dpo": 0.6311237812042236, "losses/sft": 1.3200770616531372, "losses/total": 0.6311237812042236, "ref_logps/chosen": -20.29097557067871, "ref_logps/rejected": -34.8529167175293, "rewards/accuracies": 0.8125, "rewards/chosen": -1.144747018814087, "rewards/margins": 1.5986618995666504, "rewards/rejected": -2.7434089183807373, "step": 1526 }, { "epoch": 1.44, "grad_norm": 19.273527964136687, "learning_rate": 2.782080232099514e-07, "logps/chosen": -47.08742141723633, "logps/rejected": -76.27474975585938, "loss": 0.2667, "losses/dpo": 0.48803502321243286, "losses/sft": 0.5440508127212524, "losses/total": 0.48803502321243286, "ref_logps/chosen": -29.486852645874023, "ref_logps/rejected": -41.178035736083984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.760056495666504, "rewards/margins": 1.7496145963668823, "rewards/rejected": -3.509671211242676, "step": 1527 }, { "epoch": 1.44, "grad_norm": 17.28619363186906, "learning_rate": 2.77954966249269e-07, "logps/chosen": -47.56803894042969, "logps/rejected": -73.02439880371094, "loss": 0.1817, "losses/dpo": 0.5005092620849609, "losses/sft": 2.027005910873413, "losses/total": 0.5005092620849609, "ref_logps/chosen": -36.86183166503906, "ref_logps/rejected": -36.5128288269043, "rewards/accuracies": 1.0, "rewards/chosen": -1.070621132850647, "rewards/margins": 2.580535888671875, "rewards/rejected": -3.6511573791503906, "step": 1528 }, { "epoch": 1.44, "grad_norm": 16.043828638522637, "learning_rate": 2.777018802797748e-07, "logps/chosen": -48.065757751464844, "logps/rejected": -91.99061584472656, "loss": 0.1751, "losses/dpo": 0.0034981700591742992, "losses/sft": 0.49539676308631897, "losses/total": 0.0034981700591742992, "ref_logps/chosen": -32.03112030029297, "ref_logps/rejected": -48.99083709716797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6034636497497559, "rewards/margins": 2.696514844894409, "rewards/rejected": -4.299978256225586, "step": 1529 }, { "epoch": 1.44, "grad_norm": 14.682310762697945, "learning_rate": 2.774487655640955e-07, "logps/chosen": -41.2689208984375, "logps/rejected": -64.03199768066406, "loss": 0.1972, "losses/dpo": 0.12043540924787521, "losses/sft": 2.2332746982574463, "losses/total": 0.12043540924787521, "ref_logps/chosen": -27.92266845703125, "ref_logps/rejected": -30.79859161376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3346248865127563, "rewards/margins": 1.988715648651123, "rewards/rejected": -3.32334041595459, "step": 1530 }, { "epoch": 1.44, "grad_norm": 21.17642942037912, "learning_rate": 2.771956223648878e-07, "logps/chosen": -48.07182312011719, "logps/rejected": -62.97510528564453, "loss": 0.2884, "losses/dpo": 0.043874699622392654, "losses/sft": 1.426417350769043, "losses/total": 0.043874699622392654, "ref_logps/chosen": -33.370182037353516, "ref_logps/rejected": -32.453861236572266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4701642990112305, "rewards/margins": 1.5819602012634277, "rewards/rejected": -3.052124500274658, "step": 1531 }, { "epoch": 1.45, "grad_norm": 23.613657304296183, "learning_rate": 2.7694245094483787e-07, "logps/chosen": -50.67750930786133, "logps/rejected": -67.4464111328125, "loss": 0.3205, "losses/dpo": 0.5088014006614685, "losses/sft": 1.7572262287139893, "losses/total": 0.5088014006614685, "ref_logps/chosen": -31.838193893432617, "ref_logps/rejected": -31.034265518188477, "rewards/accuracies": 0.875, "rewards/chosen": -1.8839318752288818, "rewards/margins": 1.7572828531265259, "rewards/rejected": -3.641214609146118, "step": 1532 }, { "epoch": 1.45, "grad_norm": 19.228503251283026, "learning_rate": 2.766892515666612e-07, "logps/chosen": -37.42405700683594, "logps/rejected": -64.52375793457031, "loss": 0.3077, "losses/dpo": 0.7833768129348755, "losses/sft": 1.823652982711792, "losses/total": 0.7833768129348755, "ref_logps/chosen": -24.53429412841797, "ref_logps/rejected": -31.545045852661133, "rewards/accuracies": 0.875, "rewards/chosen": -1.2889759540557861, "rewards/margins": 2.0088951587677, "rewards/rejected": -3.2978711128234863, "step": 1533 }, { "epoch": 1.45, "grad_norm": 13.563110511796273, "learning_rate": 2.764360244931021e-07, "logps/chosen": -31.553726196289062, "logps/rejected": -71.39308166503906, "loss": 0.208, "losses/dpo": 0.013260392472147942, "losses/sft": 0.0481097437441349, "losses/total": 0.013260392472147942, "ref_logps/chosen": -22.558902740478516, "ref_logps/rejected": -36.98809051513672, "rewards/accuracies": 0.875, "rewards/chosen": -0.8994823694229126, "rewards/margins": 2.5410165786743164, "rewards/rejected": -3.4404988288879395, "step": 1534 }, { "epoch": 1.45, "grad_norm": 19.482972077626666, "learning_rate": 2.76182769986934e-07, "logps/chosen": -46.94768524169922, "logps/rejected": -64.03106689453125, "loss": 0.2684, "losses/dpo": 0.11530100554227829, "losses/sft": 0.48238179087638855, "losses/total": 0.11530100554227829, "ref_logps/chosen": -30.766162872314453, "ref_logps/rejected": -33.03657531738281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.618152141571045, "rewards/margins": 1.4812973737716675, "rewards/rejected": -3.099449634552002, "step": 1535 }, { "epoch": 1.45, "grad_norm": 38.29708814638043, "learning_rate": 2.7592948831095856e-07, "logps/chosen": -52.177215576171875, "logps/rejected": -74.74415588378906, "loss": 0.5205, "losses/dpo": 0.09699534624814987, "losses/sft": 1.1788978576660156, "losses/total": 0.09699534624814987, "ref_logps/chosen": -30.939830780029297, "ref_logps/rejected": -37.5186653137207, "rewards/accuracies": 0.75, "rewards/chosen": -2.1237382888793945, "rewards/margins": 1.5988103151321411, "rewards/rejected": -3.7225489616394043, "step": 1536 }, { "epoch": 1.45, "grad_norm": 13.04750780721016, "learning_rate": 2.7567617972800555e-07, "logps/chosen": -36.395084381103516, "logps/rejected": -83.14082336425781, "loss": 0.1601, "losses/dpo": 0.1543453484773636, "losses/sft": 1.5526213645935059, "losses/total": 0.1543453484773636, "ref_logps/chosen": -19.66161346435547, "ref_logps/rejected": -38.608642578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6733472347259521, "rewards/margins": 2.7798709869384766, "rewards/rejected": -4.45321798324585, "step": 1537 }, { "epoch": 1.45, "grad_norm": 10.976191775689593, "learning_rate": 2.7542284450093286e-07, "logps/chosen": -47.551822662353516, "logps/rejected": -79.31442260742188, "loss": 0.1485, "losses/dpo": 0.04400336742401123, "losses/sft": 1.0605798959732056, "losses/total": 0.04400336742401123, "ref_logps/chosen": -29.449132919311523, "ref_logps/rejected": -34.460941314697266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8102688789367676, "rewards/margins": 2.675079345703125, "rewards/rejected": -4.485348224639893, "step": 1538 }, { "epoch": 1.45, "grad_norm": 26.736203826541733, "learning_rate": 2.7516948289262595e-07, "logps/chosen": -64.73818969726562, "logps/rejected": -73.00676727294922, "loss": 0.4962, "losses/dpo": 0.07168720662593842, "losses/sft": 0.3946000337600708, "losses/total": 0.07168720662593842, "ref_logps/chosen": -41.80548858642578, "ref_logps/rejected": -35.22407531738281, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2932708263397217, "rewards/margins": 1.4849984645843506, "rewards/rejected": -3.7782692909240723, "step": 1539 }, { "epoch": 1.45, "grad_norm": 32.986885316407445, "learning_rate": 2.7491609516599755e-07, "logps/chosen": -56.64607620239258, "logps/rejected": -68.12968444824219, "loss": 0.4982, "losses/dpo": 0.12252765893936157, "losses/sft": 1.5054192543029785, "losses/total": 0.12252765893936157, "ref_logps/chosen": -35.67534637451172, "ref_logps/rejected": -35.230323791503906, "rewards/accuracies": 0.75, "rewards/chosen": -2.0970730781555176, "rewards/margins": 1.192862629890442, "rewards/rejected": -3.28993558883667, "step": 1540 }, { "epoch": 1.45, "grad_norm": 22.594697802358038, "learning_rate": 2.7466268158398775e-07, "logps/chosen": -43.95732116699219, "logps/rejected": -63.122459411621094, "loss": 0.35, "losses/dpo": 0.5483638644218445, "losses/sft": 1.3835371732711792, "losses/total": 0.5483638644218445, "ref_logps/chosen": -28.010093688964844, "ref_logps/rejected": -29.35342025756836, "rewards/accuracies": 0.75, "rewards/chosen": -1.5947225093841553, "rewards/margins": 1.7821818590164185, "rewards/rejected": -3.3769044876098633, "step": 1541 }, { "epoch": 1.45, "grad_norm": 18.205427077139234, "learning_rate": 2.744092424095632e-07, "logps/chosen": -45.92624282836914, "logps/rejected": -76.29288482666016, "loss": 0.3623, "losses/dpo": 0.0017839401261880994, "losses/sft": 0.2726411521434784, "losses/total": 0.0017839401261880994, "ref_logps/chosen": -31.496967315673828, "ref_logps/rejected": -41.12691879272461, "rewards/accuracies": 0.9375, "rewards/chosen": -1.442927598953247, "rewards/margins": 2.0736684799194336, "rewards/rejected": -3.5165960788726807, "step": 1542 }, { "epoch": 1.46, "grad_norm": 16.83592627727288, "learning_rate": 2.741557779057172e-07, "logps/chosen": -40.94697570800781, "logps/rejected": -73.12403106689453, "loss": 0.2148, "losses/dpo": 0.3111013174057007, "losses/sft": 2.0381414890289307, "losses/total": 0.3111013174057007, "ref_logps/chosen": -26.73687744140625, "ref_logps/rejected": -40.39875411987305, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4210097789764404, "rewards/margins": 1.8515182733535767, "rewards/rejected": -3.2725281715393066, "step": 1543 }, { "epoch": 1.46, "grad_norm": 20.484875717935104, "learning_rate": 2.7390228833546944e-07, "logps/chosen": -44.42792510986328, "logps/rejected": -61.692752838134766, "loss": 0.2904, "losses/dpo": 0.22472047805786133, "losses/sft": 0.34258130192756653, "losses/total": 0.22472047805786133, "ref_logps/chosen": -25.848575592041016, "ref_logps/rejected": -26.667949676513672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8579349517822266, "rewards/margins": 1.6445451974868774, "rewards/rejected": -3.5024800300598145, "step": 1544 }, { "epoch": 1.46, "grad_norm": 19.037654838178643, "learning_rate": 2.7364877396186543e-07, "logps/chosen": -39.38399887084961, "logps/rejected": -65.11557006835938, "loss": 0.4304, "losses/dpo": 0.0743199810385704, "losses/sft": 0.9474436640739441, "losses/total": 0.0743199810385704, "ref_logps/chosen": -26.145206451416016, "ref_logps/rejected": -31.05997657775879, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3238792419433594, "rewards/margins": 2.0816807746887207, "rewards/rejected": -3.40556001663208, "step": 1545 }, { "epoch": 1.46, "grad_norm": 21.422852596284706, "learning_rate": 2.7339523504797655e-07, "logps/chosen": -45.115379333496094, "logps/rejected": -61.01797103881836, "loss": 0.4416, "losses/dpo": 0.18686546385288239, "losses/sft": 3.1092090606689453, "losses/total": 0.18686546385288239, "ref_logps/chosen": -26.924224853515625, "ref_logps/rejected": -29.692243576049805, "rewards/accuracies": 0.875, "rewards/chosen": -1.8191155195236206, "rewards/margins": 1.3134571313858032, "rewards/rejected": -3.132572650909424, "step": 1546 }, { "epoch": 1.46, "grad_norm": 17.176407370947057, "learning_rate": 2.731416718568997e-07, "logps/chosen": -53.90424346923828, "logps/rejected": -82.44709777832031, "loss": 0.2112, "losses/dpo": 0.008044109679758549, "losses/sft": 2.1742563247680664, "losses/total": 0.008044109679758549, "ref_logps/chosen": -34.582950592041016, "ref_logps/rejected": -40.31464385986328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9321296215057373, "rewards/margins": 2.281116008758545, "rewards/rejected": -4.213245391845703, "step": 1547 }, { "epoch": 1.46, "grad_norm": 13.63338823731341, "learning_rate": 2.728880846517569e-07, "logps/chosen": -42.782310485839844, "logps/rejected": -76.95053100585938, "loss": 0.1976, "losses/dpo": 0.16140836477279663, "losses/sft": 1.5069584846496582, "losses/total": 0.16140836477279663, "ref_logps/chosen": -27.72067642211914, "ref_logps/rejected": -36.289710998535156, "rewards/accuracies": 0.875, "rewards/chosen": -1.5061633586883545, "rewards/margins": 2.559918165206909, "rewards/rejected": -4.066081523895264, "step": 1548 }, { "epoch": 1.46, "grad_norm": 28.26959784593708, "learning_rate": 2.7263447369569495e-07, "logps/chosen": -43.78807067871094, "logps/rejected": -60.345577239990234, "loss": 0.5639, "losses/dpo": 0.050933822989463806, "losses/sft": 1.5822184085845947, "losses/total": 0.050933822989463806, "ref_logps/chosen": -26.274675369262695, "ref_logps/rejected": -31.187049865722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.7513396739959717, "rewards/margins": 1.164513111114502, "rewards/rejected": -2.9158527851104736, "step": 1549 }, { "epoch": 1.46, "grad_norm": 22.30924161094604, "learning_rate": 2.723808392518855e-07, "logps/chosen": -42.09339904785156, "logps/rejected": -68.98628234863281, "loss": 0.3183, "losses/dpo": 0.013090084306895733, "losses/sft": 1.2170708179473877, "losses/total": 0.013090084306895733, "ref_logps/chosen": -29.13673210144043, "ref_logps/rejected": -34.58540344238281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2956664562225342, "rewards/margins": 2.144421339035034, "rewards/rejected": -3.4400877952575684, "step": 1550 }, { "epoch": 1.46, "grad_norm": 17.48260487106963, "learning_rate": 2.7212718158352447e-07, "logps/chosen": -45.57863998413086, "logps/rejected": -73.93955993652344, "loss": 0.2307, "losses/dpo": 0.46628570556640625, "losses/sft": 1.891338586807251, "losses/total": 0.46628570556640625, "ref_logps/chosen": -30.57067108154297, "ref_logps/rejected": -35.157196044921875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.500796914100647, "rewards/margins": 2.377438545227051, "rewards/rejected": -3.8782358169555664, "step": 1551 }, { "epoch": 1.46, "grad_norm": 24.284890612677287, "learning_rate": 2.7187350095383195e-07, "logps/chosen": -40.6180305480957, "logps/rejected": -67.65479278564453, "loss": 0.2692, "losses/dpo": 0.013830565847456455, "losses/sft": 2.226024627685547, "losses/total": 0.013830565847456455, "ref_logps/chosen": -24.90155029296875, "ref_logps/rejected": -32.15721130371094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5716482400894165, "rewards/margins": 1.9781100749969482, "rewards/rejected": -3.5497584342956543, "step": 1552 }, { "epoch": 1.47, "grad_norm": 27.18415915821773, "learning_rate": 2.7161979762605174e-07, "logps/chosen": -58.118934631347656, "logps/rejected": -75.29222869873047, "loss": 0.341, "losses/dpo": 0.018174082040786743, "losses/sft": 1.402090072631836, "losses/total": 0.018174082040786743, "ref_logps/chosen": -40.03765869140625, "ref_logps/rejected": -39.20370101928711, "rewards/accuracies": 0.875, "rewards/chosen": -1.8081270456314087, "rewards/margins": 1.8007256984710693, "rewards/rejected": -3.6088528633117676, "step": 1553 }, { "epoch": 1.47, "grad_norm": 9.066799666575331, "learning_rate": 2.713660718634513e-07, "logps/chosen": -43.500850677490234, "logps/rejected": -94.12228393554688, "loss": 0.0999, "losses/dpo": 0.057694997638463974, "losses/sft": 1.5986833572387695, "losses/total": 0.057694997638463974, "ref_logps/chosen": -28.71493148803711, "ref_logps/rejected": -47.538787841796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.478592038154602, "rewards/margins": 3.179757595062256, "rewards/rejected": -4.658349990844727, "step": 1554 }, { "epoch": 1.47, "grad_norm": 14.135618274757116, "learning_rate": 2.7111232392932146e-07, "logps/chosen": -39.0775260925293, "logps/rejected": -80.67762756347656, "loss": 0.1653, "losses/dpo": 0.18752793967723846, "losses/sft": 0.8103616833686829, "losses/total": 0.18752793967723846, "ref_logps/chosen": -21.485557556152344, "ref_logps/rejected": -35.44255828857422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7591965198516846, "rewards/margins": 2.764310836791992, "rewards/rejected": -4.523507118225098, "step": 1555 }, { "epoch": 1.47, "grad_norm": 12.679208232973135, "learning_rate": 2.7085855408697586e-07, "logps/chosen": -39.218505859375, "logps/rejected": -65.55511474609375, "loss": 0.2264, "losses/dpo": 0.6380235552787781, "losses/sft": 0.5551038384437561, "losses/total": 0.6380235552787781, "ref_logps/chosen": -27.124967575073242, "ref_logps/rejected": -34.54956817626953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2093536853790283, "rewards/margins": 1.891200304031372, "rewards/rejected": -3.1005542278289795, "step": 1556 }, { "epoch": 1.47, "grad_norm": 20.150249787301725, "learning_rate": 2.7060476259975085e-07, "logps/chosen": -36.63771057128906, "logps/rejected": -59.63883972167969, "loss": 0.3122, "losses/dpo": 0.38703110814094543, "losses/sft": 0.3018331229686737, "losses/total": 0.38703110814094543, "ref_logps/chosen": -24.835277557373047, "ref_logps/rejected": -29.31821632385254, "rewards/accuracies": 0.875, "rewards/chosen": -1.1802432537078857, "rewards/margins": 1.8518190383911133, "rewards/rejected": -3.032062530517578, "step": 1557 }, { "epoch": 1.47, "grad_norm": 16.179190095743483, "learning_rate": 2.703509497310054e-07, "logps/chosen": -35.956321716308594, "logps/rejected": -56.115447998046875, "loss": 0.2342, "losses/dpo": 0.19010357558727264, "losses/sft": 1.424890398979187, "losses/total": 0.19010357558727264, "ref_logps/chosen": -25.084688186645508, "ref_logps/rejected": -27.000404357910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0871634483337402, "rewards/margins": 1.8243409395217896, "rewards/rejected": -2.9115045070648193, "step": 1558 }, { "epoch": 1.47, "grad_norm": 19.714071700593877, "learning_rate": 2.700971157441207e-07, "logps/chosen": -48.68989181518555, "logps/rejected": -85.72061920166016, "loss": 0.28, "losses/dpo": 0.04300021380186081, "losses/sft": 2.313699245452881, "losses/total": 0.04300021380186081, "ref_logps/chosen": -31.649791717529297, "ref_logps/rejected": -43.73984909057617, "rewards/accuracies": 0.875, "rewards/chosen": -1.7040101289749146, "rewards/margins": 2.4940671920776367, "rewards/rejected": -4.198077201843262, "step": 1559 }, { "epoch": 1.47, "grad_norm": 14.525605823719163, "learning_rate": 2.698432609024997e-07, "logps/chosen": -76.36503601074219, "logps/rejected": -91.32911682128906, "loss": 0.1342, "losses/dpo": 0.08568628877401352, "losses/sft": 1.329795241355896, "losses/total": 0.08568628877401352, "ref_logps/chosen": -56.63087463378906, "ref_logps/rejected": -42.94050979614258, "rewards/accuracies": 1.0, "rewards/chosen": -1.9734164476394653, "rewards/margins": 2.8654441833496094, "rewards/rejected": -4.838860988616943, "step": 1560 }, { "epoch": 1.47, "grad_norm": 9.27534352366827, "learning_rate": 2.695893854695671e-07, "logps/chosen": -43.33165740966797, "logps/rejected": -84.45143127441406, "loss": 0.1477, "losses/dpo": 0.506389319896698, "losses/sft": 0.4837765693664551, "losses/total": 0.506389319896698, "ref_logps/chosen": -29.533374786376953, "ref_logps/rejected": -44.51738357543945, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3798283338546753, "rewards/margins": 2.613576650619507, "rewards/rejected": -3.9934051036834717, "step": 1561 }, { "epoch": 1.47, "grad_norm": 14.84200323248066, "learning_rate": 2.693354897087687e-07, "logps/chosen": -35.01655578613281, "logps/rejected": -67.54527282714844, "loss": 0.2186, "losses/dpo": 0.021394183859229088, "losses/sft": 1.7451027631759644, "losses/total": 0.021394183859229088, "ref_logps/chosen": -25.069290161132812, "ref_logps/rejected": -36.19506072998047, "rewards/accuracies": 0.875, "rewards/chosen": -0.9947265386581421, "rewards/margins": 2.1402945518493652, "rewards/rejected": -3.135021209716797, "step": 1562 }, { "epoch": 1.47, "grad_norm": 25.417792094126803, "learning_rate": 2.690815738835719e-07, "logps/chosen": -41.99106216430664, "logps/rejected": -60.608333587646484, "loss": 0.3033, "losses/dpo": 0.1067599430680275, "losses/sft": 2.4985876083374023, "losses/total": 0.1067599430680275, "ref_logps/chosen": -26.10916519165039, "ref_logps/rejected": -29.27239227294922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5881894826889038, "rewards/margins": 1.5454046726226807, "rewards/rejected": -3.133594036102295, "step": 1563 }, { "epoch": 1.48, "grad_norm": 17.96493489885012, "learning_rate": 2.688276382574645e-07, "logps/chosen": -65.01177978515625, "logps/rejected": -88.07069396972656, "loss": 0.2494, "losses/dpo": 0.9093460440635681, "losses/sft": 1.68545401096344, "losses/total": 0.9093460440635681, "ref_logps/chosen": -44.82599639892578, "ref_logps/rejected": -47.28948974609375, "rewards/accuracies": 0.875, "rewards/chosen": -2.0185787677764893, "rewards/margins": 2.0595414638519287, "rewards/rejected": -4.078120231628418, "step": 1564 }, { "epoch": 1.48, "grad_norm": 22.845775179087767, "learning_rate": 2.6857368309395506e-07, "logps/chosen": -50.724403381347656, "logps/rejected": -55.98576354980469, "loss": 0.3812, "losses/dpo": 0.6642271876335144, "losses/sft": 1.6332733631134033, "losses/total": 0.6642271876335144, "ref_logps/chosen": -34.87358093261719, "ref_logps/rejected": -25.70241355895996, "rewards/accuracies": 0.875, "rewards/chosen": -1.5850822925567627, "rewards/margins": 1.4432528018951416, "rewards/rejected": -3.0283350944519043, "step": 1565 }, { "epoch": 1.48, "grad_norm": 19.001297119340748, "learning_rate": 2.683197086565722e-07, "logps/chosen": -41.266815185546875, "logps/rejected": -73.48295593261719, "loss": 0.2667, "losses/dpo": 0.18336009979248047, "losses/sft": 1.1339466571807861, "losses/total": 0.18336009979248047, "ref_logps/chosen": -25.813098907470703, "ref_logps/rejected": -35.3677978515625, "rewards/accuracies": 0.875, "rewards/chosen": -1.5453717708587646, "rewards/margins": 2.266144037246704, "rewards/rejected": -3.8115158081054688, "step": 1566 }, { "epoch": 1.48, "grad_norm": 23.584278141664047, "learning_rate": 2.6806571520886484e-07, "logps/chosen": -38.665321350097656, "logps/rejected": -60.59961700439453, "loss": 0.3952, "losses/dpo": 0.019451556727290154, "losses/sft": 1.2427524328231812, "losses/total": 0.019451556727290154, "ref_logps/chosen": -23.489784240722656, "ref_logps/rejected": -30.18180274963379, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5175540447235107, "rewards/margins": 1.5242276191711426, "rewards/rejected": -3.0417819023132324, "step": 1567 }, { "epoch": 1.48, "grad_norm": 20.02409152613525, "learning_rate": 2.678117030144014e-07, "logps/chosen": -45.837120056152344, "logps/rejected": -69.75418090820312, "loss": 0.321, "losses/dpo": 1.005019187927246, "losses/sft": 1.9148921966552734, "losses/total": 1.005019187927246, "ref_logps/chosen": -29.787294387817383, "ref_logps/rejected": -35.76964569091797, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6049822568893433, "rewards/margins": 1.793471097946167, "rewards/rejected": -3.398453712463379, "step": 1568 }, { "epoch": 1.48, "grad_norm": 20.982082034278484, "learning_rate": 2.6755767233676984e-07, "logps/chosen": -42.849365234375, "logps/rejected": -66.4957504272461, "loss": 0.3478, "losses/dpo": 0.6789383888244629, "losses/sft": 0.6388615369796753, "losses/total": 0.6789383888244629, "ref_logps/chosen": -29.829315185546875, "ref_logps/rejected": -35.437843322753906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3020050525665283, "rewards/margins": 1.8037861585617065, "rewards/rejected": -3.1057913303375244, "step": 1569 }, { "epoch": 1.48, "grad_norm": 17.44811750799858, "learning_rate": 2.6730362343957726e-07, "logps/chosen": -52.84885787963867, "logps/rejected": -73.8907470703125, "loss": 0.2015, "losses/dpo": 0.08575388044118881, "losses/sft": 1.284766435623169, "losses/total": 0.08575388044118881, "ref_logps/chosen": -36.56277847290039, "ref_logps/rejected": -34.42839813232422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6286077499389648, "rewards/margins": 2.317627191543579, "rewards/rejected": -3.946234941482544, "step": 1570 }, { "epoch": 1.48, "grad_norm": 23.978694984901257, "learning_rate": 2.6704955658644964e-07, "logps/chosen": -39.38630676269531, "logps/rejected": -53.821563720703125, "loss": 0.3822, "losses/dpo": 0.26816150546073914, "losses/sft": 1.5100555419921875, "losses/total": 0.26816150546073914, "ref_logps/chosen": -28.75908660888672, "ref_logps/rejected": -28.948837280273438, "rewards/accuracies": 0.875, "rewards/chosen": -1.0627217292785645, "rewards/margins": 1.4245505332946777, "rewards/rejected": -2.487272262573242, "step": 1571 }, { "epoch": 1.48, "grad_norm": 16.50303099970524, "learning_rate": 2.667954720410317e-07, "logps/chosen": -41.68964385986328, "logps/rejected": -59.42198181152344, "loss": 0.2649, "losses/dpo": 0.018192289397120476, "losses/sft": 0.8081929087638855, "losses/total": 0.018192289397120476, "ref_logps/chosen": -28.992923736572266, "ref_logps/rejected": -28.71475601196289, "rewards/accuracies": 1.0, "rewards/chosen": -1.26967191696167, "rewards/margins": 1.8010504245758057, "rewards/rejected": -3.0707223415374756, "step": 1572 }, { "epoch": 1.48, "grad_norm": 16.907128092234068, "learning_rate": 2.6654137006698633e-07, "logps/chosen": -45.19456481933594, "logps/rejected": -77.94294738769531, "loss": 0.1984, "losses/dpo": 0.16976429522037506, "losses/sft": 0.9936632513999939, "losses/total": 0.16976429522037506, "ref_logps/chosen": -27.16962432861328, "ref_logps/rejected": -38.901580810546875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8024942874908447, "rewards/margins": 2.101642608642578, "rewards/rejected": -3.9041366577148438, "step": 1573 }, { "epoch": 1.48, "grad_norm": 15.101167392162283, "learning_rate": 2.6628725092799486e-07, "logps/chosen": -40.39970397949219, "logps/rejected": -62.49283218383789, "loss": 0.3035, "losses/dpo": 0.9372571706771851, "losses/sft": 0.6482148766517639, "losses/total": 0.9372571706771851, "ref_logps/chosen": -24.24648666381836, "ref_logps/rejected": -27.283504486083984, "rewards/accuracies": 0.875, "rewards/chosen": -1.615322232246399, "rewards/margins": 1.9056105613708496, "rewards/rejected": -3.520932674407959, "step": 1574 }, { "epoch": 1.49, "grad_norm": 26.802552472921107, "learning_rate": 2.660331148877561e-07, "logps/chosen": -37.427703857421875, "logps/rejected": -51.03084945678711, "loss": 0.5399, "losses/dpo": 1.1152417659759521, "losses/sft": 0.693807065486908, "losses/total": 1.1152417659759521, "ref_logps/chosen": -21.923189163208008, "ref_logps/rejected": -24.57575225830078, "rewards/accuracies": 0.625, "rewards/chosen": -1.5504517555236816, "rewards/margins": 1.0950579643249512, "rewards/rejected": -2.645509958267212, "step": 1575 }, { "epoch": 1.49, "grad_norm": 14.863711585472576, "learning_rate": 2.6577896220998654e-07, "logps/chosen": -37.88897705078125, "logps/rejected": -65.79258728027344, "loss": 0.2175, "losses/dpo": 0.22653675079345703, "losses/sft": 1.328675389289856, "losses/total": 0.22653675079345703, "ref_logps/chosen": -25.029956817626953, "ref_logps/rejected": -30.6778564453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2859022617340088, "rewards/margins": 2.2255706787109375, "rewards/rejected": -3.5114731788635254, "step": 1576 }, { "epoch": 1.49, "grad_norm": 12.048869535240241, "learning_rate": 2.6552479315841976e-07, "logps/chosen": -53.187049865722656, "logps/rejected": -82.003662109375, "loss": 0.1435, "losses/dpo": 0.20075976848602295, "losses/sft": 2.335404872894287, "losses/total": 0.20075976848602295, "ref_logps/chosen": -34.216243743896484, "ref_logps/rejected": -37.98564147949219, "rewards/accuracies": 1.0, "rewards/chosen": -1.897080659866333, "rewards/margins": 2.5047216415405273, "rewards/rejected": -4.401802062988281, "step": 1577 }, { "epoch": 1.49, "grad_norm": 21.970712327447835, "learning_rate": 2.652706079968066e-07, "logps/chosen": -41.413116455078125, "logps/rejected": -63.81810760498047, "loss": 0.3779, "losses/dpo": 0.2029433399438858, "losses/sft": 0.6946732401847839, "losses/total": 0.2029433399438858, "ref_logps/chosen": -22.981691360473633, "ref_logps/rejected": -27.819873809814453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8431427478790283, "rewards/margins": 1.7566803693771362, "rewards/rejected": -3.599823236465454, "step": 1578 }, { "epoch": 1.49, "grad_norm": 26.11886180181706, "learning_rate": 2.650164069889144e-07, "logps/chosen": -45.330352783203125, "logps/rejected": -68.95980834960938, "loss": 0.4316, "losses/dpo": 0.5370529890060425, "losses/sft": 2.4692633152008057, "losses/total": 0.5370529890060425, "ref_logps/chosen": -25.633052825927734, "ref_logps/rejected": -35.462677001953125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.969730019569397, "rewards/margins": 1.3799834251403809, "rewards/rejected": -3.3497133255004883, "step": 1579 }, { "epoch": 1.49, "grad_norm": 17.248027793930405, "learning_rate": 2.6476219039852715e-07, "logps/chosen": -48.25762176513672, "logps/rejected": -65.9638442993164, "loss": 0.2692, "losses/dpo": 0.20074206590652466, "losses/sft": 1.0203324556350708, "losses/total": 0.20074206590652466, "ref_logps/chosen": -32.247764587402344, "ref_logps/rejected": -32.290958404541016, "rewards/accuracies": 0.875, "rewards/chosen": -1.6009862422943115, "rewards/margins": 1.766302227973938, "rewards/rejected": -3.36728835105896, "step": 1580 }, { "epoch": 1.49, "grad_norm": 17.089079969278952, "learning_rate": 2.6450795848944474e-07, "logps/chosen": -51.7088508605957, "logps/rejected": -73.9566421508789, "loss": 0.2482, "losses/dpo": 0.06429414451122284, "losses/sft": 2.1181788444519043, "losses/total": 0.06429414451122284, "ref_logps/chosen": -33.42747497558594, "ref_logps/rejected": -36.944454193115234, "rewards/accuracies": 0.875, "rewards/chosen": -1.8281381130218506, "rewards/margins": 1.8730807304382324, "rewards/rejected": -3.701218843460083, "step": 1581 }, { "epoch": 1.49, "grad_norm": 15.360351922408631, "learning_rate": 2.6425371152548327e-07, "logps/chosen": -41.51422119140625, "logps/rejected": -66.02384185791016, "loss": 0.2808, "losses/dpo": 0.8977885246276855, "losses/sft": 0.7995364665985107, "losses/total": 0.8977885246276855, "ref_logps/chosen": -25.89014434814453, "ref_logps/rejected": -30.147613525390625, "rewards/accuracies": 0.875, "rewards/chosen": -1.5624074935913086, "rewards/margins": 2.0252151489257812, "rewards/rejected": -3.58762264251709, "step": 1582 }, { "epoch": 1.49, "grad_norm": 18.88711210808915, "learning_rate": 2.6399944977047424e-07, "logps/chosen": -41.04403305053711, "logps/rejected": -56.9444580078125, "loss": 0.3196, "losses/dpo": 0.17879390716552734, "losses/sft": 1.561668872833252, "losses/total": 0.17879390716552734, "ref_logps/chosen": -25.514110565185547, "ref_logps/rejected": -25.67087173461914, "rewards/accuracies": 0.9375, "rewards/chosen": -1.552992343902588, "rewards/margins": 1.5743666887283325, "rewards/rejected": -3.12735915184021, "step": 1583 }, { "epoch": 1.49, "grad_norm": 14.773159583995223, "learning_rate": 2.637451734882645e-07, "logps/chosen": -44.196746826171875, "logps/rejected": -74.59762573242188, "loss": 0.162, "losses/dpo": 0.1610415279865265, "losses/sft": 0.7867486476898193, "losses/total": 0.1610415279865265, "ref_logps/chosen": -28.88631820678711, "ref_logps/rejected": -35.574981689453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5310429334640503, "rewards/margins": 2.3712215423583984, "rewards/rejected": -3.9022645950317383, "step": 1584 }, { "epoch": 1.5, "grad_norm": 19.97257991481444, "learning_rate": 2.63490882942716e-07, "logps/chosen": -41.33078384399414, "logps/rejected": -69.52881622314453, "loss": 0.3247, "losses/dpo": 0.2955249845981598, "losses/sft": 1.7636263370513916, "losses/total": 0.2955249845981598, "ref_logps/chosen": -27.28889274597168, "ref_logps/rejected": -33.87715148925781, "rewards/accuracies": 0.875, "rewards/chosen": -1.4041893482208252, "rewards/margins": 2.1609771251678467, "rewards/rejected": -3.565166473388672, "step": 1585 }, { "epoch": 1.5, "grad_norm": 18.987489630251364, "learning_rate": 2.632365783977057e-07, "logps/chosen": -55.50893783569336, "logps/rejected": -75.03429412841797, "loss": 0.2568, "losses/dpo": 0.003404749557375908, "losses/sft": 1.1838951110839844, "losses/total": 0.003404749557375908, "ref_logps/chosen": -35.21678161621094, "ref_logps/rejected": -33.38523864746094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0292158126831055, "rewards/margins": 2.1356897354125977, "rewards/rejected": -4.164905548095703, "step": 1586 }, { "epoch": 1.5, "grad_norm": 22.47626141671399, "learning_rate": 2.629822601171248e-07, "logps/chosen": -40.621971130371094, "logps/rejected": -64.77650451660156, "loss": 0.4215, "losses/dpo": 0.15444394946098328, "losses/sft": 0.7825025320053101, "losses/total": 0.15444394946098328, "ref_logps/chosen": -21.31365203857422, "ref_logps/rejected": -28.442222595214844, "rewards/accuracies": 0.875, "rewards/chosen": -1.9308321475982666, "rewards/margins": 1.7025964260101318, "rewards/rejected": -3.6334285736083984, "step": 1587 }, { "epoch": 1.5, "grad_norm": 21.44728308708794, "learning_rate": 2.627279283648788e-07, "logps/chosen": -46.24116134643555, "logps/rejected": -84.57968139648438, "loss": 0.3526, "losses/dpo": 0.10662383586168289, "losses/sft": 0.8619363903999329, "losses/total": 0.10662383586168289, "ref_logps/chosen": -29.948381423950195, "ref_logps/rejected": -46.286983489990234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.629278302192688, "rewards/margins": 2.199991464614868, "rewards/rejected": -3.8292698860168457, "step": 1588 }, { "epoch": 1.5, "grad_norm": 20.89236916148667, "learning_rate": 2.6247358340488733e-07, "logps/chosen": -43.72514724731445, "logps/rejected": -68.51033020019531, "loss": 0.3084, "losses/dpo": 0.4610956311225891, "losses/sft": 0.9609649777412415, "losses/total": 0.4610956311225891, "ref_logps/chosen": -29.59490394592285, "ref_logps/rejected": -34.5286979675293, "rewards/accuracies": 0.875, "rewards/chosen": -1.4130244255065918, "rewards/margins": 1.985139012336731, "rewards/rejected": -3.3981637954711914, "step": 1589 }, { "epoch": 1.5, "grad_norm": 20.0303890503382, "learning_rate": 2.622192255010836e-07, "logps/chosen": -47.65338134765625, "logps/rejected": -69.31295776367188, "loss": 0.316, "losses/dpo": 0.7498297095298767, "losses/sft": 1.0049593448638916, "losses/total": 0.7498297095298767, "ref_logps/chosen": -29.65135955810547, "ref_logps/rejected": -33.26111602783203, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8002021312713623, "rewards/margins": 1.8049817085266113, "rewards/rejected": -3.6051838397979736, "step": 1590 }, { "epoch": 1.5, "grad_norm": 16.901285410157357, "learning_rate": 2.619648549174144e-07, "logps/chosen": -38.482242584228516, "logps/rejected": -62.098846435546875, "loss": 0.3299, "losses/dpo": 0.6195681691169739, "losses/sft": 0.39196908473968506, "losses/total": 0.6195681691169739, "ref_logps/chosen": -24.871623992919922, "ref_logps/rejected": -30.753299713134766, "rewards/accuracies": 0.875, "rewards/chosen": -1.3610618114471436, "rewards/margins": 1.7734928131103516, "rewards/rejected": -3.134554862976074, "step": 1591 }, { "epoch": 1.5, "grad_norm": 22.240438233379702, "learning_rate": 2.6171047191783944e-07, "logps/chosen": -34.379425048828125, "logps/rejected": -60.96382141113281, "loss": 0.3526, "losses/dpo": 0.3068383038043976, "losses/sft": 1.254927635192871, "losses/total": 0.3068383038043976, "ref_logps/chosen": -19.488555908203125, "ref_logps/rejected": -28.599218368530273, "rewards/accuracies": 0.8125, "rewards/chosen": -1.489086627960205, "rewards/margins": 1.7473737001419067, "rewards/rejected": -3.2364602088928223, "step": 1592 }, { "epoch": 1.5, "grad_norm": 26.52318224104616, "learning_rate": 2.6145607676633153e-07, "logps/chosen": -43.47270965576172, "logps/rejected": -57.617881774902344, "loss": 0.3816, "losses/dpo": 0.0830446183681488, "losses/sft": 1.145216464996338, "losses/total": 0.0830446183681488, "ref_logps/chosen": -27.640169143676758, "ref_logps/rejected": -26.910493850708008, "rewards/accuracies": 0.8125, "rewards/chosen": -1.583254337310791, "rewards/margins": 1.4874849319458008, "rewards/rejected": -3.070739269256592, "step": 1593 }, { "epoch": 1.5, "grad_norm": 27.56698469854055, "learning_rate": 2.612016697268759e-07, "logps/chosen": -40.3408088684082, "logps/rejected": -63.19961166381836, "loss": 0.4246, "losses/dpo": 0.0480157770216465, "losses/sft": 0.6151535511016846, "losses/total": 0.0480157770216465, "ref_logps/chosen": -24.687744140625, "ref_logps/rejected": -31.324987411499023, "rewards/accuracies": 0.75, "rewards/chosen": -1.5653064250946045, "rewards/margins": 1.622156023979187, "rewards/rejected": -3.187462329864502, "step": 1594 }, { "epoch": 1.5, "grad_norm": 20.15014522478157, "learning_rate": 2.609472510634703e-07, "logps/chosen": -42.91722869873047, "logps/rejected": -64.75651550292969, "loss": 0.3345, "losses/dpo": 0.10238239169120789, "losses/sft": 1.3477811813354492, "losses/total": 0.10238239169120789, "ref_logps/chosen": -26.019433975219727, "ref_logps/rejected": -31.94081687927246, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6897797584533691, "rewards/margins": 1.5917903184890747, "rewards/rejected": -3.281569719314575, "step": 1595 }, { "epoch": 1.51, "grad_norm": 23.539894601637158, "learning_rate": 2.606928210401245e-07, "logps/chosen": -43.95249557495117, "logps/rejected": -62.69418716430664, "loss": 0.4376, "losses/dpo": 2.444797992706299, "losses/sft": 2.551663637161255, "losses/total": 2.444797992706299, "ref_logps/chosen": -27.5424861907959, "ref_logps/rejected": -31.118751525878906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.641000747680664, "rewards/margins": 1.516542911529541, "rewards/rejected": -3.157543897628784, "step": 1596 }, { "epoch": 1.51, "grad_norm": 16.5661512234557, "learning_rate": 2.604383799208599e-07, "logps/chosen": -36.64223098754883, "logps/rejected": -64.04246520996094, "loss": 0.2112, "losses/dpo": 0.7700505256652832, "losses/sft": 0.8985521793365479, "losses/total": 0.7700505256652832, "ref_logps/chosen": -22.230369567871094, "ref_logps/rejected": -29.729473114013672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4411861896514893, "rewards/margins": 1.9901131391525269, "rewards/rejected": -3.4312992095947266, "step": 1597 }, { "epoch": 1.51, "grad_norm": 18.16434393444345, "learning_rate": 2.6018392796970957e-07, "logps/chosen": -47.16395568847656, "logps/rejected": -89.41499328613281, "loss": 0.1706, "losses/dpo": 0.34175509214401245, "losses/sft": 1.5332673788070679, "losses/total": 0.34175509214401245, "ref_logps/chosen": -30.906261444091797, "ref_logps/rejected": -47.259056091308594, "rewards/accuracies": 1.0, "rewards/chosen": -1.6257696151733398, "rewards/margins": 2.589824676513672, "rewards/rejected": -4.215594291687012, "step": 1598 }, { "epoch": 1.51, "grad_norm": 15.592488688745227, "learning_rate": 2.5992946545071787e-07, "logps/chosen": -59.74093246459961, "logps/rejected": -84.63883209228516, "loss": 0.2156, "losses/dpo": 0.010624743066728115, "losses/sft": 0.8569409847259521, "losses/total": 0.010624743066728115, "ref_logps/chosen": -38.93655014038086, "ref_logps/rejected": -42.6060905456543, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0804386138916016, "rewards/margins": 2.122835636138916, "rewards/rejected": -4.203274250030518, "step": 1599 }, { "epoch": 1.51, "grad_norm": 15.059262666688246, "learning_rate": 2.5967499262794e-07, "logps/chosen": -44.28340148925781, "logps/rejected": -77.75691986083984, "loss": 0.2075, "losses/dpo": 0.08446957170963287, "losses/sft": 1.0516406297683716, "losses/total": 0.08446957170963287, "ref_logps/chosen": -26.803482055664062, "ref_logps/rejected": -36.44837951660156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7479920387268066, "rewards/margins": 2.382861614227295, "rewards/rejected": -4.130853652954102, "step": 1600 }, { "epoch": 1.51, "grad_norm": 19.89795087533425, "learning_rate": 2.5942050976544183e-07, "logps/chosen": -47.153995513916016, "logps/rejected": -67.58198547363281, "loss": 0.2642, "losses/dpo": 0.0618208572268486, "losses/sft": 0.6810871362686157, "losses/total": 0.0618208572268486, "ref_logps/chosen": -30.731739044189453, "ref_logps/rejected": -32.84632873535156, "rewards/accuracies": 0.875, "rewards/chosen": -1.642225980758667, "rewards/margins": 1.8313398361206055, "rewards/rejected": -3.4735658168792725, "step": 1601 }, { "epoch": 1.51, "grad_norm": 10.82166691771124, "learning_rate": 2.5916601712729975e-07, "logps/chosen": -40.000823974609375, "logps/rejected": -73.0838623046875, "loss": 0.1794, "losses/dpo": 0.014058173634111881, "losses/sft": 1.634528636932373, "losses/total": 0.014058173634111881, "ref_logps/chosen": -25.343116760253906, "ref_logps/rejected": -34.46684265136719, "rewards/accuracies": 1.0, "rewards/chosen": -1.465770959854126, "rewards/margins": 2.395930767059326, "rewards/rejected": -3.8617019653320312, "step": 1602 }, { "epoch": 1.51, "grad_norm": 16.990549896915656, "learning_rate": 2.589115149776003e-07, "logps/chosen": -42.00717544555664, "logps/rejected": -76.84578704833984, "loss": 0.2553, "losses/dpo": 0.0773918405175209, "losses/sft": 1.3400797843933105, "losses/total": 0.0773918405175209, "ref_logps/chosen": -24.132308959960938, "ref_logps/rejected": -36.72596740722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.7874867916107178, "rewards/margins": 2.2244949340820312, "rewards/rejected": -4.01198148727417, "step": 1603 }, { "epoch": 1.51, "grad_norm": 16.375210276198995, "learning_rate": 2.586570035804398e-07, "logps/chosen": -52.67914962768555, "logps/rejected": -84.81504821777344, "loss": 0.1924, "losses/dpo": 0.03920631855726242, "losses/sft": 1.341700792312622, "losses/total": 0.03920631855726242, "ref_logps/chosen": -34.97237014770508, "ref_logps/rejected": -43.854034423828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7706780433654785, "rewards/margins": 2.325422525405884, "rewards/rejected": -4.096100807189941, "step": 1604 }, { "epoch": 1.51, "grad_norm": 12.253763261723766, "learning_rate": 2.5840248319992417e-07, "logps/chosen": -31.446659088134766, "logps/rejected": -77.84005737304688, "loss": 0.1491, "losses/dpo": 0.03984105959534645, "losses/sft": 1.2879674434661865, "losses/total": 0.03984105959534645, "ref_logps/chosen": -14.9603271484375, "ref_logps/rejected": -33.919342041015625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6486332416534424, "rewards/margins": 2.743438482284546, "rewards/rejected": -4.392071723937988, "step": 1605 }, { "epoch": 1.52, "grad_norm": 12.737083942593209, "learning_rate": 2.581479541001688e-07, "logps/chosen": -45.48522186279297, "logps/rejected": -69.50028991699219, "loss": 0.1795, "losses/dpo": 0.08472014218568802, "losses/sft": 0.7797074317932129, "losses/total": 0.08472014218568802, "ref_logps/chosen": -29.546810150146484, "ref_logps/rejected": -34.55805206298828, "rewards/accuracies": 1.0, "rewards/chosen": -1.5938410758972168, "rewards/margins": 1.9003829956054688, "rewards/rejected": -3.4942240715026855, "step": 1606 }, { "epoch": 1.52, "grad_norm": 15.197826600645671, "learning_rate": 2.5789341654529794e-07, "logps/chosen": -41.95906448364258, "logps/rejected": -72.42234802246094, "loss": 0.1845, "losses/dpo": 0.19094327092170715, "losses/sft": 2.1642091274261475, "losses/total": 0.19094327092170715, "ref_logps/chosen": -28.169235229492188, "ref_logps/rejected": -35.83125305175781, "rewards/accuracies": 1.0, "rewards/chosen": -1.3789830207824707, "rewards/margins": 2.2801260948181152, "rewards/rejected": -3.659109115600586, "step": 1607 }, { "epoch": 1.52, "grad_norm": 31.911551328129043, "learning_rate": 2.576388707994447e-07, "logps/chosen": -45.81748962402344, "logps/rejected": -61.33546447753906, "loss": 0.6126, "losses/dpo": 0.03657618165016174, "losses/sft": 2.0519962310791016, "losses/total": 0.03657618165016174, "ref_logps/chosen": -24.531490325927734, "ref_logps/rejected": -27.781681060791016, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1286001205444336, "rewards/margins": 1.226778268814087, "rewards/rejected": -3.3553781509399414, "step": 1608 }, { "epoch": 1.52, "grad_norm": 16.095682654880378, "learning_rate": 2.5738431712675073e-07, "logps/chosen": -43.15835952758789, "logps/rejected": -71.51252746582031, "loss": 0.227, "losses/dpo": 0.014860324561595917, "losses/sft": 1.0250248908996582, "losses/total": 0.014860324561595917, "ref_logps/chosen": -27.884918212890625, "ref_logps/rejected": -35.41294479370117, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5273442268371582, "rewards/margins": 2.082613706588745, "rewards/rejected": -3.6099581718444824, "step": 1609 }, { "epoch": 1.52, "grad_norm": 12.808091974292921, "learning_rate": 2.571297557913659e-07, "logps/chosen": -40.415626525878906, "logps/rejected": -63.584861755371094, "loss": 0.1882, "losses/dpo": 0.15880560874938965, "losses/sft": 0.779414713382721, "losses/total": 0.15880560874938965, "ref_logps/chosen": -27.802852630615234, "ref_logps/rejected": -28.837810516357422, "rewards/accuracies": 1.0, "rewards/chosen": -1.261277437210083, "rewards/margins": 2.213428020477295, "rewards/rejected": -3.474705219268799, "step": 1610 }, { "epoch": 1.52, "grad_norm": 24.659500203276956, "learning_rate": 2.568751870574478e-07, "logps/chosen": -45.872291564941406, "logps/rejected": -63.48705291748047, "loss": 0.374, "losses/dpo": 1.440657377243042, "losses/sft": 2.6784629821777344, "losses/total": 1.440657377243042, "ref_logps/chosen": -25.880107879638672, "ref_logps/rejected": -28.308246612548828, "rewards/accuracies": 0.75, "rewards/chosen": -1.9992185831069946, "rewards/margins": 1.5186618566513062, "rewards/rejected": -3.517880439758301, "step": 1611 }, { "epoch": 1.52, "grad_norm": 21.660726413686504, "learning_rate": 2.5662061118916207e-07, "logps/chosen": -47.00584411621094, "logps/rejected": -70.78782653808594, "loss": 0.4046, "losses/dpo": 0.01784604787826538, "losses/sft": 0.44079169631004333, "losses/total": 0.01784604787826538, "ref_logps/chosen": -26.68485450744629, "ref_logps/rejected": -33.18986511230469, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0320990085601807, "rewards/margins": 1.7276972532272339, "rewards/rejected": -3.759796142578125, "step": 1612 }, { "epoch": 1.52, "grad_norm": 32.16453695347403, "learning_rate": 2.5636602845068156e-07, "logps/chosen": -52.609046936035156, "logps/rejected": -64.38192749023438, "loss": 0.6065, "losses/dpo": 0.07311403751373291, "losses/sft": 2.338721752166748, "losses/total": 0.07311403751373291, "ref_logps/chosen": -27.5692081451416, "ref_logps/rejected": -32.17863845825195, "rewards/accuracies": 0.6875, "rewards/chosen": -2.503983974456787, "rewards/margins": 0.7163453698158264, "rewards/rejected": -3.2203292846679688, "step": 1613 }, { "epoch": 1.52, "grad_norm": 17.181211315934025, "learning_rate": 2.561114391061861e-07, "logps/chosen": -41.07716751098633, "logps/rejected": -69.98804473876953, "loss": 0.2601, "losses/dpo": 0.2395356446504593, "losses/sft": 0.5962024331092834, "losses/total": 0.2395356446504593, "ref_logps/chosen": -26.406936645507812, "ref_logps/rejected": -33.20668029785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4670228958129883, "rewards/margins": 2.211113452911377, "rewards/rejected": -3.6781363487243652, "step": 1614 }, { "epoch": 1.52, "grad_norm": 19.26918924752795, "learning_rate": 2.5585684341986274e-07, "logps/chosen": -37.05244827270508, "logps/rejected": -62.564483642578125, "loss": 0.2175, "losses/dpo": 0.04492151364684105, "losses/sft": 1.248550534248352, "losses/total": 0.04492151364684105, "ref_logps/chosen": -24.928539276123047, "ref_logps/rejected": -28.753677368164062, "rewards/accuracies": 0.875, "rewards/chosen": -1.2123911380767822, "rewards/margins": 2.168689250946045, "rewards/rejected": -3.3810806274414062, "step": 1615 }, { "epoch": 1.52, "grad_norm": 20.250796748902278, "learning_rate": 2.5560224165590485e-07, "logps/chosen": -40.07423400878906, "logps/rejected": -64.77552795410156, "loss": 0.2842, "losses/dpo": 0.01756967604160309, "losses/sft": 0.7942717671394348, "losses/total": 0.01756967604160309, "ref_logps/chosen": -25.593154907226562, "ref_logps/rejected": -32.0208740234375, "rewards/accuracies": 0.875, "rewards/chosen": -1.4481079578399658, "rewards/margins": 1.8273568153381348, "rewards/rejected": -3.2754650115966797, "step": 1616 }, { "epoch": 1.53, "grad_norm": 32.198984308871395, "learning_rate": 2.553476340785121e-07, "logps/chosen": -53.36528015136719, "logps/rejected": -82.64787292480469, "loss": 0.4285, "losses/dpo": 0.057173412293195724, "losses/sft": 2.1919682025909424, "losses/total": 0.057173412293195724, "ref_logps/chosen": -33.849021911621094, "ref_logps/rejected": -37.62543487548828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9516260623931885, "rewards/margins": 2.5506186485290527, "rewards/rejected": -4.50224494934082, "step": 1617 }, { "epoch": 1.53, "grad_norm": 24.037329600988215, "learning_rate": 2.5509302095189027e-07, "logps/chosen": -52.57476806640625, "logps/rejected": -63.8599739074707, "loss": 0.2868, "losses/dpo": 0.03461316600441933, "losses/sft": 1.4170067310333252, "losses/total": 0.03461316600441933, "ref_logps/chosen": -38.116783142089844, "ref_logps/rejected": -28.521873474121094, "rewards/accuracies": 0.875, "rewards/chosen": -1.445798397064209, "rewards/margins": 2.0880112648010254, "rewards/rejected": -3.5338094234466553, "step": 1618 }, { "epoch": 1.53, "grad_norm": 15.079094797123032, "learning_rate": 2.5483840254025095e-07, "logps/chosen": -40.38468551635742, "logps/rejected": -73.5125732421875, "loss": 0.2222, "losses/dpo": 0.06766606867313385, "losses/sft": 0.0885307714343071, "losses/total": 0.06766606867313385, "ref_logps/chosen": -26.792936325073242, "ref_logps/rejected": -37.85224914550781, "rewards/accuracies": 1.0, "rewards/chosen": -1.3591749668121338, "rewards/margins": 2.206857681274414, "rewards/rejected": -3.566032648086548, "step": 1619 }, { "epoch": 1.53, "grad_norm": 18.65643334822428, "learning_rate": 2.5458377910781114e-07, "logps/chosen": -39.17176818847656, "logps/rejected": -70.18649291992188, "loss": 0.2667, "losses/dpo": 0.2764016389846802, "losses/sft": 1.6462887525558472, "losses/total": 0.2764016389846802, "ref_logps/chosen": -24.985111236572266, "ref_logps/rejected": -34.73835754394531, "rewards/accuracies": 0.9375, "rewards/chosen": -1.418665885925293, "rewards/margins": 2.126147508621216, "rewards/rejected": -3.5448131561279297, "step": 1620 }, { "epoch": 1.53, "grad_norm": 19.46530288463303, "learning_rate": 2.54329150918793e-07, "logps/chosen": -46.81804656982422, "logps/rejected": -56.55433654785156, "loss": 0.3688, "losses/dpo": 0.028668329119682312, "losses/sft": 0.5436074137687683, "losses/total": 0.028668329119682312, "ref_logps/chosen": -29.994359970092773, "ref_logps/rejected": -26.376506805419922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.682368516921997, "rewards/margins": 1.3354144096374512, "rewards/rejected": -3.0177829265594482, "step": 1621 }, { "epoch": 1.53, "grad_norm": 23.507929612448784, "learning_rate": 2.540745182374238e-07, "logps/chosen": -52.36756134033203, "logps/rejected": -76.7259292602539, "loss": 0.2792, "losses/dpo": 0.47872042655944824, "losses/sft": 1.8050898313522339, "losses/total": 0.47872042655944824, "ref_logps/chosen": -34.92975616455078, "ref_logps/rejected": -39.44233703613281, "rewards/accuracies": 0.875, "rewards/chosen": -1.7437803745269775, "rewards/margins": 1.9845784902572632, "rewards/rejected": -3.728358745574951, "step": 1622 }, { "epoch": 1.53, "grad_norm": 20.86916283673548, "learning_rate": 2.5381988132793513e-07, "logps/chosen": -57.716453552246094, "logps/rejected": -82.36442565917969, "loss": 0.2355, "losses/dpo": 0.002182162832468748, "losses/sft": 0.43940261006355286, "losses/total": 0.002182162832468748, "ref_logps/chosen": -42.852821350097656, "ref_logps/rejected": -42.70977783203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.486363172531128, "rewards/margins": 2.47910213470459, "rewards/rejected": -3.9654653072357178, "step": 1623 }, { "epoch": 1.53, "grad_norm": 23.432153795517358, "learning_rate": 2.5356524045456347e-07, "logps/chosen": -36.68240737915039, "logps/rejected": -66.83920288085938, "loss": 0.3754, "losses/dpo": 0.07154897600412369, "losses/sft": 0.8949294090270996, "losses/total": 0.07154897600412369, "ref_logps/chosen": -23.85591697692871, "ref_logps/rejected": -34.313941955566406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2826488018035889, "rewards/margins": 1.969877004623413, "rewards/rejected": -3.252525806427002, "step": 1624 }, { "epoch": 1.53, "grad_norm": 19.90831412271287, "learning_rate": 2.53310595881549e-07, "logps/chosen": -52.38392639160156, "logps/rejected": -69.86347961425781, "loss": 0.2197, "losses/dpo": 0.03911465406417847, "losses/sft": 2.361893653869629, "losses/total": 0.03911465406417847, "ref_logps/chosen": -36.01429748535156, "ref_logps/rejected": -35.06315612792969, "rewards/accuracies": 0.875, "rewards/chosen": -1.6369630098342896, "rewards/margins": 1.8430684804916382, "rewards/rejected": -3.4800314903259277, "step": 1625 }, { "epoch": 1.53, "grad_norm": 11.380090906218332, "learning_rate": 2.530559478731358e-07, "logps/chosen": -38.66001892089844, "logps/rejected": -71.62647247314453, "loss": 0.1443, "losses/dpo": 0.11572781950235367, "losses/sft": 1.169177770614624, "losses/total": 0.11572781950235367, "ref_logps/chosen": -24.038066864013672, "ref_logps/rejected": -31.262779235839844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4621949195861816, "rewards/margins": 2.574174404144287, "rewards/rejected": -4.036369323730469, "step": 1626 }, { "epoch": 1.53, "grad_norm": 20.181230953827853, "learning_rate": 2.528012966935717e-07, "logps/chosen": -44.828887939453125, "logps/rejected": -75.15731811523438, "loss": 0.2989, "losses/dpo": 0.1599864661693573, "losses/sft": 0.47559455037117004, "losses/total": 0.1599864661693573, "ref_logps/chosen": -26.697925567626953, "ref_logps/rejected": -39.10159683227539, "rewards/accuracies": 0.875, "rewards/chosen": -1.8130958080291748, "rewards/margins": 1.7924771308898926, "rewards/rejected": -3.6055731773376465, "step": 1627 }, { "epoch": 1.54, "grad_norm": 20.617696405004356, "learning_rate": 2.525466426071077e-07, "logps/chosen": -40.99879455566406, "logps/rejected": -55.930076599121094, "loss": 0.3241, "losses/dpo": 0.9020158052444458, "losses/sft": 1.4882838726043701, "losses/total": 0.9020158052444458, "ref_logps/chosen": -26.21251678466797, "ref_logps/rejected": -26.095985412597656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4786278009414673, "rewards/margins": 1.5047814846038818, "rewards/rejected": -2.9834094047546387, "step": 1628 }, { "epoch": 1.54, "grad_norm": 23.757455498979574, "learning_rate": 2.5229198587799787e-07, "logps/chosen": -43.09331512451172, "logps/rejected": -69.52620697021484, "loss": 0.3129, "losses/dpo": 0.20814894139766693, "losses/sft": 0.17454871535301208, "losses/total": 0.20814894139766693, "ref_logps/chosen": -29.476036071777344, "ref_logps/rejected": -38.14772415161133, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3617277145385742, "rewards/margins": 1.7761204242706299, "rewards/rejected": -3.137848377227783, "step": 1629 }, { "epoch": 1.54, "grad_norm": 16.08627352092056, "learning_rate": 2.5203732677049893e-07, "logps/chosen": -41.84186553955078, "logps/rejected": -66.6622314453125, "loss": 0.2721, "losses/dpo": 0.14078356325626373, "losses/sft": 0.8515151739120483, "losses/total": 0.14078356325626373, "ref_logps/chosen": -23.158828735351562, "ref_logps/rejected": -30.31624984741211, "rewards/accuracies": 0.9375, "rewards/chosen": -1.868303656578064, "rewards/margins": 1.7662949562072754, "rewards/rejected": -3.634598731994629, "step": 1630 }, { "epoch": 1.54, "grad_norm": 22.455618020137184, "learning_rate": 2.517826655488701e-07, "logps/chosen": -49.49104690551758, "logps/rejected": -68.92185974121094, "loss": 0.3878, "losses/dpo": 0.1175113245844841, "losses/sft": 1.0011802911758423, "losses/total": 0.1175113245844841, "ref_logps/chosen": -31.042999267578125, "ref_logps/rejected": -34.8188591003418, "rewards/accuracies": 0.875, "rewards/chosen": -1.8448046445846558, "rewards/margins": 1.5654951333999634, "rewards/rejected": -3.41029953956604, "step": 1631 }, { "epoch": 1.54, "grad_norm": 14.758599550001486, "learning_rate": 2.515280024773729e-07, "logps/chosen": -53.05231475830078, "logps/rejected": -81.21009063720703, "loss": 0.2228, "losses/dpo": 0.7262633442878723, "losses/sft": 0.6761093139648438, "losses/total": 0.7262633442878723, "ref_logps/chosen": -34.278411865234375, "ref_logps/rejected": -43.84061813354492, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8773901462554932, "rewards/margins": 1.8595569133758545, "rewards/rejected": -3.7369470596313477, "step": 1632 }, { "epoch": 1.54, "grad_norm": 18.583219669254362, "learning_rate": 2.512733378202706e-07, "logps/chosen": -36.88666534423828, "logps/rejected": -57.62715148925781, "loss": 0.3277, "losses/dpo": 0.11730064451694489, "losses/sft": 1.0247364044189453, "losses/total": 0.11730064451694489, "ref_logps/chosen": -22.071212768554688, "ref_logps/rejected": -28.783100128173828, "rewards/accuracies": 0.875, "rewards/chosen": -1.4815454483032227, "rewards/margins": 1.4028594493865967, "rewards/rejected": -2.8844048976898193, "step": 1633 }, { "epoch": 1.54, "grad_norm": 20.884797120359938, "learning_rate": 2.510186718418281e-07, "logps/chosen": -37.74592971801758, "logps/rejected": -63.90220260620117, "loss": 0.3548, "losses/dpo": 0.2329302430152893, "losses/sft": 1.9083000421524048, "losses/total": 0.2329302430152893, "ref_logps/chosen": -23.950246810913086, "ref_logps/rejected": -33.08823776245117, "rewards/accuracies": 0.8125, "rewards/chosen": -1.379568338394165, "rewards/margins": 1.701828122138977, "rewards/rejected": -3.0813965797424316, "step": 1634 }, { "epoch": 1.54, "grad_norm": 17.37904435738535, "learning_rate": 2.5076400480631207e-07, "logps/chosen": -31.37360382080078, "logps/rejected": -82.2332992553711, "loss": 0.237, "losses/dpo": 0.04423757642507553, "losses/sft": 0.9599249362945557, "losses/total": 0.04423757642507553, "ref_logps/chosen": -17.772233963012695, "ref_logps/rejected": -44.14623260498047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3601367473602295, "rewards/margins": 2.4485695362091064, "rewards/rejected": -3.808706283569336, "step": 1635 }, { "epoch": 1.54, "grad_norm": 18.652994893496068, "learning_rate": 2.505093369779897e-07, "logps/chosen": -38.8519401550293, "logps/rejected": -69.8973617553711, "loss": 0.2182, "losses/dpo": 0.04510349780321121, "losses/sft": 0.9173808693885803, "losses/total": 0.04510349780321121, "ref_logps/chosen": -22.516590118408203, "ref_logps/rejected": -32.106903076171875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6335346698760986, "rewards/margins": 2.1455113887786865, "rewards/rejected": -3.779046058654785, "step": 1636 }, { "epoch": 1.54, "grad_norm": 16.868907862043713, "learning_rate": 2.502546686211294e-07, "logps/chosen": -50.292144775390625, "logps/rejected": -70.64627838134766, "loss": 0.2767, "losses/dpo": 0.3433552086353302, "losses/sft": 1.5598151683807373, "losses/total": 0.3433552086353302, "ref_logps/chosen": -35.32904052734375, "ref_logps/rejected": -36.09330749511719, "rewards/accuracies": 0.875, "rewards/chosen": -1.4963104724884033, "rewards/margins": 1.9589871168136597, "rewards/rejected": -3.4552974700927734, "step": 1637 }, { "epoch": 1.55, "grad_norm": 14.399461425960983, "learning_rate": 2.5e-07, "logps/chosen": -36.41966247558594, "logps/rejected": -71.470703125, "loss": 0.2029, "losses/dpo": 0.01144376490265131, "losses/sft": 1.7596220970153809, "losses/total": 0.01144376490265131, "ref_logps/chosen": -23.790870666503906, "ref_logps/rejected": -32.248313903808594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.262879490852356, "rewards/margins": 2.6593594551086426, "rewards/rejected": -3.922238826751709, "step": 1638 }, { "epoch": 1.55, "grad_norm": 14.978475283114822, "learning_rate": 2.497453313788706e-07, "logps/chosen": -39.509029388427734, "logps/rejected": -65.8313980102539, "loss": 0.2698, "losses/dpo": 0.40737348794937134, "losses/sft": 1.3073850870132446, "losses/total": 0.40737348794937134, "ref_logps/chosen": -24.301372528076172, "ref_logps/rejected": -32.24395751953125, "rewards/accuracies": 0.875, "rewards/chosen": -1.5207655429840088, "rewards/margins": 1.8379790782928467, "rewards/rejected": -3.3587446212768555, "step": 1639 }, { "epoch": 1.55, "grad_norm": 11.591237942234342, "learning_rate": 2.494906630220103e-07, "logps/chosen": -40.56306457519531, "logps/rejected": -71.72065734863281, "loss": 0.2088, "losses/dpo": 0.05366961285471916, "losses/sft": 1.0727684497833252, "losses/total": 0.05366961285471916, "ref_logps/chosen": -25.232412338256836, "ref_logps/rejected": -31.564613342285156, "rewards/accuracies": 0.875, "rewards/chosen": -1.5330650806427002, "rewards/margins": 2.482539653778076, "rewards/rejected": -4.0156049728393555, "step": 1640 }, { "epoch": 1.55, "grad_norm": 23.903688190891845, "learning_rate": 2.49235995193688e-07, "logps/chosen": -47.555816650390625, "logps/rejected": -62.24616241455078, "loss": 0.4194, "losses/dpo": 0.015806520357728004, "losses/sft": 1.7234503030776978, "losses/total": 0.015806520357728004, "ref_logps/chosen": -29.53871726989746, "ref_logps/rejected": -30.647335052490234, "rewards/accuracies": 0.875, "rewards/chosen": -1.801709771156311, "rewards/margins": 1.3581726551055908, "rewards/rejected": -3.1598825454711914, "step": 1641 }, { "epoch": 1.55, "grad_norm": 17.91230005388707, "learning_rate": 2.4898132815817186e-07, "logps/chosen": -40.55141830444336, "logps/rejected": -76.145751953125, "loss": 0.2199, "losses/dpo": 0.19413436949253082, "losses/sft": 2.5089075565338135, "losses/total": 0.19413436949253082, "ref_logps/chosen": -25.124345779418945, "ref_logps/rejected": -38.979888916015625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5427074432373047, "rewards/margins": 2.1738784313201904, "rewards/rejected": -3.716585874557495, "step": 1642 }, { "epoch": 1.55, "grad_norm": 18.96508023407873, "learning_rate": 2.487266621797294e-07, "logps/chosen": -53.55635070800781, "logps/rejected": -78.3470458984375, "loss": 0.247, "losses/dpo": 0.033654581755399704, "losses/sft": 1.2039624452590942, "losses/total": 0.033654581755399704, "ref_logps/chosen": -33.49898147583008, "ref_logps/rejected": -36.49613952636719, "rewards/accuracies": 0.875, "rewards/chosen": -2.005736827850342, "rewards/margins": 2.179354190826416, "rewards/rejected": -4.185091018676758, "step": 1643 }, { "epoch": 1.55, "grad_norm": 24.437316089108943, "learning_rate": 2.484719975226271e-07, "logps/chosen": -39.166900634765625, "logps/rejected": -59.459312438964844, "loss": 0.3888, "losses/dpo": 0.21187573671340942, "losses/sft": 0.32384827733039856, "losses/total": 0.21187573671340942, "ref_logps/chosen": -26.852252960205078, "ref_logps/rejected": -32.899871826171875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2314642667770386, "rewards/margins": 1.4244797229766846, "rewards/rejected": -2.6559438705444336, "step": 1644 }, { "epoch": 1.55, "grad_norm": 22.003728093728405, "learning_rate": 2.482173344511299e-07, "logps/chosen": -37.4185676574707, "logps/rejected": -69.05416870117188, "loss": 0.2634, "losses/dpo": 0.3457527458667755, "losses/sft": 1.4768190383911133, "losses/total": 0.3457527458667755, "ref_logps/chosen": -19.41339874267578, "ref_logps/rejected": -30.85307502746582, "rewards/accuracies": 0.875, "rewards/chosen": -1.8005168437957764, "rewards/margins": 2.019592761993408, "rewards/rejected": -3.8201093673706055, "step": 1645 }, { "epoch": 1.55, "grad_norm": 16.159238589670768, "learning_rate": 2.4796267322950105e-07, "logps/chosen": -46.37367248535156, "logps/rejected": -78.90770721435547, "loss": 0.2124, "losses/dpo": 0.013547714799642563, "losses/sft": 1.0197057723999023, "losses/total": 0.013547714799642563, "ref_logps/chosen": -31.22461700439453, "ref_logps/rejected": -35.81779479980469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5149056911468506, "rewards/margins": 2.794085741043091, "rewards/rejected": -4.308991432189941, "step": 1646 }, { "epoch": 1.55, "grad_norm": 16.971919860928683, "learning_rate": 2.477080141220021e-07, "logps/chosen": -41.441650390625, "logps/rejected": -69.05755615234375, "loss": 0.2184, "losses/dpo": 0.05334571376442909, "losses/sft": 1.049261212348938, "losses/total": 0.05334571376442909, "ref_logps/chosen": -24.90970802307129, "ref_logps/rejected": -33.152191162109375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6531941890716553, "rewards/margins": 1.9373422861099243, "rewards/rejected": -3.590536594390869, "step": 1647 }, { "epoch": 1.55, "grad_norm": 17.623050351590482, "learning_rate": 2.4745335739289227e-07, "logps/chosen": -46.28074645996094, "logps/rejected": -66.19282531738281, "loss": 0.3056, "losses/dpo": 0.5540752410888672, "losses/sft": 0.5671115517616272, "losses/total": 0.5540752410888672, "ref_logps/chosen": -26.516332626342773, "ref_logps/rejected": -30.244651794433594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9764413833618164, "rewards/margins": 1.6183761358261108, "rewards/rejected": -3.5948173999786377, "step": 1648 }, { "epoch": 1.56, "grad_norm": 20.53271785860662, "learning_rate": 2.471987033064283e-07, "logps/chosen": -46.57971954345703, "logps/rejected": -60.765724182128906, "loss": 0.3249, "losses/dpo": 0.05886421725153923, "losses/sft": 1.5163401365280151, "losses/total": 0.05886421725153923, "ref_logps/chosen": -28.088619232177734, "ref_logps/rejected": -26.841215133666992, "rewards/accuracies": 0.875, "rewards/chosen": -1.8491100072860718, "rewards/margins": 1.5433411598205566, "rewards/rejected": -3.392451286315918, "step": 1649 }, { "epoch": 1.56, "grad_norm": 17.925876072302728, "learning_rate": 2.4694405212686425e-07, "logps/chosen": -44.842369079589844, "logps/rejected": -65.35426330566406, "loss": 0.2898, "losses/dpo": 0.26999661326408386, "losses/sft": 1.3804237842559814, "losses/total": 0.26999661326408386, "ref_logps/chosen": -30.578752517700195, "ref_logps/rejected": -32.91206359863281, "rewards/accuracies": 0.875, "rewards/chosen": -1.4263615608215332, "rewards/margins": 1.8178582191467285, "rewards/rejected": -3.2442197799682617, "step": 1650 }, { "epoch": 1.56, "grad_norm": 15.393737175680332, "learning_rate": 2.4668940411845105e-07, "logps/chosen": -44.30866241455078, "logps/rejected": -60.07221984863281, "loss": 0.3008, "losses/dpo": 0.4313015341758728, "losses/sft": 0.703107476234436, "losses/total": 0.4313015341758728, "ref_logps/chosen": -27.533889770507812, "ref_logps/rejected": -25.745647430419922, "rewards/accuracies": 0.875, "rewards/chosen": -1.6774771213531494, "rewards/margins": 1.7551803588867188, "rewards/rejected": -3.432657241821289, "step": 1651 }, { "epoch": 1.56, "grad_norm": 20.284417039201813, "learning_rate": 2.4643475954543657e-07, "logps/chosen": -41.416107177734375, "logps/rejected": -56.72584533691406, "loss": 0.4399, "losses/dpo": 0.0549796000123024, "losses/sft": 1.5730695724487305, "losses/total": 0.0549796000123024, "ref_logps/chosen": -24.165786743164062, "ref_logps/rejected": -25.492847442626953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.725031852722168, "rewards/margins": 1.3982677459716797, "rewards/rejected": -3.1232995986938477, "step": 1652 }, { "epoch": 1.56, "grad_norm": 16.185620058596722, "learning_rate": 2.4618011867206485e-07, "logps/chosen": -37.12403106689453, "logps/rejected": -52.96816635131836, "loss": 0.3415, "losses/dpo": 0.21513521671295166, "losses/sft": 1.8681983947753906, "losses/total": 0.21513521671295166, "ref_logps/chosen": -20.80508804321289, "ref_logps/rejected": -24.165346145629883, "rewards/accuracies": 0.875, "rewards/chosen": -1.6318939924240112, "rewards/margins": 1.2483878135681152, "rewards/rejected": -2.880281925201416, "step": 1653 }, { "epoch": 1.56, "grad_norm": 16.91125765910508, "learning_rate": 2.459254817625763e-07, "logps/chosen": -38.27928924560547, "logps/rejected": -69.81684875488281, "loss": 0.2744, "losses/dpo": 0.6797643899917603, "losses/sft": 1.4857711791992188, "losses/total": 0.6797643899917603, "ref_logps/chosen": -23.49102020263672, "ref_logps/rejected": -30.81230926513672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4788267612457275, "rewards/margins": 2.4216277599334717, "rewards/rejected": -3.900454521179199, "step": 1654 }, { "epoch": 1.56, "grad_norm": 15.395035170820233, "learning_rate": 2.45670849081207e-07, "logps/chosen": -37.98332595825195, "logps/rejected": -70.23515319824219, "loss": 0.2106, "losses/dpo": 0.2786385715007782, "losses/sft": 1.250869870185852, "losses/total": 0.2786385715007782, "ref_logps/chosen": -25.97027587890625, "ref_logps/rejected": -35.46717071533203, "rewards/accuracies": 1.0, "rewards/chosen": -1.2013051509857178, "rewards/margins": 2.2754924297332764, "rewards/rejected": -3.476797580718994, "step": 1655 }, { "epoch": 1.56, "grad_norm": 26.42291886722004, "learning_rate": 2.454162208921889e-07, "logps/chosen": -45.686424255371094, "logps/rejected": -63.08049774169922, "loss": 0.3026, "losses/dpo": 0.18025730550289154, "losses/sft": 2.3047900199890137, "losses/total": 0.18025730550289154, "ref_logps/chosen": -29.122783660888672, "ref_logps/rejected": -29.231731414794922, "rewards/accuracies": 0.875, "rewards/chosen": -1.656363844871521, "rewards/margins": 1.7285125255584717, "rewards/rejected": -3.3848764896392822, "step": 1656 }, { "epoch": 1.56, "grad_norm": 18.057220073820464, "learning_rate": 2.45161597459749e-07, "logps/chosen": -37.35597229003906, "logps/rejected": -69.2220458984375, "loss": 0.205, "losses/dpo": 0.468471884727478, "losses/sft": 2.011033773422241, "losses/total": 0.468471884727478, "ref_logps/chosen": -25.559947967529297, "ref_logps/rejected": -30.037145614624023, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1796026229858398, "rewards/margins": 2.7388877868652344, "rewards/rejected": -3.918490409851074, "step": 1657 }, { "epoch": 1.56, "grad_norm": 11.66774893890416, "learning_rate": 2.4490697904810976e-07, "logps/chosen": -38.19312286376953, "logps/rejected": -66.5694808959961, "loss": 0.1513, "losses/dpo": 0.4729243218898773, "losses/sft": 1.0673952102661133, "losses/total": 0.4729243218898773, "ref_logps/chosen": -27.208330154418945, "ref_logps/rejected": -32.75291061401367, "rewards/accuracies": 1.0, "rewards/chosen": -1.0984790325164795, "rewards/margins": 2.2831780910491943, "rewards/rejected": -3.381657361984253, "step": 1658 }, { "epoch": 1.57, "grad_norm": 19.110257507866926, "learning_rate": 2.4465236592148795e-07, "logps/chosen": -38.72062683105469, "logps/rejected": -59.52893829345703, "loss": 0.342, "losses/dpo": 0.2696214020252228, "losses/sft": 1.675897240638733, "losses/total": 0.2696214020252228, "ref_logps/chosen": -20.59206771850586, "ref_logps/rejected": -26.209877014160156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.812855839729309, "rewards/margins": 1.5190505981445312, "rewards/rejected": -3.33190655708313, "step": 1659 }, { "epoch": 1.57, "grad_norm": 27.534163923307695, "learning_rate": 2.443977583440952e-07, "logps/chosen": -64.85089111328125, "logps/rejected": -76.97000122070312, "loss": 0.4887, "losses/dpo": 0.007114763837307692, "losses/sft": 1.0515692234039307, "losses/total": 0.007114763837307692, "ref_logps/chosen": -44.0017204284668, "ref_logps/rejected": -36.67057800292969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0849175453186035, "rewards/margins": 1.9450244903564453, "rewards/rejected": -4.029942512512207, "step": 1660 }, { "epoch": 1.57, "grad_norm": 20.637809719734477, "learning_rate": 2.4414315658013723e-07, "logps/chosen": -53.12828826904297, "logps/rejected": -92.96969604492188, "loss": 0.2202, "losses/dpo": 0.7560958862304688, "losses/sft": 2.2737395763397217, "losses/total": 0.7560958862304688, "ref_logps/chosen": -32.094181060791016, "ref_logps/rejected": -46.03028869628906, "rewards/accuracies": 0.875, "rewards/chosen": -2.1034109592437744, "rewards/margins": 2.590529441833496, "rewards/rejected": -4.69394063949585, "step": 1661 }, { "epoch": 1.57, "grad_norm": 27.137811778496058, "learning_rate": 2.438885608938139e-07, "logps/chosen": -49.2429084777832, "logps/rejected": -79.29359436035156, "loss": 0.2819, "losses/dpo": 0.6845599412918091, "losses/sft": 0.919348418712616, "losses/total": 0.6845599412918091, "ref_logps/chosen": -31.374481201171875, "ref_logps/rejected": -38.304161071777344, "rewards/accuracies": 0.875, "rewards/chosen": -1.786842703819275, "rewards/margins": 2.312100887298584, "rewards/rejected": -4.098943710327148, "step": 1662 }, { "epoch": 1.57, "grad_norm": 21.53589883656993, "learning_rate": 2.436339715493185e-07, "logps/chosen": -42.41843795776367, "logps/rejected": -57.322776794433594, "loss": 0.3213, "losses/dpo": 0.06401684135198593, "losses/sft": 1.8808308839797974, "losses/total": 0.06401684135198593, "ref_logps/chosen": -27.696054458618164, "ref_logps/rejected": -26.307519912719727, "rewards/accuracies": 0.8125, "rewards/chosen": -1.472238540649414, "rewards/margins": 1.6292871236801147, "rewards/rejected": -3.1015257835388184, "step": 1663 }, { "epoch": 1.57, "grad_norm": 13.301151714430983, "learning_rate": 2.4337938881083796e-07, "logps/chosen": -44.49638748168945, "logps/rejected": -82.23444366455078, "loss": 0.1483, "losses/dpo": 0.014977573417127132, "losses/sft": 1.229832649230957, "losses/total": 0.014977573417127132, "ref_logps/chosen": -29.492019653320312, "ref_logps/rejected": -38.98964309692383, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5004370212554932, "rewards/margins": 2.824042797088623, "rewards/rejected": -4.324479579925537, "step": 1664 }, { "epoch": 1.57, "grad_norm": 14.951726148521361, "learning_rate": 2.431248129425522e-07, "logps/chosen": -45.03076171875, "logps/rejected": -84.23710632324219, "loss": 0.2464, "losses/dpo": 0.23011022806167603, "losses/sft": 2.107832908630371, "losses/total": 0.23011022806167603, "ref_logps/chosen": -26.621139526367188, "ref_logps/rejected": -41.16923522949219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8409624099731445, "rewards/margins": 2.465824842453003, "rewards/rejected": -4.306787014007568, "step": 1665 }, { "epoch": 1.57, "grad_norm": 16.690086488176853, "learning_rate": 2.428702442086342e-07, "logps/chosen": -36.611778259277344, "logps/rejected": -66.39388275146484, "loss": 0.3103, "losses/dpo": 0.35603567957878113, "losses/sft": 0.6953798532485962, "losses/total": 0.35603567957878113, "ref_logps/chosen": -21.509145736694336, "ref_logps/rejected": -35.676734924316406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.510263442993164, "rewards/margins": 1.5614516735076904, "rewards/rejected": -3.0717151165008545, "step": 1666 }, { "epoch": 1.57, "grad_norm": 18.232240659521466, "learning_rate": 2.426156828732493e-07, "logps/chosen": -48.49578857421875, "logps/rejected": -67.29393005371094, "loss": 0.2454, "losses/dpo": 0.12819477915763855, "losses/sft": 2.1390302181243896, "losses/total": 0.12819477915763855, "ref_logps/chosen": -30.582368850708008, "ref_logps/rejected": -30.15245819091797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7913422584533691, "rewards/margins": 1.9228050708770752, "rewards/rejected": -3.7141473293304443, "step": 1667 }, { "epoch": 1.57, "grad_norm": 16.28911255286829, "learning_rate": 2.423611292005553e-07, "logps/chosen": -45.20835876464844, "logps/rejected": -82.42007446289062, "loss": 0.1999, "losses/dpo": 0.12368675321340561, "losses/sft": 2.4664664268493652, "losses/total": 0.12368675321340561, "ref_logps/chosen": -25.029117584228516, "ref_logps/rejected": -41.533775329589844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0179243087768555, "rewards/margins": 2.0707056522369385, "rewards/rejected": -4.088629722595215, "step": 1668 }, { "epoch": 1.57, "grad_norm": 14.986602847263633, "learning_rate": 2.421065834547021e-07, "logps/chosen": -50.6917724609375, "logps/rejected": -83.19256591796875, "loss": 0.179, "losses/dpo": 0.030564354732632637, "losses/sft": 0.8912684321403503, "losses/total": 0.030564354732632637, "ref_logps/chosen": -31.796079635620117, "ref_logps/rejected": -40.01795959472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8895692825317383, "rewards/margins": 2.427891254425049, "rewards/rejected": -4.317460060119629, "step": 1669 }, { "epoch": 1.58, "grad_norm": 13.007763776284872, "learning_rate": 2.4185204589983123e-07, "logps/chosen": -42.033531188964844, "logps/rejected": -79.95841979980469, "loss": 0.1464, "losses/dpo": 0.19340354204177856, "losses/sft": 0.8796550035476685, "losses/total": 0.19340354204177856, "ref_logps/chosen": -28.31271743774414, "ref_logps/rejected": -38.982765197753906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3720812797546387, "rewards/margins": 2.725484848022461, "rewards/rejected": -4.0975661277771, "step": 1670 }, { "epoch": 1.58, "grad_norm": 14.197964267116951, "learning_rate": 2.4159751680007586e-07, "logps/chosen": -39.01573181152344, "logps/rejected": -86.53192138671875, "loss": 0.1984, "losses/dpo": 0.0038860347121953964, "losses/sft": 2.6708929538726807, "losses/total": 0.0038860347121953964, "ref_logps/chosen": -26.806896209716797, "ref_logps/rejected": -43.06671905517578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2208836078643799, "rewards/margins": 3.1256372928619385, "rewards/rejected": -4.346520900726318, "step": 1671 }, { "epoch": 1.58, "grad_norm": 21.438476943227375, "learning_rate": 2.413429964195603e-07, "logps/chosen": -46.189640045166016, "logps/rejected": -89.82420349121094, "loss": 0.2549, "losses/dpo": 0.402616024017334, "losses/sft": 2.5646395683288574, "losses/total": 0.402616024017334, "ref_logps/chosen": -23.735048294067383, "ref_logps/rejected": -42.25016784667969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2454590797424316, "rewards/margins": 2.51194429397583, "rewards/rejected": -4.757403373718262, "step": 1672 }, { "epoch": 1.58, "grad_norm": 10.960621472216047, "learning_rate": 2.4108848502239974e-07, "logps/chosen": -41.1834716796875, "logps/rejected": -79.50554656982422, "loss": 0.1475, "losses/dpo": 0.19148385524749756, "losses/sft": 1.1129140853881836, "losses/total": 0.19148385524749756, "ref_logps/chosen": -25.623889923095703, "ref_logps/rejected": -34.35963821411133, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5559585094451904, "rewards/margins": 2.958632469177246, "rewards/rejected": -4.514591217041016, "step": 1673 }, { "epoch": 1.58, "grad_norm": 22.468236958442183, "learning_rate": 2.408339828727003e-07, "logps/chosen": -39.46678161621094, "logps/rejected": -60.567535400390625, "loss": 0.3546, "losses/dpo": 0.9035484790802002, "losses/sft": 0.4638422131538391, "losses/total": 0.9035484790802002, "ref_logps/chosen": -24.638561248779297, "ref_logps/rejected": -30.144630432128906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4828219413757324, "rewards/margins": 1.559468150138855, "rewards/rejected": -3.042290210723877, "step": 1674 }, { "epoch": 1.58, "grad_norm": 27.26349190966214, "learning_rate": 2.4057949023455825e-07, "logps/chosen": -50.123226165771484, "logps/rejected": -69.28247833251953, "loss": 0.3954, "losses/dpo": 0.02021792158484459, "losses/sft": 2.1722564697265625, "losses/total": 0.02021792158484459, "ref_logps/chosen": -35.26260757446289, "ref_logps/rejected": -37.48541259765625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4860620498657227, "rewards/margins": 1.6936441659927368, "rewards/rejected": -3.17970609664917, "step": 1675 }, { "epoch": 1.58, "grad_norm": 19.282787503382128, "learning_rate": 2.403250073720601e-07, "logps/chosen": -47.095916748046875, "logps/rejected": -63.84204864501953, "loss": 0.3639, "losses/dpo": 0.02952723018825054, "losses/sft": 1.0214409828186035, "losses/total": 0.02952723018825054, "ref_logps/chosen": -29.81865119934082, "ref_logps/rejected": -29.315372467041016, "rewards/accuracies": 0.75, "rewards/chosen": -1.727726697921753, "rewards/margins": 1.7249410152435303, "rewards/rejected": -3.452667713165283, "step": 1676 }, { "epoch": 1.58, "grad_norm": 11.96783061879777, "learning_rate": 2.4007053454928216e-07, "logps/chosen": -54.72465515136719, "logps/rejected": -115.13099670410156, "loss": 0.1666, "losses/dpo": 0.06580784916877747, "losses/sft": 2.614683151245117, "losses/total": 0.06580784916877747, "ref_logps/chosen": -33.28915786743164, "ref_logps/rejected": -61.06529998779297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.143549680709839, "rewards/margins": 3.2630198001861572, "rewards/rejected": -5.406569480895996, "step": 1677 }, { "epoch": 1.58, "grad_norm": 16.07343933908007, "learning_rate": 2.3981607203029046e-07, "logps/chosen": -40.789695739746094, "logps/rejected": -61.94167709350586, "loss": 0.2515, "losses/dpo": 0.22440825402736664, "losses/sft": 0.4918345808982849, "losses/total": 0.22440825402736664, "ref_logps/chosen": -24.439048767089844, "ref_logps/rejected": -27.18928337097168, "rewards/accuracies": 0.875, "rewards/chosen": -1.635064959526062, "rewards/margins": 1.8401745557785034, "rewards/rejected": -3.4752395153045654, "step": 1678 }, { "epoch": 1.58, "grad_norm": 11.140236031063912, "learning_rate": 2.3956162007914015e-07, "logps/chosen": -52.897911071777344, "logps/rejected": -79.12753295898438, "loss": 0.109, "losses/dpo": 0.07206130027770996, "losses/sft": 1.792461633682251, "losses/total": 0.07206130027770996, "ref_logps/chosen": -34.116058349609375, "ref_logps/rejected": -34.45696258544922, "rewards/accuracies": 1.0, "rewards/chosen": -1.8781850337982178, "rewards/margins": 2.588872194290161, "rewards/rejected": -4.467057228088379, "step": 1679 }, { "epoch": 1.58, "grad_norm": 28.98301888576246, "learning_rate": 2.393071789598756e-07, "logps/chosen": -45.64189910888672, "logps/rejected": -80.43260955810547, "loss": 0.5076, "losses/dpo": 0.17711059749126434, "losses/sft": 1.497687578201294, "losses/total": 0.17711059749126434, "ref_logps/chosen": -26.17877197265625, "ref_logps/rejected": -41.15570068359375, "rewards/accuracies": 0.75, "rewards/chosen": -1.9463130235671997, "rewards/margins": 1.9813780784606934, "rewards/rejected": -3.9276909828186035, "step": 1680 }, { "epoch": 1.59, "grad_norm": 25.10161339383985, "learning_rate": 2.3905274893652974e-07, "logps/chosen": -50.45479202270508, "logps/rejected": -54.187705993652344, "loss": 0.3995, "losses/dpo": 0.33752650022506714, "losses/sft": 0.41660889983177185, "losses/total": 0.33752650022506714, "ref_logps/chosen": -34.56293869018555, "ref_logps/rejected": -27.071182250976562, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5891852378845215, "rewards/margins": 1.1224673986434937, "rewards/rejected": -2.7116525173187256, "step": 1681 }, { "epoch": 1.59, "grad_norm": 18.362532310280933, "learning_rate": 2.3879833027312416e-07, "logps/chosen": -48.20265579223633, "logps/rejected": -72.10372924804688, "loss": 0.2675, "losses/dpo": 0.06639623641967773, "losses/sft": 1.6195001602172852, "losses/total": 0.06639623641967773, "ref_logps/chosen": -28.192516326904297, "ref_logps/rejected": -32.77663803100586, "rewards/accuracies": 0.9375, "rewards/chosen": -2.001013994216919, "rewards/margins": 1.931694746017456, "rewards/rejected": -3.932708740234375, "step": 1682 }, { "epoch": 1.59, "grad_norm": 21.477299825721065, "learning_rate": 2.3854392323366855e-07, "logps/chosen": -47.61481857299805, "logps/rejected": -64.21627807617188, "loss": 0.2983, "losses/dpo": 0.20386886596679688, "losses/sft": 1.532460331916809, "losses/total": 0.20386886596679688, "ref_logps/chosen": -31.593215942382812, "ref_logps/rejected": -30.24395751953125, "rewards/accuracies": 0.875, "rewards/chosen": -1.6021602153778076, "rewards/margins": 1.7950718402862549, "rewards/rejected": -3.3972320556640625, "step": 1683 }, { "epoch": 1.59, "grad_norm": 14.614345381959334, "learning_rate": 2.3828952808216064e-07, "logps/chosen": -42.34394073486328, "logps/rejected": -79.73025512695312, "loss": 0.1881, "losses/dpo": 0.14819583296775818, "losses/sft": 2.711760997772217, "losses/total": 0.14819583296775818, "ref_logps/chosen": -25.58661651611328, "ref_logps/rejected": -37.150596618652344, "rewards/accuracies": 0.875, "rewards/chosen": -1.6757324934005737, "rewards/margins": 2.5822343826293945, "rewards/rejected": -4.2579665184021, "step": 1684 }, { "epoch": 1.59, "grad_norm": 13.31626939707339, "learning_rate": 2.3803514508258563e-07, "logps/chosen": -39.908470153808594, "logps/rejected": -75.38070678710938, "loss": 0.1649, "losses/dpo": 0.1580624282360077, "losses/sft": 1.1868855953216553, "losses/total": 0.1580624282360077, "ref_logps/chosen": -23.40515899658203, "ref_logps/rejected": -32.97417449951172, "rewards/accuracies": 1.0, "rewards/chosen": -1.6503307819366455, "rewards/margins": 2.590322494506836, "rewards/rejected": -4.240653038024902, "step": 1685 }, { "epoch": 1.59, "grad_norm": 17.224165922654915, "learning_rate": 2.3778077449891636e-07, "logps/chosen": -38.823333740234375, "logps/rejected": -60.21107864379883, "loss": 0.3087, "losses/dpo": 0.024707702919840813, "losses/sft": 0.10938769578933716, "losses/total": 0.024707702919840813, "ref_logps/chosen": -23.55056381225586, "ref_logps/rejected": -27.98306655883789, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5272769927978516, "rewards/margins": 1.6955243349075317, "rewards/rejected": -3.2228012084960938, "step": 1686 }, { "epoch": 1.59, "grad_norm": 15.211804351028082, "learning_rate": 2.3752641659511265e-07, "logps/chosen": -35.92481231689453, "logps/rejected": -67.84481048583984, "loss": 0.2251, "losses/dpo": 0.671671450138092, "losses/sft": 0.12681607902050018, "losses/total": 0.671671450138092, "ref_logps/chosen": -22.782238006591797, "ref_logps/rejected": -33.283172607421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3142576217651367, "rewards/margins": 2.141906261444092, "rewards/rejected": -3.4561636447906494, "step": 1687 }, { "epoch": 1.59, "grad_norm": 16.327384827227064, "learning_rate": 2.372720716351212e-07, "logps/chosen": -42.02134704589844, "logps/rejected": -72.34749603271484, "loss": 0.1976, "losses/dpo": 0.5031624436378479, "losses/sft": 2.0002970695495605, "losses/total": 0.5031624436378479, "ref_logps/chosen": -24.18085479736328, "ref_logps/rejected": -33.266212463378906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7840490341186523, "rewards/margins": 2.124079704284668, "rewards/rejected": -3.908128499984741, "step": 1688 }, { "epoch": 1.59, "grad_norm": 10.011105380600988, "learning_rate": 2.370177398828752e-07, "logps/chosen": -50.62308120727539, "logps/rejected": -96.93049621582031, "loss": 0.0973, "losses/dpo": 0.25948256254196167, "losses/sft": 1.3510468006134033, "losses/total": 0.25948256254196167, "ref_logps/chosen": -31.845962524414062, "ref_logps/rejected": -49.17698669433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.8777118921279907, "rewards/margins": 2.897639751434326, "rewards/rejected": -4.775351524353027, "step": 1689 }, { "epoch": 1.59, "grad_norm": 8.877398463710529, "learning_rate": 2.3676342160229427e-07, "logps/chosen": -32.91072082519531, "logps/rejected": -70.79508209228516, "loss": 0.1034, "losses/dpo": 0.03665340319275856, "losses/sft": 0.32493677735328674, "losses/total": 0.03665340319275856, "ref_logps/chosen": -21.477649688720703, "ref_logps/rejected": -33.04235076904297, "rewards/accuracies": 1.0, "rewards/chosen": -1.143307089805603, "rewards/margins": 2.6319661140441895, "rewards/rejected": -3.775273323059082, "step": 1690 }, { "epoch": 1.6, "grad_norm": 15.94564237582972, "learning_rate": 2.365091170572839e-07, "logps/chosen": -45.90605926513672, "logps/rejected": -89.82598114013672, "loss": 0.2053, "losses/dpo": 0.32785364985466003, "losses/sft": 0.7003283500671387, "losses/total": 0.32785364985466003, "ref_logps/chosen": -26.9813289642334, "ref_logps/rejected": -43.997039794921875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8924729824066162, "rewards/margins": 2.6904213428497314, "rewards/rejected": -4.582894325256348, "step": 1691 }, { "epoch": 1.6, "grad_norm": 23.028923605909224, "learning_rate": 2.3625482651173548e-07, "logps/chosen": -44.240478515625, "logps/rejected": -67.47151184082031, "loss": 0.3721, "losses/dpo": 0.04503392428159714, "losses/sft": 1.748524785041809, "losses/total": 0.04503392428159714, "ref_logps/chosen": -26.463783264160156, "ref_logps/rejected": -32.88263702392578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7776694297790527, "rewards/margins": 1.681218147277832, "rewards/rejected": -3.458887815475464, "step": 1692 }, { "epoch": 1.6, "grad_norm": 22.632787300455536, "learning_rate": 2.3600055022952576e-07, "logps/chosen": -44.93437957763672, "logps/rejected": -67.66806030273438, "loss": 0.3671, "losses/dpo": 1.7610491514205933, "losses/sft": 1.8419649600982666, "losses/total": 1.7610491514205933, "ref_logps/chosen": -27.859392166137695, "ref_logps/rejected": -33.916481018066406, "rewards/accuracies": 0.875, "rewards/chosen": -1.7074989080429077, "rewards/margins": 1.667658805847168, "rewards/rejected": -3.375157594680786, "step": 1693 }, { "epoch": 1.6, "grad_norm": 25.107229672109685, "learning_rate": 2.3574628847451666e-07, "logps/chosen": -44.69868087768555, "logps/rejected": -56.27399444580078, "loss": 0.4236, "losses/dpo": 1.2858922481536865, "losses/sft": 1.7307049036026, "losses/total": 1.2858922481536865, "ref_logps/chosen": -28.6993408203125, "ref_logps/rejected": -25.836170196533203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5999339818954468, "rewards/margins": 1.4438486099243164, "rewards/rejected": -3.0437827110290527, "step": 1694 }, { "epoch": 1.6, "grad_norm": 17.36625097214321, "learning_rate": 2.354920415105552e-07, "logps/chosen": -33.608951568603516, "logps/rejected": -69.53680419921875, "loss": 0.2977, "losses/dpo": 0.25717273354530334, "losses/sft": 1.2488983869552612, "losses/total": 0.25717273354530334, "ref_logps/chosen": -18.154094696044922, "ref_logps/rejected": -33.626461029052734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5454857349395752, "rewards/margins": 2.045548915863037, "rewards/rejected": -3.5910346508026123, "step": 1695 }, { "epoch": 1.6, "grad_norm": 15.074592091975694, "learning_rate": 2.3523780960147286e-07, "logps/chosen": -37.867218017578125, "logps/rejected": -71.19403839111328, "loss": 0.2431, "losses/dpo": 0.04698430374264717, "losses/sft": 0.9253247380256653, "losses/total": 0.04698430374264717, "ref_logps/chosen": -24.859575271606445, "ref_logps/rejected": -35.75669479370117, "rewards/accuracies": 0.875, "rewards/chosen": -1.300764560699463, "rewards/margins": 2.2429699897766113, "rewards/rejected": -3.543734550476074, "step": 1696 }, { "epoch": 1.6, "grad_norm": 29.041085080911774, "learning_rate": 2.3498359301108556e-07, "logps/chosen": -46.169525146484375, "logps/rejected": -67.35106658935547, "loss": 0.538, "losses/dpo": 0.005433796904981136, "losses/sft": 1.8857502937316895, "losses/total": 0.005433796904981136, "ref_logps/chosen": -23.585514068603516, "ref_logps/rejected": -30.787572860717773, "rewards/accuracies": 0.75, "rewards/chosen": -2.258401393890381, "rewards/margins": 1.3979482650756836, "rewards/rejected": -3.6563496589660645, "step": 1697 }, { "epoch": 1.6, "grad_norm": 15.6071216001308, "learning_rate": 2.3472939200319337e-07, "logps/chosen": -32.3151741027832, "logps/rejected": -59.29243087768555, "loss": 0.3234, "losses/dpo": 0.049366217106580734, "losses/sft": 2.7364602088928223, "losses/total": 0.049366217106580734, "ref_logps/chosen": -19.493614196777344, "ref_logps/rejected": -29.410655975341797, "rewards/accuracies": 0.875, "rewards/chosen": -1.282155990600586, "rewards/margins": 1.7060211896896362, "rewards/rejected": -2.9881770610809326, "step": 1698 }, { "epoch": 1.6, "grad_norm": 20.837523138510182, "learning_rate": 2.344752068415802e-07, "logps/chosen": -51.74908447265625, "logps/rejected": -75.65050506591797, "loss": 0.3869, "losses/dpo": 0.007434592116624117, "losses/sft": 1.1434495449066162, "losses/total": 0.007434592116624117, "ref_logps/chosen": -29.91616439819336, "ref_logps/rejected": -35.03895568847656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.183291435241699, "rewards/margins": 1.8778635263442993, "rewards/rejected": -4.061154842376709, "step": 1699 }, { "epoch": 1.6, "grad_norm": 20.428589252942537, "learning_rate": 2.3422103779001346e-07, "logps/chosen": -44.682804107666016, "logps/rejected": -73.68740844726562, "loss": 0.3541, "losses/dpo": 0.022549590095877647, "losses/sft": 1.4112052917480469, "losses/total": 0.022549590095877647, "ref_logps/chosen": -30.268924713134766, "ref_logps/rejected": -39.74162673950195, "rewards/accuracies": 0.75, "rewards/chosen": -1.4413881301879883, "rewards/margins": 1.9531902074813843, "rewards/rejected": -3.394577980041504, "step": 1700 }, { "epoch": 1.6, "grad_norm": 12.740241055188513, "learning_rate": 2.3396688511224383e-07, "logps/chosen": -38.89617156982422, "logps/rejected": -71.14676666259766, "loss": 0.1255, "losses/dpo": 0.22997590899467468, "losses/sft": 1.6485611200332642, "losses/total": 0.22997590899467468, "ref_logps/chosen": -28.604007720947266, "ref_logps/rejected": -32.654361724853516, "rewards/accuracies": 1.0, "rewards/chosen": -1.029215931892395, "rewards/margins": 2.8200247287750244, "rewards/rejected": -3.849240779876709, "step": 1701 }, { "epoch": 1.61, "grad_norm": 23.678611104146572, "learning_rate": 2.3371274907200507e-07, "logps/chosen": -44.14984130859375, "logps/rejected": -60.287010192871094, "loss": 0.3982, "losses/dpo": 0.9128880500793457, "losses/sft": 2.062490463256836, "losses/total": 0.9128880500793457, "ref_logps/chosen": -26.220081329345703, "ref_logps/rejected": -28.29216957092285, "rewards/accuracies": 0.75, "rewards/chosen": -1.7929761409759521, "rewards/margins": 1.406507968902588, "rewards/rejected": -3.19948410987854, "step": 1702 }, { "epoch": 1.61, "grad_norm": 15.958979029115417, "learning_rate": 2.3345862993301357e-07, "logps/chosen": -37.7315673828125, "logps/rejected": -67.98019409179688, "loss": 0.222, "losses/dpo": 0.650470495223999, "losses/sft": 0.3567954897880554, "losses/total": 0.650470495223999, "ref_logps/chosen": -23.156160354614258, "ref_logps/rejected": -32.97951126098633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.45754075050354, "rewards/margins": 2.0425267219543457, "rewards/rejected": -3.5000674724578857, "step": 1703 }, { "epoch": 1.61, "grad_norm": 17.40001401766195, "learning_rate": 2.3320452795896833e-07, "logps/chosen": -43.115692138671875, "logps/rejected": -73.30187225341797, "loss": 0.2418, "losses/dpo": 0.04280790686607361, "losses/sft": 2.0237717628479004, "losses/total": 0.04280790686607361, "ref_logps/chosen": -27.020404815673828, "ref_logps/rejected": -35.221370697021484, "rewards/accuracies": 0.875, "rewards/chosen": -1.609528660774231, "rewards/margins": 2.198521614074707, "rewards/rejected": -3.8080503940582275, "step": 1704 }, { "epoch": 1.61, "grad_norm": 29.968343011871823, "learning_rate": 2.329504434135504e-07, "logps/chosen": -45.637447357177734, "logps/rejected": -65.19456481933594, "loss": 0.4525, "losses/dpo": 0.05385809764266014, "losses/sft": 0.7293650507926941, "losses/total": 0.05385809764266014, "ref_logps/chosen": -29.025461196899414, "ref_logps/rejected": -31.652908325195312, "rewards/accuracies": 0.875, "rewards/chosen": -1.6611987352371216, "rewards/margins": 1.6929666996002197, "rewards/rejected": -3.354165554046631, "step": 1705 }, { "epoch": 1.61, "grad_norm": 10.05907327471056, "learning_rate": 2.3269637656042275e-07, "logps/chosen": -39.026493072509766, "logps/rejected": -66.3045654296875, "loss": 0.1472, "losses/dpo": 0.4103826582431793, "losses/sft": 1.707318902015686, "losses/total": 0.4103826582431793, "ref_logps/chosen": -24.47677230834961, "ref_logps/rejected": -29.07670021057129, "rewards/accuracies": 1.0, "rewards/chosen": -1.4549721479415894, "rewards/margins": 2.267815113067627, "rewards/rejected": -3.722787380218506, "step": 1706 }, { "epoch": 1.61, "grad_norm": 18.096156940290616, "learning_rate": 2.3244232766323014e-07, "logps/chosen": -52.20814514160156, "logps/rejected": -79.70476531982422, "loss": 0.2162, "losses/dpo": 0.30133533477783203, "losses/sft": 1.8692384958267212, "losses/total": 0.30133533477783203, "ref_logps/chosen": -35.92131042480469, "ref_logps/rejected": -38.70689392089844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6286835670471191, "rewards/margins": 2.4711039066314697, "rewards/rejected": -4.099787712097168, "step": 1707 }, { "epoch": 1.61, "grad_norm": 23.67001644622368, "learning_rate": 2.3218829698559854e-07, "logps/chosen": -38.32965087890625, "logps/rejected": -86.44395446777344, "loss": 0.2063, "losses/dpo": 0.003787600202485919, "losses/sft": 0.8463283777236938, "losses/total": 0.003787600202485919, "ref_logps/chosen": -22.07469940185547, "ref_logps/rejected": -42.55935287475586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.625495195388794, "rewards/margins": 2.7629647254943848, "rewards/rejected": -4.388460159301758, "step": 1708 }, { "epoch": 1.61, "grad_norm": 29.87143950947719, "learning_rate": 2.3193428479113511e-07, "logps/chosen": -50.219093322753906, "logps/rejected": -71.7376937866211, "loss": 0.5306, "losses/dpo": 0.011915607377886772, "losses/sft": 1.3270386457443237, "losses/total": 0.011915607377886772, "ref_logps/chosen": -26.372295379638672, "ref_logps/rejected": -34.39012908935547, "rewards/accuracies": 0.75, "rewards/chosen": -2.3846795558929443, "rewards/margins": 1.350077509880066, "rewards/rejected": -3.7347569465637207, "step": 1709 }, { "epoch": 1.61, "grad_norm": 27.999897086732446, "learning_rate": 2.3168029134342775e-07, "logps/chosen": -45.978759765625, "logps/rejected": -76.87596130371094, "loss": 0.3027, "losses/dpo": 0.4562780261039734, "losses/sft": 1.8752609491348267, "losses/total": 0.4562780261039734, "ref_logps/chosen": -28.416967391967773, "ref_logps/rejected": -37.111690521240234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7561795711517334, "rewards/margins": 2.220247745513916, "rewards/rejected": -3.9764270782470703, "step": 1710 }, { "epoch": 1.61, "grad_norm": 23.74131517880989, "learning_rate": 2.3142631690604497e-07, "logps/chosen": -48.61737823486328, "logps/rejected": -70.41845703125, "loss": 0.3163, "losses/dpo": 0.005153219681233168, "losses/sft": 0.6397639513015747, "losses/total": 0.005153219681233168, "ref_logps/chosen": -31.67184066772461, "ref_logps/rejected": -34.25102996826172, "rewards/accuracies": 0.875, "rewards/chosen": -1.6945533752441406, "rewards/margins": 1.9221892356872559, "rewards/rejected": -3.6167426109313965, "step": 1711 }, { "epoch": 1.62, "grad_norm": 16.547426437102615, "learning_rate": 2.311723617425355e-07, "logps/chosen": -46.66661834716797, "logps/rejected": -70.6142349243164, "loss": 0.2095, "losses/dpo": 0.46696752309799194, "losses/sft": 1.0233979225158691, "losses/total": 0.46696752309799194, "ref_logps/chosen": -27.589038848876953, "ref_logps/rejected": -33.949031829833984, "rewards/accuracies": 1.0, "rewards/chosen": -1.9077579975128174, "rewards/margins": 1.7587625980377197, "rewards/rejected": -3.666520595550537, "step": 1712 }, { "epoch": 1.62, "grad_norm": 16.54803036625123, "learning_rate": 2.309184261164281e-07, "logps/chosen": -54.89860534667969, "logps/rejected": -67.34324645996094, "loss": 0.2423, "losses/dpo": 0.3209412693977356, "losses/sft": 1.5477352142333984, "losses/total": 0.3209412693977356, "ref_logps/chosen": -37.39726638793945, "ref_logps/rejected": -31.473791122436523, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7501344680786133, "rewards/margins": 1.8368111848831177, "rewards/rejected": -3.5869455337524414, "step": 1713 }, { "epoch": 1.62, "grad_norm": 25.842409465201392, "learning_rate": 2.3066451029123132e-07, "logps/chosen": -35.710670471191406, "logps/rejected": -48.55517578125, "loss": 0.5846, "losses/dpo": 1.7665302753448486, "losses/sft": 2.421818256378174, "losses/total": 1.7665302753448486, "ref_logps/chosen": -20.77214813232422, "ref_logps/rejected": -24.07038116455078, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4938523769378662, "rewards/margins": 0.954627275466919, "rewards/rejected": -2.448479652404785, "step": 1714 }, { "epoch": 1.62, "grad_norm": 15.215726012418655, "learning_rate": 2.3041061453043298e-07, "logps/chosen": -34.4462890625, "logps/rejected": -66.47616577148438, "loss": 0.234, "losses/dpo": 0.003653117222711444, "losses/sft": 0.32324427366256714, "losses/total": 0.003653117222711444, "ref_logps/chosen": -21.14645767211914, "ref_logps/rejected": -31.479663848876953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3299833536148071, "rewards/margins": 2.1696667671203613, "rewards/rejected": -3.499650239944458, "step": 1715 }, { "epoch": 1.62, "grad_norm": 17.282421472982296, "learning_rate": 2.3015673909750032e-07, "logps/chosen": -36.98992919921875, "logps/rejected": -70.398193359375, "loss": 0.2391, "losses/dpo": 0.3306486904621124, "losses/sft": 1.2252355813980103, "losses/total": 0.3306486904621124, "ref_logps/chosen": -25.111736297607422, "ref_logps/rejected": -34.8628044128418, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1878191232681274, "rewards/margins": 2.3657193183898926, "rewards/rejected": -3.5535385608673096, "step": 1716 }, { "epoch": 1.62, "grad_norm": 22.749754686549327, "learning_rate": 2.2990288425587928e-07, "logps/chosen": -40.71396255493164, "logps/rejected": -71.16204833984375, "loss": 0.4098, "losses/dpo": 2.2006442546844482, "losses/sft": 1.8549578189849854, "losses/total": 2.2006442546844482, "ref_logps/chosen": -24.261157989501953, "ref_logps/rejected": -38.724266052246094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.645280361175537, "rewards/margins": 1.5984984636306763, "rewards/rejected": -3.243778944015503, "step": 1717 }, { "epoch": 1.62, "grad_norm": 27.689591397985783, "learning_rate": 2.2964905026899457e-07, "logps/chosen": -59.34758758544922, "logps/rejected": -82.22174072265625, "loss": 0.3875, "losses/dpo": 0.03512691706418991, "losses/sft": 0.8824968338012695, "losses/total": 0.03512691706418991, "ref_logps/chosen": -36.52250671386719, "ref_logps/rejected": -44.63568878173828, "rewards/accuracies": 0.75, "rewards/chosen": -2.28250789642334, "rewards/margins": 1.4760974645614624, "rewards/rejected": -3.7586052417755127, "step": 1718 }, { "epoch": 1.62, "grad_norm": 23.4149516027573, "learning_rate": 2.2939523740024918e-07, "logps/chosen": -44.038719177246094, "logps/rejected": -62.486083984375, "loss": 0.3702, "losses/dpo": 0.012539311312139034, "losses/sft": 1.568535327911377, "losses/total": 0.012539311312139034, "ref_logps/chosen": -30.815250396728516, "ref_logps/rejected": -32.63030242919922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.322346806526184, "rewards/margins": 1.6632311344146729, "rewards/rejected": -2.9855780601501465, "step": 1719 }, { "epoch": 1.62, "grad_norm": 21.59955113740723, "learning_rate": 2.291414459130242e-07, "logps/chosen": -51.317806243896484, "logps/rejected": -81.5999984741211, "loss": 0.2274, "losses/dpo": 0.36022788286209106, "losses/sft": 1.3925288915634155, "losses/total": 0.36022788286209106, "ref_logps/chosen": -33.044700622558594, "ref_logps/rejected": -38.279022216796875, "rewards/accuracies": 0.875, "rewards/chosen": -1.827310562133789, "rewards/margins": 2.504786729812622, "rewards/rejected": -4.332097053527832, "step": 1720 }, { "epoch": 1.62, "grad_norm": 17.170955656590838, "learning_rate": 2.2888767607067854e-07, "logps/chosen": -42.028324127197266, "logps/rejected": -73.1629409790039, "loss": 0.2481, "losses/dpo": 0.33343201875686646, "losses/sft": 1.0779716968536377, "losses/total": 0.33343201875686646, "ref_logps/chosen": -28.47946548461914, "ref_logps/rejected": -36.365840911865234, "rewards/accuracies": 0.875, "rewards/chosen": -1.3548860549926758, "rewards/margins": 2.324824333190918, "rewards/rejected": -3.6797103881835938, "step": 1721 }, { "epoch": 1.62, "grad_norm": 14.600777585256491, "learning_rate": 2.2863392813654866e-07, "logps/chosen": -41.447147369384766, "logps/rejected": -64.73214721679688, "loss": 0.2411, "losses/dpo": 0.35107719898223877, "losses/sft": 0.5182114243507385, "losses/total": 0.35107719898223877, "ref_logps/chosen": -27.77071189880371, "ref_logps/rejected": -32.452903747558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3676435947418213, "rewards/margins": 1.8602805137634277, "rewards/rejected": -3.22792387008667, "step": 1722 }, { "epoch": 1.63, "grad_norm": 20.332518437400818, "learning_rate": 2.2838020237394824e-07, "logps/chosen": -54.186378479003906, "logps/rejected": -67.65484619140625, "loss": 0.3064, "losses/dpo": 0.023900248110294342, "losses/sft": 1.494602084159851, "losses/total": 0.023900248110294342, "ref_logps/chosen": -37.526676177978516, "ref_logps/rejected": -30.60269546508789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6659703254699707, "rewards/margins": 2.039245128631592, "rewards/rejected": -3.7052154541015625, "step": 1723 }, { "epoch": 1.63, "grad_norm": 23.48749140310327, "learning_rate": 2.2812649904616806e-07, "logps/chosen": -47.26039123535156, "logps/rejected": -67.93679809570312, "loss": 0.3412, "losses/dpo": 0.5768740177154541, "losses/sft": 1.6357088088989258, "losses/total": 0.5768740177154541, "ref_logps/chosen": -29.119415283203125, "ref_logps/rejected": -32.98191833496094, "rewards/accuracies": 0.875, "rewards/chosen": -1.8140977621078491, "rewards/margins": 1.6813905239105225, "rewards/rejected": -3.495488405227661, "step": 1724 }, { "epoch": 1.63, "grad_norm": 15.971972789359677, "learning_rate": 2.2787281841647553e-07, "logps/chosen": -35.01715850830078, "logps/rejected": -72.493896484375, "loss": 0.2701, "losses/dpo": 0.025473125278949738, "losses/sft": 0.5762603282928467, "losses/total": 0.025473125278949738, "ref_logps/chosen": -20.980548858642578, "ref_logps/rejected": -34.19956970214844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4036610126495361, "rewards/margins": 2.425772190093994, "rewards/rejected": -3.829432964324951, "step": 1725 }, { "epoch": 1.63, "grad_norm": 23.255759354835586, "learning_rate": 2.2761916074811453e-07, "logps/chosen": -42.286766052246094, "logps/rejected": -70.3701400756836, "loss": 0.2998, "losses/dpo": 0.010960161685943604, "losses/sft": 1.0901007652282715, "losses/total": 0.010960161685943604, "ref_logps/chosen": -27.121810913085938, "ref_logps/rejected": -36.154476165771484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5164955854415894, "rewards/margins": 1.9050703048706055, "rewards/rejected": -3.4215660095214844, "step": 1726 }, { "epoch": 1.63, "grad_norm": 17.94099345753167, "learning_rate": 2.2736552630430506e-07, "logps/chosen": -45.38172912597656, "logps/rejected": -68.96247863769531, "loss": 0.2516, "losses/dpo": 0.004267010372132063, "losses/sft": 1.329399585723877, "losses/total": 0.004267010372132063, "ref_logps/chosen": -28.454662322998047, "ref_logps/rejected": -30.9820556640625, "rewards/accuracies": 0.875, "rewards/chosen": -1.6927069425582886, "rewards/margins": 2.1053357124328613, "rewards/rejected": -3.7980427742004395, "step": 1727 }, { "epoch": 1.63, "grad_norm": 12.15845777009303, "learning_rate": 2.2711191534824313e-07, "logps/chosen": -49.86335372924805, "logps/rejected": -73.06263732910156, "loss": 0.1635, "losses/dpo": 0.015484340488910675, "losses/sft": 1.9626342058181763, "losses/total": 0.015484340488910675, "ref_logps/chosen": -34.637638092041016, "ref_logps/rejected": -33.98653030395508, "rewards/accuracies": 1.0, "rewards/chosen": -1.5225716829299927, "rewards/margins": 2.385038375854492, "rewards/rejected": -3.9076104164123535, "step": 1728 }, { "epoch": 1.63, "grad_norm": 22.70930245963185, "learning_rate": 2.2685832814310026e-07, "logps/chosen": -49.929718017578125, "logps/rejected": -63.960906982421875, "loss": 0.3918, "losses/dpo": 0.205226331949234, "losses/sft": 2.7275352478027344, "losses/total": 0.205226331949234, "ref_logps/chosen": -32.22853088378906, "ref_logps/rejected": -29.233768463134766, "rewards/accuracies": 0.75, "rewards/chosen": -1.7701184749603271, "rewards/margins": 1.7025949954986572, "rewards/rejected": -3.4727132320404053, "step": 1729 }, { "epoch": 1.63, "grad_norm": 22.861394041477453, "learning_rate": 2.2660476495202342e-07, "logps/chosen": -48.27467727661133, "logps/rejected": -62.7203483581543, "loss": 0.3548, "losses/dpo": 0.1375332623720169, "losses/sft": 0.6341996788978577, "losses/total": 0.1375332623720169, "ref_logps/chosen": -31.364337921142578, "ref_logps/rejected": -31.66867446899414, "rewards/accuracies": 0.8125, "rewards/chosen": -1.691034197807312, "rewards/margins": 1.4141336679458618, "rewards/rejected": -3.105167865753174, "step": 1730 }, { "epoch": 1.63, "grad_norm": 11.874216884049646, "learning_rate": 2.263512260381346e-07, "logps/chosen": -51.721656799316406, "logps/rejected": -93.09309387207031, "loss": 0.1243, "losses/dpo": 0.23599624633789062, "losses/sft": 1.0393520593643188, "losses/total": 0.23599624633789062, "ref_logps/chosen": -34.7602653503418, "ref_logps/rejected": -43.16780471801758, "rewards/accuracies": 1.0, "rewards/chosen": -1.6961393356323242, "rewards/margins": 3.2963900566101074, "rewards/rejected": -4.992529392242432, "step": 1731 }, { "epoch": 1.63, "grad_norm": 15.406659638198686, "learning_rate": 2.2609771166453057e-07, "logps/chosen": -55.9107666015625, "logps/rejected": -86.73988342285156, "loss": 0.176, "losses/dpo": 0.0020050685852766037, "losses/sft": 2.361013174057007, "losses/total": 0.0020050685852766037, "ref_logps/chosen": -36.002532958984375, "ref_logps/rejected": -44.10152816772461, "rewards/accuracies": 1.0, "rewards/chosen": -1.990823745727539, "rewards/margins": 2.273012161254883, "rewards/rejected": -4.263835906982422, "step": 1732 }, { "epoch": 1.63, "grad_norm": 15.10735672523065, "learning_rate": 2.258442220942828e-07, "logps/chosen": -41.89453887939453, "logps/rejected": -63.67473602294922, "loss": 0.205, "losses/dpo": 0.003982866648584604, "losses/sft": 1.6985623836517334, "losses/total": 0.003982866648584604, "ref_logps/chosen": -26.88031005859375, "ref_logps/rejected": -28.89159393310547, "rewards/accuracies": 1.0, "rewards/chosen": -1.5014222860336304, "rewards/margins": 1.9768917560577393, "rewards/rejected": -3.47831392288208, "step": 1733 }, { "epoch": 1.64, "grad_norm": 17.270299009684646, "learning_rate": 2.2559075759043683e-07, "logps/chosen": -56.979156494140625, "logps/rejected": -74.9915771484375, "loss": 0.2161, "losses/dpo": 0.59991455078125, "losses/sft": 0.9862111806869507, "losses/total": 0.59991455078125, "ref_logps/chosen": -37.091064453125, "ref_logps/rejected": -34.851295471191406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9888099431991577, "rewards/margins": 2.025217056274414, "rewards/rejected": -4.014027118682861, "step": 1734 }, { "epoch": 1.64, "grad_norm": 11.67460060951356, "learning_rate": 2.2533731841601225e-07, "logps/chosen": -42.027523040771484, "logps/rejected": -75.72975158691406, "loss": 0.1441, "losses/dpo": 0.4725152254104614, "losses/sft": 0.922329306602478, "losses/total": 0.4725152254104614, "ref_logps/chosen": -28.47024917602539, "ref_logps/rejected": -37.306114196777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3557275533676147, "rewards/margins": 2.4866366386413574, "rewards/rejected": -3.8423643112182617, "step": 1735 }, { "epoch": 1.64, "grad_norm": 21.047119894849043, "learning_rate": 2.250839048340024e-07, "logps/chosen": -42.130435943603516, "logps/rejected": -70.7964859008789, "loss": 0.3105, "losses/dpo": 0.0022130149882286787, "losses/sft": 2.1634275913238525, "losses/total": 0.0022130149882286787, "ref_logps/chosen": -29.118406295776367, "ref_logps/rejected": -32.71942138671875, "rewards/accuracies": 0.875, "rewards/chosen": -1.3012031316757202, "rewards/margins": 2.506503105163574, "rewards/rejected": -3.807706356048584, "step": 1736 }, { "epoch": 1.64, "grad_norm": 22.643513768566343, "learning_rate": 2.2483051710737408e-07, "logps/chosen": -51.18182373046875, "logps/rejected": -71.82383728027344, "loss": 0.3155, "losses/dpo": 0.054268985986709595, "losses/sft": 1.7406115531921387, "losses/total": 0.054268985986709595, "ref_logps/chosen": -31.464214324951172, "ref_logps/rejected": -34.42535400390625, "rewards/accuracies": 0.875, "rewards/chosen": -1.9717607498168945, "rewards/margins": 1.7680882215499878, "rewards/rejected": -3.739849090576172, "step": 1737 }, { "epoch": 1.64, "grad_norm": 29.078442444666, "learning_rate": 2.2457715549906717e-07, "logps/chosen": -62.112117767333984, "logps/rejected": -85.78421783447266, "loss": 0.267, "losses/dpo": 0.10794480890035629, "losses/sft": 1.7501901388168335, "losses/total": 0.10794480890035629, "ref_logps/chosen": -39.58230972290039, "ref_logps/rejected": -38.59516143798828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.252980947494507, "rewards/margins": 2.465925931930542, "rewards/rejected": -4.718906879425049, "step": 1738 }, { "epoch": 1.64, "grad_norm": 17.808564176931014, "learning_rate": 2.2432382027199448e-07, "logps/chosen": -51.414710998535156, "logps/rejected": -78.41193389892578, "loss": 0.1533, "losses/dpo": 0.1376917064189911, "losses/sft": 1.1449768543243408, "losses/total": 0.1376917064189911, "ref_logps/chosen": -30.928234100341797, "ref_logps/rejected": -32.005615234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.048647880554199, "rewards/margins": 2.591984272003174, "rewards/rejected": -4.640632152557373, "step": 1739 }, { "epoch": 1.64, "grad_norm": 18.17147244199787, "learning_rate": 2.2407051168904145e-07, "logps/chosen": -38.209285736083984, "logps/rejected": -63.75984191894531, "loss": 0.3136, "losses/dpo": 0.08934782445430756, "losses/sft": 1.4261010885238647, "losses/total": 0.08934782445430756, "ref_logps/chosen": -25.27802848815918, "ref_logps/rejected": -30.906173706054688, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2931257486343384, "rewards/margins": 1.9922412633895874, "rewards/rejected": -3.285367012023926, "step": 1740 }, { "epoch": 1.64, "grad_norm": 18.87681424996315, "learning_rate": 2.2381723001306597e-07, "logps/chosen": -47.54729461669922, "logps/rejected": -69.74908447265625, "loss": 0.2323, "losses/dpo": 0.16679830849170685, "losses/sft": 0.45346924662590027, "losses/total": 0.16679830849170685, "ref_logps/chosen": -27.59787940979004, "ref_logps/rejected": -32.62733459472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.9949417114257812, "rewards/margins": 1.7172340154647827, "rewards/rejected": -3.7121758460998535, "step": 1741 }, { "epoch": 1.64, "grad_norm": 15.517577392650894, "learning_rate": 2.2356397550689789e-07, "logps/chosen": -40.587303161621094, "logps/rejected": -81.11550903320312, "loss": 0.1726, "losses/dpo": 0.0025778624694794416, "losses/sft": 0.8531028032302856, "losses/total": 0.0025778624694794416, "ref_logps/chosen": -26.765300750732422, "ref_logps/rejected": -36.914337158203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3822002410888672, "rewards/margins": 3.037916660308838, "rewards/rejected": -4.420116901397705, "step": 1742 }, { "epoch": 1.64, "grad_norm": 16.535190365552033, "learning_rate": 2.2331074843333885e-07, "logps/chosen": -45.063385009765625, "logps/rejected": -79.79359436035156, "loss": 0.2315, "losses/dpo": 0.03805039823055267, "losses/sft": 1.9183592796325684, "losses/total": 0.03805039823055267, "ref_logps/chosen": -26.94108009338379, "ref_logps/rejected": -41.94749450683594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8122303485870361, "rewards/margins": 1.9723793268203735, "rewards/rejected": -3.784609794616699, "step": 1743 }, { "epoch": 1.65, "grad_norm": 13.412722874362487, "learning_rate": 2.2305754905516214e-07, "logps/chosen": -46.94789123535156, "logps/rejected": -81.44883728027344, "loss": 0.12, "losses/dpo": 0.06309617310762405, "losses/sft": 1.5649709701538086, "losses/total": 0.06309617310762405, "ref_logps/chosen": -31.273021697998047, "ref_logps/rejected": -37.68297576904297, "rewards/accuracies": 1.0, "rewards/chosen": -1.567487359046936, "rewards/margins": 2.8090991973876953, "rewards/rejected": -4.376586437225342, "step": 1744 }, { "epoch": 1.65, "grad_norm": 24.20935801534682, "learning_rate": 2.228043776351122e-07, "logps/chosen": -50.94013595581055, "logps/rejected": -76.10325622558594, "loss": 0.2682, "losses/dpo": 0.05018678307533264, "losses/sft": 1.5612701177597046, "losses/total": 0.05018678307533264, "ref_logps/chosen": -31.992725372314453, "ref_logps/rejected": -36.7916145324707, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8947409391403198, "rewards/margins": 2.036423683166504, "rewards/rejected": -3.931164503097534, "step": 1745 }, { "epoch": 1.65, "grad_norm": 12.661829065130473, "learning_rate": 2.2255123443590448e-07, "logps/chosen": -40.923133850097656, "logps/rejected": -71.95945739746094, "loss": 0.1761, "losses/dpo": 0.05235496163368225, "losses/sft": 0.9655537605285645, "losses/total": 0.05235496163368225, "ref_logps/chosen": -25.004432678222656, "ref_logps/rejected": -32.91352081298828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.591869592666626, "rewards/margins": 2.3127238750457764, "rewards/rejected": -3.9045934677124023, "step": 1746 }, { "epoch": 1.65, "grad_norm": 18.538598120031402, "learning_rate": 2.222981197202252e-07, "logps/chosen": -57.998260498046875, "logps/rejected": -70.47949981689453, "loss": 0.2822, "losses/dpo": 0.048808299005031586, "losses/sft": 0.8126518726348877, "losses/total": 0.048808299005031586, "ref_logps/chosen": -37.078182220458984, "ref_logps/rejected": -29.8776912689209, "rewards/accuracies": 0.9375, "rewards/chosen": -2.092008113861084, "rewards/margins": 1.968172311782837, "rewards/rejected": -4.060180187225342, "step": 1747 }, { "epoch": 1.65, "grad_norm": 14.189756479139364, "learning_rate": 2.2204503375073103e-07, "logps/chosen": -34.118465423583984, "logps/rejected": -59.49951171875, "loss": 0.2314, "losses/dpo": 0.6617621183395386, "losses/sft": 1.0950748920440674, "losses/total": 0.6617621183395386, "ref_logps/chosen": -20.566843032836914, "ref_logps/rejected": -26.352008819580078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3551623821258545, "rewards/margins": 1.9595880508422852, "rewards/rejected": -3.3147504329681396, "step": 1748 }, { "epoch": 1.65, "grad_norm": 25.241377842051275, "learning_rate": 2.217919767900486e-07, "logps/chosen": -51.66968536376953, "logps/rejected": -82.72459411621094, "loss": 0.371, "losses/dpo": 1.0646506547927856, "losses/sft": 1.630411982536316, "losses/total": 1.0646506547927856, "ref_logps/chosen": -34.89293670654297, "ref_logps/rejected": -46.2094612121582, "rewards/accuracies": 0.875, "rewards/chosen": -1.6776752471923828, "rewards/margins": 1.973838210105896, "rewards/rejected": -3.6515133380889893, "step": 1749 }, { "epoch": 1.65, "grad_norm": 19.654164662536814, "learning_rate": 2.215389491007748e-07, "logps/chosen": -34.35638427734375, "logps/rejected": -69.17939758300781, "loss": 0.2708, "losses/dpo": 0.07105179131031036, "losses/sft": 1.6983479261398315, "losses/total": 0.07105179131031036, "ref_logps/chosen": -19.284393310546875, "ref_logps/rejected": -30.664648056030273, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5071990489959717, "rewards/margins": 2.344275951385498, "rewards/rejected": -3.8514747619628906, "step": 1750 }, { "epoch": 1.65, "grad_norm": 9.83494915124206, "learning_rate": 2.2128595094547588e-07, "logps/chosen": -39.5886116027832, "logps/rejected": -87.26260375976562, "loss": 0.1377, "losses/dpo": 0.0024544629268348217, "losses/sft": 1.247136116027832, "losses/total": 0.0024544629268348217, "ref_logps/chosen": -23.392410278320312, "ref_logps/rejected": -41.961463928222656, "rewards/accuracies": 1.0, "rewards/chosen": -1.6196198463439941, "rewards/margins": 2.910494089126587, "rewards/rejected": -4.53011417388916, "step": 1751 }, { "epoch": 1.65, "grad_norm": 25.962997929482437, "learning_rate": 2.210329825866875e-07, "logps/chosen": -43.30696105957031, "logps/rejected": -67.1041488647461, "loss": 0.5132, "losses/dpo": 0.052901677787303925, "losses/sft": 0.9774656295776367, "losses/total": 0.052901677787303925, "ref_logps/chosen": -22.685047149658203, "ref_logps/rejected": -29.44293212890625, "rewards/accuracies": 0.75, "rewards/chosen": -2.0621910095214844, "rewards/margins": 1.7039307355880737, "rewards/rejected": -3.7661213874816895, "step": 1752 }, { "epoch": 1.65, "grad_norm": 13.176394009081589, "learning_rate": 2.2078004428691443e-07, "logps/chosen": -47.62222671508789, "logps/rejected": -74.6699447631836, "loss": 0.2123, "losses/dpo": 0.23476065695285797, "losses/sft": 1.1593106985092163, "losses/total": 0.23476065695285797, "ref_logps/chosen": -31.69056510925293, "ref_logps/rejected": -38.15045166015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5931659936904907, "rewards/margins": 2.0587828159332275, "rewards/rejected": -3.651948928833008, "step": 1753 }, { "epoch": 1.65, "grad_norm": 15.170466360962173, "learning_rate": 2.205271363086302e-07, "logps/chosen": -43.81224822998047, "logps/rejected": -78.68208312988281, "loss": 0.2691, "losses/dpo": 0.010670578107237816, "losses/sft": 0.7652752995491028, "losses/total": 0.010670578107237816, "ref_logps/chosen": -25.965173721313477, "ref_logps/rejected": -38.17053985595703, "rewards/accuracies": 0.875, "rewards/chosen": -1.7847071886062622, "rewards/margins": 2.266447067260742, "rewards/rejected": -4.051154136657715, "step": 1754 }, { "epoch": 1.66, "grad_norm": 28.560440934640003, "learning_rate": 2.2027425891427695e-07, "logps/chosen": -49.50389862060547, "logps/rejected": -74.63764953613281, "loss": 0.298, "losses/dpo": 0.06625477224588394, "losses/sft": 1.1976375579833984, "losses/total": 0.06625477224588394, "ref_logps/chosen": -29.048532485961914, "ref_logps/rejected": -36.71393585205078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.045536518096924, "rewards/margins": 1.7468348741531372, "rewards/rejected": -3.7923712730407715, "step": 1755 }, { "epoch": 1.66, "grad_norm": 17.14328676775454, "learning_rate": 2.20021412366265e-07, "logps/chosen": -40.62357711791992, "logps/rejected": -56.09864044189453, "loss": 0.2238, "losses/dpo": 0.15126171708106995, "losses/sft": 0.7038993239402771, "losses/total": 0.15126171708106995, "ref_logps/chosen": -28.672943115234375, "ref_logps/rejected": -25.139751434326172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1950633525848389, "rewards/margins": 1.9008255004882812, "rewards/rejected": -3.09588885307312, "step": 1756 }, { "epoch": 1.66, "grad_norm": 17.956222041531763, "learning_rate": 2.197685969269727e-07, "logps/chosen": -43.211273193359375, "logps/rejected": -55.95500946044922, "loss": 0.324, "losses/dpo": 0.027726413682103157, "losses/sft": 1.8842332363128662, "losses/total": 0.027726413682103157, "ref_logps/chosen": -26.61529541015625, "ref_logps/rejected": -27.41497802734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.659597635269165, "rewards/margins": 1.1944057941436768, "rewards/rejected": -2.854003429412842, "step": 1757 }, { "epoch": 1.66, "grad_norm": 14.290995607823692, "learning_rate": 2.1951581285874614e-07, "logps/chosen": -42.646080017089844, "logps/rejected": -60.61488723754883, "loss": 0.1848, "losses/dpo": 0.016802897676825523, "losses/sft": 1.7180569171905518, "losses/total": 0.016802897676825523, "ref_logps/chosen": -30.17223358154297, "ref_logps/rejected": -25.342266082763672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2473849058151245, "rewards/margins": 2.279877185821533, "rewards/rejected": -3.5272622108459473, "step": 1758 }, { "epoch": 1.66, "grad_norm": 17.138446018674614, "learning_rate": 2.1926306042389886e-07, "logps/chosen": -44.47677230834961, "logps/rejected": -62.821807861328125, "loss": 0.2563, "losses/dpo": 0.3769366145133972, "losses/sft": 1.3666332960128784, "losses/total": 0.3769366145133972, "ref_logps/chosen": -29.142024993896484, "ref_logps/rejected": -29.185298919677734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5334746837615967, "rewards/margins": 1.8301761150360107, "rewards/rejected": -3.3636507987976074, "step": 1759 }, { "epoch": 1.66, "grad_norm": 9.225711519335208, "learning_rate": 2.1901033988471155e-07, "logps/chosen": -43.13740539550781, "logps/rejected": -78.08718872070312, "loss": 0.1059, "losses/dpo": 0.3157990574836731, "losses/sft": 1.0664818286895752, "losses/total": 0.3157990574836731, "ref_logps/chosen": -29.101531982421875, "ref_logps/rejected": -38.40013885498047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4035874605178833, "rewards/margins": 2.5651164054870605, "rewards/rejected": -3.9687042236328125, "step": 1760 }, { "epoch": 1.66, "grad_norm": 22.31172173835255, "learning_rate": 2.1875765150343174e-07, "logps/chosen": -59.10236740112305, "logps/rejected": -76.85816192626953, "loss": 0.2852, "losses/dpo": 0.7495242953300476, "losses/sft": 1.7451924085617065, "losses/total": 0.7495242953300476, "ref_logps/chosen": -39.00106430053711, "ref_logps/rejected": -35.9586296081543, "rewards/accuracies": 0.875, "rewards/chosen": -2.0101301670074463, "rewards/margins": 2.0798230171203613, "rewards/rejected": -4.0899529457092285, "step": 1761 }, { "epoch": 1.66, "grad_norm": 8.396051736159606, "learning_rate": 2.185049955422737e-07, "logps/chosen": -43.503761291503906, "logps/rejected": -88.2507095336914, "loss": 0.0782, "losses/dpo": 0.027292076498270035, "losses/sft": 0.08890163153409958, "losses/total": 0.027292076498270035, "ref_logps/chosen": -27.287952423095703, "ref_logps/rejected": -42.68241882324219, "rewards/accuracies": 1.0, "rewards/chosen": -1.6215808391571045, "rewards/margins": 2.935248613357544, "rewards/rejected": -4.556829452514648, "step": 1762 }, { "epoch": 1.66, "grad_norm": 17.556068300578772, "learning_rate": 2.1825237226341797e-07, "logps/chosen": -50.34785079956055, "logps/rejected": -69.86410522460938, "loss": 0.2401, "losses/dpo": 1.045094609260559, "losses/sft": 1.3174591064453125, "losses/total": 1.045094609260559, "ref_logps/chosen": -33.51023864746094, "ref_logps/rejected": -32.570587158203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6837611198425293, "rewards/margins": 2.045590877532959, "rewards/rejected": -3.7293519973754883, "step": 1763 }, { "epoch": 1.66, "grad_norm": 21.029484556579828, "learning_rate": 2.1799978192901128e-07, "logps/chosen": -52.116310119628906, "logps/rejected": -81.25984191894531, "loss": 0.2579, "losses/dpo": 0.1754544973373413, "losses/sft": 2.5854132175445557, "losses/total": 0.1754544973373413, "ref_logps/chosen": -33.540626525878906, "ref_logps/rejected": -39.60148620605469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.857568383216858, "rewards/margins": 2.308267593383789, "rewards/rejected": -4.165836334228516, "step": 1764 }, { "epoch": 1.67, "grad_norm": 13.109246659379451, "learning_rate": 2.1774722480116605e-07, "logps/chosen": -47.201656341552734, "logps/rejected": -90.9590072631836, "loss": 0.1637, "losses/dpo": 0.02625391259789467, "losses/sft": 2.623748540878296, "losses/total": 0.02625391259789467, "ref_logps/chosen": -27.57964324951172, "ref_logps/rejected": -47.03551483154297, "rewards/accuracies": 1.0, "rewards/chosen": -1.9622013568878174, "rewards/margins": 2.430148124694824, "rewards/rejected": -4.3923492431640625, "step": 1765 }, { "epoch": 1.67, "grad_norm": 16.445910197039865, "learning_rate": 2.174947011419603e-07, "logps/chosen": -36.57024383544922, "logps/rejected": -52.85034942626953, "loss": 0.2927, "losses/dpo": 0.1897663176059723, "losses/sft": 1.696777105331421, "losses/total": 0.1897663176059723, "ref_logps/chosen": -22.548809051513672, "ref_logps/rejected": -20.867305755615234, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4021435976028442, "rewards/margins": 1.7961604595184326, "rewards/rejected": -3.1983039379119873, "step": 1766 }, { "epoch": 1.67, "grad_norm": 16.67526571829083, "learning_rate": 2.1724221121343733e-07, "logps/chosen": -37.51417541503906, "logps/rejected": -68.23750305175781, "loss": 0.2125, "losses/dpo": 0.14197784662246704, "losses/sft": 1.1496909856796265, "losses/total": 0.14197784662246704, "ref_logps/chosen": -22.73615837097168, "ref_logps/rejected": -30.87883758544922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.477802038192749, "rewards/margins": 2.2580647468566895, "rewards/rejected": -3.7358670234680176, "step": 1767 }, { "epoch": 1.67, "grad_norm": 18.23866642770414, "learning_rate": 2.169897552776055e-07, "logps/chosen": -39.370201110839844, "logps/rejected": -63.54962158203125, "loss": 0.2474, "losses/dpo": 0.037069808691740036, "losses/sft": 1.1828033924102783, "losses/total": 0.037069808691740036, "ref_logps/chosen": -24.242834091186523, "ref_logps/rejected": -30.191936492919922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5127366781234741, "rewards/margins": 1.8230319023132324, "rewards/rejected": -3.335768699645996, "step": 1768 }, { "epoch": 1.67, "grad_norm": 18.713444591641295, "learning_rate": 2.167373335964377e-07, "logps/chosen": -50.64347457885742, "logps/rejected": -89.81675720214844, "loss": 0.2254, "losses/dpo": 0.46372562646865845, "losses/sft": 2.3001177310943604, "losses/total": 0.46372562646865845, "ref_logps/chosen": -33.42962646484375, "ref_logps/rejected": -40.022377014160156, "rewards/accuracies": 0.875, "rewards/chosen": -1.721384882926941, "rewards/margins": 3.2580530643463135, "rewards/rejected": -4.979438304901123, "step": 1769 }, { "epoch": 1.67, "grad_norm": 13.255850678459842, "learning_rate": 2.1648494643187149e-07, "logps/chosen": -37.02099609375, "logps/rejected": -67.46208190917969, "loss": 0.1592, "losses/dpo": 0.024601858109235764, "losses/sft": 0.6508243083953857, "losses/total": 0.024601858109235764, "ref_logps/chosen": -27.000059127807617, "ref_logps/rejected": -33.02073669433594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.00209379196167, "rewards/margins": 2.44204044342041, "rewards/rejected": -3.44413423538208, "step": 1770 }, { "epoch": 1.67, "grad_norm": 22.145580796250062, "learning_rate": 2.1623259404580848e-07, "logps/chosen": -39.66795349121094, "logps/rejected": -79.8292236328125, "loss": 0.2843, "losses/dpo": 0.10447347909212112, "losses/sft": 1.5967960357666016, "losses/total": 0.10447347909212112, "ref_logps/chosen": -23.52006721496582, "ref_logps/rejected": -38.51547622680664, "rewards/accuracies": 0.875, "rewards/chosen": -1.6147890090942383, "rewards/margins": 2.5165865421295166, "rewards/rejected": -4.131375312805176, "step": 1771 }, { "epoch": 1.67, "grad_norm": 24.198915967595745, "learning_rate": 2.1598027670011426e-07, "logps/chosen": -41.283443450927734, "logps/rejected": -74.08280944824219, "loss": 0.3434, "losses/dpo": 0.37855443358421326, "losses/sft": 1.210308313369751, "losses/total": 0.37855443358421326, "ref_logps/chosen": -22.216217041015625, "ref_logps/rejected": -35.998661041259766, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9067227840423584, "rewards/margins": 1.9016923904418945, "rewards/rejected": -3.808414936065674, "step": 1772 }, { "epoch": 1.67, "grad_norm": 19.284227690221755, "learning_rate": 2.1572799465661797e-07, "logps/chosen": -52.281494140625, "logps/rejected": -71.97247314453125, "loss": 0.2755, "losses/dpo": 0.04582522064447403, "losses/sft": 0.6528494358062744, "losses/total": 0.04582522064447403, "ref_logps/chosen": -34.441932678222656, "ref_logps/rejected": -33.428550720214844, "rewards/accuracies": 0.875, "rewards/chosen": -1.7839564085006714, "rewards/margins": 2.070436477661133, "rewards/rejected": -3.8543925285339355, "step": 1773 }, { "epoch": 1.67, "grad_norm": 25.268704700007746, "learning_rate": 2.1547574817711218e-07, "logps/chosen": -46.43940734863281, "logps/rejected": -56.82270812988281, "loss": 0.3979, "losses/dpo": 0.07077234238386154, "losses/sft": 0.8996348977088928, "losses/total": 0.07077234238386154, "ref_logps/chosen": -30.47357940673828, "ref_logps/rejected": -25.425687789916992, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5965826511383057, "rewards/margins": 1.5431196689605713, "rewards/rejected": -3.139702320098877, "step": 1774 }, { "epoch": 1.67, "grad_norm": 23.376309197232477, "learning_rate": 2.1522353752335263e-07, "logps/chosen": -44.37974548339844, "logps/rejected": -76.22279357910156, "loss": 0.351, "losses/dpo": 0.10927741974592209, "losses/sft": 1.5267983675003052, "losses/total": 0.10927741974592209, "ref_logps/chosen": -26.651138305664062, "ref_logps/rejected": -37.27848815917969, "rewards/accuracies": 0.75, "rewards/chosen": -1.7728607654571533, "rewards/margins": 2.121570110321045, "rewards/rejected": -3.8944311141967773, "step": 1775 }, { "epoch": 1.68, "grad_norm": 26.665759211447188, "learning_rate": 2.1497136295705775e-07, "logps/chosen": -54.299041748046875, "logps/rejected": -69.39103698730469, "loss": 0.4378, "losses/dpo": 0.34912896156311035, "losses/sft": 1.377199411392212, "losses/total": 0.34912896156311035, "ref_logps/chosen": -30.46295738220215, "ref_logps/rejected": -33.01316452026367, "rewards/accuracies": 0.75, "rewards/chosen": -2.383608341217041, "rewards/margins": 1.254178524017334, "rewards/rejected": -3.637786865234375, "step": 1776 }, { "epoch": 1.68, "grad_norm": 24.81250657131154, "learning_rate": 2.1471922473990858e-07, "logps/chosen": -35.801918029785156, "logps/rejected": -67.19467163085938, "loss": 0.5382, "losses/dpo": 0.3732699155807495, "losses/sft": 0.29573971033096313, "losses/total": 0.3732699155807495, "ref_logps/chosen": -17.701759338378906, "ref_logps/rejected": -31.424718856811523, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8100160360336304, "rewards/margins": 1.766979694366455, "rewards/rejected": -3.576995611190796, "step": 1777 }, { "epoch": 1.68, "grad_norm": 12.011838464562805, "learning_rate": 2.1446712313354846e-07, "logps/chosen": -27.428661346435547, "logps/rejected": -63.88801193237305, "loss": 0.1917, "losses/dpo": 0.06874135881662369, "losses/sft": 0.17946715652942657, "losses/total": 0.06874135881662369, "ref_logps/chosen": -15.66684341430664, "ref_logps/rejected": -25.870521545410156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1761817932128906, "rewards/margins": 2.6255674362182617, "rewards/rejected": -3.8017497062683105, "step": 1778 }, { "epoch": 1.68, "grad_norm": 17.468866394375578, "learning_rate": 2.1421505839958267e-07, "logps/chosen": -41.98310089111328, "logps/rejected": -77.69672393798828, "loss": 0.207, "losses/dpo": 0.0018899569986388087, "losses/sft": 1.4945491552352905, "losses/total": 0.0018899569986388087, "ref_logps/chosen": -25.413841247558594, "ref_logps/rejected": -38.601890563964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6569263935089111, "rewards/margins": 2.2525572776794434, "rewards/rejected": -3.9094839096069336, "step": 1779 }, { "epoch": 1.68, "grad_norm": 19.34991329613161, "learning_rate": 2.1396303079957832e-07, "logps/chosen": -46.878177642822266, "logps/rejected": -90.0653076171875, "loss": 0.1928, "losses/dpo": 0.01873818412423134, "losses/sft": 0.8061766028404236, "losses/total": 0.01873818412423134, "ref_logps/chosen": -25.885326385498047, "ref_logps/rejected": -44.05261993408203, "rewards/accuracies": 0.9375, "rewards/chosen": -2.099285364151001, "rewards/margins": 2.501983404159546, "rewards/rejected": -4.601268768310547, "step": 1780 }, { "epoch": 1.68, "grad_norm": 26.766792125339027, "learning_rate": 2.137110405950639e-07, "logps/chosen": -45.6655158996582, "logps/rejected": -68.66731262207031, "loss": 0.3956, "losses/dpo": 0.7398976683616638, "losses/sft": 1.3031642436981201, "losses/total": 0.7398976683616638, "ref_logps/chosen": -25.23446273803711, "ref_logps/rejected": -31.485782623291016, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0431056022644043, "rewards/margins": 1.6750472784042358, "rewards/rejected": -3.7181527614593506, "step": 1781 }, { "epoch": 1.68, "grad_norm": 20.975185262510287, "learning_rate": 2.134590880475292e-07, "logps/chosen": -48.12535858154297, "logps/rejected": -66.54090881347656, "loss": 0.2934, "losses/dpo": 0.14219461381435394, "losses/sft": 1.9335103034973145, "losses/total": 0.14219461381435394, "ref_logps/chosen": -33.19419860839844, "ref_logps/rejected": -32.799903869628906, "rewards/accuracies": 0.875, "rewards/chosen": -1.4931161403656006, "rewards/margins": 1.8809840679168701, "rewards/rejected": -3.3741002082824707, "step": 1782 }, { "epoch": 1.68, "grad_norm": 19.79174962550712, "learning_rate": 2.1320717341842464e-07, "logps/chosen": -43.705657958984375, "logps/rejected": -86.04524230957031, "loss": 0.2362, "losses/dpo": 0.03380502015352249, "losses/sft": 1.1764084100723267, "losses/total": 0.03380502015352249, "ref_logps/chosen": -24.895896911621094, "ref_logps/rejected": -41.6482048034668, "rewards/accuracies": 0.875, "rewards/chosen": -1.8809763193130493, "rewards/margins": 2.5587267875671387, "rewards/rejected": -4.439702987670898, "step": 1783 }, { "epoch": 1.68, "grad_norm": 14.559151971515368, "learning_rate": 2.1295529696916187e-07, "logps/chosen": -50.65794372558594, "logps/rejected": -68.44568634033203, "loss": 0.2026, "losses/dpo": 0.057032760232686996, "losses/sft": 2.3918263912200928, "losses/total": 0.057032760232686996, "ref_logps/chosen": -31.020877838134766, "ref_logps/rejected": -26.920429229736328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9637064933776855, "rewards/margins": 2.188819646835327, "rewards/rejected": -4.152525901794434, "step": 1784 }, { "epoch": 1.68, "grad_norm": 21.13704980166872, "learning_rate": 2.1270345896111242e-07, "logps/chosen": -46.038578033447266, "logps/rejected": -77.45632934570312, "loss": 0.233, "losses/dpo": 0.03573845699429512, "losses/sft": 2.524688482284546, "losses/total": 0.03573845699429512, "ref_logps/chosen": -24.72777557373047, "ref_logps/rejected": -36.87851333618164, "rewards/accuracies": 0.875, "rewards/chosen": -2.131080150604248, "rewards/margins": 1.926701545715332, "rewards/rejected": -4.057781219482422, "step": 1785 }, { "epoch": 1.68, "grad_norm": 19.247116835536186, "learning_rate": 2.1245165965560796e-07, "logps/chosen": -48.13048553466797, "logps/rejected": -78.6148910522461, "loss": 0.2657, "losses/dpo": 0.037247300148010254, "losses/sft": 1.6079072952270508, "losses/total": 0.037247300148010254, "ref_logps/chosen": -29.542795181274414, "ref_logps/rejected": -36.816650390625, "rewards/accuracies": 0.875, "rewards/chosen": -1.8587690591812134, "rewards/margins": 2.3210551738739014, "rewards/rejected": -4.179824352264404, "step": 1786 }, { "epoch": 1.69, "grad_norm": 22.762527909293933, "learning_rate": 2.121998993139403e-07, "logps/chosen": -43.87397003173828, "logps/rejected": -64.79363250732422, "loss": 0.3513, "losses/dpo": 0.45527327060699463, "losses/sft": 0.5017714500427246, "losses/total": 0.45527327060699463, "ref_logps/chosen": -28.085979461669922, "ref_logps/rejected": -28.269424438476562, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5787992477416992, "rewards/margins": 2.0736217498779297, "rewards/rejected": -3.652420997619629, "step": 1787 }, { "epoch": 1.69, "grad_norm": 17.36537383551203, "learning_rate": 2.1194817819736058e-07, "logps/chosen": -47.45977020263672, "logps/rejected": -71.7544937133789, "loss": 0.2344, "losses/dpo": 0.04596106335520744, "losses/sft": 0.8755618333816528, "losses/total": 0.04596106335520744, "ref_logps/chosen": -27.885570526123047, "ref_logps/rejected": -33.51491928100586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9574203491210938, "rewards/margins": 1.8665375709533691, "rewards/rejected": -3.823957920074463, "step": 1788 }, { "epoch": 1.69, "grad_norm": 28.313808914428755, "learning_rate": 2.1169649656707926e-07, "logps/chosen": -45.60970687866211, "logps/rejected": -87.31629943847656, "loss": 0.287, "losses/dpo": 0.3910108804702759, "losses/sft": 1.7402527332305908, "losses/total": 0.3910108804702759, "ref_logps/chosen": -27.662254333496094, "ref_logps/rejected": -43.894317626953125, "rewards/accuracies": 0.875, "rewards/chosen": -1.794745683670044, "rewards/margins": 2.5474531650543213, "rewards/rejected": -4.342198848724365, "step": 1789 }, { "epoch": 1.69, "grad_norm": 21.38281111692428, "learning_rate": 2.1144485468426588e-07, "logps/chosen": -33.09284210205078, "logps/rejected": -69.37309265136719, "loss": 0.2701, "losses/dpo": 0.007628267165273428, "losses/sft": 0.9183155298233032, "losses/total": 0.007628267165273428, "ref_logps/chosen": -17.618867874145508, "ref_logps/rejected": -30.860301971435547, "rewards/accuracies": 0.875, "rewards/chosen": -1.547397255897522, "rewards/margins": 2.3038816452026367, "rewards/rejected": -3.851278781890869, "step": 1790 }, { "epoch": 1.69, "grad_norm": 21.11798950465996, "learning_rate": 2.1119325281004867e-07, "logps/chosen": -50.65167999267578, "logps/rejected": -73.29559326171875, "loss": 0.3569, "losses/dpo": 0.006955782417207956, "losses/sft": 0.9906240701675415, "losses/total": 0.006955782417207956, "ref_logps/chosen": -34.48381042480469, "ref_logps/rejected": -37.15162658691406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6167871952056885, "rewards/margins": 1.997609257698059, "rewards/rejected": -3.614396333694458, "step": 1791 }, { "epoch": 1.69, "grad_norm": 13.146133697232644, "learning_rate": 2.1094169120551447e-07, "logps/chosen": -46.19035339355469, "logps/rejected": -82.50373077392578, "loss": 0.1256, "losses/dpo": 0.11490020155906677, "losses/sft": 0.8786349296569824, "losses/total": 0.11490020155906677, "ref_logps/chosen": -30.72202491760254, "ref_logps/rejected": -37.175594329833984, "rewards/accuracies": 1.0, "rewards/chosen": -1.54683256149292, "rewards/margins": 2.985980987548828, "rewards/rejected": -4.532813549041748, "step": 1792 }, { "epoch": 1.69, "grad_norm": 19.366950682538064, "learning_rate": 2.1069017013170817e-07, "logps/chosen": -44.05651092529297, "logps/rejected": -76.12604522705078, "loss": 0.2844, "losses/dpo": 0.046371664851903915, "losses/sft": 1.6748237609863281, "losses/total": 0.046371664851903915, "ref_logps/chosen": -26.512371063232422, "ref_logps/rejected": -38.91546630859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.7544142007827759, "rewards/margins": 1.9666435718536377, "rewards/rejected": -3.721057891845703, "step": 1793 }, { "epoch": 1.69, "grad_norm": 30.161703444152312, "learning_rate": 2.104386898496327e-07, "logps/chosen": -57.9234619140625, "logps/rejected": -70.68209838867188, "loss": 0.4095, "losses/dpo": 0.19059693813323975, "losses/sft": 1.4028639793395996, "losses/total": 0.19059693813323975, "ref_logps/chosen": -36.16295623779297, "ref_logps/rejected": -33.61328887939453, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1760506629943848, "rewards/margins": 1.53083074092865, "rewards/rejected": -3.706881284713745, "step": 1794 }, { "epoch": 1.69, "grad_norm": 11.985606327546606, "learning_rate": 2.1018725062024862e-07, "logps/chosen": -43.608978271484375, "logps/rejected": -73.87721252441406, "loss": 0.1732, "losses/dpo": 0.24049639701843262, "losses/sft": 0.629115879535675, "losses/total": 0.24049639701843262, "ref_logps/chosen": -27.739473342895508, "ref_logps/rejected": -34.353294372558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5869507789611816, "rewards/margins": 2.36544132232666, "rewards/rejected": -3.952392101287842, "step": 1795 }, { "epoch": 1.69, "grad_norm": 18.636783940526374, "learning_rate": 2.0993585270447397e-07, "logps/chosen": -51.37494659423828, "logps/rejected": -79.81936645507812, "loss": 0.1894, "losses/dpo": 0.07937640696763992, "losses/sft": 1.6454341411590576, "losses/total": 0.07937640696763992, "ref_logps/chosen": -31.388835906982422, "ref_logps/rejected": -38.663246154785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9986112117767334, "rewards/margins": 2.117001533508301, "rewards/rejected": -4.115612983703613, "step": 1796 }, { "epoch": 1.7, "grad_norm": 22.90888783100189, "learning_rate": 2.0968449636318384e-07, "logps/chosen": -43.505516052246094, "logps/rejected": -65.30705261230469, "loss": 0.3236, "losses/dpo": 1.2198777198791504, "losses/sft": 2.449301242828369, "losses/total": 1.2198777198791504, "ref_logps/chosen": -29.870267868041992, "ref_logps/rejected": -32.116695404052734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3635250329971313, "rewards/margins": 1.955510139465332, "rewards/rejected": -3.319035053253174, "step": 1797 }, { "epoch": 1.7, "grad_norm": 27.96635355755532, "learning_rate": 2.0943318185721014e-07, "logps/chosen": -52.08725357055664, "logps/rejected": -80.71739196777344, "loss": 0.3622, "losses/dpo": 0.05387360230088234, "losses/sft": 0.7177534699440002, "losses/total": 0.05387360230088234, "ref_logps/chosen": -31.107532501220703, "ref_logps/rejected": -36.368804931640625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0979721546173096, "rewards/margins": 2.3368871212005615, "rewards/rejected": -4.434859275817871, "step": 1798 }, { "epoch": 1.7, "grad_norm": 18.930731134322418, "learning_rate": 2.0918190944734149e-07, "logps/chosen": -42.743568420410156, "logps/rejected": -70.82667541503906, "loss": 0.2964, "losses/dpo": 1.3302721977233887, "losses/sft": 1.5556471347808838, "losses/total": 1.3302721977233887, "ref_logps/chosen": -24.763195037841797, "ref_logps/rejected": -33.75483322143555, "rewards/accuracies": 0.875, "rewards/chosen": -1.7980376482009888, "rewards/margins": 1.9091475009918213, "rewards/rejected": -3.7071850299835205, "step": 1799 }, { "epoch": 1.7, "grad_norm": 20.363904545490765, "learning_rate": 2.0893067939432275e-07, "logps/chosen": -32.39381408691406, "logps/rejected": -50.47711944580078, "loss": 0.3507, "losses/dpo": 0.5780834555625916, "losses/sft": 0.48088109493255615, "losses/total": 0.5780834555625916, "ref_logps/chosen": -20.86392593383789, "ref_logps/rejected": -23.669729232788086, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1529886722564697, "rewards/margins": 1.5277506113052368, "rewards/rejected": -2.680739164352417, "step": 1800 }, { "epoch": 1.7, "grad_norm": 21.6405861517679, "learning_rate": 2.0867949195885482e-07, "logps/chosen": -38.61616516113281, "logps/rejected": -61.36317825317383, "loss": 0.311, "losses/dpo": 0.5724496245384216, "losses/sft": 2.911712408065796, "losses/total": 0.5724496245384216, "ref_logps/chosen": -22.23430824279785, "ref_logps/rejected": -27.80313491821289, "rewards/accuracies": 0.875, "rewards/chosen": -1.638185739517212, "rewards/margins": 1.7178187370300293, "rewards/rejected": -3.356004476547241, "step": 1801 }, { "epoch": 1.7, "grad_norm": 35.66880933039272, "learning_rate": 2.0842834740159437e-07, "logps/chosen": -46.05561828613281, "logps/rejected": -70.44792175292969, "loss": 0.5426, "losses/dpo": 0.218224436044693, "losses/sft": 1.7719978094100952, "losses/total": 0.218224436044693, "ref_logps/chosen": -22.205257415771484, "ref_logps/rejected": -34.70069885253906, "rewards/accuracies": 0.75, "rewards/chosen": -2.385035991668701, "rewards/margins": 1.1896860599517822, "rewards/rejected": -3.5747222900390625, "step": 1802 }, { "epoch": 1.7, "grad_norm": 16.36399708375672, "learning_rate": 2.0817724598315362e-07, "logps/chosen": -36.867149353027344, "logps/rejected": -70.3560562133789, "loss": 0.1718, "losses/dpo": 0.31389492750167847, "losses/sft": 1.9854469299316406, "losses/total": 0.31389492750167847, "ref_logps/chosen": -22.887454986572266, "ref_logps/rejected": -34.786827087402344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3979694843292236, "rewards/margins": 2.1589529514312744, "rewards/rejected": -3.556922435760498, "step": 1803 }, { "epoch": 1.7, "grad_norm": 16.175873314501516, "learning_rate": 2.0792618796409996e-07, "logps/chosen": -33.86431884765625, "logps/rejected": -68.1125717163086, "loss": 0.2089, "losses/dpo": 0.050446610897779465, "losses/sft": 1.5376802682876587, "losses/total": 0.050446610897779465, "ref_logps/chosen": -20.397266387939453, "ref_logps/rejected": -32.767478942871094, "rewards/accuracies": 0.875, "rewards/chosen": -1.346705436706543, "rewards/margins": 2.1878037452697754, "rewards/rejected": -3.5345091819763184, "step": 1804 }, { "epoch": 1.7, "grad_norm": 18.487402429342914, "learning_rate": 2.076751736049559e-07, "logps/chosen": -51.47166061401367, "logps/rejected": -67.09324645996094, "loss": 0.2247, "losses/dpo": 0.07120442390441895, "losses/sft": 1.5376020669937134, "losses/total": 0.07120442390441895, "ref_logps/chosen": -33.62562942504883, "ref_logps/rejected": -30.81966781616211, "rewards/accuracies": 1.0, "rewards/chosen": -1.7846031188964844, "rewards/margins": 1.8427549600601196, "rewards/rejected": -3.6273581981658936, "step": 1805 }, { "epoch": 1.7, "grad_norm": 29.05707607941422, "learning_rate": 2.0742420316619847e-07, "logps/chosen": -50.50535583496094, "logps/rejected": -63.297122955322266, "loss": 0.3708, "losses/dpo": 0.6356156468391418, "losses/sft": 1.6818277835845947, "losses/total": 0.6356156468391418, "ref_logps/chosen": -32.554927825927734, "ref_logps/rejected": -26.833446502685547, "rewards/accuracies": 0.875, "rewards/chosen": -1.795042634010315, "rewards/margins": 1.851325511932373, "rewards/rejected": -3.6463680267333984, "step": 1806 }, { "epoch": 1.7, "grad_norm": 16.55383471934535, "learning_rate": 2.0717327690825913e-07, "logps/chosen": -41.15163040161133, "logps/rejected": -63.42743682861328, "loss": 0.2302, "losses/dpo": 0.01166528556495905, "losses/sft": 1.1059327125549316, "losses/total": 0.01166528556495905, "ref_logps/chosen": -26.145458221435547, "ref_logps/rejected": -26.055561065673828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5006171464920044, "rewards/margins": 2.236570358276367, "rewards/rejected": -3.737187623977661, "step": 1807 }, { "epoch": 1.71, "grad_norm": 21.031560355971706, "learning_rate": 2.0692239509152364e-07, "logps/chosen": -45.037445068359375, "logps/rejected": -78.97473907470703, "loss": 0.2619, "losses/dpo": 0.26356762647628784, "losses/sft": 1.542590618133545, "losses/total": 0.26356762647628784, "ref_logps/chosen": -26.644424438476562, "ref_logps/rejected": -39.567256927490234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8393017053604126, "rewards/margins": 2.1014463901519775, "rewards/rejected": -3.9407482147216797, "step": 1808 }, { "epoch": 1.71, "grad_norm": 12.527446236622273, "learning_rate": 2.0667155797633154e-07, "logps/chosen": -56.970489501953125, "logps/rejected": -86.10353088378906, "loss": 0.1329, "losses/dpo": 0.2757355272769928, "losses/sft": 0.5218445658683777, "losses/total": 0.2757355272769928, "ref_logps/chosen": -36.40309143066406, "ref_logps/rejected": -38.125633239746094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0567398071289062, "rewards/margins": 2.7410504817962646, "rewards/rejected": -4.797790050506592, "step": 1809 }, { "epoch": 1.71, "grad_norm": 14.91030555774806, "learning_rate": 2.0642076582297593e-07, "logps/chosen": -51.06340408325195, "logps/rejected": -77.5318603515625, "loss": 0.1747, "losses/dpo": 0.01620815508067608, "losses/sft": 1.6951223611831665, "losses/total": 0.01620815508067608, "ref_logps/chosen": -29.486427307128906, "ref_logps/rejected": -33.40199279785156, "rewards/accuracies": 1.0, "rewards/chosen": -2.1576976776123047, "rewards/margins": 2.25528883934021, "rewards/rejected": -4.412986755371094, "step": 1810 }, { "epoch": 1.71, "grad_norm": 21.013384872560355, "learning_rate": 2.0617001889170339e-07, "logps/chosen": -55.07940673828125, "logps/rejected": -86.43697357177734, "loss": 0.2123, "losses/dpo": 0.032964255660772324, "losses/sft": 1.4102756977081299, "losses/total": 0.032964255660772324, "ref_logps/chosen": -35.4669189453125, "ref_logps/rejected": -42.088294982910156, "rewards/accuracies": 0.875, "rewards/chosen": -1.9612489938735962, "rewards/margins": 2.473618268966675, "rewards/rejected": -4.434866905212402, "step": 1811 }, { "epoch": 1.71, "grad_norm": 21.73312540296459, "learning_rate": 2.059193174427134e-07, "logps/chosen": -59.653839111328125, "logps/rejected": -73.35647583007812, "loss": 0.3514, "losses/dpo": 0.026715297251939774, "losses/sft": 1.5661389827728271, "losses/total": 0.026715297251939774, "ref_logps/chosen": -38.1933708190918, "ref_logps/rejected": -34.72644805908203, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1460471153259277, "rewards/margins": 1.7169560194015503, "rewards/rejected": -3.8630027770996094, "step": 1812 }, { "epoch": 1.71, "grad_norm": 18.571150736459686, "learning_rate": 2.0566866173615848e-07, "logps/chosen": -39.99833679199219, "logps/rejected": -62.01145553588867, "loss": 0.298, "losses/dpo": 0.4505768418312073, "losses/sft": 1.639939785003662, "losses/total": 0.4505768418312073, "ref_logps/chosen": -23.32213020324707, "ref_logps/rejected": -27.99996566772461, "rewards/accuracies": 0.875, "rewards/chosen": -1.6676208972930908, "rewards/margins": 1.7335281372070312, "rewards/rejected": -3.401149272918701, "step": 1813 }, { "epoch": 1.71, "grad_norm": 13.143871272587296, "learning_rate": 2.054180520321435e-07, "logps/chosen": -44.06698989868164, "logps/rejected": -95.64669799804688, "loss": 0.114, "losses/dpo": 0.04227929562330246, "losses/sft": 1.3230316638946533, "losses/total": 0.04227929562330246, "ref_logps/chosen": -26.078901290893555, "ref_logps/rejected": -47.744850158691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7988089323043823, "rewards/margins": 2.99137544631958, "rewards/rejected": -4.790184020996094, "step": 1814 }, { "epoch": 1.71, "grad_norm": 12.153315704881853, "learning_rate": 2.0516748859072562e-07, "logps/chosen": -41.25010681152344, "logps/rejected": -78.50422668457031, "loss": 0.1913, "losses/dpo": 0.6319067478179932, "losses/sft": 1.4156001806259155, "losses/total": 0.6319067478179932, "ref_logps/chosen": -25.443798065185547, "ref_logps/rejected": -38.99017333984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5806310176849365, "rewards/margins": 2.370774269104004, "rewards/rejected": -3.9514050483703613, "step": 1815 }, { "epoch": 1.71, "grad_norm": 19.507783717906165, "learning_rate": 2.0491697167191403e-07, "logps/chosen": -39.2997932434082, "logps/rejected": -70.5454330444336, "loss": 0.2177, "losses/dpo": 0.02870015613734722, "losses/sft": 2.6548235416412354, "losses/total": 0.02870015613734722, "ref_logps/chosen": -23.42422866821289, "ref_logps/rejected": -30.037126541137695, "rewards/accuracies": 0.875, "rewards/chosen": -1.5875563621520996, "rewards/margins": 2.4632742404937744, "rewards/rejected": -4.050830841064453, "step": 1816 }, { "epoch": 1.71, "grad_norm": 15.20899650614926, "learning_rate": 2.0466650153566962e-07, "logps/chosen": -40.3053092956543, "logps/rejected": -77.52742004394531, "loss": 0.1821, "losses/dpo": 0.005029942374676466, "losses/sft": 2.0523531436920166, "losses/total": 0.005029942374676466, "ref_logps/chosen": -23.034950256347656, "ref_logps/rejected": -31.485084533691406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7270358800888062, "rewards/margins": 2.877197027206421, "rewards/rejected": -4.6042327880859375, "step": 1817 }, { "epoch": 1.72, "grad_norm": 22.083991255615402, "learning_rate": 2.0441607844190473e-07, "logps/chosen": -36.094993591308594, "logps/rejected": -55.70845031738281, "loss": 0.3694, "losses/dpo": 0.14297796785831451, "losses/sft": 0.6670050621032715, "losses/total": 0.14297796785831451, "ref_logps/chosen": -21.208538055419922, "ref_logps/rejected": -24.708730697631836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4886457920074463, "rewards/margins": 1.6113260984420776, "rewards/rejected": -3.0999720096588135, "step": 1818 }, { "epoch": 1.72, "grad_norm": 18.563423659888887, "learning_rate": 2.0416570265048293e-07, "logps/chosen": -43.178462982177734, "logps/rejected": -63.98070526123047, "loss": 0.3008, "losses/dpo": 0.3419981002807617, "losses/sft": 0.3094834089279175, "losses/total": 0.3419981002807617, "ref_logps/chosen": -27.939884185791016, "ref_logps/rejected": -30.182811737060547, "rewards/accuracies": 0.875, "rewards/chosen": -1.5238580703735352, "rewards/margins": 1.8559308052062988, "rewards/rejected": -3.379788875579834, "step": 1819 }, { "epoch": 1.72, "grad_norm": 21.97334534394629, "learning_rate": 2.0391537442121868e-07, "logps/chosen": -50.27375030517578, "logps/rejected": -81.39229583740234, "loss": 0.2805, "losses/dpo": 0.01550209615379572, "losses/sft": 1.6526716947555542, "losses/total": 0.01550209615379572, "ref_logps/chosen": -28.360881805419922, "ref_logps/rejected": -37.87116241455078, "rewards/accuracies": 1.0, "rewards/chosen": -2.1912872791290283, "rewards/margins": 2.1608262062072754, "rewards/rejected": -4.352113723754883, "step": 1820 }, { "epoch": 1.72, "grad_norm": 17.346316733859496, "learning_rate": 2.0366509401387696e-07, "logps/chosen": -45.912574768066406, "logps/rejected": -78.01548767089844, "loss": 0.2136, "losses/dpo": 0.5077234506607056, "losses/sft": 1.1709036827087402, "losses/total": 0.5077234506607056, "ref_logps/chosen": -29.24095916748047, "ref_logps/rejected": -38.93292236328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6671617031097412, "rewards/margins": 2.2410948276519775, "rewards/rejected": -3.9082565307617188, "step": 1821 }, { "epoch": 1.72, "grad_norm": 17.976865544587948, "learning_rate": 2.0341486168817342e-07, "logps/chosen": -46.080665588378906, "logps/rejected": -67.69998168945312, "loss": 0.2724, "losses/dpo": 0.7106265425682068, "losses/sft": 1.7463035583496094, "losses/total": 0.7106265425682068, "ref_logps/chosen": -30.44420623779297, "ref_logps/rejected": -31.890535354614258, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5636460781097412, "rewards/margins": 2.017298698425293, "rewards/rejected": -3.580944776535034, "step": 1822 }, { "epoch": 1.72, "grad_norm": 15.724419178966016, "learning_rate": 2.0316467770377355e-07, "logps/chosen": -48.42083740234375, "logps/rejected": -79.8192138671875, "loss": 0.1948, "losses/dpo": 0.018691806122660637, "losses/sft": 1.2703622579574585, "losses/total": 0.018691806122660637, "ref_logps/chosen": -29.56385040283203, "ref_logps/rejected": -37.714900970458984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8856981992721558, "rewards/margins": 2.3247337341308594, "rewards/rejected": -4.210432052612305, "step": 1823 }, { "epoch": 1.72, "grad_norm": 31.099750343829072, "learning_rate": 2.0291454232029277e-07, "logps/chosen": -48.34716796875, "logps/rejected": -62.84562683105469, "loss": 0.5444, "losses/dpo": 0.18469709157943726, "losses/sft": 1.0794670581817627, "losses/total": 0.18469709157943726, "ref_logps/chosen": -31.27045249938965, "ref_logps/rejected": -26.890159606933594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.707671880722046, "rewards/margins": 1.8878753185272217, "rewards/rejected": -3.5955471992492676, "step": 1824 }, { "epoch": 1.72, "grad_norm": 13.11968671640192, "learning_rate": 2.0266445579729602e-07, "logps/chosen": -42.915550231933594, "logps/rejected": -73.49390411376953, "loss": 0.2134, "losses/dpo": 0.055094826966524124, "losses/sft": 1.7131587266921997, "losses/total": 0.055094826966524124, "ref_logps/chosen": -27.70074462890625, "ref_logps/rejected": -33.59857940673828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.521480679512024, "rewards/margins": 2.4680521488189697, "rewards/rejected": -3.989532709121704, "step": 1825 }, { "epoch": 1.72, "grad_norm": 21.69386301453207, "learning_rate": 2.0241441839429766e-07, "logps/chosen": -44.419395446777344, "logps/rejected": -66.16650390625, "loss": 0.3216, "losses/dpo": 0.060948196798563004, "losses/sft": 1.291718602180481, "losses/total": 0.060948196798563004, "ref_logps/chosen": -30.447189331054688, "ref_logps/rejected": -32.6075325012207, "rewards/accuracies": 0.875, "rewards/chosen": -1.3972209692001343, "rewards/margins": 1.9586759805679321, "rewards/rejected": -3.3558969497680664, "step": 1826 }, { "epoch": 1.72, "grad_norm": 12.740817613374139, "learning_rate": 2.0216443037076091e-07, "logps/chosen": -32.03684616088867, "logps/rejected": -62.60367965698242, "loss": 0.1865, "losses/dpo": 0.08476562798023224, "losses/sft": 1.0266791582107544, "losses/total": 0.08476562798023224, "ref_logps/chosen": -20.05780792236328, "ref_logps/rejected": -25.958101272583008, "rewards/accuracies": 1.0, "rewards/chosen": -1.1979036331176758, "rewards/margins": 2.4666543006896973, "rewards/rejected": -3.664557933807373, "step": 1827 }, { "epoch": 1.72, "grad_norm": 18.87586300028938, "learning_rate": 2.0191449198609794e-07, "logps/chosen": -49.482574462890625, "logps/rejected": -90.0111083984375, "loss": 0.1542, "losses/dpo": 0.03856416419148445, "losses/sft": 1.3147094249725342, "losses/total": 0.03856416419148445, "ref_logps/chosen": -34.48843002319336, "ref_logps/rejected": -43.158164978027344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4994146823883057, "rewards/margins": 3.185879945755005, "rewards/rejected": -4.6852946281433105, "step": 1828 }, { "epoch": 1.73, "grad_norm": 24.048987695064053, "learning_rate": 2.0166460349966914e-07, "logps/chosen": -39.773075103759766, "logps/rejected": -68.77421569824219, "loss": 0.3579, "losses/dpo": 0.6493552923202515, "losses/sft": 2.445167303085327, "losses/total": 0.6493552923202515, "ref_logps/chosen": -21.694490432739258, "ref_logps/rejected": -31.394365310668945, "rewards/accuracies": 0.875, "rewards/chosen": -1.8078584671020508, "rewards/margins": 1.930126428604126, "rewards/rejected": -3.737985134124756, "step": 1829 }, { "epoch": 1.73, "grad_norm": 16.15310295805119, "learning_rate": 2.014147651707835e-07, "logps/chosen": -47.683631896972656, "logps/rejected": -81.38330078125, "loss": 0.245, "losses/dpo": 0.018391238525509834, "losses/sft": 1.4220452308654785, "losses/total": 0.018391238525509834, "ref_logps/chosen": -30.13445281982422, "ref_logps/rejected": -39.03313446044922, "rewards/accuracies": 0.875, "rewards/chosen": -1.754918098449707, "rewards/margins": 2.4800987243652344, "rewards/rejected": -4.235016822814941, "step": 1830 }, { "epoch": 1.73, "grad_norm": 17.381634317863806, "learning_rate": 2.0116497725869758e-07, "logps/chosen": -32.834075927734375, "logps/rejected": -67.70655822753906, "loss": 0.261, "losses/dpo": 0.020084364339709282, "losses/sft": 1.9574815034866333, "losses/total": 0.020084364339709282, "ref_logps/chosen": -18.094640731811523, "ref_logps/rejected": -33.207618713378906, "rewards/accuracies": 0.875, "rewards/chosen": -1.4739433526992798, "rewards/margins": 1.9759504795074463, "rewards/rejected": -3.4498937129974365, "step": 1831 }, { "epoch": 1.73, "grad_norm": 19.813940672932045, "learning_rate": 2.0091524002261588e-07, "logps/chosen": -47.20208740234375, "logps/rejected": -75.14486694335938, "loss": 0.3136, "losses/dpo": 0.0006112066330388188, "losses/sft": 1.471347451210022, "losses/total": 0.0006112066330388188, "ref_logps/chosen": -28.91836929321289, "ref_logps/rejected": -34.129241943359375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8283721208572388, "rewards/margins": 2.273190498352051, "rewards/rejected": -4.1015625, "step": 1832 }, { "epoch": 1.73, "grad_norm": 17.900325664118714, "learning_rate": 2.0066555372169017e-07, "logps/chosen": -43.83747100830078, "logps/rejected": -74.59333801269531, "loss": 0.2326, "losses/dpo": 0.2836589217185974, "losses/sft": 2.1104252338409424, "losses/total": 0.2836589217185974, "ref_logps/chosen": -25.613162994384766, "ref_logps/rejected": -35.66532897949219, "rewards/accuracies": 1.0, "rewards/chosen": -1.8224304914474487, "rewards/margins": 2.0703697204589844, "rewards/rejected": -3.8928000926971436, "step": 1833 }, { "epoch": 1.73, "grad_norm": 14.671588126843584, "learning_rate": 2.0041591861501943e-07, "logps/chosen": -34.43102264404297, "logps/rejected": -68.67103576660156, "loss": 0.2589, "losses/dpo": 0.005979393143206835, "losses/sft": 2.3365511894226074, "losses/total": 0.005979393143206835, "ref_logps/chosen": -19.920238494873047, "ref_logps/rejected": -31.990312576293945, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4510787725448608, "rewards/margins": 2.216993808746338, "rewards/rejected": -3.6680727005004883, "step": 1834 }, { "epoch": 1.73, "grad_norm": 14.631547478510754, "learning_rate": 2.0016633496164957e-07, "logps/chosen": -37.5110969543457, "logps/rejected": -76.40032958984375, "loss": 0.2326, "losses/dpo": 0.510457456111908, "losses/sft": 0.7423537969589233, "losses/total": 0.510457456111908, "ref_logps/chosen": -24.150089263916016, "ref_logps/rejected": -34.67899703979492, "rewards/accuracies": 0.875, "rewards/chosen": -1.3361008167266846, "rewards/margins": 2.8360323905944824, "rewards/rejected": -4.172133445739746, "step": 1835 }, { "epoch": 1.73, "grad_norm": 34.36254685527735, "learning_rate": 1.9991680302057292e-07, "logps/chosen": -51.18808364868164, "logps/rejected": -65.38687896728516, "loss": 0.5987, "losses/dpo": 0.23903042078018188, "losses/sft": 0.24395246803760529, "losses/total": 0.23903042078018188, "ref_logps/chosen": -29.875274658203125, "ref_logps/rejected": -29.814414978027344, "rewards/accuracies": 0.75, "rewards/chosen": -2.1312808990478516, "rewards/margins": 1.4259655475616455, "rewards/rejected": -3.557246685028076, "step": 1836 }, { "epoch": 1.73, "grad_norm": 22.988224542153276, "learning_rate": 1.996673230507284e-07, "logps/chosen": -45.33837127685547, "logps/rejected": -60.02286911010742, "loss": 0.3262, "losses/dpo": 1.108525276184082, "losses/sft": 2.2815122604370117, "losses/total": 1.108525276184082, "ref_logps/chosen": -30.983116149902344, "ref_logps/rejected": -27.300888061523438, "rewards/accuracies": 0.875, "rewards/chosen": -1.4355255365371704, "rewards/margins": 1.8366725444793701, "rewards/rejected": -3.27219820022583, "step": 1837 }, { "epoch": 1.73, "grad_norm": 17.408551874586774, "learning_rate": 1.9941789531100073e-07, "logps/chosen": -39.698333740234375, "logps/rejected": -63.729244232177734, "loss": 0.2934, "losses/dpo": 0.12830758094787598, "losses/sft": 1.527014136314392, "losses/total": 0.12830758094787598, "ref_logps/chosen": -26.152713775634766, "ref_logps/rejected": -32.96944808959961, "rewards/accuracies": 0.875, "rewards/chosen": -1.3545622825622559, "rewards/margins": 1.7214171886444092, "rewards/rejected": -3.075979232788086, "step": 1838 }, { "epoch": 1.73, "grad_norm": 25.380777345912705, "learning_rate": 1.991685200602207e-07, "logps/chosen": -43.06573486328125, "logps/rejected": -63.57730484008789, "loss": 0.3472, "losses/dpo": 0.25939542055130005, "losses/sft": 0.7399962544441223, "losses/total": 0.25939542055130005, "ref_logps/chosen": -24.97681427001953, "ref_logps/rejected": -29.1684513092041, "rewards/accuracies": 0.75, "rewards/chosen": -1.808892011642456, "rewards/margins": 1.6319937705993652, "rewards/rejected": -3.440885543823242, "step": 1839 }, { "epoch": 1.74, "grad_norm": 13.308987169959796, "learning_rate": 1.9891919755716444e-07, "logps/chosen": -51.825618743896484, "logps/rejected": -85.09367370605469, "loss": 0.1262, "losses/dpo": 0.09059280157089233, "losses/sft": 1.6608505249023438, "losses/total": 0.09059280157089233, "ref_logps/chosen": -37.439697265625, "ref_logps/rejected": -41.449642181396484, "rewards/accuracies": 1.0, "rewards/chosen": -1.4385920763015747, "rewards/margins": 2.9258110523223877, "rewards/rejected": -4.364402770996094, "step": 1840 }, { "epoch": 1.74, "grad_norm": 25.202446192108773, "learning_rate": 1.9866992806055343e-07, "logps/chosen": -46.19084930419922, "logps/rejected": -79.61540222167969, "loss": 0.3851, "losses/dpo": 0.999357283115387, "losses/sft": 2.230675458908081, "losses/total": 0.999357283115387, "ref_logps/chosen": -28.08599853515625, "ref_logps/rejected": -35.80476760864258, "rewards/accuracies": 0.75, "rewards/chosen": -1.8104851245880127, "rewards/margins": 2.57057785987854, "rewards/rejected": -4.381062984466553, "step": 1841 }, { "epoch": 1.74, "grad_norm": 20.93711040692169, "learning_rate": 1.984207118290541e-07, "logps/chosen": -37.41297912597656, "logps/rejected": -60.0831298828125, "loss": 0.3162, "losses/dpo": 0.20217742025852203, "losses/sft": 1.6433061361312866, "losses/total": 0.20217742025852203, "ref_logps/chosen": -21.341110229492188, "ref_logps/rejected": -25.033557891845703, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6071867942810059, "rewards/margins": 1.8977704048156738, "rewards/rejected": -3.504957437515259, "step": 1842 }, { "epoch": 1.74, "grad_norm": 15.689756456569992, "learning_rate": 1.9817154912127764e-07, "logps/chosen": -48.26507568359375, "logps/rejected": -69.02723693847656, "loss": 0.2445, "losses/dpo": 0.0034005220513790846, "losses/sft": 1.0414621829986572, "losses/total": 0.0034005220513790846, "ref_logps/chosen": -30.750404357910156, "ref_logps/rejected": -30.01174545288086, "rewards/accuracies": 0.9375, "rewards/chosen": -1.751467227935791, "rewards/margins": 2.1500821113586426, "rewards/rejected": -3.9015493392944336, "step": 1843 }, { "epoch": 1.74, "grad_norm": 17.058836357476352, "learning_rate": 1.979224401957797e-07, "logps/chosen": -45.0399169921875, "logps/rejected": -65.59105682373047, "loss": 0.232, "losses/dpo": 0.18535350263118744, "losses/sft": 1.8230832815170288, "losses/total": 0.18535350263118744, "ref_logps/chosen": -27.75438690185547, "ref_logps/rejected": -30.18304443359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.728553295135498, "rewards/margins": 1.8122479915618896, "rewards/rejected": -3.5408010482788086, "step": 1844 }, { "epoch": 1.74, "grad_norm": 23.646747600104735, "learning_rate": 1.9767338531106006e-07, "logps/chosen": -46.34044647216797, "logps/rejected": -83.38282775878906, "loss": 0.3053, "losses/dpo": 0.0003143893845845014, "losses/sft": 1.985962986946106, "losses/total": 0.0003143893845845014, "ref_logps/chosen": -26.767990112304688, "ref_logps/rejected": -41.92462921142578, "rewards/accuracies": 0.875, "rewards/chosen": -1.9572460651397705, "rewards/margins": 2.1885733604431152, "rewards/rejected": -4.145819664001465, "step": 1845 }, { "epoch": 1.74, "grad_norm": 25.322249393466805, "learning_rate": 1.9742438472556244e-07, "logps/chosen": -53.752628326416016, "logps/rejected": -66.82719421386719, "loss": 0.4006, "losses/dpo": 0.5706372261047363, "losses/sft": 1.3567687273025513, "losses/total": 0.5706372261047363, "ref_logps/chosen": -35.96656799316406, "ref_logps/rejected": -36.067962646484375, "rewards/accuracies": 0.75, "rewards/chosen": -1.7786060571670532, "rewards/margins": 1.2973178625106812, "rewards/rejected": -3.0759239196777344, "step": 1846 }, { "epoch": 1.74, "grad_norm": 13.82147254904727, "learning_rate": 1.971754386976744e-07, "logps/chosen": -42.915977478027344, "logps/rejected": -74.99737548828125, "loss": 0.1481, "losses/dpo": 0.08014640212059021, "losses/sft": 1.0199511051177979, "losses/total": 0.08014640212059021, "ref_logps/chosen": -29.84493064880371, "ref_logps/rejected": -34.611392974853516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3071048259735107, "rewards/margins": 2.7314929962158203, "rewards/rejected": -4.03859806060791, "step": 1847 }, { "epoch": 1.74, "grad_norm": 13.62907177478638, "learning_rate": 1.9692654748572662e-07, "logps/chosen": -43.93564224243164, "logps/rejected": -70.44972229003906, "loss": 0.2445, "losses/dpo": 0.09978022426366806, "losses/sft": 1.0878430604934692, "losses/total": 0.09978022426366806, "ref_logps/chosen": -27.891149520874023, "ref_logps/rejected": -32.174171447753906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6044492721557617, "rewards/margins": 2.2231056690216064, "rewards/rejected": -3.8275551795959473, "step": 1848 }, { "epoch": 1.74, "grad_norm": 19.330482204785703, "learning_rate": 1.96677711347993e-07, "logps/chosen": -37.13566207885742, "logps/rejected": -69.46117401123047, "loss": 0.2044, "losses/dpo": 0.00665627745911479, "losses/sft": 0.452237069606781, "losses/total": 0.00665627745911479, "ref_logps/chosen": -23.715776443481445, "ref_logps/rejected": -30.87993049621582, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3419886827468872, "rewards/margins": 2.5161356925964355, "rewards/rejected": -3.858124256134033, "step": 1849 }, { "epoch": 1.75, "grad_norm": 20.810626384985653, "learning_rate": 1.9642893054269033e-07, "logps/chosen": -41.88091278076172, "logps/rejected": -66.37342071533203, "loss": 0.3875, "losses/dpo": 0.17872491478919983, "losses/sft": 0.9100199341773987, "losses/total": 0.17872491478919983, "ref_logps/chosen": -23.394390106201172, "ref_logps/rejected": -33.69822692871094, "rewards/accuracies": 0.75, "rewards/chosen": -1.8486524820327759, "rewards/margins": 1.4188666343688965, "rewards/rejected": -3.267518997192383, "step": 1850 }, { "epoch": 1.75, "grad_norm": 29.247571647941356, "learning_rate": 1.9618020532797794e-07, "logps/chosen": -46.62902069091797, "logps/rejected": -59.73168182373047, "loss": 0.4057, "losses/dpo": 1.3806648254394531, "losses/sft": 1.563230037689209, "losses/total": 1.3806648254394531, "ref_logps/chosen": -24.756031036376953, "ref_logps/rejected": -25.52804183959961, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1872987747192383, "rewards/margins": 1.233064889907837, "rewards/rejected": -3.420363664627075, "step": 1851 }, { "epoch": 1.75, "grad_norm": 25.017754972685818, "learning_rate": 1.9593153596195747e-07, "logps/chosen": -43.173797607421875, "logps/rejected": -67.07186126708984, "loss": 0.4563, "losses/dpo": 2.8293004035949707, "losses/sft": 2.477250337600708, "losses/total": 2.8293004035949707, "ref_logps/chosen": -26.338300704956055, "ref_logps/rejected": -32.314823150634766, "rewards/accuracies": 0.875, "rewards/chosen": -1.6835497617721558, "rewards/margins": 1.79215407371521, "rewards/rejected": -3.475703716278076, "step": 1852 }, { "epoch": 1.75, "grad_norm": 23.195633732401046, "learning_rate": 1.9568292270267262e-07, "logps/chosen": -52.421897888183594, "logps/rejected": -82.89418029785156, "loss": 0.2885, "losses/dpo": 0.09886397421360016, "losses/sft": 1.3621423244476318, "losses/total": 0.09886397421360016, "ref_logps/chosen": -32.083614349365234, "ref_logps/rejected": -38.01274108886719, "rewards/accuracies": 0.875, "rewards/chosen": -2.0338282585144043, "rewards/margins": 2.4543161392211914, "rewards/rejected": -4.4881439208984375, "step": 1853 }, { "epoch": 1.75, "grad_norm": 19.98907065607733, "learning_rate": 1.9543436580810887e-07, "logps/chosen": -49.1458854675293, "logps/rejected": -59.88701629638672, "loss": 0.3205, "losses/dpo": 0.277382493019104, "losses/sft": 2.1388769149780273, "losses/total": 0.277382493019104, "ref_logps/chosen": -32.74658203125, "ref_logps/rejected": -28.908245086669922, "rewards/accuracies": 0.875, "rewards/chosen": -1.6399304866790771, "rewards/margins": 1.45794677734375, "rewards/rejected": -3.0978775024414062, "step": 1854 }, { "epoch": 1.75, "grad_norm": 10.751656246057177, "learning_rate": 1.9518586553619316e-07, "logps/chosen": -39.535247802734375, "logps/rejected": -60.872825622558594, "loss": 0.1597, "losses/dpo": 0.016439104452729225, "losses/sft": 1.5635181665420532, "losses/total": 0.016439104452729225, "ref_logps/chosen": -26.617206573486328, "ref_logps/rejected": -26.176063537597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.2918040752410889, "rewards/margins": 2.1778717041015625, "rewards/rejected": -3.4696757793426514, "step": 1855 }, { "epoch": 1.75, "grad_norm": 19.766930105990856, "learning_rate": 1.949374221447938e-07, "logps/chosen": -56.001991271972656, "logps/rejected": -77.51371765136719, "loss": 0.2954, "losses/dpo": 0.030849698930978775, "losses/sft": 1.862028956413269, "losses/total": 0.030849698930978775, "ref_logps/chosen": -35.77962112426758, "ref_logps/rejected": -32.97008514404297, "rewards/accuracies": 0.875, "rewards/chosen": -2.0222373008728027, "rewards/margins": 2.432126522064209, "rewards/rejected": -4.454363822937012, "step": 1856 }, { "epoch": 1.75, "grad_norm": 24.619186835840253, "learning_rate": 1.9468903589171997e-07, "logps/chosen": -37.051536560058594, "logps/rejected": -58.20011901855469, "loss": 0.3563, "losses/dpo": 0.7277657389640808, "losses/sft": 1.4955745935440063, "losses/total": 0.7277657389640808, "ref_logps/chosen": -23.49554443359375, "ref_logps/rejected": -26.574926376342773, "rewards/accuracies": 0.75, "rewards/chosen": -1.3555991649627686, "rewards/margins": 1.8069202899932861, "rewards/rejected": -3.1625194549560547, "step": 1857 }, { "epoch": 1.75, "grad_norm": 27.508631402154766, "learning_rate": 1.9444070703472153e-07, "logps/chosen": -44.648155212402344, "logps/rejected": -70.8765869140625, "loss": 0.3929, "losses/dpo": 0.012264568358659744, "losses/sft": 0.899861752986908, "losses/total": 0.012264568358659744, "ref_logps/chosen": -26.03215789794922, "ref_logps/rejected": -31.840211868286133, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8615996837615967, "rewards/margins": 2.0420377254486084, "rewards/rejected": -3.903637409210205, "step": 1858 }, { "epoch": 1.75, "grad_norm": 17.076226790959897, "learning_rate": 1.9419243583148893e-07, "logps/chosen": -36.705116271972656, "logps/rejected": -60.45794677734375, "loss": 0.2923, "losses/dpo": 0.10868658870458603, "losses/sft": 1.2082163095474243, "losses/total": 0.10868658870458603, "ref_logps/chosen": -24.95285415649414, "ref_logps/rejected": -27.82168960571289, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1752262115478516, "rewards/margins": 2.0883991718292236, "rewards/rejected": -3.263625144958496, "step": 1859 }, { "epoch": 1.75, "grad_norm": 16.039574219889264, "learning_rate": 1.9394422253965262e-07, "logps/chosen": -45.56523132324219, "logps/rejected": -81.2020263671875, "loss": 0.1918, "losses/dpo": 0.023469509556889534, "losses/sft": 1.9750854969024658, "losses/total": 0.023469509556889534, "ref_logps/chosen": -30.218421936035156, "ref_logps/rejected": -39.84780502319336, "rewards/accuracies": 1.0, "rewards/chosen": -1.5346810817718506, "rewards/margins": 2.6007416248321533, "rewards/rejected": -4.135422706604004, "step": 1860 }, { "epoch": 1.76, "grad_norm": 27.105004610799632, "learning_rate": 1.93696067416783e-07, "logps/chosen": -55.46660614013672, "logps/rejected": -82.129638671875, "loss": 0.282, "losses/dpo": 0.005760112311691046, "losses/sft": 0.3249993920326233, "losses/total": 0.005760112311691046, "ref_logps/chosen": -33.07682800292969, "ref_logps/rejected": -39.09412384033203, "rewards/accuracies": 0.875, "rewards/chosen": -2.2389779090881348, "rewards/margins": 2.0645737648010254, "rewards/rejected": -4.30355167388916, "step": 1861 }, { "epoch": 1.76, "grad_norm": 16.33805859744638, "learning_rate": 1.9344797072039024e-07, "logps/chosen": -34.19944763183594, "logps/rejected": -68.99260711669922, "loss": 0.2463, "losses/dpo": 0.6223173141479492, "losses/sft": 0.8870053887367249, "losses/total": 0.6223173141479492, "ref_logps/chosen": -24.68155288696289, "ref_logps/rejected": -36.00293731689453, "rewards/accuracies": 0.9375, "rewards/chosen": -0.951789379119873, "rewards/margins": 2.347177028656006, "rewards/rejected": -3.298966407775879, "step": 1862 }, { "epoch": 1.76, "grad_norm": 18.365500029462563, "learning_rate": 1.9319993270792364e-07, "logps/chosen": -38.33457946777344, "logps/rejected": -63.21196746826172, "loss": 0.3104, "losses/dpo": 0.0704483836889267, "losses/sft": 1.1172504425048828, "losses/total": 0.0704483836889267, "ref_logps/chosen": -23.449443817138672, "ref_logps/rejected": -27.167728424072266, "rewards/accuracies": 0.8125, "rewards/chosen": -1.488513469696045, "rewards/margins": 2.115910530090332, "rewards/rejected": -3.604423999786377, "step": 1863 }, { "epoch": 1.76, "grad_norm": 21.267462938529945, "learning_rate": 1.929519536367719e-07, "logps/chosen": -35.58766174316406, "logps/rejected": -58.27900314331055, "loss": 0.3627, "losses/dpo": 0.48868200182914734, "losses/sft": 0.7182791829109192, "losses/total": 0.48868200182914734, "ref_logps/chosen": -20.229389190673828, "ref_logps/rejected": -28.19063949584961, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5358272790908813, "rewards/margins": 1.4730088710784912, "rewards/rejected": -3.008836269378662, "step": 1864 }, { "epoch": 1.76, "grad_norm": 12.825296290261578, "learning_rate": 1.927040337642623e-07, "logps/chosen": -38.153053283691406, "logps/rejected": -71.34842681884766, "loss": 0.2261, "losses/dpo": 0.025218456983566284, "losses/sft": 1.1507976055145264, "losses/total": 0.025218456983566284, "ref_logps/chosen": -25.511049270629883, "ref_logps/rejected": -34.90440368652344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2642004489898682, "rewards/margins": 2.380201816558838, "rewards/rejected": -3.644402503967285, "step": 1865 }, { "epoch": 1.76, "grad_norm": 25.25755625132531, "learning_rate": 1.9245617334766084e-07, "logps/chosen": -38.18147277832031, "logps/rejected": -63.67568588256836, "loss": 0.4747, "losses/dpo": 0.003162599168717861, "losses/sft": 0.78315269947052, "losses/total": 0.003162599168717861, "ref_logps/chosen": -23.779417037963867, "ref_logps/rejected": -32.93841552734375, "rewards/accuracies": 0.75, "rewards/chosen": -1.4402055740356445, "rewards/margins": 1.633521318435669, "rewards/rejected": -3.0737268924713135, "step": 1866 }, { "epoch": 1.76, "grad_norm": 19.063263898669295, "learning_rate": 1.9220837264417176e-07, "logps/chosen": -35.33039093017578, "logps/rejected": -60.849464416503906, "loss": 0.3692, "losses/dpo": 0.45280614495277405, "losses/sft": 1.2007455825805664, "losses/total": 0.45280614495277405, "ref_logps/chosen": -21.81812286376953, "ref_logps/rejected": -33.9069938659668, "rewards/accuracies": 0.875, "rewards/chosen": -1.351227045059204, "rewards/margins": 1.343019962310791, "rewards/rejected": -2.694247007369995, "step": 1867 }, { "epoch": 1.76, "grad_norm": 18.28367460287466, "learning_rate": 1.9196063191093731e-07, "logps/chosen": -55.27790069580078, "logps/rejected": -82.73681640625, "loss": 0.1684, "losses/dpo": 0.015016703866422176, "losses/sft": 2.20687198638916, "losses/total": 0.015016703866422176, "ref_logps/chosen": -34.91338348388672, "ref_logps/rejected": -37.972930908203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.036451816558838, "rewards/margins": 2.439936876296997, "rewards/rejected": -4.476388454437256, "step": 1868 }, { "epoch": 1.76, "grad_norm": 19.74646940725796, "learning_rate": 1.917129514050376e-07, "logps/chosen": -46.45881652832031, "logps/rejected": -80.83940124511719, "loss": 0.2163, "losses/dpo": 1.2055273056030273, "losses/sft": 1.1969276666641235, "losses/total": 1.2055273056030273, "ref_logps/chosen": -28.590747833251953, "ref_logps/rejected": -34.066070556640625, "rewards/accuracies": 0.875, "rewards/chosen": -1.7868070602416992, "rewards/margins": 2.890526294708252, "rewards/rejected": -4.677332878112793, "step": 1869 }, { "epoch": 1.76, "grad_norm": 10.109624917638747, "learning_rate": 1.9146533138349013e-07, "logps/chosen": -39.03748321533203, "logps/rejected": -71.05303955078125, "loss": 0.1383, "losses/dpo": 0.062476176768541336, "losses/sft": 0.3909705579280853, "losses/total": 0.062476176768541336, "ref_logps/chosen": -28.45667266845703, "ref_logps/rejected": -33.89583206176758, "rewards/accuracies": 1.0, "rewards/chosen": -1.0580809116363525, "rewards/margins": 2.657639980316162, "rewards/rejected": -3.7157208919525146, "step": 1870 }, { "epoch": 1.77, "grad_norm": 28.61655568108199, "learning_rate": 1.9121777210324975e-07, "logps/chosen": -42.2423095703125, "logps/rejected": -57.76631164550781, "loss": 0.4911, "losses/dpo": 0.2240193635225296, "losses/sft": 1.5953660011291504, "losses/total": 0.2240193635225296, "ref_logps/chosen": -26.43741226196289, "ref_logps/rejected": -25.830432891845703, "rewards/accuracies": 0.875, "rewards/chosen": -1.5804893970489502, "rewards/margins": 1.613098382949829, "rewards/rejected": -3.1935877799987793, "step": 1871 }, { "epoch": 1.77, "grad_norm": 12.14360877624651, "learning_rate": 1.9097027382120814e-07, "logps/chosen": -39.41382598876953, "logps/rejected": -71.89685821533203, "loss": 0.171, "losses/dpo": 0.026782190427184105, "losses/sft": 0.9211391806602478, "losses/total": 0.026782190427184105, "ref_logps/chosen": -24.648563385009766, "ref_logps/rejected": -34.17087173461914, "rewards/accuracies": 1.0, "rewards/chosen": -1.476525902748108, "rewards/margins": 2.296072483062744, "rewards/rejected": -3.7725982666015625, "step": 1872 }, { "epoch": 1.77, "grad_norm": 18.680406379730357, "learning_rate": 1.907228367941939e-07, "logps/chosen": -26.362646102905273, "logps/rejected": -68.19088745117188, "loss": 0.3318, "losses/dpo": 0.0024624504148960114, "losses/sft": 1.6600722074508667, "losses/total": 0.0024624504148960114, "ref_logps/chosen": -13.34976577758789, "ref_logps/rejected": -30.93532943725586, "rewards/accuracies": 0.75, "rewards/chosen": -1.3012878894805908, "rewards/margins": 2.424267530441284, "rewards/rejected": -3.725555419921875, "step": 1873 }, { "epoch": 1.77, "grad_norm": 18.332914173177514, "learning_rate": 1.9047546127897183e-07, "logps/chosen": -40.800079345703125, "logps/rejected": -78.02334594726562, "loss": 0.28, "losses/dpo": 0.012895755469799042, "losses/sft": 1.1053762435913086, "losses/total": 0.012895755469799042, "ref_logps/chosen": -24.245729446411133, "ref_logps/rejected": -38.06733322143555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.65543532371521, "rewards/margins": 2.3401663303375244, "rewards/rejected": -3.9956016540527344, "step": 1874 }, { "epoch": 1.77, "grad_norm": 10.28113099138493, "learning_rate": 1.9022814753224303e-07, "logps/chosen": -35.014434814453125, "logps/rejected": -76.98099517822266, "loss": 0.1138, "losses/dpo": 0.2159355878829956, "losses/sft": 0.9146163463592529, "losses/total": 0.2159355878829956, "ref_logps/chosen": -22.98752784729004, "ref_logps/rejected": -37.08372497558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2026904821395874, "rewards/margins": 2.787036418914795, "rewards/rejected": -3.989727020263672, "step": 1875 }, { "epoch": 1.77, "grad_norm": 14.262345649783821, "learning_rate": 1.899808958106445e-07, "logps/chosen": -41.36073303222656, "logps/rejected": -62.95549774169922, "loss": 0.1917, "losses/dpo": 0.5902440547943115, "losses/sft": 0.6980459690093994, "losses/total": 0.5902440547943115, "ref_logps/chosen": -25.61549186706543, "ref_logps/rejected": -25.090438842773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5745240449905396, "rewards/margins": 2.211982250213623, "rewards/rejected": -3.786506175994873, "step": 1876 }, { "epoch": 1.77, "grad_norm": 10.73371687338947, "learning_rate": 1.8973370637074882e-07, "logps/chosen": -39.073612213134766, "logps/rejected": -77.12162017822266, "loss": 0.1424, "losses/dpo": 0.1140456348657608, "losses/sft": 2.346010208129883, "losses/total": 0.1140456348657608, "ref_logps/chosen": -24.470779418945312, "ref_logps/rejected": -36.518428802490234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4602833986282349, "rewards/margins": 2.6000354290008545, "rewards/rejected": -4.060318946838379, "step": 1877 }, { "epoch": 1.77, "grad_norm": 15.335361699803052, "learning_rate": 1.8948657946906397e-07, "logps/chosen": -44.26184844970703, "logps/rejected": -77.1544189453125, "loss": 0.1764, "losses/dpo": 0.03511284664273262, "losses/sft": 1.0068389177322388, "losses/total": 0.03511284664273262, "ref_logps/chosen": -26.444608688354492, "ref_logps/rejected": -34.24185562133789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7817237377166748, "rewards/margins": 2.5095319747924805, "rewards/rejected": -4.291255950927734, "step": 1878 }, { "epoch": 1.77, "grad_norm": 23.597403368935826, "learning_rate": 1.8923951536203293e-07, "logps/chosen": -44.75363540649414, "logps/rejected": -74.20936584472656, "loss": 0.3154, "losses/dpo": 0.0072501408867537975, "losses/sft": 1.9473834037780762, "losses/total": 0.0072501408867537975, "ref_logps/chosen": -26.27121925354004, "ref_logps/rejected": -34.797340393066406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8482418060302734, "rewards/margins": 2.0929603576660156, "rewards/rejected": -3.941202163696289, "step": 1879 }, { "epoch": 1.77, "grad_norm": 20.78412155335397, "learning_rate": 1.8899251430603384e-07, "logps/chosen": -45.836341857910156, "logps/rejected": -66.04273223876953, "loss": 0.2622, "losses/dpo": 0.0015450877835974097, "losses/sft": 0.40001383423805237, "losses/total": 0.0015450877835974097, "ref_logps/chosen": -26.72789192199707, "ref_logps/rejected": -27.650400161743164, "rewards/accuracies": 0.875, "rewards/chosen": -1.9108448028564453, "rewards/margins": 1.9283888339996338, "rewards/rejected": -3.839233636856079, "step": 1880 }, { "epoch": 1.77, "grad_norm": 21.36640298765676, "learning_rate": 1.8874557655737908e-07, "logps/chosen": -38.16846466064453, "logps/rejected": -73.15679931640625, "loss": 0.2228, "losses/dpo": 0.029199227690696716, "losses/sft": 1.5697134733200073, "losses/total": 0.029199227690696716, "ref_logps/chosen": -22.131526947021484, "ref_logps/rejected": -33.34413146972656, "rewards/accuracies": 0.875, "rewards/chosen": -1.6036937236785889, "rewards/margins": 2.377573251724243, "rewards/rejected": -3.981266975402832, "step": 1881 }, { "epoch": 1.78, "grad_norm": 26.351464596521733, "learning_rate": 1.884987023723155e-07, "logps/chosen": -48.28015899658203, "logps/rejected": -65.18205261230469, "loss": 0.5321, "losses/dpo": 0.06929390132427216, "losses/sft": 1.8008413314819336, "losses/total": 0.06929390132427216, "ref_logps/chosen": -25.647546768188477, "ref_logps/rejected": -28.928924560546875, "rewards/accuracies": 0.75, "rewards/chosen": -2.263261079788208, "rewards/margins": 1.3620516061782837, "rewards/rejected": -3.6253128051757812, "step": 1882 }, { "epoch": 1.78, "grad_norm": 23.96763910182793, "learning_rate": 1.8825189200702388e-07, "logps/chosen": -57.06287384033203, "logps/rejected": -68.1351318359375, "loss": 0.421, "losses/dpo": 0.29148635268211365, "losses/sft": 2.7936079502105713, "losses/total": 0.29148635268211365, "ref_logps/chosen": -36.96672058105469, "ref_logps/rejected": -34.235328674316406, "rewards/accuracies": 0.875, "rewards/chosen": -2.009615659713745, "rewards/margins": 1.3803644180297852, "rewards/rejected": -3.3899803161621094, "step": 1883 }, { "epoch": 1.78, "grad_norm": 24.342167007090012, "learning_rate": 1.880051457176188e-07, "logps/chosen": -40.357696533203125, "logps/rejected": -73.41621398925781, "loss": 0.3203, "losses/dpo": 0.009876866824924946, "losses/sft": 0.9529080390930176, "losses/total": 0.009876866824924946, "ref_logps/chosen": -22.9329776763916, "ref_logps/rejected": -33.8691291809082, "rewards/accuracies": 0.875, "rewards/chosen": -1.7424719333648682, "rewards/margins": 2.2122368812561035, "rewards/rejected": -3.954709053039551, "step": 1884 }, { "epoch": 1.78, "grad_norm": 15.957531310242395, "learning_rate": 1.877584637601485e-07, "logps/chosen": -41.818119049072266, "logps/rejected": -58.09650421142578, "loss": 0.2987, "losses/dpo": 0.09308353811502457, "losses/sft": 1.973541259765625, "losses/total": 0.09308353811502457, "ref_logps/chosen": -25.82727813720703, "ref_logps/rejected": -24.412654876708984, "rewards/accuracies": 0.875, "rewards/chosen": -1.5990841388702393, "rewards/margins": 1.7693004608154297, "rewards/rejected": -3.368384599685669, "step": 1885 }, { "epoch": 1.78, "grad_norm": 16.120975683311467, "learning_rate": 1.875118463905943e-07, "logps/chosen": -45.22954559326172, "logps/rejected": -68.23180389404297, "loss": 0.2459, "losses/dpo": 0.2929215133190155, "losses/sft": 2.6145362854003906, "losses/total": 0.2929215133190155, "ref_logps/chosen": -27.590396881103516, "ref_logps/rejected": -28.597209930419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.7639145851135254, "rewards/margins": 2.199544906616211, "rewards/rejected": -3.9634594917297363, "step": 1886 }, { "epoch": 1.78, "grad_norm": 13.38070016553984, "learning_rate": 1.8726529386487054e-07, "logps/chosen": -37.08674621582031, "logps/rejected": -73.19036865234375, "loss": 0.2144, "losses/dpo": 0.21417401731014252, "losses/sft": 0.01678481511771679, "losses/total": 0.21417401731014252, "ref_logps/chosen": -21.26760482788086, "ref_logps/rejected": -34.77747344970703, "rewards/accuracies": 0.875, "rewards/chosen": -1.581913948059082, "rewards/margins": 2.25937557220459, "rewards/rejected": -3.841289520263672, "step": 1887 }, { "epoch": 1.78, "grad_norm": 14.771155653792494, "learning_rate": 1.870188064388243e-07, "logps/chosen": -57.34480285644531, "logps/rejected": -97.54756164550781, "loss": 0.1768, "losses/dpo": 0.0010768732754513621, "losses/sft": 2.1023662090301514, "losses/total": 0.0010768732754513621, "ref_logps/chosen": -37.57083511352539, "ref_logps/rejected": -46.247215270996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.977397084236145, "rewards/margins": 3.1526379585266113, "rewards/rejected": -5.130034923553467, "step": 1888 }, { "epoch": 1.78, "grad_norm": 20.893926469748045, "learning_rate": 1.86772384368235e-07, "logps/chosen": -42.793357849121094, "logps/rejected": -61.11437225341797, "loss": 0.2775, "losses/dpo": 0.0659540444612503, "losses/sft": 0.9296409487724304, "losses/total": 0.0659540444612503, "ref_logps/chosen": -25.90182113647461, "ref_logps/rejected": -27.905059814453125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6891536712646484, "rewards/margins": 1.6317777633666992, "rewards/rejected": -3.3209314346313477, "step": 1889 }, { "epoch": 1.78, "grad_norm": 20.918751365873682, "learning_rate": 1.8652602790881445e-07, "logps/chosen": -46.948509216308594, "logps/rejected": -72.42996215820312, "loss": 0.3566, "losses/dpo": 0.20767411589622498, "losses/sft": 0.7821439504623413, "losses/total": 0.20767411589622498, "ref_logps/chosen": -27.169692993164062, "ref_logps/rejected": -35.14916229248047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9778817892074585, "rewards/margins": 1.7501981258392334, "rewards/rejected": -3.7280800342559814, "step": 1890 }, { "epoch": 1.78, "grad_norm": 21.147524505729002, "learning_rate": 1.862797373162061e-07, "logps/chosen": -50.94414138793945, "logps/rejected": -82.21531677246094, "loss": 0.3687, "losses/dpo": 0.49493932723999023, "losses/sft": 0.5002094507217407, "losses/total": 0.49493932723999023, "ref_logps/chosen": -29.314022064208984, "ref_logps/rejected": -40.015106201171875, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1630120277404785, "rewards/margins": 2.057008981704712, "rewards/rejected": -4.220020771026611, "step": 1891 }, { "epoch": 1.78, "grad_norm": 19.00595071347731, "learning_rate": 1.8603351284598529e-07, "logps/chosen": -49.86937713623047, "logps/rejected": -63.33832550048828, "loss": 0.2515, "losses/dpo": 0.42154982686042786, "losses/sft": 2.71988582611084, "losses/total": 0.42154982686042786, "ref_logps/chosen": -33.406673431396484, "ref_logps/rejected": -26.063573837280273, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6462700366973877, "rewards/margins": 2.0812058448791504, "rewards/rejected": -3.727476119995117, "step": 1892 }, { "epoch": 1.79, "grad_norm": 15.522056830860715, "learning_rate": 1.8578735475365864e-07, "logps/chosen": -49.45536422729492, "logps/rejected": -74.65966796875, "loss": 0.2051, "losses/dpo": 0.006282164715230465, "losses/sft": 1.5863734483718872, "losses/total": 0.006282164715230465, "ref_logps/chosen": -28.853031158447266, "ref_logps/rejected": -32.36133575439453, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0602333545684814, "rewards/margins": 2.1695995330810547, "rewards/rejected": -4.229832649230957, "step": 1893 }, { "epoch": 1.79, "grad_norm": 14.044021881399074, "learning_rate": 1.8554126329466395e-07, "logps/chosen": -47.271209716796875, "logps/rejected": -66.22584533691406, "loss": 0.2381, "losses/dpo": 0.12707866728305817, "losses/sft": 0.7846873998641968, "losses/total": 0.12707866728305817, "ref_logps/chosen": -30.08302116394043, "ref_logps/rejected": -29.63513946533203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7188191413879395, "rewards/margins": 1.9402523040771484, "rewards/rejected": -3.659071445465088, "step": 1894 }, { "epoch": 1.79, "grad_norm": 25.450285765735288, "learning_rate": 1.8529523872436977e-07, "logps/chosen": -39.41581344604492, "logps/rejected": -50.82054138183594, "loss": 0.4076, "losses/dpo": 0.040029097348451614, "losses/sft": 0.6712386608123779, "losses/total": 0.040029097348451614, "ref_logps/chosen": -20.61463737487793, "ref_logps/rejected": -20.538860321044922, "rewards/accuracies": 0.75, "rewards/chosen": -1.8801177740097046, "rewards/margins": 1.1480501890182495, "rewards/rejected": -3.028167963027954, "step": 1895 }, { "epoch": 1.79, "grad_norm": 15.081052863069925, "learning_rate": 1.8504928129807533e-07, "logps/chosen": -36.74241638183594, "logps/rejected": -66.04499816894531, "loss": 0.2137, "losses/dpo": 0.1763160228729248, "losses/sft": 1.798322081565857, "losses/total": 0.1763160228729248, "ref_logps/chosen": -22.560955047607422, "ref_logps/rejected": -32.130584716796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4181463718414307, "rewards/margins": 1.973294734954834, "rewards/rejected": -3.3914411067962646, "step": 1896 }, { "epoch": 1.79, "grad_norm": 16.96822874837197, "learning_rate": 1.8480339127101013e-07, "logps/chosen": -52.927284240722656, "logps/rejected": -88.89242553710938, "loss": 0.2207, "losses/dpo": 0.186062291264534, "losses/sft": 2.7903826236724854, "losses/total": 0.186062291264534, "ref_logps/chosen": -32.533042907714844, "ref_logps/rejected": -42.193355560302734, "rewards/accuracies": 0.875, "rewards/chosen": -2.039423942565918, "rewards/margins": 2.6304826736450195, "rewards/rejected": -4.6699066162109375, "step": 1897 }, { "epoch": 1.79, "grad_norm": 12.896355867011987, "learning_rate": 1.8455756889833373e-07, "logps/chosen": -42.453392028808594, "logps/rejected": -66.22993469238281, "loss": 0.1618, "losses/dpo": 0.326546311378479, "losses/sft": 0.034300364553928375, "losses/total": 0.326546311378479, "ref_logps/chosen": -28.56289291381836, "ref_logps/rejected": -29.619279861450195, "rewards/accuracies": 1.0, "rewards/chosen": -1.389050006866455, "rewards/margins": 2.27201509475708, "rewards/rejected": -3.661065101623535, "step": 1898 }, { "epoch": 1.79, "grad_norm": 25.816071001144348, "learning_rate": 1.8431181443513557e-07, "logps/chosen": -54.1541862487793, "logps/rejected": -78.17366027832031, "loss": 0.3374, "losses/dpo": 0.4403950273990631, "losses/sft": 2.026698350906372, "losses/total": 0.4403950273990631, "ref_logps/chosen": -35.85503387451172, "ref_logps/rejected": -39.42515563964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8299152851104736, "rewards/margins": 2.0449342727661133, "rewards/rejected": -3.874849796295166, "step": 1899 }, { "epoch": 1.79, "grad_norm": 32.43757350616235, "learning_rate": 1.8406612813643452e-07, "logps/chosen": -39.60973358154297, "logps/rejected": -61.755577087402344, "loss": 0.4952, "losses/dpo": 0.17128509283065796, "losses/sft": 0.2599928379058838, "losses/total": 0.17128509283065796, "ref_logps/chosen": -24.634172439575195, "ref_logps/rejected": -32.38593292236328, "rewards/accuracies": 0.875, "rewards/chosen": -1.4975559711456299, "rewards/margins": 1.439408302307129, "rewards/rejected": -2.936964511871338, "step": 1900 }, { "epoch": 1.79, "grad_norm": 12.34839938118662, "learning_rate": 1.838205102571787e-07, "logps/chosen": -50.32011795043945, "logps/rejected": -92.68389892578125, "loss": 0.1225, "losses/dpo": 0.09575699269771576, "losses/sft": 1.4693450927734375, "losses/total": 0.09575699269771576, "ref_logps/chosen": -28.35059356689453, "ref_logps/rejected": -44.02552795410156, "rewards/accuracies": 1.0, "rewards/chosen": -2.1969523429870605, "rewards/margins": 2.6688849925994873, "rewards/rejected": -4.865837574005127, "step": 1901 }, { "epoch": 1.79, "grad_norm": 11.921366334046233, "learning_rate": 1.835749610522454e-07, "logps/chosen": -39.279823303222656, "logps/rejected": -82.25537109375, "loss": 0.1372, "losses/dpo": 0.4077632427215576, "losses/sft": 2.185009002685547, "losses/total": 0.4077632427215576, "ref_logps/chosen": -24.837188720703125, "ref_logps/rejected": -38.37119674682617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.444263219833374, "rewards/margins": 2.9441545009613037, "rewards/rejected": -4.388417720794678, "step": 1902 }, { "epoch": 1.8, "grad_norm": 16.448750015795266, "learning_rate": 1.8332948077644051e-07, "logps/chosen": -48.41465377807617, "logps/rejected": -72.5574722290039, "loss": 0.2477, "losses/dpo": 0.00441342405974865, "losses/sft": 1.0316789150238037, "losses/total": 0.00441342405974865, "ref_logps/chosen": -29.75638198852539, "ref_logps/rejected": -30.461889266967773, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8658270835876465, "rewards/margins": 2.34373140335083, "rewards/rejected": -4.209558486938477, "step": 1903 }, { "epoch": 1.8, "grad_norm": 24.601400797560693, "learning_rate": 1.8308406968449835e-07, "logps/chosen": -47.19146728515625, "logps/rejected": -73.276123046875, "loss": 0.318, "losses/dpo": 0.10063426196575165, "losses/sft": 1.8223878145217896, "losses/total": 0.10063426196575165, "ref_logps/chosen": -27.523265838623047, "ref_logps/rejected": -31.66738510131836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9668200016021729, "rewards/margins": 2.194054365158081, "rewards/rejected": -4.160874366760254, "step": 1904 }, { "epoch": 1.8, "grad_norm": 20.46439500021129, "learning_rate": 1.828387280310816e-07, "logps/chosen": -44.63185501098633, "logps/rejected": -82.56573486328125, "loss": 0.2737, "losses/dpo": 1.0077770948410034, "losses/sft": 2.8291943073272705, "losses/total": 1.0077770948410034, "ref_logps/chosen": -23.557838439941406, "ref_logps/rejected": -36.41875457763672, "rewards/accuracies": 0.875, "rewards/chosen": -2.1074013710021973, "rewards/margins": 2.5072972774505615, "rewards/rejected": -4.61469841003418, "step": 1905 }, { "epoch": 1.8, "grad_norm": 14.81272178723359, "learning_rate": 1.8259345607078065e-07, "logps/chosen": -48.12702941894531, "logps/rejected": -68.65149688720703, "loss": 0.2143, "losses/dpo": 0.04292614012956619, "losses/sft": 2.2025270462036133, "losses/total": 0.04292614012956619, "ref_logps/chosen": -31.016983032226562, "ref_logps/rejected": -32.22064208984375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7110042572021484, "rewards/margins": 1.9320809841156006, "rewards/rejected": -3.643085241317749, "step": 1906 }, { "epoch": 1.8, "grad_norm": 21.029120470114368, "learning_rate": 1.8234825405811388e-07, "logps/chosen": -46.6914176940918, "logps/rejected": -82.06936645507812, "loss": 0.2769, "losses/dpo": 0.0012479448923841119, "losses/sft": 1.5602765083312988, "losses/total": 0.0012479448923841119, "ref_logps/chosen": -29.376773834228516, "ref_logps/rejected": -37.708351135253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7314646244049072, "rewards/margins": 2.704636812210083, "rewards/rejected": -4.43610143661499, "step": 1907 }, { "epoch": 1.8, "grad_norm": 14.44214523119945, "learning_rate": 1.8210312224752684e-07, "logps/chosen": -36.72011947631836, "logps/rejected": -62.01432418823242, "loss": 0.2372, "losses/dpo": 1.0533651113510132, "losses/sft": 1.9498562812805176, "losses/total": 1.0533651113510132, "ref_logps/chosen": -25.644115447998047, "ref_logps/rejected": -28.460378646850586, "rewards/accuracies": 0.875, "rewards/chosen": -1.107600212097168, "rewards/margins": 2.2477948665618896, "rewards/rejected": -3.3553950786590576, "step": 1908 }, { "epoch": 1.8, "grad_norm": 18.029707512476975, "learning_rate": 1.8185806089339232e-07, "logps/chosen": -54.130130767822266, "logps/rejected": -70.30267333984375, "loss": 0.2483, "losses/dpo": 0.13004879653453827, "losses/sft": 1.8547086715698242, "losses/total": 0.13004879653453827, "ref_logps/chosen": -37.244163513183594, "ref_logps/rejected": -33.20351791381836, "rewards/accuracies": 0.875, "rewards/chosen": -1.6885969638824463, "rewards/margins": 2.0213184356689453, "rewards/rejected": -3.7099151611328125, "step": 1909 }, { "epoch": 1.8, "grad_norm": 22.756517142005904, "learning_rate": 1.8161307025000994e-07, "logps/chosen": -50.87480926513672, "logps/rejected": -71.28863525390625, "loss": 0.2802, "losses/dpo": 0.8834264278411865, "losses/sft": 1.544842004776001, "losses/total": 0.8834264278411865, "ref_logps/chosen": -34.351097106933594, "ref_logps/rejected": -33.61806106567383, "rewards/accuracies": 0.875, "rewards/chosen": -1.652370810508728, "rewards/margins": 2.114686965942383, "rewards/rejected": -3.7670578956604004, "step": 1910 }, { "epoch": 1.8, "grad_norm": 12.368038917420561, "learning_rate": 1.813681505716061e-07, "logps/chosen": -38.18334197998047, "logps/rejected": -77.30561065673828, "loss": 0.1722, "losses/dpo": 0.05835147202014923, "losses/sft": 1.9509977102279663, "losses/total": 0.05835147202014923, "ref_logps/chosen": -23.304920196533203, "ref_logps/rejected": -31.89995765686035, "rewards/accuracies": 0.9375, "rewards/chosen": -1.487842082977295, "rewards/margins": 3.052722930908203, "rewards/rejected": -4.540565490722656, "step": 1911 }, { "epoch": 1.8, "grad_norm": 12.46224080947359, "learning_rate": 1.8112330211233341e-07, "logps/chosen": -48.98664855957031, "logps/rejected": -69.10411071777344, "loss": 0.1646, "losses/dpo": 0.01691328175365925, "losses/sft": 1.7419958114624023, "losses/total": 0.01691328175365925, "ref_logps/chosen": -32.27710723876953, "ref_logps/rejected": -31.920684814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6709541082382202, "rewards/margins": 2.0473883152008057, "rewards/rejected": -3.7183423042297363, "step": 1912 }, { "epoch": 1.8, "grad_norm": 25.765304583980754, "learning_rate": 1.8087852512627063e-07, "logps/chosen": -41.87187194824219, "logps/rejected": -65.92528533935547, "loss": 0.4298, "losses/dpo": 0.14608797430992126, "losses/sft": 1.235650897026062, "losses/total": 0.14608797430992126, "ref_logps/chosen": -25.87765884399414, "ref_logps/rejected": -33.85479736328125, "rewards/accuracies": 0.875, "rewards/chosen": -1.5994210243225098, "rewards/margins": 1.607628345489502, "rewards/rejected": -3.2070493698120117, "step": 1913 }, { "epoch": 1.81, "grad_norm": 13.153637576834027, "learning_rate": 1.8063381986742233e-07, "logps/chosen": -40.616783142089844, "logps/rejected": -71.42878723144531, "loss": 0.1984, "losses/dpo": 0.036718156188726425, "losses/sft": 1.6293299198150635, "losses/total": 0.036718156188726425, "ref_logps/chosen": -25.533432006835938, "ref_logps/rejected": -33.74925231933594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5083348751068115, "rewards/margins": 2.2596187591552734, "rewards/rejected": -3.767953872680664, "step": 1914 }, { "epoch": 1.81, "grad_norm": 13.53335189619395, "learning_rate": 1.8038918658971863e-07, "logps/chosen": -44.99298095703125, "logps/rejected": -79.14630889892578, "loss": 0.2033, "losses/dpo": 0.2985191345214844, "losses/sft": 0.9186640381813049, "losses/total": 0.2985191345214844, "ref_logps/chosen": -27.174779891967773, "ref_logps/rejected": -36.85523223876953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.78182053565979, "rewards/margins": 2.4472880363464355, "rewards/rejected": -4.2291083335876465, "step": 1915 }, { "epoch": 1.81, "grad_norm": 12.2735799085476, "learning_rate": 1.8014462554701503e-07, "logps/chosen": -40.30080032348633, "logps/rejected": -81.59831237792969, "loss": 0.143, "losses/dpo": 0.02132265642285347, "losses/sft": 2.135216236114502, "losses/total": 0.02132265642285347, "ref_logps/chosen": -23.895048141479492, "ref_logps/rejected": -36.60157012939453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6405754089355469, "rewards/margins": 2.8590989112854004, "rewards/rejected": -4.499673843383789, "step": 1916 }, { "epoch": 1.81, "grad_norm": 19.398178864605317, "learning_rate": 1.7990013699309203e-07, "logps/chosen": -46.44587326049805, "logps/rejected": -72.26748657226562, "loss": 0.2143, "losses/dpo": 0.17608454823493958, "losses/sft": 1.526909589767456, "losses/total": 0.17608454823493958, "ref_logps/chosen": -29.167083740234375, "ref_logps/rejected": -31.433853149414062, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7278789281845093, "rewards/margins": 2.3554840087890625, "rewards/rejected": -4.083363056182861, "step": 1917 }, { "epoch": 1.81, "grad_norm": 25.412508429289826, "learning_rate": 1.7965572118165484e-07, "logps/chosen": -56.30256652832031, "logps/rejected": -70.48951721191406, "loss": 0.2852, "losses/dpo": 0.03836922347545624, "losses/sft": 1.5193389654159546, "losses/total": 0.03836922347545624, "ref_logps/chosen": -32.69793701171875, "ref_logps/rejected": -28.84698486328125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3604626655578613, "rewards/margins": 1.8037906885147095, "rewards/rejected": -4.164253234863281, "step": 1918 }, { "epoch": 1.81, "grad_norm": 17.726421391415442, "learning_rate": 1.7941137836633334e-07, "logps/chosen": -47.48576354980469, "logps/rejected": -71.14376068115234, "loss": 0.2374, "losses/dpo": 0.24031661450862885, "losses/sft": 0.1887766569852829, "losses/total": 0.24031661450862885, "ref_logps/chosen": -33.34783172607422, "ref_logps/rejected": -33.86571502685547, "rewards/accuracies": 0.875, "rewards/chosen": -1.4137935638427734, "rewards/margins": 2.3140110969543457, "rewards/rejected": -3.727804660797119, "step": 1919 }, { "epoch": 1.81, "grad_norm": 16.909613890723072, "learning_rate": 1.7916710880068158e-07, "logps/chosen": -34.92132568359375, "logps/rejected": -61.467533111572266, "loss": 0.2037, "losses/dpo": 0.21771323680877686, "losses/sft": 0.7981751561164856, "losses/total": 0.21771323680877686, "ref_logps/chosen": -23.846355438232422, "ref_logps/rejected": -24.646406173706055, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1074970960617065, "rewards/margins": 2.574615716934204, "rewards/rejected": -3.682112693786621, "step": 1920 }, { "epoch": 1.81, "grad_norm": 6.372176085342766, "learning_rate": 1.789229127381775e-07, "logps/chosen": -37.278411865234375, "logps/rejected": -86.42433166503906, "loss": 0.0671, "losses/dpo": 0.09990810602903366, "losses/sft": 2.3857500553131104, "losses/total": 0.09990810602903366, "ref_logps/chosen": -21.898151397705078, "ref_logps/rejected": -39.50790786743164, "rewards/accuracies": 1.0, "rewards/chosen": -1.5380263328552246, "rewards/margins": 3.153616428375244, "rewards/rejected": -4.691642761230469, "step": 1921 }, { "epoch": 1.81, "grad_norm": 23.097164023569672, "learning_rate": 1.7867879043222295e-07, "logps/chosen": -42.393699645996094, "logps/rejected": -50.62892532348633, "loss": 0.4189, "losses/dpo": 0.5771991610527039, "losses/sft": 1.8448460102081299, "losses/total": 0.5771991610527039, "ref_logps/chosen": -26.788110733032227, "ref_logps/rejected": -25.810110092163086, "rewards/accuracies": 0.875, "rewards/chosen": -1.560558795928955, "rewards/margins": 0.921322762966156, "rewards/rejected": -2.481881618499756, "step": 1922 }, { "epoch": 1.81, "grad_norm": 22.35306501563303, "learning_rate": 1.7843474213614312e-07, "logps/chosen": -46.03312301635742, "logps/rejected": -92.048095703125, "loss": 0.2686, "losses/dpo": 0.001605596044100821, "losses/sft": 0.751574695110321, "losses/total": 0.001605596044100821, "ref_logps/chosen": -29.492103576660156, "ref_logps/rejected": -48.041282653808594, "rewards/accuracies": 0.875, "rewards/chosen": -1.654101848602295, "rewards/margins": 2.7465786933898926, "rewards/rejected": -4.4006805419921875, "step": 1923 }, { "epoch": 1.82, "grad_norm": 21.013656756747654, "learning_rate": 1.781907681031864e-07, "logps/chosen": -45.040802001953125, "logps/rejected": -66.67268371582031, "loss": 0.3322, "losses/dpo": 0.4485863447189331, "losses/sft": 0.9176396727561951, "losses/total": 0.4485863447189331, "ref_logps/chosen": -25.985546112060547, "ref_logps/rejected": -30.735321044921875, "rewards/accuracies": 0.875, "rewards/chosen": -1.9055252075195312, "rewards/margins": 1.6882108449935913, "rewards/rejected": -3.593736410140991, "step": 1924 }, { "epoch": 1.82, "grad_norm": 28.765208414002508, "learning_rate": 1.779468685865242e-07, "logps/chosen": -50.90229415893555, "logps/rejected": -66.0791015625, "loss": 0.4059, "losses/dpo": 0.8028591871261597, "losses/sft": 1.0577070713043213, "losses/total": 0.8028591871261597, "ref_logps/chosen": -30.443737030029297, "ref_logps/rejected": -29.821958541870117, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0458555221557617, "rewards/margins": 1.5798585414886475, "rewards/rejected": -3.6257143020629883, "step": 1925 }, { "epoch": 1.82, "grad_norm": 20.84844991994807, "learning_rate": 1.777030438392505e-07, "logps/chosen": -41.052833557128906, "logps/rejected": -68.25338745117188, "loss": 0.2846, "losses/dpo": 0.4137612581253052, "losses/sft": 2.3251636028289795, "losses/total": 0.4137612581253052, "ref_logps/chosen": -22.645076751708984, "ref_logps/rejected": -30.29153823852539, "rewards/accuracies": 0.875, "rewards/chosen": -1.840775728225708, "rewards/margins": 1.955409049987793, "rewards/rejected": -3.796184778213501, "step": 1926 }, { "epoch": 1.82, "grad_norm": 20.970566898584988, "learning_rate": 1.7745929411438178e-07, "logps/chosen": -38.500892639160156, "logps/rejected": -58.07754898071289, "loss": 0.361, "losses/dpo": 0.22148171067237854, "losses/sft": 1.3784884214401245, "losses/total": 0.22148171067237854, "ref_logps/chosen": -22.734882354736328, "ref_logps/rejected": -28.11418914794922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5766010284423828, "rewards/margins": 1.4197347164154053, "rewards/rejected": -2.996335983276367, "step": 1927 }, { "epoch": 1.82, "grad_norm": 17.281896370190925, "learning_rate": 1.7721561966485663e-07, "logps/chosen": -53.772315979003906, "logps/rejected": -84.39387512207031, "loss": 0.2312, "losses/dpo": 0.08442972600460052, "losses/sft": 1.463821530342102, "losses/total": 0.08442972600460052, "ref_logps/chosen": -35.11360549926758, "ref_logps/rejected": -41.59605407714844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8658711910247803, "rewards/margins": 2.413910150527954, "rewards/rejected": -4.279781341552734, "step": 1928 }, { "epoch": 1.82, "grad_norm": 21.81088043871449, "learning_rate": 1.7697202074353558e-07, "logps/chosen": -49.895896911621094, "logps/rejected": -81.35272216796875, "loss": 0.2171, "losses/dpo": 0.004298172891139984, "losses/sft": 1.9891738891601562, "losses/total": 0.004298172891139984, "ref_logps/chosen": -29.752685546875, "ref_logps/rejected": -39.40964889526367, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0143213272094727, "rewards/margins": 2.179985761642456, "rewards/rejected": -4.194307327270508, "step": 1929 }, { "epoch": 1.82, "grad_norm": 9.478497576986916, "learning_rate": 1.7672849760320067e-07, "logps/chosen": -48.33134460449219, "logps/rejected": -87.41835021972656, "loss": 0.0892, "losses/dpo": 0.26320040225982666, "losses/sft": 1.4251810312271118, "losses/total": 0.26320040225982666, "ref_logps/chosen": -31.181001663208008, "ref_logps/rejected": -39.35926055908203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7150342464447021, "rewards/margins": 3.090874433517456, "rewards/rejected": -4.805909156799316, "step": 1930 }, { "epoch": 1.82, "grad_norm": 23.550567749957818, "learning_rate": 1.764850504965554e-07, "logps/chosen": -53.86132049560547, "logps/rejected": -82.71774291992188, "loss": 0.3656, "losses/dpo": 0.01210867054760456, "losses/sft": 1.4879150390625, "losses/total": 0.01210867054760456, "ref_logps/chosen": -34.09670639038086, "ref_logps/rejected": -39.5772705078125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.97646164894104, "rewards/margins": 2.337585926055908, "rewards/rejected": -4.314047813415527, "step": 1931 }, { "epoch": 1.82, "grad_norm": 20.16483476785838, "learning_rate": 1.7624167967622433e-07, "logps/chosen": -44.382781982421875, "logps/rejected": -58.34480667114258, "loss": 0.334, "losses/dpo": 0.026039963588118553, "losses/sft": 2.4628450870513916, "losses/total": 0.026039963588118553, "ref_logps/chosen": -26.50543975830078, "ref_logps/rejected": -27.734416961669922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7877342700958252, "rewards/margins": 1.2733044624328613, "rewards/rejected": -3.0610387325286865, "step": 1932 }, { "epoch": 1.82, "grad_norm": 22.308666803391265, "learning_rate": 1.7599838539475288e-07, "logps/chosen": -48.074501037597656, "logps/rejected": -68.99952697753906, "loss": 0.3799, "losses/dpo": 0.6473108530044556, "losses/sft": 1.130273699760437, "losses/total": 0.6473108530044556, "ref_logps/chosen": -23.992359161376953, "ref_logps/rejected": -28.282554626464844, "rewards/accuracies": 0.875, "rewards/chosen": -2.4082143306732178, "rewards/margins": 1.663482904434204, "rewards/rejected": -4.071697235107422, "step": 1933 }, { "epoch": 1.82, "grad_norm": 16.665359188980094, "learning_rate": 1.75755167904607e-07, "logps/chosen": -30.82843017578125, "logps/rejected": -64.88978576660156, "loss": 0.269, "losses/dpo": 0.2092418521642685, "losses/sft": 1.7959539890289307, "losses/total": 0.2092418521642685, "ref_logps/chosen": -16.539363861083984, "ref_logps/rejected": -30.292402267456055, "rewards/accuracies": 0.875, "rewards/chosen": -1.4289065599441528, "rewards/margins": 2.0308313369750977, "rewards/rejected": -3.45973801612854, "step": 1934 }, { "epoch": 1.83, "grad_norm": 23.751701381044036, "learning_rate": 1.7551202745817297e-07, "logps/chosen": -41.43653106689453, "logps/rejected": -70.59455871582031, "loss": 0.3697, "losses/dpo": 0.1430562436580658, "losses/sft": 1.8378825187683105, "losses/total": 0.1430562436580658, "ref_logps/chosen": -23.743356704711914, "ref_logps/rejected": -32.724510192871094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.769317388534546, "rewards/margins": 2.0176877975463867, "rewards/rejected": -3.7870051860809326, "step": 1935 }, { "epoch": 1.83, "grad_norm": 23.997446337776196, "learning_rate": 1.7526896430775723e-07, "logps/chosen": -42.26692199707031, "logps/rejected": -62.11564636230469, "loss": 0.4256, "losses/dpo": 0.06704232841730118, "losses/sft": 1.4058691263198853, "losses/total": 0.06704232841730118, "ref_logps/chosen": -24.626379013061523, "ref_logps/rejected": -26.250324249267578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.764054298400879, "rewards/margins": 1.822478175163269, "rewards/rejected": -3.5865325927734375, "step": 1936 }, { "epoch": 1.83, "grad_norm": 23.984558366013648, "learning_rate": 1.7502597870558588e-07, "logps/chosen": -43.5571174621582, "logps/rejected": -80.66322326660156, "loss": 0.2836, "losses/dpo": 0.6525413990020752, "losses/sft": 1.4777387380599976, "losses/total": 0.6525413990020752, "ref_logps/chosen": -26.530563354492188, "ref_logps/rejected": -40.42990493774414, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7026556730270386, "rewards/margins": 2.3206756114959717, "rewards/rejected": -4.023331642150879, "step": 1937 }, { "epoch": 1.83, "grad_norm": 24.233188917450978, "learning_rate": 1.7478307090380456e-07, "logps/chosen": -40.9268798828125, "logps/rejected": -71.84115600585938, "loss": 0.3541, "losses/dpo": 0.020341500639915466, "losses/sft": 1.444818377494812, "losses/total": 0.020341500639915466, "ref_logps/chosen": -22.421703338623047, "ref_logps/rejected": -35.411800384521484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.850517749786377, "rewards/margins": 1.7924176454544067, "rewards/rejected": -3.642935276031494, "step": 1938 }, { "epoch": 1.83, "grad_norm": 38.212855546093564, "learning_rate": 1.7454024115447824e-07, "logps/chosen": -53.702152252197266, "logps/rejected": -73.04073333740234, "loss": 0.6058, "losses/dpo": 0.03801440820097923, "losses/sft": 1.9510667324066162, "losses/total": 0.03801440820097923, "ref_logps/chosen": -32.12208938598633, "ref_logps/rejected": -34.047664642333984, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1580066680908203, "rewards/margins": 1.7413005828857422, "rewards/rejected": -3.8993072509765625, "step": 1939 }, { "epoch": 1.83, "grad_norm": 30.937679415947667, "learning_rate": 1.7429748970959086e-07, "logps/chosen": -55.13325500488281, "logps/rejected": -100.07205963134766, "loss": 0.3029, "losses/dpo": 0.006097561214119196, "losses/sft": 0.9617800116539001, "losses/total": 0.006097561214119196, "ref_logps/chosen": -33.743072509765625, "ref_logps/rejected": -47.774017333984375, "rewards/accuracies": 0.875, "rewards/chosen": -2.1390180587768555, "rewards/margins": 3.0907859802246094, "rewards/rejected": -5.229804039001465, "step": 1940 }, { "epoch": 1.83, "grad_norm": 8.056386458086859, "learning_rate": 1.7405481682104502e-07, "logps/chosen": -29.514022827148438, "logps/rejected": -75.72084045410156, "loss": 0.0973, "losses/dpo": 0.025219816714525223, "losses/sft": 1.7561702728271484, "losses/total": 0.025219816714525223, "ref_logps/chosen": -15.794366836547852, "ref_logps/rejected": -33.80937194824219, "rewards/accuracies": 1.0, "rewards/chosen": -1.3719656467437744, "rewards/margins": 2.819181442260742, "rewards/rejected": -4.1911468505859375, "step": 1941 }, { "epoch": 1.83, "grad_norm": 21.216738529396952, "learning_rate": 1.7381222274066197e-07, "logps/chosen": -41.92566680908203, "logps/rejected": -66.96142578125, "loss": 0.3893, "losses/dpo": 0.5025523900985718, "losses/sft": 2.6521341800689697, "losses/total": 0.5025523900985718, "ref_logps/chosen": -20.569223403930664, "ref_logps/rejected": -30.149547576904297, "rewards/accuracies": 0.875, "rewards/chosen": -2.13564395904541, "rewards/margins": 1.545544147491455, "rewards/rejected": -3.6811881065368652, "step": 1942 }, { "epoch": 1.83, "grad_norm": 18.631087438292017, "learning_rate": 1.735697077201811e-07, "logps/chosen": -41.94259262084961, "logps/rejected": -73.93215942382812, "loss": 0.2405, "losses/dpo": 0.08470824360847473, "losses/sft": 2.1794261932373047, "losses/total": 0.08470824360847473, "ref_logps/chosen": -23.813777923583984, "ref_logps/rejected": -32.71188735961914, "rewards/accuracies": 0.9375, "rewards/chosen": -1.812881588935852, "rewards/margins": 2.309145927429199, "rewards/rejected": -4.122027397155762, "step": 1943 }, { "epoch": 1.83, "grad_norm": 24.383712546361544, "learning_rate": 1.7332727201125964e-07, "logps/chosen": -45.344242095947266, "logps/rejected": -57.76859664916992, "loss": 0.3819, "losses/dpo": 0.008829002268612385, "losses/sft": 2.1089043617248535, "losses/total": 0.008829002268612385, "ref_logps/chosen": -28.246685028076172, "ref_logps/rejected": -23.780933380126953, "rewards/accuracies": 0.75, "rewards/chosen": -1.7097556591033936, "rewards/margins": 1.689010739326477, "rewards/rejected": -3.398766279220581, "step": 1944 }, { "epoch": 1.83, "grad_norm": 27.52121449605299, "learning_rate": 1.730849158654728e-07, "logps/chosen": -46.4013671875, "logps/rejected": -63.86181640625, "loss": 0.4469, "losses/dpo": 0.5620337724685669, "losses/sft": 1.9574856758117676, "losses/total": 0.5620337724685669, "ref_logps/chosen": -24.987869262695312, "ref_logps/rejected": -29.77008628845215, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1413495540618896, "rewards/margins": 1.2678232192993164, "rewards/rejected": -3.409173011779785, "step": 1945 }, { "epoch": 1.84, "grad_norm": 14.081868348769339, "learning_rate": 1.7284263953431298e-07, "logps/chosen": -41.708160400390625, "logps/rejected": -62.80833435058594, "loss": 0.1944, "losses/dpo": 0.2400686889886856, "losses/sft": 1.0967164039611816, "losses/total": 0.2400686889886856, "ref_logps/chosen": -26.03165626525879, "ref_logps/rejected": -29.706850051879883, "rewards/accuracies": 1.0, "rewards/chosen": -1.567650318145752, "rewards/margins": 1.7424982786178589, "rewards/rejected": -3.3101484775543213, "step": 1946 }, { "epoch": 1.84, "grad_norm": 13.415117630040161, "learning_rate": 1.726004432691898e-07, "logps/chosen": -48.74122619628906, "logps/rejected": -83.03141784667969, "loss": 0.1557, "losses/dpo": 0.01792912930250168, "losses/sft": 1.5956730842590332, "losses/total": 0.01792912930250168, "ref_logps/chosen": -28.72989273071289, "ref_logps/rejected": -37.77899932861328, "rewards/accuracies": 1.0, "rewards/chosen": -2.001133441925049, "rewards/margins": 2.5241079330444336, "rewards/rejected": -4.525241374969482, "step": 1947 }, { "epoch": 1.84, "grad_norm": 9.518311014103707, "learning_rate": 1.7235832732142993e-07, "logps/chosen": -45.90635299682617, "logps/rejected": -91.28358459472656, "loss": 0.0995, "losses/dpo": 0.01577501744031906, "losses/sft": 0.8305711150169373, "losses/total": 0.01577501744031906, "ref_logps/chosen": -27.257740020751953, "ref_logps/rejected": -41.626869201660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8648611307144165, "rewards/margins": 3.100811004638672, "rewards/rejected": -4.965672016143799, "step": 1948 }, { "epoch": 1.84, "grad_norm": 14.375803183340508, "learning_rate": 1.7211629194227654e-07, "logps/chosen": -36.40850830078125, "logps/rejected": -78.49505615234375, "loss": 0.1973, "losses/dpo": 0.028041653335094452, "losses/sft": 0.8443722724914551, "losses/total": 0.028041653335094452, "ref_logps/chosen": -22.061634063720703, "ref_logps/rejected": -40.46807861328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4346871376037598, "rewards/margins": 2.3680105209350586, "rewards/rejected": -3.8026978969573975, "step": 1949 }, { "epoch": 1.84, "grad_norm": 25.782705525526143, "learning_rate": 1.7187433738288923e-07, "logps/chosen": -50.86469268798828, "logps/rejected": -68.51762390136719, "loss": 0.4003, "losses/dpo": 1.96294105052948, "losses/sft": 2.243185043334961, "losses/total": 1.96294105052948, "ref_logps/chosen": -33.329673767089844, "ref_logps/rejected": -33.2955322265625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7535018920898438, "rewards/margins": 1.7687069177627563, "rewards/rejected": -3.5222086906433105, "step": 1950 }, { "epoch": 1.84, "grad_norm": 12.634062833375884, "learning_rate": 1.716324638943438e-07, "logps/chosen": -44.117855072021484, "logps/rejected": -87.87020874023438, "loss": 0.1586, "losses/dpo": 0.07932086288928986, "losses/sft": 2.9833409786224365, "losses/total": 0.07932086288928986, "ref_logps/chosen": -24.04553985595703, "ref_logps/rejected": -40.20326232910156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0072317123413086, "rewards/margins": 2.759462833404541, "rewards/rejected": -4.76669454574585, "step": 1951 }, { "epoch": 1.84, "grad_norm": 20.052846303808018, "learning_rate": 1.713906717276318e-07, "logps/chosen": -58.85535430908203, "logps/rejected": -75.09369659423828, "loss": 0.2585, "losses/dpo": 0.28419607877731323, "losses/sft": 2.4226677417755127, "losses/total": 0.28419607877731323, "ref_logps/chosen": -38.46784210205078, "ref_logps/rejected": -35.64442825317383, "rewards/accuracies": 0.9375, "rewards/chosen": -2.038750648498535, "rewards/margins": 1.9061765670776367, "rewards/rejected": -3.944927215576172, "step": 1952 }, { "epoch": 1.84, "grad_norm": 21.23557476899644, "learning_rate": 1.711489611336606e-07, "logps/chosen": -36.22601318359375, "logps/rejected": -59.4320068359375, "loss": 0.3076, "losses/dpo": 0.5652883052825928, "losses/sft": 2.03004789352417, "losses/total": 0.5652883052825928, "ref_logps/chosen": -22.115671157836914, "ref_logps/rejected": -27.092124938964844, "rewards/accuracies": 0.875, "rewards/chosen": -1.4110339879989624, "rewards/margins": 1.8229541778564453, "rewards/rejected": -3.233988046646118, "step": 1953 }, { "epoch": 1.84, "grad_norm": 18.985343079100534, "learning_rate": 1.7090733236325272e-07, "logps/chosen": -42.863067626953125, "logps/rejected": -61.904449462890625, "loss": 0.2561, "losses/dpo": 0.05403871461749077, "losses/sft": 1.4688228368759155, "losses/total": 0.05403871461749077, "ref_logps/chosen": -26.955978393554688, "ref_logps/rejected": -28.846895217895508, "rewards/accuracies": 0.875, "rewards/chosen": -1.5907092094421387, "rewards/margins": 1.7150460481643677, "rewards/rejected": -3.305755138397217, "step": 1954 }, { "epoch": 1.84, "grad_norm": 16.026763377060135, "learning_rate": 1.7066578566714585e-07, "logps/chosen": -43.3271484375, "logps/rejected": -87.4383316040039, "loss": 0.2012, "losses/dpo": 0.07257739454507828, "losses/sft": 0.5352790951728821, "losses/total": 0.07257739454507828, "ref_logps/chosen": -27.358184814453125, "ref_logps/rejected": -46.00355529785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5968966484069824, "rewards/margins": 2.5465803146362305, "rewards/rejected": -4.143476963043213, "step": 1955 }, { "epoch": 1.85, "grad_norm": 20.46101679605452, "learning_rate": 1.7042432129599248e-07, "logps/chosen": -39.755367279052734, "logps/rejected": -70.59049987792969, "loss": 0.2809, "losses/dpo": 0.8831392526626587, "losses/sft": 1.7261605262756348, "losses/total": 0.8831392526626587, "ref_logps/chosen": -21.97357749938965, "ref_logps/rejected": -31.053987503051758, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7781789302825928, "rewards/margins": 2.1754720211029053, "rewards/rejected": -3.953650951385498, "step": 1956 }, { "epoch": 1.85, "grad_norm": 25.904132326418114, "learning_rate": 1.7018293950035978e-07, "logps/chosen": -39.221832275390625, "logps/rejected": -82.27742004394531, "loss": 0.3095, "losses/dpo": 0.005782434716820717, "losses/sft": 1.4015170335769653, "losses/total": 0.005782434716820717, "ref_logps/chosen": -23.94043731689453, "ref_logps/rejected": -41.68681335449219, "rewards/accuracies": 0.875, "rewards/chosen": -1.528139352798462, "rewards/margins": 2.530921459197998, "rewards/rejected": -4.059061050415039, "step": 1957 }, { "epoch": 1.85, "grad_norm": 29.572345137137447, "learning_rate": 1.6994164053072908e-07, "logps/chosen": -44.23220443725586, "logps/rejected": -60.239219665527344, "loss": 0.4599, "losses/dpo": 0.039463263005018234, "losses/sft": 1.9403280019760132, "losses/total": 0.039463263005018234, "ref_logps/chosen": -25.926837921142578, "ref_logps/rejected": -26.187103271484375, "rewards/accuracies": 0.875, "rewards/chosen": -1.8305364847183228, "rewards/margins": 1.5746755599975586, "rewards/rejected": -3.405211925506592, "step": 1958 }, { "epoch": 1.85, "grad_norm": 15.487041760129884, "learning_rate": 1.6970042463749584e-07, "logps/chosen": -52.151893615722656, "logps/rejected": -88.29170227050781, "loss": 0.1987, "losses/dpo": 0.07037341594696045, "losses/sft": 1.8090219497680664, "losses/total": 0.07037341594696045, "ref_logps/chosen": -32.40631866455078, "ref_logps/rejected": -43.095638275146484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.974557638168335, "rewards/margins": 2.5450491905212402, "rewards/rejected": -4.519607067108154, "step": 1959 }, { "epoch": 1.85, "grad_norm": 11.655996941199502, "learning_rate": 1.6945929207096935e-07, "logps/chosen": -38.821815490722656, "logps/rejected": -97.675537109375, "loss": 0.1134, "losses/dpo": 0.0684918537735939, "losses/sft": 0.6762935519218445, "losses/total": 0.0684918537735939, "ref_logps/chosen": -22.686622619628906, "ref_logps/rejected": -42.090599060058594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6135189533233643, "rewards/margins": 3.9449753761291504, "rewards/rejected": -5.5584940910339355, "step": 1960 }, { "epoch": 1.85, "grad_norm": 21.777163004309166, "learning_rate": 1.692182430813723e-07, "logps/chosen": -56.371559143066406, "logps/rejected": -76.00325775146484, "loss": 0.3499, "losses/dpo": 0.21874606609344482, "losses/sft": 2.5959925651550293, "losses/total": 0.21874606609344482, "ref_logps/chosen": -31.846473693847656, "ref_logps/rejected": -35.06661605834961, "rewards/accuracies": 0.875, "rewards/chosen": -2.4525089263916016, "rewards/margins": 1.6411547660827637, "rewards/rejected": -4.093663692474365, "step": 1961 }, { "epoch": 1.85, "grad_norm": 17.04473019520901, "learning_rate": 1.689772779188408e-07, "logps/chosen": -36.79838562011719, "logps/rejected": -61.6599006652832, "loss": 0.204, "losses/dpo": 0.3649807870388031, "losses/sft": 1.3126417398452759, "losses/total": 0.3649807870388031, "ref_logps/chosen": -25.069820404052734, "ref_logps/rejected": -26.786977767944336, "rewards/accuracies": 1.0, "rewards/chosen": -1.172856330871582, "rewards/margins": 2.3144359588623047, "rewards/rejected": -3.4872922897338867, "step": 1962 }, { "epoch": 1.85, "grad_norm": 20.69182921586025, "learning_rate": 1.687363968334239e-07, "logps/chosen": -38.91547775268555, "logps/rejected": -64.28431701660156, "loss": 0.2546, "losses/dpo": 0.09875140339136124, "losses/sft": 0.03401421755552292, "losses/total": 0.09875140339136124, "ref_logps/chosen": -24.312847137451172, "ref_logps/rejected": -25.54791259765625, "rewards/accuracies": 0.875, "rewards/chosen": -1.4602630138397217, "rewards/margins": 2.4133777618408203, "rewards/rejected": -3.873641014099121, "step": 1963 }, { "epoch": 1.85, "grad_norm": 12.278841144439443, "learning_rate": 1.6849560007508344e-07, "logps/chosen": -33.88705825805664, "logps/rejected": -65.26016235351562, "loss": 0.1652, "losses/dpo": 0.055741701275110245, "losses/sft": 0.9231744408607483, "losses/total": 0.055741701275110245, "ref_logps/chosen": -22.294353485107422, "ref_logps/rejected": -31.20195198059082, "rewards/accuracies": 1.0, "rewards/chosen": -1.1592705249786377, "rewards/margins": 2.246551036834717, "rewards/rejected": -3.4058218002319336, "step": 1964 }, { "epoch": 1.85, "grad_norm": 23.974685726724125, "learning_rate": 1.682548878936937e-07, "logps/chosen": -44.11328125, "logps/rejected": -70.53290557861328, "loss": 0.3044, "losses/dpo": 0.06091180443763733, "losses/sft": 1.8997395038604736, "losses/total": 0.06091180443763733, "ref_logps/chosen": -27.956560134887695, "ref_logps/rejected": -36.019500732421875, "rewards/accuracies": 0.875, "rewards/chosen": -1.6156723499298096, "rewards/margins": 1.8356685638427734, "rewards/rejected": -3.451341152191162, "step": 1965 }, { "epoch": 1.85, "grad_norm": 11.490454077509916, "learning_rate": 1.680142605390412e-07, "logps/chosen": -40.325897216796875, "logps/rejected": -80.85000610351562, "loss": 0.1008, "losses/dpo": 0.014552763663232327, "losses/sft": 1.839357614517212, "losses/total": 0.014552763663232327, "ref_logps/chosen": -25.91600799560547, "ref_logps/rejected": -32.92162322998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4409891366958618, "rewards/margins": 3.35184907913208, "rewards/rejected": -4.792838096618652, "step": 1966 }, { "epoch": 1.86, "grad_norm": 16.907498980514365, "learning_rate": 1.6777371826082454e-07, "logps/chosen": -42.548954010009766, "logps/rejected": -63.84481430053711, "loss": 0.2383, "losses/dpo": 0.009789708070456982, "losses/sft": 0.31894099712371826, "losses/total": 0.009789708070456982, "ref_logps/chosen": -27.35976791381836, "ref_logps/rejected": -30.402812957763672, "rewards/accuracies": 1.0, "rewards/chosen": -1.518918514251709, "rewards/margins": 1.8252817392349243, "rewards/rejected": -3.344200372695923, "step": 1967 }, { "epoch": 1.86, "grad_norm": 7.019245219042717, "learning_rate": 1.6753326130865387e-07, "logps/chosen": -47.52423858642578, "logps/rejected": -92.04293060302734, "loss": 0.0857, "losses/dpo": 0.24306653439998627, "losses/sft": 0.7671452164649963, "losses/total": 0.24306653439998627, "ref_logps/chosen": -28.54889488220215, "ref_logps/rejected": -42.52732467651367, "rewards/accuracies": 1.0, "rewards/chosen": -1.8975348472595215, "rewards/margins": 3.054025650024414, "rewards/rejected": -4.9515604972839355, "step": 1968 }, { "epoch": 1.86, "grad_norm": 16.615352394573936, "learning_rate": 1.6729288993205087e-07, "logps/chosen": -40.801753997802734, "logps/rejected": -58.00297927856445, "loss": 0.2336, "losses/dpo": 0.15804064273834229, "losses/sft": 1.5629889965057373, "losses/total": 0.15804064273834229, "ref_logps/chosen": -27.923465728759766, "ref_logps/rejected": -26.499879837036133, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2878289222717285, "rewards/margins": 1.862480878829956, "rewards/rejected": -3.1503100395202637, "step": 1969 }, { "epoch": 1.86, "grad_norm": 21.754014939441284, "learning_rate": 1.6705260438044853e-07, "logps/chosen": -40.02710723876953, "logps/rejected": -65.10873413085938, "loss": 0.2849, "losses/dpo": 0.07684829086065292, "losses/sft": 0.8087054491043091, "losses/total": 0.07684829086065292, "ref_logps/chosen": -23.832489013671875, "ref_logps/rejected": -29.88188362121582, "rewards/accuracies": 0.9375, "rewards/chosen": -1.619462013244629, "rewards/margins": 1.9032231569290161, "rewards/rejected": -3.5226852893829346, "step": 1970 }, { "epoch": 1.86, "grad_norm": 20.145203069987918, "learning_rate": 1.668124049031906e-07, "logps/chosen": -45.67802429199219, "logps/rejected": -74.75636291503906, "loss": 0.253, "losses/dpo": 0.7035473585128784, "losses/sft": 1.8513095378875732, "losses/total": 0.7035473585128784, "ref_logps/chosen": -27.71307373046875, "ref_logps/rejected": -32.92552947998047, "rewards/accuracies": 0.875, "rewards/chosen": -1.796494960784912, "rewards/margins": 2.3865883350372314, "rewards/rejected": -4.183083534240723, "step": 1971 }, { "epoch": 1.86, "grad_norm": 21.137688440867446, "learning_rate": 1.6657229174953163e-07, "logps/chosen": -54.91517639160156, "logps/rejected": -75.32865142822266, "loss": 0.2265, "losses/dpo": 0.082246333360672, "losses/sft": 1.9186564683914185, "losses/total": 0.082246333360672, "ref_logps/chosen": -36.11769104003906, "ref_logps/rejected": -33.20167541503906, "rewards/accuracies": 0.875, "rewards/chosen": -1.879748821258545, "rewards/margins": 2.3329484462738037, "rewards/rejected": -4.2126970291137695, "step": 1972 }, { "epoch": 1.86, "grad_norm": 23.142050741260284, "learning_rate": 1.6633226516863655e-07, "logps/chosen": -54.898475646972656, "logps/rejected": -83.28624725341797, "loss": 0.3277, "losses/dpo": 0.2630813717842102, "losses/sft": 1.9650304317474365, "losses/total": 0.2630813717842102, "ref_logps/chosen": -34.184139251708984, "ref_logps/rejected": -43.08917999267578, "rewards/accuracies": 0.875, "rewards/chosen": -2.0714335441589355, "rewards/margins": 1.9482730627059937, "rewards/rejected": -4.019706726074219, "step": 1973 }, { "epoch": 1.86, "grad_norm": 18.115339632420625, "learning_rate": 1.6609232540958044e-07, "logps/chosen": -43.95806121826172, "logps/rejected": -67.36223602294922, "loss": 0.2813, "losses/dpo": 0.1623481810092926, "losses/sft": 0.537017822265625, "losses/total": 0.1623481810092926, "ref_logps/chosen": -29.027103424072266, "ref_logps/rejected": -34.530948638916016, "rewards/accuracies": 0.875, "rewards/chosen": -1.493095874786377, "rewards/margins": 1.7900328636169434, "rewards/rejected": -3.2831287384033203, "step": 1974 }, { "epoch": 1.86, "grad_norm": 15.119712357667028, "learning_rate": 1.6585247272134828e-07, "logps/chosen": -48.02715301513672, "logps/rejected": -80.53358459472656, "loss": 0.1868, "losses/dpo": 0.0821787491440773, "losses/sft": 1.157912015914917, "losses/total": 0.0821787491440773, "ref_logps/chosen": -31.854915618896484, "ref_logps/rejected": -40.22533416748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.6172235012054443, "rewards/margins": 2.413601875305176, "rewards/rejected": -4.030825138092041, "step": 1975 }, { "epoch": 1.86, "grad_norm": 17.526551948997163, "learning_rate": 1.6561270735283466e-07, "logps/chosen": -40.3074951171875, "logps/rejected": -71.66091918945312, "loss": 0.2577, "losses/dpo": 0.005564086139202118, "losses/sft": 0.5699872374534607, "losses/total": 0.005564086139202118, "ref_logps/chosen": -26.89061737060547, "ref_logps/rejected": -33.65739822387695, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3416880369186401, "rewards/margins": 2.4586644172668457, "rewards/rejected": -3.8003525733947754, "step": 1976 }, { "epoch": 1.87, "grad_norm": 30.482009295377523, "learning_rate": 1.6537302955284383e-07, "logps/chosen": -57.02873992919922, "logps/rejected": -81.28375244140625, "loss": 0.4553, "losses/dpo": 0.04349144548177719, "losses/sft": 1.2588069438934326, "losses/total": 0.04349144548177719, "ref_logps/chosen": -35.318729400634766, "ref_logps/rejected": -37.40796661376953, "rewards/accuracies": 0.75, "rewards/chosen": -2.1710009574890137, "rewards/margins": 2.2165772914886475, "rewards/rejected": -4.38757848739624, "step": 1977 }, { "epoch": 1.87, "grad_norm": 31.852696076885763, "learning_rate": 1.651334395700888e-07, "logps/chosen": -51.30027770996094, "logps/rejected": -65.42240905761719, "loss": 0.3578, "losses/dpo": 0.5641307234764099, "losses/sft": 0.8340333104133606, "losses/total": 0.5641307234764099, "ref_logps/chosen": -35.197235107421875, "ref_logps/rejected": -31.924909591674805, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6103038787841797, "rewards/margins": 1.739445686340332, "rewards/rejected": -3.3497495651245117, "step": 1978 }, { "epoch": 1.87, "grad_norm": 22.58986742730435, "learning_rate": 1.6489393765319166e-07, "logps/chosen": -41.06809997558594, "logps/rejected": -67.49652862548828, "loss": 0.268, "losses/dpo": 1.2209234237670898, "losses/sft": 2.4079253673553467, "losses/total": 1.2209234237670898, "ref_logps/chosen": -24.777645111083984, "ref_logps/rejected": -31.572290420532227, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6290453672409058, "rewards/margins": 1.963378667831421, "rewards/rejected": -3.592424154281616, "step": 1979 }, { "epoch": 1.87, "grad_norm": 23.386590266669884, "learning_rate": 1.6465452405068302e-07, "logps/chosen": -39.292762756347656, "logps/rejected": -64.79240417480469, "loss": 0.3419, "losses/dpo": 0.5005916357040405, "losses/sft": 1.655900239944458, "losses/total": 0.5005916357040405, "ref_logps/chosen": -24.972126007080078, "ref_logps/rejected": -32.29897689819336, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4320636987686157, "rewards/margins": 1.81727933883667, "rewards/rejected": -3.249342918395996, "step": 1980 }, { "epoch": 1.87, "grad_norm": 14.808325232355417, "learning_rate": 1.6441519901100186e-07, "logps/chosen": -44.593482971191406, "logps/rejected": -81.48065185546875, "loss": 0.2074, "losses/dpo": 1.8711470365524292, "losses/sft": 1.7626228332519531, "losses/total": 1.8711470365524292, "ref_logps/chosen": -27.228633880615234, "ref_logps/rejected": -38.02665710449219, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7364850044250488, "rewards/margins": 2.608914852142334, "rewards/rejected": -4.345399856567383, "step": 1981 }, { "epoch": 1.87, "grad_norm": 15.477664311265835, "learning_rate": 1.641759627824954e-07, "logps/chosen": -43.56343078613281, "logps/rejected": -72.2051773071289, "loss": 0.175, "losses/dpo": 0.008270912803709507, "losses/sft": 0.7272599935531616, "losses/total": 0.008270912803709507, "ref_logps/chosen": -28.07147979736328, "ref_logps/rejected": -32.378700256347656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5491955280303955, "rewards/margins": 2.4334518909454346, "rewards/rejected": -3.98264741897583, "step": 1982 }, { "epoch": 1.87, "grad_norm": 14.073464396716455, "learning_rate": 1.6393681561341855e-07, "logps/chosen": -43.29547882080078, "logps/rejected": -65.52647399902344, "loss": 0.1974, "losses/dpo": 0.13323654234409332, "losses/sft": 2.1076536178588867, "losses/total": 0.13323654234409332, "ref_logps/chosen": -26.129688262939453, "ref_logps/rejected": -27.808517456054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7165789604187012, "rewards/margins": 2.0552167892456055, "rewards/rejected": -3.7717955112457275, "step": 1983 }, { "epoch": 1.87, "grad_norm": 27.0066130806619, "learning_rate": 1.6369775775193384e-07, "logps/chosen": -47.91738510131836, "logps/rejected": -72.31123352050781, "loss": 0.3571, "losses/dpo": 0.17499633133411407, "losses/sft": 0.8251637816429138, "losses/total": 0.17499633133411407, "ref_logps/chosen": -28.82666778564453, "ref_logps/rejected": -31.62845230102539, "rewards/accuracies": 0.875, "rewards/chosen": -1.9090718030929565, "rewards/margins": 2.1592068672180176, "rewards/rejected": -4.068278789520264, "step": 1984 }, { "epoch": 1.87, "grad_norm": 18.35374021450797, "learning_rate": 1.634587894461111e-07, "logps/chosen": -46.814353942871094, "logps/rejected": -90.33724975585938, "loss": 0.2123, "losses/dpo": 0.03461301326751709, "losses/sft": 1.1019930839538574, "losses/total": 0.03461301326751709, "ref_logps/chosen": -27.650224685668945, "ref_logps/rejected": -43.31744384765625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9164130687713623, "rewards/margins": 2.7855682373046875, "rewards/rejected": -4.701981067657471, "step": 1985 }, { "epoch": 1.87, "grad_norm": 30.00665687489347, "learning_rate": 1.6321991094392726e-07, "logps/chosen": -50.991695404052734, "logps/rejected": -68.68099975585938, "loss": 0.4022, "losses/dpo": 1.2930673360824585, "losses/sft": 1.7419801950454712, "losses/total": 1.2930673360824585, "ref_logps/chosen": -30.57494354248047, "ref_logps/rejected": -33.38149642944336, "rewards/accuracies": 0.8125, "rewards/chosen": -2.041675090789795, "rewards/margins": 1.4882746934890747, "rewards/rejected": -3.5299501419067383, "step": 1986 }, { "epoch": 1.87, "grad_norm": 22.863815511838744, "learning_rate": 1.6298112249326608e-07, "logps/chosen": -54.306034088134766, "logps/rejected": -69.23176574707031, "loss": 0.3389, "losses/dpo": 0.3699338436126709, "losses/sft": 1.8781682252883911, "losses/total": 0.3699338436126709, "ref_logps/chosen": -34.45021057128906, "ref_logps/rejected": -31.536758422851562, "rewards/accuracies": 0.875, "rewards/chosen": -1.9855823516845703, "rewards/margins": 1.7839176654815674, "rewards/rejected": -3.769500255584717, "step": 1987 }, { "epoch": 1.88, "grad_norm": 26.397249843231027, "learning_rate": 1.627424243419178e-07, "logps/chosen": -40.03387451171875, "logps/rejected": -61.60554504394531, "loss": 0.595, "losses/dpo": 0.6190805435180664, "losses/sft": 1.6847203969955444, "losses/total": 0.6190805435180664, "ref_logps/chosen": -22.384441375732422, "ref_logps/rejected": -27.862232208251953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7649433612823486, "rewards/margins": 1.6093883514404297, "rewards/rejected": -3.3743317127227783, "step": 1988 }, { "epoch": 1.88, "grad_norm": 29.736698867544867, "learning_rate": 1.6250381673757912e-07, "logps/chosen": -45.23607635498047, "logps/rejected": -62.396785736083984, "loss": 0.5263, "losses/dpo": 0.05627313256263733, "losses/sft": 0.18432050943374634, "losses/total": 0.05627313256263733, "ref_logps/chosen": -27.21544647216797, "ref_logps/rejected": -29.76076889038086, "rewards/accuracies": 0.75, "rewards/chosen": -1.8020634651184082, "rewards/margins": 1.4615387916564941, "rewards/rejected": -3.2636022567749023, "step": 1989 }, { "epoch": 1.88, "grad_norm": 31.75165472524659, "learning_rate": 1.6226529992785255e-07, "logps/chosen": -58.17169952392578, "logps/rejected": -88.0155029296875, "loss": 0.3619, "losses/dpo": 0.25012120604515076, "losses/sft": 2.327420949935913, "losses/total": 0.25012120604515076, "ref_logps/chosen": -37.68841552734375, "ref_logps/rejected": -45.557334899902344, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0483288764953613, "rewards/margins": 2.1974878311157227, "rewards/rejected": -4.245816230773926, "step": 1990 }, { "epoch": 1.88, "grad_norm": 19.389105913266974, "learning_rate": 1.6202687416024653e-07, "logps/chosen": -45.413509368896484, "logps/rejected": -83.91389465332031, "loss": 0.1763, "losses/dpo": 0.04089270904660225, "losses/sft": 1.0478520393371582, "losses/total": 0.04089270904660225, "ref_logps/chosen": -29.10716438293457, "ref_logps/rejected": -41.3154296875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6306343078613281, "rewards/margins": 2.62921142578125, "rewards/rejected": -4.259845733642578, "step": 1991 }, { "epoch": 1.88, "grad_norm": 26.151010917396217, "learning_rate": 1.6178853968217505e-07, "logps/chosen": -41.17626953125, "logps/rejected": -57.8343391418457, "loss": 0.4152, "losses/dpo": 0.5552520751953125, "losses/sft": 1.1133934259414673, "losses/total": 0.5552520751953125, "ref_logps/chosen": -26.188213348388672, "ref_logps/rejected": -23.476993560791016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4988055229187012, "rewards/margins": 1.9369291067123413, "rewards/rejected": -3.435734748840332, "step": 1992 }, { "epoch": 1.88, "grad_norm": 22.867266180859268, "learning_rate": 1.6155029674095727e-07, "logps/chosen": -48.796119689941406, "logps/rejected": -75.97205352783203, "loss": 0.3301, "losses/dpo": 0.058057013899087906, "losses/sft": 1.637559413909912, "losses/total": 0.058057013899087906, "ref_logps/chosen": -32.505943298339844, "ref_logps/rejected": -39.40492630004883, "rewards/accuracies": 0.8125, "rewards/chosen": -1.629017949104309, "rewards/margins": 2.0276949405670166, "rewards/rejected": -3.6567130088806152, "step": 1993 }, { "epoch": 1.88, "grad_norm": 34.856451915436836, "learning_rate": 1.6131214558381744e-07, "logps/chosen": -53.29895782470703, "logps/rejected": -58.32591247558594, "loss": 0.6041, "losses/dpo": 0.6234641075134277, "losses/sft": 1.243798017501831, "losses/total": 0.6234641075134277, "ref_logps/chosen": -36.57881164550781, "ref_logps/rejected": -27.15228271484375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6720147132873535, "rewards/margins": 1.445347785949707, "rewards/rejected": -3.1173624992370605, "step": 1994 }, { "epoch": 1.88, "grad_norm": 14.320706611091385, "learning_rate": 1.6107408645788452e-07, "logps/chosen": -56.52404022216797, "logps/rejected": -79.8743667602539, "loss": 0.1878, "losses/dpo": 0.11953239887952805, "losses/sft": 1.849951982498169, "losses/total": 0.11953239887952805, "ref_logps/chosen": -37.535118103027344, "ref_logps/rejected": -38.624237060546875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8988925218582153, "rewards/margins": 2.2261204719543457, "rewards/rejected": -4.12501335144043, "step": 1995 }, { "epoch": 1.88, "grad_norm": 29.78376646110692, "learning_rate": 1.6083611961019195e-07, "logps/chosen": -47.458518981933594, "logps/rejected": -71.12018585205078, "loss": 0.5059, "losses/dpo": 0.036343902349472046, "losses/sft": 2.0232772827148438, "losses/total": 0.036343902349472046, "ref_logps/chosen": -27.14342498779297, "ref_logps/rejected": -34.798091888427734, "rewards/accuracies": 0.75, "rewards/chosen": -2.0315098762512207, "rewards/margins": 1.6006999015808105, "rewards/rejected": -3.6322097778320312, "step": 1996 }, { "epoch": 1.88, "grad_norm": 20.073481794389128, "learning_rate": 1.6059824528767746e-07, "logps/chosen": -43.73548126220703, "logps/rejected": -62.76807403564453, "loss": 0.3078, "losses/dpo": 0.02291620336472988, "losses/sft": 1.7513556480407715, "losses/total": 0.02291620336472988, "ref_logps/chosen": -26.068531036376953, "ref_logps/rejected": -29.185165405273438, "rewards/accuracies": 0.9375, "rewards/chosen": -1.766695261001587, "rewards/margins": 1.5915956497192383, "rewards/rejected": -3.358290910720825, "step": 1997 }, { "epoch": 1.88, "grad_norm": 19.269632152003663, "learning_rate": 1.603604637371827e-07, "logps/chosen": -49.75260925292969, "logps/rejected": -89.78598022460938, "loss": 0.214, "losses/dpo": 0.057665009051561356, "losses/sft": 1.7332943677902222, "losses/total": 0.057665009051561356, "ref_logps/chosen": -36.07408142089844, "ref_logps/rejected": -47.98749542236328, "rewards/accuracies": 0.875, "rewards/chosen": -1.3678529262542725, "rewards/margins": 2.8119964599609375, "rewards/rejected": -4.179849624633789, "step": 1998 }, { "epoch": 1.89, "grad_norm": 10.6464421425134, "learning_rate": 1.6012277520545326e-07, "logps/chosen": -41.8365478515625, "logps/rejected": -87.7524185180664, "loss": 0.1189, "losses/dpo": 0.514366865158081, "losses/sft": 1.8831422328948975, "losses/total": 0.514366865158081, "ref_logps/chosen": -25.856496810913086, "ref_logps/rejected": -42.62345504760742, "rewards/accuracies": 1.0, "rewards/chosen": -1.5980050563812256, "rewards/margins": 2.9148917198181152, "rewards/rejected": -4.512896537780762, "step": 1999 }, { "epoch": 1.89, "grad_norm": 19.553937475666142, "learning_rate": 1.5988517993913793e-07, "logps/chosen": -48.471763610839844, "logps/rejected": -86.09092712402344, "loss": 0.178, "losses/dpo": 0.008564875461161137, "losses/sft": 1.414162516593933, "losses/total": 0.008564875461161137, "ref_logps/chosen": -30.975313186645508, "ref_logps/rejected": -40.318450927734375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7496452331542969, "rewards/margins": 2.8276028633117676, "rewards/rejected": -4.5772480964660645, "step": 2000 }, { "epoch": 1.89, "grad_norm": 11.32639164165444, "learning_rate": 1.5964767818478886e-07, "logps/chosen": -56.16606521606445, "logps/rejected": -104.53165435791016, "loss": 0.1443, "losses/dpo": 0.0009719761437736452, "losses/sft": 1.7750164270401, "losses/total": 0.0009719761437736452, "ref_logps/chosen": -40.9307975769043, "ref_logps/rejected": -52.05905532836914, "rewards/accuracies": 0.9375, "rewards/chosen": -1.523526906967163, "rewards/margins": 3.7237329483032227, "rewards/rejected": -5.247260093688965, "step": 2001 }, { "epoch": 1.89, "grad_norm": 15.284754377530533, "learning_rate": 1.594102701888611e-07, "logps/chosen": -45.7064094543457, "logps/rejected": -63.914581298828125, "loss": 0.254, "losses/dpo": 0.1547994762659073, "losses/sft": 1.5940495729446411, "losses/total": 0.1547994762659073, "ref_logps/chosen": -28.470767974853516, "ref_logps/rejected": -27.954669952392578, "rewards/accuracies": 0.875, "rewards/chosen": -1.7235643863677979, "rewards/margins": 1.8724267482757568, "rewards/rejected": -3.5959911346435547, "step": 2002 }, { "epoch": 1.89, "grad_norm": 18.688194622531018, "learning_rate": 1.5917295619771254e-07, "logps/chosen": -38.362369537353516, "logps/rejected": -74.07752990722656, "loss": 0.2285, "losses/dpo": 0.05395171791315079, "losses/sft": 0.9161611795425415, "losses/total": 0.05395171791315079, "ref_logps/chosen": -23.189971923828125, "ref_logps/rejected": -33.839393615722656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5172396898269653, "rewards/margins": 2.5065739154815674, "rewards/rejected": -4.023813724517822, "step": 2003 }, { "epoch": 1.89, "grad_norm": 20.699241451643186, "learning_rate": 1.5893573645760337e-07, "logps/chosen": -29.464481353759766, "logps/rejected": -56.03434753417969, "loss": 0.3382, "losses/dpo": 0.027344144880771637, "losses/sft": 0.7373000979423523, "losses/total": 0.027344144880771637, "ref_logps/chosen": -17.061241149902344, "ref_logps/rejected": -24.415372848510742, "rewards/accuracies": 0.875, "rewards/chosen": -1.2403241395950317, "rewards/margins": 1.9215734004974365, "rewards/rejected": -3.1618974208831787, "step": 2004 }, { "epoch": 1.89, "grad_norm": 12.765071305763318, "learning_rate": 1.5869861121469601e-07, "logps/chosen": -41.33045196533203, "logps/rejected": -91.15283203125, "loss": 0.1746, "losses/dpo": 0.3530893325805664, "losses/sft": 1.051767110824585, "losses/total": 0.3530893325805664, "ref_logps/chosen": -28.19598960876465, "ref_logps/rejected": -45.06828308105469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.313446283340454, "rewards/margins": 3.295008659362793, "rewards/rejected": -4.608454704284668, "step": 2005 }, { "epoch": 1.89, "grad_norm": 16.09856224705603, "learning_rate": 1.5846158071505488e-07, "logps/chosen": -46.62800979614258, "logps/rejected": -64.25411987304688, "loss": 0.2601, "losses/dpo": 0.06868055462837219, "losses/sft": 1.7465884685516357, "losses/total": 0.06868055462837219, "ref_logps/chosen": -29.991640090942383, "ref_logps/rejected": -30.4559326171875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6636371612548828, "rewards/margins": 1.716181755065918, "rewards/rejected": -3.379818916320801, "step": 2006 }, { "epoch": 1.89, "grad_norm": 25.445559916817274, "learning_rate": 1.58224645204646e-07, "logps/chosen": -40.09779357910156, "logps/rejected": -72.59436798095703, "loss": 0.3819, "losses/dpo": 4.838615859625861e-05, "losses/sft": 0.431045800447464, "losses/total": 4.838615859625861e-05, "ref_logps/chosen": -22.753496170043945, "ref_logps/rejected": -29.885913848876953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.734429955482483, "rewards/margins": 2.5364153385162354, "rewards/rejected": -4.270845413208008, "step": 2007 }, { "epoch": 1.89, "grad_norm": 22.45781163245945, "learning_rate": 1.5798780492933694e-07, "logps/chosen": -49.08796310424805, "logps/rejected": -71.93032836914062, "loss": 0.211, "losses/dpo": 0.09610185027122498, "losses/sft": 1.957557201385498, "losses/total": 0.09610185027122498, "ref_logps/chosen": -29.88048553466797, "ref_logps/rejected": -32.22974395751953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9207477569580078, "rewards/margins": 2.0493106842041016, "rewards/rejected": -3.9700584411621094, "step": 2008 }, { "epoch": 1.9, "grad_norm": 26.484941997218566, "learning_rate": 1.5775106013489638e-07, "logps/chosen": -42.87336730957031, "logps/rejected": -65.04397583007812, "loss": 0.3524, "losses/dpo": 0.24398000538349152, "losses/sft": 1.9263685941696167, "losses/total": 0.24398000538349152, "ref_logps/chosen": -24.744625091552734, "ref_logps/rejected": -29.16967010498047, "rewards/accuracies": 0.875, "rewards/chosen": -1.8128741979599, "rewards/margins": 1.7745566368103027, "rewards/rejected": -3.587430477142334, "step": 2009 }, { "epoch": 1.9, "grad_norm": 18.95140615320338, "learning_rate": 1.5751441106699387e-07, "logps/chosen": -49.45009231567383, "logps/rejected": -70.81902313232422, "loss": 0.2488, "losses/dpo": 0.10524746775627136, "losses/sft": 1.5512540340423584, "losses/total": 0.10524746775627136, "ref_logps/chosen": -30.821069717407227, "ref_logps/rejected": -32.331295013427734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.862902283668518, "rewards/margins": 1.9858707189559937, "rewards/rejected": -3.8487730026245117, "step": 2010 }, { "epoch": 1.9, "grad_norm": 18.517279244230537, "learning_rate": 1.5727785797119966e-07, "logps/chosen": -46.03853225708008, "logps/rejected": -83.97383117675781, "loss": 0.1951, "losses/dpo": 1.029597282409668, "losses/sft": 2.280531883239746, "losses/total": 1.029597282409668, "ref_logps/chosen": -28.3820743560791, "ref_logps/rejected": -38.3809928894043, "rewards/accuracies": 0.875, "rewards/chosen": -1.7656457424163818, "rewards/margins": 2.7936387062072754, "rewards/rejected": -4.5592851638793945, "step": 2011 }, { "epoch": 1.9, "grad_norm": 18.715662050734917, "learning_rate": 1.5704140109298445e-07, "logps/chosen": -52.61248779296875, "logps/rejected": -90.28121948242188, "loss": 0.2143, "losses/dpo": 0.050776515156030655, "losses/sft": 1.183617353439331, "losses/total": 0.050776515156030655, "ref_logps/chosen": -29.33259391784668, "ref_logps/rejected": -43.36280822753906, "rewards/accuracies": 0.875, "rewards/chosen": -2.327989101409912, "rewards/margins": 2.363851547241211, "rewards/rejected": -4.691841125488281, "step": 2012 }, { "epoch": 1.9, "grad_norm": 14.26279106996491, "learning_rate": 1.5680504067771905e-07, "logps/chosen": -40.75905227661133, "logps/rejected": -79.26350402832031, "loss": 0.1801, "losses/dpo": 0.09930313378572464, "losses/sft": 1.6016701459884644, "losses/total": 0.09930313378572464, "ref_logps/chosen": -26.103670120239258, "ref_logps/rejected": -37.21332931518555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4655382633209229, "rewards/margins": 2.7394795417785645, "rewards/rejected": -4.205018043518066, "step": 2013 }, { "epoch": 1.9, "grad_norm": 24.29989483598445, "learning_rate": 1.5656877697067417e-07, "logps/chosen": -50.000732421875, "logps/rejected": -65.05966186523438, "loss": 0.4067, "losses/dpo": 0.4450518488883972, "losses/sft": 1.1336594820022583, "losses/total": 0.4450518488883972, "ref_logps/chosen": -33.29244613647461, "ref_logps/rejected": -29.38322639465332, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6708290576934814, "rewards/margins": 1.8968150615692139, "rewards/rejected": -3.5676441192626953, "step": 2014 }, { "epoch": 1.9, "grad_norm": 15.051111093878587, "learning_rate": 1.5633261021702013e-07, "logps/chosen": -44.32561492919922, "logps/rejected": -67.62644958496094, "loss": 0.2221, "losses/dpo": 0.4050869345664978, "losses/sft": 1.5230646133422852, "losses/total": 0.4050869345664978, "ref_logps/chosen": -23.433000564575195, "ref_logps/rejected": -27.8021240234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.089261531829834, "rewards/margins": 1.893170952796936, "rewards/rejected": -3.9824326038360596, "step": 2015 }, { "epoch": 1.9, "grad_norm": 24.46666364835858, "learning_rate": 1.560965406618268e-07, "logps/chosen": -42.75065231323242, "logps/rejected": -72.89776611328125, "loss": 0.2735, "losses/dpo": 0.5494415163993835, "losses/sft": 1.1628432273864746, "losses/total": 0.5494415163993835, "ref_logps/chosen": -25.093109130859375, "ref_logps/rejected": -35.21197509765625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7657544612884521, "rewards/margins": 2.0028247833251953, "rewards/rejected": -3.7685794830322266, "step": 2016 }, { "epoch": 1.9, "grad_norm": 26.323626993157553, "learning_rate": 1.5586056855006308e-07, "logps/chosen": -45.37520980834961, "logps/rejected": -67.54296875, "loss": 0.3163, "losses/dpo": 0.07151717692613602, "losses/sft": 0.4426971971988678, "losses/total": 0.07151717692613602, "ref_logps/chosen": -27.320280075073242, "ref_logps/rejected": -32.439849853515625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8054931163787842, "rewards/margins": 1.7048184871673584, "rewards/rejected": -3.5103116035461426, "step": 2017 }, { "epoch": 1.9, "grad_norm": 15.412021602803502, "learning_rate": 1.556246941265967e-07, "logps/chosen": -41.11048889160156, "logps/rejected": -64.95111083984375, "loss": 0.2401, "losses/dpo": 0.019503260031342506, "losses/sft": 2.0723211765289307, "losses/total": 0.019503260031342506, "ref_logps/chosen": -24.51816177368164, "ref_logps/rejected": -29.899595260620117, "rewards/accuracies": 0.875, "rewards/chosen": -1.6592328548431396, "rewards/margins": 1.845919132232666, "rewards/rejected": -3.5051517486572266, "step": 2018 }, { "epoch": 1.9, "grad_norm": 14.795113692908703, "learning_rate": 1.55388917636194e-07, "logps/chosen": -51.95750427246094, "logps/rejected": -74.31239318847656, "loss": 0.2171, "losses/dpo": 0.3364966809749603, "losses/sft": 1.7880414724349976, "losses/total": 0.3364966809749603, "ref_logps/chosen": -35.217491149902344, "ref_logps/rejected": -32.500091552734375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6740014553070068, "rewards/margins": 2.5072293281555176, "rewards/rejected": -4.181230545043945, "step": 2019 }, { "epoch": 1.91, "grad_norm": 23.30632222796063, "learning_rate": 1.5515323932351993e-07, "logps/chosen": -51.039554595947266, "logps/rejected": -68.42231750488281, "loss": 0.4006, "losses/dpo": 0.16646626591682434, "losses/sft": 1.8105448484420776, "losses/total": 0.16646626591682434, "ref_logps/chosen": -30.157556533813477, "ref_logps/rejected": -31.209680557250977, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0881996154785156, "rewards/margins": 1.6330647468566895, "rewards/rejected": -3.721264362335205, "step": 2020 }, { "epoch": 1.91, "grad_norm": 18.303908446242104, "learning_rate": 1.5491765943313728e-07, "logps/chosen": -44.7973747253418, "logps/rejected": -69.817138671875, "loss": 0.2575, "losses/dpo": 0.08076605200767517, "losses/sft": 2.036306142807007, "losses/total": 0.08076605200767517, "ref_logps/chosen": -29.42617416381836, "ref_logps/rejected": -31.175779342651367, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5371203422546387, "rewards/margins": 2.3270153999328613, "rewards/rejected": -3.8641357421875, "step": 2021 }, { "epoch": 1.91, "grad_norm": 24.20842038328497, "learning_rate": 1.5468217820950687e-07, "logps/chosen": -49.21734619140625, "logps/rejected": -65.1997299194336, "loss": 0.3386, "losses/dpo": 0.1329592764377594, "losses/sft": 0.6883078217506409, "losses/total": 0.1329592764377594, "ref_logps/chosen": -29.899028778076172, "ref_logps/rejected": -31.335105895996094, "rewards/accuracies": 0.875, "rewards/chosen": -1.9318320751190186, "rewards/margins": 1.4546302556991577, "rewards/rejected": -3.386462450027466, "step": 2022 }, { "epoch": 1.91, "grad_norm": 24.790467765654547, "learning_rate": 1.5444679589698706e-07, "logps/chosen": -49.40947723388672, "logps/rejected": -86.73258972167969, "loss": 0.411, "losses/dpo": 0.0029598199762403965, "losses/sft": 0.6719064116477966, "losses/total": 0.0029598199762403965, "ref_logps/chosen": -33.655601501464844, "ref_logps/rejected": -43.06786346435547, "rewards/accuracies": 0.75, "rewards/chosen": -1.5753881931304932, "rewards/margins": 2.791084051132202, "rewards/rejected": -4.366472244262695, "step": 2023 }, { "epoch": 1.91, "grad_norm": 22.086936646221236, "learning_rate": 1.5421151273983358e-07, "logps/chosen": -40.319297790527344, "logps/rejected": -76.96699523925781, "loss": 0.3002, "losses/dpo": 0.0032815358135849237, "losses/sft": 4.080586910247803, "losses/total": 0.0032815358135849237, "ref_logps/chosen": -23.722251892089844, "ref_logps/rejected": -37.601810455322266, "rewards/accuracies": 0.875, "rewards/chosen": -1.6597044467926025, "rewards/margins": 2.2768139839172363, "rewards/rejected": -3.936518669128418, "step": 2024 }, { "epoch": 1.91, "grad_norm": 36.913649783702276, "learning_rate": 1.5397632898219937e-07, "logps/chosen": -53.35273742675781, "logps/rejected": -61.490379333496094, "loss": 0.6619, "losses/dpo": 0.01732526905834675, "losses/sft": 1.729949951171875, "losses/total": 0.01732526905834675, "ref_logps/chosen": -32.521202087402344, "ref_logps/rejected": -28.873327255249023, "rewards/accuracies": 0.5625, "rewards/chosen": -2.08315372467041, "rewards/margins": 1.178551435470581, "rewards/rejected": -3.261705160140991, "step": 2025 }, { "epoch": 1.91, "grad_norm": 19.724344821035025, "learning_rate": 1.537412448681341e-07, "logps/chosen": -47.29255676269531, "logps/rejected": -86.06138610839844, "loss": 0.248, "losses/dpo": 0.6243476271629333, "losses/sft": 1.2413771152496338, "losses/total": 0.6243476271629333, "ref_logps/chosen": -30.30844497680664, "ref_logps/rejected": -45.302978515625, "rewards/accuracies": 0.875, "rewards/chosen": -1.698411226272583, "rewards/margins": 2.377429485321045, "rewards/rejected": -4.075840950012207, "step": 2026 }, { "epoch": 1.91, "grad_norm": 13.02267137218712, "learning_rate": 1.5350626064158406e-07, "logps/chosen": -42.657249450683594, "logps/rejected": -75.00213623046875, "loss": 0.1806, "losses/dpo": 0.7024914026260376, "losses/sft": 0.5547789335250854, "losses/total": 0.7024914026260376, "ref_logps/chosen": -25.37103843688965, "ref_logps/rejected": -33.32603073120117, "rewards/accuracies": 0.875, "rewards/chosen": -1.728621482849121, "rewards/margins": 2.4389896392822266, "rewards/rejected": -4.167611122131348, "step": 2027 }, { "epoch": 1.91, "grad_norm": 28.95220516390695, "learning_rate": 1.5327137654639187e-07, "logps/chosen": -48.52608871459961, "logps/rejected": -61.718875885009766, "loss": 0.5031, "losses/dpo": 0.749407947063446, "losses/sft": 1.9931141138076782, "losses/total": 0.749407947063446, "ref_logps/chosen": -31.83425521850586, "ref_logps/rejected": -29.927505493164062, "rewards/accuracies": 0.625, "rewards/chosen": -1.669183611869812, "rewards/margins": 1.5099536180496216, "rewards/rejected": -3.1791372299194336, "step": 2028 }, { "epoch": 1.91, "grad_norm": 10.43636395409377, "learning_rate": 1.530365928262964e-07, "logps/chosen": -43.15977478027344, "logps/rejected": -78.22032165527344, "loss": 0.1438, "losses/dpo": 0.012706398963928223, "losses/sft": 1.241773247718811, "losses/total": 0.012706398963928223, "ref_logps/chosen": -28.21371841430664, "ref_logps/rejected": -36.786922454833984, "rewards/accuracies": 1.0, "rewards/chosen": -1.494605302810669, "rewards/margins": 2.6487343311309814, "rewards/rejected": -4.14333963394165, "step": 2029 }, { "epoch": 1.92, "grad_norm": 25.039130387642004, "learning_rate": 1.5280190972493212e-07, "logps/chosen": -50.923675537109375, "logps/rejected": -62.16432189941406, "loss": 0.3878, "losses/dpo": 0.3462701737880707, "losses/sft": 1.2446174621582031, "losses/total": 0.3462701737880707, "ref_logps/chosen": -28.85210609436035, "ref_logps/rejected": -26.62311553955078, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2071571350097656, "rewards/margins": 1.346963882446289, "rewards/rejected": -3.5541210174560547, "step": 2030 }, { "epoch": 1.92, "grad_norm": 17.99071614626062, "learning_rate": 1.525673274858293e-07, "logps/chosen": -55.714569091796875, "logps/rejected": -67.367431640625, "loss": 0.2205, "losses/dpo": 0.044063717126846313, "losses/sft": 1.195931077003479, "losses/total": 0.044063717126846313, "ref_logps/chosen": -37.35388946533203, "ref_logps/rejected": -29.0621280670166, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8360679149627686, "rewards/margins": 1.9944626092910767, "rewards/rejected": -3.8305304050445557, "step": 2031 }, { "epoch": 1.92, "grad_norm": 6.060405198405576, "learning_rate": 1.5233284635241334e-07, "logps/chosen": -51.898895263671875, "logps/rejected": -88.03495788574219, "loss": 0.0798, "losses/dpo": 0.5104496479034424, "losses/sft": 1.3727504014968872, "losses/total": 0.5104496479034424, "ref_logps/chosen": -34.18336486816406, "ref_logps/rejected": -38.48495101928711, "rewards/accuracies": 1.0, "rewards/chosen": -1.7715530395507812, "rewards/margins": 3.18344783782959, "rewards/rejected": -4.955000877380371, "step": 2032 }, { "epoch": 1.92, "grad_norm": 14.034299198665632, "learning_rate": 1.520984665680049e-07, "logps/chosen": -34.39985656738281, "logps/rejected": -55.73679733276367, "loss": 0.2207, "losses/dpo": 0.17298026382923126, "losses/sft": 1.7528618574142456, "losses/total": 0.17298026382923126, "ref_logps/chosen": -20.413951873779297, "ref_logps/rejected": -22.92814826965332, "rewards/accuracies": 1.0, "rewards/chosen": -1.3985905647277832, "rewards/margins": 1.8822743892669678, "rewards/rejected": -3.28086519241333, "step": 2033 }, { "epoch": 1.92, "grad_norm": 24.596718019955183, "learning_rate": 1.5186418837581942e-07, "logps/chosen": -53.02325439453125, "logps/rejected": -56.17415237426758, "loss": 0.3342, "losses/dpo": 0.35616743564605713, "losses/sft": 1.7251025438308716, "losses/total": 0.35616743564605713, "ref_logps/chosen": -37.56336975097656, "ref_logps/rejected": -23.907032012939453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5459885597229004, "rewards/margins": 1.6807239055633545, "rewards/rejected": -3.226712226867676, "step": 2034 }, { "epoch": 1.92, "grad_norm": 21.91827976914198, "learning_rate": 1.5163001201896698e-07, "logps/chosen": -56.51097869873047, "logps/rejected": -77.85005950927734, "loss": 0.3192, "losses/dpo": 0.004371207673102617, "losses/sft": 0.9571511149406433, "losses/total": 0.004371207673102617, "ref_logps/chosen": -38.79836654663086, "ref_logps/rejected": -38.36852264404297, "rewards/accuracies": 0.75, "rewards/chosen": -1.7712615728378296, "rewards/margins": 2.1768925189971924, "rewards/rejected": -3.9481542110443115, "step": 2035 }, { "epoch": 1.92, "grad_norm": 11.109632399657462, "learning_rate": 1.513959377404518e-07, "logps/chosen": -41.25252151489258, "logps/rejected": -83.13818359375, "loss": 0.1361, "losses/dpo": 0.17310231924057007, "losses/sft": 1.3081846237182617, "losses/total": 0.17310231924057007, "ref_logps/chosen": -27.750267028808594, "ref_logps/rejected": -40.264286041259766, "rewards/accuracies": 1.0, "rewards/chosen": -1.3502253293991089, "rewards/margins": 2.937164783477783, "rewards/rejected": -4.287390232086182, "step": 2036 }, { "epoch": 1.92, "grad_norm": 23.86468035571031, "learning_rate": 1.5116196578317224e-07, "logps/chosen": -52.18598556518555, "logps/rejected": -58.59422302246094, "loss": 0.3454, "losses/dpo": 0.8287590742111206, "losses/sft": 2.111309289932251, "losses/total": 0.8287590742111206, "ref_logps/chosen": -30.797447204589844, "ref_logps/rejected": -23.710208892822266, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1388540267944336, "rewards/margins": 1.3495469093322754, "rewards/rejected": -3.488400936126709, "step": 2037 }, { "epoch": 1.92, "grad_norm": 27.40917903331593, "learning_rate": 1.5092809638992068e-07, "logps/chosen": -48.78547668457031, "logps/rejected": -78.62060546875, "loss": 0.3562, "losses/dpo": 0.5910108089447021, "losses/sft": 2.2056498527526855, "losses/total": 0.5910108089447021, "ref_logps/chosen": -29.05968475341797, "ref_logps/rejected": -40.015403747558594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.972579002380371, "rewards/margins": 1.8879406452178955, "rewards/rejected": -3.8605196475982666, "step": 2038 }, { "epoch": 1.92, "grad_norm": 28.811867906712752, "learning_rate": 1.506943298033828e-07, "logps/chosen": -46.67124938964844, "logps/rejected": -76.5169448852539, "loss": 0.3535, "losses/dpo": 1.0646964311599731, "losses/sft": 2.4913930892944336, "losses/total": 1.0646964311599731, "ref_logps/chosen": -28.4122257232666, "ref_logps/rejected": -30.441837310791016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8259022235870361, "rewards/margins": 2.781608819961548, "rewards/rejected": -4.607511520385742, "step": 2039 }, { "epoch": 1.92, "grad_norm": 18.76586502718462, "learning_rate": 1.504606662661378e-07, "logps/chosen": -46.97267150878906, "logps/rejected": -65.39550018310547, "loss": 0.2929, "losses/dpo": 0.21371899545192719, "losses/sft": 1.5303648710250854, "losses/total": 0.21371899545192719, "ref_logps/chosen": -29.012317657470703, "ref_logps/rejected": -31.02849006652832, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7960349321365356, "rewards/margins": 1.6406660079956055, "rewards/rejected": -3.4367008209228516, "step": 2040 }, { "epoch": 1.93, "grad_norm": 22.561390068849867, "learning_rate": 1.5022710602065773e-07, "logps/chosen": -43.94960021972656, "logps/rejected": -62.200103759765625, "loss": 0.3225, "losses/dpo": 0.3121010363101959, "losses/sft": 1.447585940361023, "losses/total": 0.3121010363101959, "ref_logps/chosen": -30.889301300048828, "ref_logps/rejected": -31.15484619140625, "rewards/accuracies": 0.875, "rewards/chosen": -1.3060302734375, "rewards/margins": 1.7984955310821533, "rewards/rejected": -3.104525566101074, "step": 2041 }, { "epoch": 1.93, "grad_norm": 16.04530581155448, "learning_rate": 1.499936493093077e-07, "logps/chosen": -40.90399932861328, "logps/rejected": -69.10250091552734, "loss": 0.2374, "losses/dpo": 0.08359568566083908, "losses/sft": 0.3659796416759491, "losses/total": 0.08359568566083908, "ref_logps/chosen": -26.890758514404297, "ref_logps/rejected": -33.66444778442383, "rewards/accuracies": 0.9375, "rewards/chosen": -1.401323914527893, "rewards/margins": 2.142481565475464, "rewards/rejected": -3.5438053607940674, "step": 2042 }, { "epoch": 1.93, "grad_norm": 26.24858429633127, "learning_rate": 1.4976029637434523e-07, "logps/chosen": -48.98802947998047, "logps/rejected": -71.66705322265625, "loss": 0.3968, "losses/dpo": 0.5897125005722046, "losses/sft": 1.7881121635437012, "losses/total": 0.5897125005722046, "ref_logps/chosen": -31.194629669189453, "ref_logps/rejected": -34.72300338745117, "rewards/accuracies": 0.875, "rewards/chosen": -1.7793400287628174, "rewards/margins": 1.9150649309158325, "rewards/rejected": -3.6944050788879395, "step": 2043 }, { "epoch": 1.93, "grad_norm": 17.579812720189256, "learning_rate": 1.4952704745792022e-07, "logps/chosen": -44.73838424682617, "logps/rejected": -72.42323303222656, "loss": 0.2379, "losses/dpo": 0.02086528390645981, "losses/sft": 1.6990315914154053, "losses/total": 0.02086528390645981, "ref_logps/chosen": -29.999895095825195, "ref_logps/rejected": -35.272457122802734, "rewards/accuracies": 0.875, "rewards/chosen": -1.473848819732666, "rewards/margins": 2.2412285804748535, "rewards/rejected": -3.7150774002075195, "step": 2044 }, { "epoch": 1.93, "grad_norm": 26.62711576049696, "learning_rate": 1.4929390280207456e-07, "logps/chosen": -51.51735305786133, "logps/rejected": -77.24776458740234, "loss": 0.4111, "losses/dpo": 0.39612677693367004, "losses/sft": 0.8930034637451172, "losses/total": 0.39612677693367004, "ref_logps/chosen": -30.619415283203125, "ref_logps/rejected": -38.61805725097656, "rewards/accuracies": 0.75, "rewards/chosen": -2.0897939205169678, "rewards/margins": 1.773177146911621, "rewards/rejected": -3.862971305847168, "step": 2045 }, { "epoch": 1.93, "grad_norm": 18.655484194362533, "learning_rate": 1.4906086264874197e-07, "logps/chosen": -42.427284240722656, "logps/rejected": -58.76319122314453, "loss": 0.3987, "losses/dpo": 0.07748696208000183, "losses/sft": 0.9273234009742737, "losses/total": 0.07748696208000183, "ref_logps/chosen": -21.669841766357422, "ref_logps/rejected": -25.254915237426758, "rewards/accuracies": 0.875, "rewards/chosen": -2.075744152069092, "rewards/margins": 1.275083303451538, "rewards/rejected": -3.350827693939209, "step": 2046 }, { "epoch": 1.93, "grad_norm": 14.444403788697652, "learning_rate": 1.4882792723974777e-07, "logps/chosen": -43.91203308105469, "logps/rejected": -73.49759674072266, "loss": 0.2121, "losses/dpo": 0.5780253410339355, "losses/sft": 1.8042914867401123, "losses/total": 0.5780253410339355, "ref_logps/chosen": -25.55653953552246, "ref_logps/rejected": -33.985897064208984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.835549235343933, "rewards/margins": 2.1156210899353027, "rewards/rejected": -3.9511704444885254, "step": 2047 }, { "epoch": 1.93, "grad_norm": 20.55362234051078, "learning_rate": 1.4859509681680858e-07, "logps/chosen": -39.44248580932617, "logps/rejected": -64.29849243164062, "loss": 0.3127, "losses/dpo": 0.022216537967324257, "losses/sft": 1.501570224761963, "losses/total": 0.022216537967324257, "ref_logps/chosen": -23.628646850585938, "ref_logps/rejected": -27.835987091064453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5813837051391602, "rewards/margins": 2.064866542816162, "rewards/rejected": -3.6462502479553223, "step": 2048 }, { "epoch": 1.93, "grad_norm": 13.828093321652753, "learning_rate": 1.48362371621532e-07, "logps/chosen": -48.75645446777344, "logps/rejected": -77.08427429199219, "loss": 0.2175, "losses/dpo": 0.27954915165901184, "losses/sft": 3.214576482772827, "losses/total": 0.27954915165901184, "ref_logps/chosen": -28.660062789916992, "ref_logps/rejected": -34.442161560058594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.009639263153076, "rewards/margins": 2.254572629928589, "rewards/rejected": -4.264211654663086, "step": 2049 }, { "epoch": 1.93, "grad_norm": 25.851060550541757, "learning_rate": 1.4812975189541652e-07, "logps/chosen": -45.37389373779297, "logps/rejected": -78.8575668334961, "loss": 0.4145, "losses/dpo": 0.3212563395500183, "losses/sft": 1.9797890186309814, "losses/total": 0.3212563395500183, "ref_logps/chosen": -27.667667388916016, "ref_logps/rejected": -40.26921844482422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7706226110458374, "rewards/margins": 2.088212490081787, "rewards/rejected": -3.858835220336914, "step": 2050 }, { "epoch": 1.93, "grad_norm": 20.823199854070197, "learning_rate": 1.478972378798512e-07, "logps/chosen": -50.437530517578125, "logps/rejected": -84.15351867675781, "loss": 0.3358, "losses/dpo": 0.08847317844629288, "losses/sft": 1.5101954936981201, "losses/total": 0.08847317844629288, "ref_logps/chosen": -29.951679229736328, "ref_logps/rejected": -42.64971160888672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0485851764678955, "rewards/margins": 2.1017956733703613, "rewards/rejected": -4.150381088256836, "step": 2051 }, { "epoch": 1.94, "grad_norm": 12.40317835101405, "learning_rate": 1.4766482981611535e-07, "logps/chosen": -45.703147888183594, "logps/rejected": -77.82890319824219, "loss": 0.1321, "losses/dpo": 0.021521229296922684, "losses/sft": 1.2511379718780518, "losses/total": 0.021521229296922684, "ref_logps/chosen": -29.348228454589844, "ref_logps/rejected": -33.750885009765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6354918479919434, "rewards/margins": 2.7723093032836914, "rewards/rejected": -4.407801151275635, "step": 2052 }, { "epoch": 1.94, "grad_norm": 9.4392031722414, "learning_rate": 1.4743252794537834e-07, "logps/chosen": -42.19769287109375, "logps/rejected": -83.16270446777344, "loss": 0.1238, "losses/dpo": 0.12927737832069397, "losses/sft": 1.6858614683151245, "losses/total": 0.12927737832069397, "ref_logps/chosen": -27.23973846435547, "ref_logps/rejected": -39.18828201293945, "rewards/accuracies": 1.0, "rewards/chosen": -1.4957956075668335, "rewards/margins": 2.9016475677490234, "rewards/rejected": -4.397442817687988, "step": 2053 }, { "epoch": 1.94, "grad_norm": 20.09517665339893, "learning_rate": 1.472003325086993e-07, "logps/chosen": -56.434627532958984, "logps/rejected": -82.21961975097656, "loss": 0.2325, "losses/dpo": 0.3245788514614105, "losses/sft": 2.2555348873138428, "losses/total": 0.3245788514614105, "ref_logps/chosen": -35.54400634765625, "ref_logps/rejected": -40.344825744628906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.089062213897705, "rewards/margins": 2.0984175205230713, "rewards/rejected": -4.1874799728393555, "step": 2054 }, { "epoch": 1.94, "grad_norm": 11.890770156614353, "learning_rate": 1.4696824374702705e-07, "logps/chosen": -63.59498596191406, "logps/rejected": -90.65484619140625, "loss": 0.1284, "losses/dpo": 0.41523340344429016, "losses/sft": 1.0143014192581177, "losses/total": 0.41523340344429016, "ref_logps/chosen": -42.14055252075195, "ref_logps/rejected": -43.60236358642578, "rewards/accuracies": 1.0, "rewards/chosen": -2.1454436779022217, "rewards/margins": 2.5598044395446777, "rewards/rejected": -4.70524787902832, "step": 2055 }, { "epoch": 1.94, "grad_norm": 14.605620556594532, "learning_rate": 1.4673626190119958e-07, "logps/chosen": -40.36414337158203, "logps/rejected": -64.96682739257812, "loss": 0.2279, "losses/dpo": 0.004747980739921331, "losses/sft": 1.0159443616867065, "losses/total": 0.004747980739921331, "ref_logps/chosen": -24.852802276611328, "ref_logps/rejected": -25.856977462768555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5511342287063599, "rewards/margins": 2.359851360321045, "rewards/rejected": -3.9109854698181152, "step": 2056 }, { "epoch": 1.94, "grad_norm": 17.838014581136363, "learning_rate": 1.4650438721194398e-07, "logps/chosen": -42.50884246826172, "logps/rejected": -72.43061828613281, "loss": 0.2224, "losses/dpo": 0.23907868564128876, "losses/sft": 1.0704524517059326, "losses/total": 0.23907868564128876, "ref_logps/chosen": -24.847564697265625, "ref_logps/rejected": -34.3045539855957, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7661278247833252, "rewards/margins": 2.0464789867401123, "rewards/rejected": -3.8126068115234375, "step": 2057 }, { "epoch": 1.94, "grad_norm": 19.530120324014835, "learning_rate": 1.462726199198761e-07, "logps/chosen": -38.900596618652344, "logps/rejected": -65.41603088378906, "loss": 0.2556, "losses/dpo": 0.47113335132598877, "losses/sft": 1.2608217000961304, "losses/total": 0.47113335132598877, "ref_logps/chosen": -23.352920532226562, "ref_logps/rejected": -30.26290512084961, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5547676086425781, "rewards/margins": 1.960545539855957, "rewards/rejected": -3.515313148498535, "step": 2058 }, { "epoch": 1.94, "grad_norm": 14.017390382182269, "learning_rate": 1.4604096026550048e-07, "logps/chosen": -39.27085876464844, "logps/rejected": -81.84918212890625, "loss": 0.2086, "losses/dpo": 0.18727992475032806, "losses/sft": 0.3423667848110199, "losses/total": 0.18727992475032806, "ref_logps/chosen": -22.364612579345703, "ref_logps/rejected": -37.295021057128906, "rewards/accuracies": 0.875, "rewards/chosen": -1.6906245946884155, "rewards/margins": 2.76479172706604, "rewards/rejected": -4.455416679382324, "step": 2059 }, { "epoch": 1.94, "grad_norm": 13.74691224848712, "learning_rate": 1.4580940848920984e-07, "logps/chosen": -45.08488464355469, "logps/rejected": -75.09447479248047, "loss": 0.1623, "losses/dpo": 0.03652571141719818, "losses/sft": 1.4756524562835693, "losses/total": 0.03652571141719818, "ref_logps/chosen": -31.48784065246582, "ref_logps/rejected": -38.59649658203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3597042560577393, "rewards/margins": 2.290093421936035, "rewards/rejected": -3.6497979164123535, "step": 2060 }, { "epoch": 1.94, "grad_norm": 21.792805651248642, "learning_rate": 1.4557796483128503e-07, "logps/chosen": -38.02655029296875, "logps/rejected": -64.46863555908203, "loss": 0.3631, "losses/dpo": 0.06576859205961227, "losses/sft": 1.5559818744659424, "losses/total": 0.06576859205961227, "ref_logps/chosen": -22.155353546142578, "ref_logps/rejected": -32.51167297363281, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5871195793151855, "rewards/margins": 1.608577013015747, "rewards/rejected": -3.1956963539123535, "step": 2061 }, { "epoch": 1.95, "grad_norm": 15.5365109305878, "learning_rate": 1.4534662953189454e-07, "logps/chosen": -35.8204345703125, "logps/rejected": -73.85391235351562, "loss": 0.2066, "losses/dpo": 1.0993826389312744, "losses/sft": 2.0130460262298584, "losses/total": 1.0993826389312744, "ref_logps/chosen": -17.737289428710938, "ref_logps/rejected": -32.39873504638672, "rewards/accuracies": 0.875, "rewards/chosen": -1.808314323425293, "rewards/margins": 2.337202787399292, "rewards/rejected": -4.145517349243164, "step": 2062 }, { "epoch": 1.95, "grad_norm": 27.039608457975437, "learning_rate": 1.4511540283109458e-07, "logps/chosen": -63.32147979736328, "logps/rejected": -79.21160888671875, "loss": 0.3256, "losses/dpo": 0.061368927359580994, "losses/sft": 3.464548110961914, "losses/total": 0.061368927359580994, "ref_logps/chosen": -39.70154571533203, "ref_logps/rejected": -39.756011962890625, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3619935512542725, "rewards/margins": 1.583566665649414, "rewards/rejected": -3.9455597400665283, "step": 2063 }, { "epoch": 1.95, "grad_norm": 13.604078742329687, "learning_rate": 1.448842849688288e-07, "logps/chosen": -41.66691970825195, "logps/rejected": -84.77061462402344, "loss": 0.1438, "losses/dpo": 0.19281254708766937, "losses/sft": 2.3698973655700684, "losses/total": 0.19281254708766937, "ref_logps/chosen": -25.96609115600586, "ref_logps/rejected": -40.15013885498047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5700827836990356, "rewards/margins": 2.891964912414551, "rewards/rejected": -4.462047576904297, "step": 2064 }, { "epoch": 1.95, "grad_norm": 34.05941242374739, "learning_rate": 1.446532761849275e-07, "logps/chosen": -56.802337646484375, "logps/rejected": -57.639442443847656, "loss": 0.6738, "losses/dpo": 0.1840570867061615, "losses/sft": 0.5586607456207275, "losses/total": 0.1840570867061615, "ref_logps/chosen": -31.92601203918457, "ref_logps/rejected": -24.784238815307617, "rewards/accuracies": 0.625, "rewards/chosen": -2.487633228302002, "rewards/margins": 0.7978873252868652, "rewards/rejected": -3.285520553588867, "step": 2065 }, { "epoch": 1.95, "grad_norm": 19.90436621792604, "learning_rate": 1.4442237671910827e-07, "logps/chosen": -42.93688201904297, "logps/rejected": -78.72466278076172, "loss": 0.2326, "losses/dpo": 0.7859821915626526, "losses/sft": 0.8373316526412964, "losses/total": 0.7859821915626526, "ref_logps/chosen": -25.14740753173828, "ref_logps/rejected": -36.20122146606445, "rewards/accuracies": 0.8125, "rewards/chosen": -1.778947114944458, "rewards/margins": 2.4733967781066895, "rewards/rejected": -4.252344131469727, "step": 2066 }, { "epoch": 1.95, "grad_norm": 23.281348331989687, "learning_rate": 1.4419158681097473e-07, "logps/chosen": -43.4530029296875, "logps/rejected": -75.31340026855469, "loss": 0.307, "losses/dpo": 0.031022630631923676, "losses/sft": 1.6277930736541748, "losses/total": 0.031022630631923676, "ref_logps/chosen": -22.787757873535156, "ref_logps/rejected": -31.54121208190918, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0665242671966553, "rewards/margins": 2.310694932937622, "rewards/rejected": -4.377219200134277, "step": 2067 }, { "epoch": 1.95, "grad_norm": 17.787159310495564, "learning_rate": 1.4396090670001736e-07, "logps/chosen": -33.79359436035156, "logps/rejected": -63.50110626220703, "loss": 0.2917, "losses/dpo": 0.017286768183112144, "losses/sft": 0.5724033117294312, "losses/total": 0.017286768183112144, "ref_logps/chosen": -19.819263458251953, "ref_logps/rejected": -29.9102840423584, "rewards/accuracies": 0.875, "rewards/chosen": -1.3974335193634033, "rewards/margins": 1.961648941040039, "rewards/rejected": -3.3590822219848633, "step": 2068 }, { "epoch": 1.95, "grad_norm": 15.62274993234051, "learning_rate": 1.437303366256123e-07, "logps/chosen": -37.35810852050781, "logps/rejected": -73.43032836914062, "loss": 0.2188, "losses/dpo": 0.15781453251838684, "losses/sft": 0.4676647186279297, "losses/total": 0.15781453251838684, "ref_logps/chosen": -22.674959182739258, "ref_logps/rejected": -36.45170211791992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4683150053024292, "rewards/margins": 2.2295479774475098, "rewards/rejected": -3.6978628635406494, "step": 2069 }, { "epoch": 1.95, "grad_norm": 23.272905057177255, "learning_rate": 1.4349987682702172e-07, "logps/chosen": -44.83708190917969, "logps/rejected": -75.69230651855469, "loss": 0.3059, "losses/dpo": 0.0763520821928978, "losses/sft": 1.0000711679458618, "losses/total": 0.0763520821928978, "ref_logps/chosen": -25.96942138671875, "ref_logps/rejected": -38.4272575378418, "rewards/accuracies": 0.875, "rewards/chosen": -1.8867661952972412, "rewards/margins": 1.839739203453064, "rewards/rejected": -3.7265052795410156, "step": 2070 }, { "epoch": 1.95, "grad_norm": 28.534422481701196, "learning_rate": 1.4326952754339317e-07, "logps/chosen": -41.72052764892578, "logps/rejected": -66.62228393554688, "loss": 0.5271, "losses/dpo": 1.9424571990966797, "losses/sft": 2.188002109527588, "losses/total": 1.9424571990966797, "ref_logps/chosen": -25.44626808166504, "ref_logps/rejected": -34.02560806274414, "rewards/accuracies": 0.625, "rewards/chosen": -1.6274261474609375, "rewards/margins": 1.6322417259216309, "rewards/rejected": -3.2596678733825684, "step": 2071 }, { "epoch": 1.95, "grad_norm": 28.5279189242723, "learning_rate": 1.430392890137597e-07, "logps/chosen": -49.94081115722656, "logps/rejected": -76.9002456665039, "loss": 0.3467, "losses/dpo": 0.008533785119652748, "losses/sft": 0.40789350867271423, "losses/total": 0.008533785119652748, "ref_logps/chosen": -31.175033569335938, "ref_logps/rejected": -32.39687728881836, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8765778541564941, "rewards/margins": 2.573758602142334, "rewards/rejected": -4.450336456298828, "step": 2072 }, { "epoch": 1.96, "grad_norm": 22.679905108338545, "learning_rate": 1.4280916147703942e-07, "logps/chosen": -50.63893127441406, "logps/rejected": -66.46955871582031, "loss": 0.344, "losses/dpo": 0.05745638161897659, "losses/sft": 1.8513681888580322, "losses/total": 0.05745638161897659, "ref_logps/chosen": -28.62012481689453, "ref_logps/rejected": -29.766925811767578, "rewards/accuracies": 0.875, "rewards/chosen": -2.201880931854248, "rewards/margins": 1.4683820009231567, "rewards/rejected": -3.6702628135681152, "step": 2073 }, { "epoch": 1.96, "grad_norm": 11.926866585688572, "learning_rate": 1.4257914517203527e-07, "logps/chosen": -38.97184371948242, "logps/rejected": -72.68143463134766, "loss": 0.1403, "losses/dpo": 0.10723249614238739, "losses/sft": 2.231908082962036, "losses/total": 0.10723249614238739, "ref_logps/chosen": -24.914852142333984, "ref_logps/rejected": -32.398075103759766, "rewards/accuracies": 1.0, "rewards/chosen": -1.4056992530822754, "rewards/margins": 2.6226367950439453, "rewards/rejected": -4.0283355712890625, "step": 2074 }, { "epoch": 1.96, "grad_norm": 23.28753470109213, "learning_rate": 1.4234924033743447e-07, "logps/chosen": -44.368507385253906, "logps/rejected": -82.99105834960938, "loss": 0.4026, "losses/dpo": 0.037725772708654404, "losses/sft": 0.8486329317092896, "losses/total": 0.037725772708654404, "ref_logps/chosen": -23.43882942199707, "ref_logps/rejected": -41.46912384033203, "rewards/accuracies": 0.6875, "rewards/chosen": -2.092967987060547, "rewards/margins": 2.05922532081604, "rewards/rejected": -4.152193546295166, "step": 2075 }, { "epoch": 1.96, "grad_norm": 22.71329825880985, "learning_rate": 1.4211944721180896e-07, "logps/chosen": -46.08236312866211, "logps/rejected": -76.44186401367188, "loss": 0.4338, "losses/dpo": 0.32725802063941956, "losses/sft": 1.0728839635849, "losses/total": 0.32725802063941956, "ref_logps/chosen": -27.245004653930664, "ref_logps/rejected": -38.80976867675781, "rewards/accuracies": 0.875, "rewards/chosen": -1.8837358951568604, "rewards/margins": 1.8794735670089722, "rewards/rejected": -3.763209342956543, "step": 2076 }, { "epoch": 1.96, "grad_norm": 31.38783190522192, "learning_rate": 1.418897660336147e-07, "logps/chosen": -51.611724853515625, "logps/rejected": -80.55128479003906, "loss": 0.377, "losses/dpo": 0.009660054929554462, "losses/sft": 1.0221965312957764, "losses/total": 0.009660054929554462, "ref_logps/chosen": -33.250221252441406, "ref_logps/rejected": -39.242958068847656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8361504077911377, "rewards/margins": 2.2946817874908447, "rewards/rejected": -4.130832195281982, "step": 2077 }, { "epoch": 1.96, "grad_norm": 20.741886557533782, "learning_rate": 1.4166019704119115e-07, "logps/chosen": -49.964630126953125, "logps/rejected": -73.18247985839844, "loss": 0.2367, "losses/dpo": 0.42302557826042175, "losses/sft": 1.6337039470672607, "losses/total": 0.42302557826042175, "ref_logps/chosen": -28.298675537109375, "ref_logps/rejected": -31.182912826538086, "rewards/accuracies": 1.0, "rewards/chosen": -2.166595458984375, "rewards/margins": 2.0333614349365234, "rewards/rejected": -4.199956893920898, "step": 2078 }, { "epoch": 1.96, "grad_norm": 16.110003523456697, "learning_rate": 1.4143074047276178e-07, "logps/chosen": -51.44784927368164, "logps/rejected": -73.37322998046875, "loss": 0.3266, "losses/dpo": 0.38567274808883667, "losses/sft": 2.0130999088287354, "losses/total": 0.38567274808883667, "ref_logps/chosen": -31.56816864013672, "ref_logps/rejected": -29.985933303833008, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9879683256149292, "rewards/margins": 2.3507611751556396, "rewards/rejected": -4.338729381561279, "step": 2079 }, { "epoch": 1.96, "grad_norm": 27.715206118137502, "learning_rate": 1.41201396566433e-07, "logps/chosen": -49.283931732177734, "logps/rejected": -74.40038299560547, "loss": 0.2936, "losses/dpo": 0.06153390184044838, "losses/sft": 1.5688536167144775, "losses/total": 0.06153390184044838, "ref_logps/chosen": -30.95537567138672, "ref_logps/rejected": -32.24375915527344, "rewards/accuracies": 0.875, "rewards/chosen": -1.8328555822372437, "rewards/margins": 2.3828070163726807, "rewards/rejected": -4.215662479400635, "step": 2080 }, { "epoch": 1.96, "grad_norm": 15.768382429027351, "learning_rate": 1.409721655601948e-07, "logps/chosen": -35.910255432128906, "logps/rejected": -80.91766357421875, "loss": 0.1796, "losses/dpo": 0.189287930727005, "losses/sft": 1.1857213973999023, "losses/total": 0.189287930727005, "ref_logps/chosen": -20.789939880371094, "ref_logps/rejected": -41.43907165527344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5120313167572021, "rewards/margins": 2.4358270168304443, "rewards/rejected": -3.9478583335876465, "step": 2081 }, { "epoch": 1.96, "grad_norm": 18.326519049681924, "learning_rate": 1.4074304769191945e-07, "logps/chosen": -41.72169876098633, "logps/rejected": -67.41975402832031, "loss": 0.3174, "losses/dpo": 0.11483719944953918, "losses/sft": 1.3494800329208374, "losses/total": 0.11483719944953918, "ref_logps/chosen": -25.74144744873047, "ref_logps/rejected": -31.47875213623047, "rewards/accuracies": 0.875, "rewards/chosen": -1.5980252027511597, "rewards/margins": 1.99607515335083, "rewards/rejected": -3.5941002368927, "step": 2082 }, { "epoch": 1.97, "grad_norm": 13.86947171668845, "learning_rate": 1.405140431993622e-07, "logps/chosen": -46.606658935546875, "logps/rejected": -70.59525299072266, "loss": 0.1874, "losses/dpo": 0.6137624979019165, "losses/sft": 1.1449310779571533, "losses/total": 0.6137624979019165, "ref_logps/chosen": -33.345558166503906, "ref_logps/rejected": -33.84633255004883, "rewards/accuracies": 1.0, "rewards/chosen": -1.3261098861694336, "rewards/margins": 2.348782539367676, "rewards/rejected": -3.674891948699951, "step": 2083 }, { "epoch": 1.97, "grad_norm": 13.144280085405045, "learning_rate": 1.4028515232016073e-07, "logps/chosen": -61.721923828125, "logps/rejected": -78.83810424804688, "loss": 0.1652, "losses/dpo": 0.44759243726730347, "losses/sft": 2.15564227104187, "losses/total": 0.44759243726730347, "ref_logps/chosen": -38.945953369140625, "ref_logps/rejected": -31.539447784423828, "rewards/accuracies": 1.0, "rewards/chosen": -2.277596950531006, "rewards/margins": 2.452268600463867, "rewards/rejected": -4.729865550994873, "step": 2084 }, { "epoch": 1.97, "grad_norm": 17.307270915815906, "learning_rate": 1.4005637529183433e-07, "logps/chosen": -51.48005676269531, "logps/rejected": -77.33576965332031, "loss": 0.2481, "losses/dpo": 0.033066172152757645, "losses/sft": 2.0377891063690186, "losses/total": 0.033066172152757645, "ref_logps/chosen": -30.93832015991211, "ref_logps/rejected": -35.342899322509766, "rewards/accuracies": 0.875, "rewards/chosen": -2.054173469543457, "rewards/margins": 2.145113229751587, "rewards/rejected": -4.199286460876465, "step": 2085 }, { "epoch": 1.97, "grad_norm": 26.50029123240017, "learning_rate": 1.3982771235178476e-07, "logps/chosen": -54.69218063354492, "logps/rejected": -66.00839233398438, "loss": 0.4089, "losses/dpo": 0.1994527280330658, "losses/sft": 0.2975938618183136, "losses/total": 0.1994527280330658, "ref_logps/chosen": -37.08557891845703, "ref_logps/rejected": -31.055818557739258, "rewards/accuracies": 0.75, "rewards/chosen": -1.7606600522994995, "rewards/margins": 1.7345972061157227, "rewards/rejected": -3.4952573776245117, "step": 2086 }, { "epoch": 1.97, "grad_norm": 8.72541748519367, "learning_rate": 1.3959916373729481e-07, "logps/chosen": -48.113399505615234, "logps/rejected": -103.52149963378906, "loss": 0.0837, "losses/dpo": 0.0728427916765213, "losses/sft": 1.0893166065216064, "losses/total": 0.0728427916765213, "ref_logps/chosen": -30.267139434814453, "ref_logps/rejected": -49.7093505859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7846261262893677, "rewards/margins": 3.596588611602783, "rewards/rejected": -5.381214618682861, "step": 2087 }, { "epoch": 1.97, "grad_norm": 12.516058817830338, "learning_rate": 1.3937072968552913e-07, "logps/chosen": -49.004554748535156, "logps/rejected": -88.57037353515625, "loss": 0.1795, "losses/dpo": 0.19972194731235504, "losses/sft": 0.9268177151679993, "losses/total": 0.19972194731235504, "ref_logps/chosen": -26.527889251708984, "ref_logps/rejected": -41.256187438964844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.247666597366333, "rewards/margins": 2.4837517738342285, "rewards/rejected": -4.731418609619141, "step": 2088 }, { "epoch": 1.97, "grad_norm": 19.241754269703133, "learning_rate": 1.3914241043353308e-07, "logps/chosen": -42.48604965209961, "logps/rejected": -65.3994140625, "loss": 0.2822, "losses/dpo": 0.16892078518867493, "losses/sft": 0.8218826651573181, "losses/total": 0.16892078518867493, "ref_logps/chosen": -24.966394424438477, "ref_logps/rejected": -28.835636138916016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7519655227661133, "rewards/margins": 1.9044125080108643, "rewards/rejected": -3.6563780307769775, "step": 2089 }, { "epoch": 1.97, "grad_norm": 15.075198230309441, "learning_rate": 1.3891420621823333e-07, "logps/chosen": -40.084102630615234, "logps/rejected": -66.86100769042969, "loss": 0.2753, "losses/dpo": 0.07792926579713821, "losses/sft": 1.1507415771484375, "losses/total": 0.07792926579713821, "ref_logps/chosen": -26.032644271850586, "ref_logps/rejected": -34.39381408691406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4051456451416016, "rewards/margins": 1.8415741920471191, "rewards/rejected": -3.2467198371887207, "step": 2090 }, { "epoch": 1.97, "grad_norm": 22.882115194651245, "learning_rate": 1.386861172764367e-07, "logps/chosen": -54.82339859008789, "logps/rejected": -89.15133666992188, "loss": 0.4044, "losses/dpo": 3.4328174591064453, "losses/sft": 3.0974385738372803, "losses/total": 3.4328174591064453, "ref_logps/chosen": -31.616640090942383, "ref_logps/rejected": -44.29481506347656, "rewards/accuracies": 0.875, "rewards/chosen": -2.320675849914551, "rewards/margins": 2.1649763584136963, "rewards/rejected": -4.485652446746826, "step": 2091 }, { "epoch": 1.97, "grad_norm": 25.33894004748634, "learning_rate": 1.3845814384483069e-07, "logps/chosen": -49.2068977355957, "logps/rejected": -66.63981628417969, "loss": 0.4295, "losses/dpo": 0.050288084894418716, "losses/sft": 0.7470751404762268, "losses/total": 0.050288084894418716, "ref_logps/chosen": -29.02608299255371, "ref_logps/rejected": -30.33466911315918, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0180814266204834, "rewards/margins": 1.612433671951294, "rewards/rejected": -3.6305150985717773, "step": 2092 }, { "epoch": 1.97, "grad_norm": 19.435986677982463, "learning_rate": 1.38230286159983e-07, "logps/chosen": -38.45223617553711, "logps/rejected": -80.68557739257812, "loss": 0.2157, "losses/dpo": 0.17711137235164642, "losses/sft": 0.7229025959968567, "losses/total": 0.17711137235164642, "ref_logps/chosen": -22.623836517333984, "ref_logps/rejected": -33.44220733642578, "rewards/accuracies": 0.875, "rewards/chosen": -1.582840085029602, "rewards/margins": 3.141496419906616, "rewards/rejected": -4.72433614730835, "step": 2093 }, { "epoch": 1.98, "grad_norm": 16.87460553168807, "learning_rate": 1.380025444583409e-07, "logps/chosen": -54.44811248779297, "logps/rejected": -83.67984008789062, "loss": 0.219, "losses/dpo": 0.6931470632553101, "losses/sft": 0.9179199934005737, "losses/total": 0.6931470632553101, "ref_logps/chosen": -34.133514404296875, "ref_logps/rejected": -38.731502532958984, "rewards/accuracies": 0.875, "rewards/chosen": -2.0314598083496094, "rewards/margins": 2.4633736610412598, "rewards/rejected": -4.494833946228027, "step": 2094 }, { "epoch": 1.98, "grad_norm": 18.420669192252163, "learning_rate": 1.3777491897623167e-07, "logps/chosen": -46.371238708496094, "logps/rejected": -73.95362091064453, "loss": 0.235, "losses/dpo": 0.049831852316856384, "losses/sft": 2.006356716156006, "losses/total": 0.049831852316856384, "ref_logps/chosen": -28.736328125, "ref_logps/rejected": -33.04616928100586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7634905576705933, "rewards/margins": 2.327254295349121, "rewards/rejected": -4.090744972229004, "step": 2095 }, { "epoch": 1.98, "grad_norm": 18.59621201333887, "learning_rate": 1.3754740994986162e-07, "logps/chosen": -55.105003356933594, "logps/rejected": -83.36325073242188, "loss": 0.2454, "losses/dpo": 0.021895868703722954, "losses/sft": 1.17546546459198, "losses/total": 0.021895868703722954, "ref_logps/chosen": -32.25080871582031, "ref_logps/rejected": -39.5238037109375, "rewards/accuracies": 0.875, "rewards/chosen": -2.2854199409484863, "rewards/margins": 2.098525047302246, "rewards/rejected": -4.383944988250732, "step": 2096 }, { "epoch": 1.98, "grad_norm": 18.6488535900852, "learning_rate": 1.3732001761531657e-07, "logps/chosen": -51.10932922363281, "logps/rejected": -91.54820251464844, "loss": 0.1581, "losses/dpo": 0.3782578408718109, "losses/sft": 2.22623610496521, "losses/total": 0.3782578408718109, "ref_logps/chosen": -31.69363021850586, "ref_logps/rejected": -42.766357421875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9415699243545532, "rewards/margins": 2.9366140365600586, "rewards/rejected": -4.8781843185424805, "step": 2097 }, { "epoch": 1.98, "grad_norm": 27.399549759391395, "learning_rate": 1.37092742208561e-07, "logps/chosen": -48.415870666503906, "logps/rejected": -59.8328971862793, "loss": 0.4877, "losses/dpo": 0.5442587733268738, "losses/sft": 0.34380820393562317, "losses/total": 0.5442587733268738, "ref_logps/chosen": -26.49850845336914, "ref_logps/rejected": -28.540159225463867, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1917362213134766, "rewards/margins": 0.9375378489494324, "rewards/rejected": -3.1292738914489746, "step": 2098 }, { "epoch": 1.98, "grad_norm": 19.545813901027582, "learning_rate": 1.3686558396543817e-07, "logps/chosen": -54.617366790771484, "logps/rejected": -74.04928588867188, "loss": 0.2405, "losses/dpo": 0.29790714383125305, "losses/sft": 2.107790946960449, "losses/total": 0.29790714383125305, "ref_logps/chosen": -40.36018371582031, "ref_logps/rejected": -37.88555145263672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4257185459136963, "rewards/margins": 2.190654754638672, "rewards/rejected": -3.616373300552368, "step": 2099 }, { "epoch": 1.98, "grad_norm": 28.3185672933688, "learning_rate": 1.3663854312166966e-07, "logps/chosen": -51.74425506591797, "logps/rejected": -70.1558837890625, "loss": 0.4322, "losses/dpo": 0.3409332036972046, "losses/sft": 2.1342580318450928, "losses/total": 0.3409332036972046, "ref_logps/chosen": -34.79258728027344, "ref_logps/rejected": -32.761932373046875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6951665878295898, "rewards/margins": 2.0442287921905518, "rewards/rejected": -3.7393953800201416, "step": 2100 }, { "epoch": 1.98, "grad_norm": 19.650965166164575, "learning_rate": 1.3641161991285539e-07, "logps/chosen": -41.552330017089844, "logps/rejected": -67.34017181396484, "loss": 0.3042, "losses/dpo": 0.22190740704536438, "losses/sft": 1.1864089965820312, "losses/total": 0.22190740704536438, "ref_logps/chosen": -22.398117065429688, "ref_logps/rejected": -32.36592483520508, "rewards/accuracies": 0.875, "rewards/chosen": -1.915421485900879, "rewards/margins": 1.5820033550262451, "rewards/rejected": -3.497424602508545, "step": 2101 }, { "epoch": 1.98, "grad_norm": 15.199379831006867, "learning_rate": 1.361848145744731e-07, "logps/chosen": -34.93848419189453, "logps/rejected": -67.14397430419922, "loss": 0.2311, "losses/dpo": 0.026863375678658485, "losses/sft": 1.2742863893508911, "losses/total": 0.026863375678658485, "ref_logps/chosen": -23.660940170288086, "ref_logps/rejected": -30.90576171875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1277542114257812, "rewards/margins": 2.4960672855377197, "rewards/rejected": -3.623821258544922, "step": 2102 }, { "epoch": 1.98, "grad_norm": 26.586361889070105, "learning_rate": 1.3595812734187816e-07, "logps/chosen": -60.00035095214844, "logps/rejected": -85.48727416992188, "loss": 0.2634, "losses/dpo": 0.010729814879596233, "losses/sft": 1.0016249418258667, "losses/total": 0.010729814879596233, "ref_logps/chosen": -39.089698791503906, "ref_logps/rejected": -41.796085357666016, "rewards/accuracies": 0.875, "rewards/chosen": -2.0910654067993164, "rewards/margins": 2.2780535221099854, "rewards/rejected": -4.369118690490723, "step": 2103 }, { "epoch": 1.98, "grad_norm": 15.820913061888453, "learning_rate": 1.357315584503036e-07, "logps/chosen": -42.76533508300781, "logps/rejected": -70.954833984375, "loss": 0.2348, "losses/dpo": 0.17147104442119598, "losses/sft": 0.3480341136455536, "losses/total": 0.17147104442119598, "ref_logps/chosen": -28.01975440979004, "ref_logps/rejected": -33.47462463378906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.474557876586914, "rewards/margins": 2.2734639644622803, "rewards/rejected": -3.7480216026306152, "step": 2104 }, { "epoch": 1.99, "grad_norm": 21.89539416427333, "learning_rate": 1.3550510813485928e-07, "logps/chosen": -37.739341735839844, "logps/rejected": -69.69932556152344, "loss": 0.3015, "losses/dpo": 0.16504158079624176, "losses/sft": 0.460650771856308, "losses/total": 0.16504158079624176, "ref_logps/chosen": -22.30837059020996, "ref_logps/rejected": -34.78415298461914, "rewards/accuracies": 0.875, "rewards/chosen": -1.5430967807769775, "rewards/margins": 1.9484210014343262, "rewards/rejected": -3.4915177822113037, "step": 2105 }, { "epoch": 1.99, "grad_norm": 22.97939703863197, "learning_rate": 1.3527877663053245e-07, "logps/chosen": -45.350624084472656, "logps/rejected": -67.34498596191406, "loss": 0.3591, "losses/dpo": 0.21671488881111145, "losses/sft": 0.8348963260650635, "losses/total": 0.21671488881111145, "ref_logps/chosen": -26.358949661254883, "ref_logps/rejected": -24.378761291503906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8991674184799194, "rewards/margins": 2.3974552154541016, "rewards/rejected": -4.2966227531433105, "step": 2106 }, { "epoch": 1.99, "grad_norm": 37.601323576728966, "learning_rate": 1.3505256417218662e-07, "logps/chosen": -50.387428283691406, "logps/rejected": -73.51361083984375, "loss": 0.4746, "losses/dpo": 2.7550806999206543, "losses/sft": 2.171276569366455, "losses/total": 2.7550806999206543, "ref_logps/chosen": -26.95089340209961, "ref_logps/rejected": -33.884281158447266, "rewards/accuracies": 0.875, "rewards/chosen": -2.3436532020568848, "rewards/margins": 1.619280219078064, "rewards/rejected": -3.9629335403442383, "step": 2107 }, { "epoch": 1.99, "grad_norm": 20.16217091536419, "learning_rate": 1.348264709945623e-07, "logps/chosen": -36.86800765991211, "logps/rejected": -56.44780731201172, "loss": 0.3046, "losses/dpo": 0.15089738368988037, "losses/sft": 1.1212087869644165, "losses/total": 0.15089738368988037, "ref_logps/chosen": -20.716209411621094, "ref_logps/rejected": -26.158891677856445, "rewards/accuracies": 1.0, "rewards/chosen": -1.6151800155639648, "rewards/margins": 1.413711428642273, "rewards/rejected": -3.0288915634155273, "step": 2108 }, { "epoch": 1.99, "grad_norm": 16.357925014247186, "learning_rate": 1.3460049733227562e-07, "logps/chosen": -40.586219787597656, "logps/rejected": -73.857666015625, "loss": 0.2954, "losses/dpo": 0.0035554394125938416, "losses/sft": 1.209360122680664, "losses/total": 0.0035554394125938416, "ref_logps/chosen": -23.059900283813477, "ref_logps/rejected": -34.329925537109375, "rewards/accuracies": 0.75, "rewards/chosen": -1.7526321411132812, "rewards/margins": 2.200141429901123, "rewards/rejected": -3.9527735710144043, "step": 2109 }, { "epoch": 1.99, "grad_norm": 8.599000008954933, "learning_rate": 1.3437464341981913e-07, "logps/chosen": -37.458534240722656, "logps/rejected": -91.8648910522461, "loss": 0.0943, "losses/dpo": 0.16793079674243927, "losses/sft": 1.1745234727859497, "losses/total": 0.16793079674243927, "ref_logps/chosen": -20.38945198059082, "ref_logps/rejected": -43.91806411743164, "rewards/accuracies": 1.0, "rewards/chosen": -1.7069087028503418, "rewards/margins": 3.0877740383148193, "rewards/rejected": -4.794682502746582, "step": 2110 }, { "epoch": 1.99, "grad_norm": 17.887484604696887, "learning_rate": 1.341489094915611e-07, "logps/chosen": -42.297393798828125, "logps/rejected": -67.48494720458984, "loss": 0.259, "losses/dpo": 0.2110844850540161, "losses/sft": 1.9125442504882812, "losses/total": 0.2110844850540161, "ref_logps/chosen": -20.693449020385742, "ref_logps/rejected": -29.565340042114258, "rewards/accuracies": 0.875, "rewards/chosen": -2.1603941917419434, "rewards/margins": 1.6315662860870361, "rewards/rejected": -3.7919607162475586, "step": 2111 }, { "epoch": 1.99, "grad_norm": 26.277903048900637, "learning_rate": 1.3392329578174493e-07, "logps/chosen": -47.366458892822266, "logps/rejected": -66.5057601928711, "loss": 0.3109, "losses/dpo": 0.3612852990627289, "losses/sft": 1.1686618328094482, "losses/total": 0.3612852990627289, "ref_logps/chosen": -28.138498306274414, "ref_logps/rejected": -30.862468719482422, "rewards/accuracies": 0.875, "rewards/chosen": -1.9227960109710693, "rewards/margins": 1.6415331363677979, "rewards/rejected": -3.564329147338867, "step": 2112 }, { "epoch": 1.99, "grad_norm": 23.01185979186497, "learning_rate": 1.3369780252448972e-07, "logps/chosen": -44.696739196777344, "logps/rejected": -74.43563079833984, "loss": 0.3448, "losses/dpo": 0.3436526656150818, "losses/sft": 0.4120538532733917, "losses/total": 0.3436526656150818, "ref_logps/chosen": -26.514984130859375, "ref_logps/rejected": -35.635841369628906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8181755542755127, "rewards/margins": 2.0618033409118652, "rewards/rejected": -3.879978656768799, "step": 2113 }, { "epoch": 1.99, "grad_norm": 14.111048160506334, "learning_rate": 1.334724299537892e-07, "logps/chosen": -49.090240478515625, "logps/rejected": -96.52851104736328, "loss": 0.1875, "losses/dpo": 0.08779671788215637, "losses/sft": 2.266425371170044, "losses/total": 0.08779671788215637, "ref_logps/chosen": -28.04945945739746, "ref_logps/rejected": -47.4076042175293, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1040780544281006, "rewards/margins": 2.808013439178467, "rewards/rejected": -4.912091255187988, "step": 2114 }, { "epoch": 2.0, "grad_norm": 27.72900531283853, "learning_rate": 1.3324717830351217e-07, "logps/chosen": -48.51166915893555, "logps/rejected": -67.57904052734375, "loss": 0.4724, "losses/dpo": 0.03825949504971504, "losses/sft": 1.5020045042037964, "losses/total": 0.03825949504971504, "ref_logps/chosen": -29.44674301147461, "ref_logps/rejected": -33.59752655029297, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9064927101135254, "rewards/margins": 1.4916585683822632, "rewards/rejected": -3.398151397705078, "step": 2115 }, { "epoch": 2.0, "grad_norm": 14.099658183850636, "learning_rate": 1.3302204780740168e-07, "logps/chosen": -47.85990905761719, "logps/rejected": -68.15545654296875, "loss": 0.2052, "losses/dpo": 0.012769022025167942, "losses/sft": 2.074185848236084, "losses/total": 0.012769022025167942, "ref_logps/chosen": -31.988025665283203, "ref_logps/rejected": -30.781604766845703, "rewards/accuracies": 1.0, "rewards/chosen": -1.5871878862380981, "rewards/margins": 2.1501975059509277, "rewards/rejected": -3.7373852729797363, "step": 2116 }, { "epoch": 2.0, "grad_norm": 23.186990603181847, "learning_rate": 1.3279703869907522e-07, "logps/chosen": -52.18174743652344, "logps/rejected": -98.45364379882812, "loss": 0.1679, "losses/dpo": 0.08959019184112549, "losses/sft": 0.6208884716033936, "losses/total": 0.08959019184112549, "ref_logps/chosen": -33.3410758972168, "ref_logps/rejected": -47.74286651611328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8840670585632324, "rewards/margins": 3.1870107650756836, "rewards/rejected": -5.071077346801758, "step": 2117 }, { "epoch": 2.0, "grad_norm": 16.405981548894527, "learning_rate": 1.3257215121202444e-07, "logps/chosen": -45.207427978515625, "logps/rejected": -81.97007751464844, "loss": 0.1818, "losses/dpo": 0.0510280504822731, "losses/sft": 1.9501192569732666, "losses/total": 0.0510280504822731, "ref_logps/chosen": -29.54006576538086, "ref_logps/rejected": -42.06023025512695, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5667362213134766, "rewards/margins": 2.424247980117798, "rewards/rejected": -3.9909842014312744, "step": 2118 }, { "epoch": 2.0, "grad_norm": 20.85175345734213, "learning_rate": 1.323473855796144e-07, "logps/chosen": -41.39508056640625, "logps/rejected": -67.12225341796875, "loss": 0.29, "losses/dpo": 0.4411839544773102, "losses/sft": 0.5688151121139526, "losses/total": 0.4411839544773102, "ref_logps/chosen": -21.61275863647461, "ref_logps/rejected": -30.261180877685547, "rewards/accuracies": 0.875, "rewards/chosen": -1.9782326221466064, "rewards/margins": 1.7078741788864136, "rewards/rejected": -3.6861066818237305, "step": 2119 }, { "epoch": 2.0, "grad_norm": 19.03145130028261, "learning_rate": 1.3212274203508417e-07, "logps/chosen": -45.482208251953125, "logps/rejected": -78.06450653076172, "loss": 0.2716, "losses/dpo": 0.013716379180550575, "losses/sft": 1.0461381673812866, "losses/total": 0.013716379180550575, "ref_logps/chosen": -25.168046951293945, "ref_logps/rejected": -34.685516357421875, "rewards/accuracies": 0.875, "rewards/chosen": -2.0314159393310547, "rewards/margins": 2.306483030319214, "rewards/rejected": -4.337899208068848, "step": 2120 }, { "epoch": 2.0, "grad_norm": 9.717395053355713, "learning_rate": 1.3189822081154567e-07, "logps/chosen": -41.134559631347656, "logps/rejected": -79.58096313476562, "loss": 0.13, "losses/dpo": 0.1387222409248352, "losses/sft": 1.0467373132705688, "losses/total": 0.1387222409248352, "ref_logps/chosen": -25.03965950012207, "ref_logps/rejected": -36.48158264160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6094897985458374, "rewards/margins": 2.7004475593566895, "rewards/rejected": -4.309937477111816, "step": 2121 }, { "epoch": 2.0, "grad_norm": 7.9157586825021795, "learning_rate": 1.3167382214198433e-07, "logps/chosen": -59.87278747558594, "logps/rejected": -91.809814453125, "loss": 0.0966, "losses/dpo": 0.04374464601278305, "losses/sft": 0.63117516040802, "losses/total": 0.04374464601278305, "ref_logps/chosen": -39.58152389526367, "ref_logps/rejected": -41.88600540161133, "rewards/accuracies": 1.0, "rewards/chosen": -2.0291264057159424, "rewards/margins": 2.963254451751709, "rewards/rejected": -4.9923810958862305, "step": 2122 }, { "epoch": 2.0, "grad_norm": 9.227519646518118, "learning_rate": 1.3144954625925801e-07, "logps/chosen": -39.980796813964844, "logps/rejected": -71.62884521484375, "loss": 0.1188, "losses/dpo": 0.5309237837791443, "losses/sft": 1.190915584564209, "losses/total": 0.5309237837791443, "ref_logps/chosen": -24.66818618774414, "ref_logps/rejected": -28.26958465576172, "rewards/accuracies": 1.0, "rewards/chosen": -1.5312608480453491, "rewards/margins": 2.8046650886535645, "rewards/rejected": -4.335926055908203, "step": 2123 }, { "epoch": 2.0, "grad_norm": 14.691337811574602, "learning_rate": 1.312253933960975e-07, "logps/chosen": -39.557273864746094, "logps/rejected": -66.23121643066406, "loss": 0.2046, "losses/dpo": 0.04320943355560303, "losses/sft": 1.0177515745162964, "losses/total": 0.04320943355560303, "ref_logps/chosen": -25.124887466430664, "ref_logps/rejected": -27.838272094726562, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4432388544082642, "rewards/margins": 2.3960561752319336, "rewards/rejected": -3.839294672012329, "step": 2124 }, { "epoch": 2.0, "grad_norm": 14.606634957845674, "learning_rate": 1.3100136378510562e-07, "logps/chosen": -46.955360412597656, "logps/rejected": -64.30792236328125, "loss": 0.2354, "losses/dpo": 0.3036614656448364, "losses/sft": 1.2834347486495972, "losses/total": 0.3036614656448364, "ref_logps/chosen": -31.292652130126953, "ref_logps/rejected": -27.39561653137207, "rewards/accuracies": 0.875, "rewards/chosen": -1.5662708282470703, "rewards/margins": 2.124960422515869, "rewards/rejected": -3.6912312507629395, "step": 2125 }, { "epoch": 2.01, "grad_norm": 8.129794439352672, "learning_rate": 1.3077745765875753e-07, "logps/chosen": -40.70022964477539, "logps/rejected": -83.81830596923828, "loss": 0.0975, "losses/dpo": 0.13190267980098724, "losses/sft": 2.061471939086914, "losses/total": 0.13190267980098724, "ref_logps/chosen": -24.48486328125, "ref_logps/rejected": -36.56159210205078, "rewards/accuracies": 1.0, "rewards/chosen": -1.621536374092102, "rewards/margins": 3.104135513305664, "rewards/rejected": -4.725671768188477, "step": 2126 }, { "epoch": 2.01, "grad_norm": 9.87263525996021, "learning_rate": 1.3055367524940025e-07, "logps/chosen": -44.94332504272461, "logps/rejected": -65.91205596923828, "loss": 0.1517, "losses/dpo": 0.27161911129951477, "losses/sft": 0.7543050646781921, "losses/total": 0.27161911129951477, "ref_logps/chosen": -29.885562896728516, "ref_logps/rejected": -28.29010772705078, "rewards/accuracies": 1.0, "rewards/chosen": -1.5057764053344727, "rewards/margins": 2.256418228149414, "rewards/rejected": -3.7621946334838867, "step": 2127 }, { "epoch": 2.01, "grad_norm": 19.27761078292901, "learning_rate": 1.3033001678925213e-07, "logps/chosen": -54.22815704345703, "logps/rejected": -81.50294494628906, "loss": 0.3058, "losses/dpo": 0.004411314148455858, "losses/sft": 3.7088780403137207, "losses/total": 0.004411314148455858, "ref_logps/chosen": -33.848541259765625, "ref_logps/rejected": -37.961204528808594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.037961483001709, "rewards/margins": 2.3162121772766113, "rewards/rejected": -4.35417366027832, "step": 2128 }, { "epoch": 2.01, "grad_norm": 10.033628105351667, "learning_rate": 1.301064825104033e-07, "logps/chosen": -34.36042785644531, "logps/rejected": -64.30550384521484, "loss": 0.1471, "losses/dpo": 0.1951526701450348, "losses/sft": 1.6499661207199097, "losses/total": 0.1951526701450348, "ref_logps/chosen": -20.66451644897461, "ref_logps/rejected": -28.329853057861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.369591236114502, "rewards/margins": 2.227973461151123, "rewards/rejected": -3.597564697265625, "step": 2129 }, { "epoch": 2.01, "grad_norm": 9.709810803053111, "learning_rate": 1.2988307264481463e-07, "logps/chosen": -58.2974853515625, "logps/rejected": -88.18870544433594, "loss": 0.1081, "losses/dpo": 0.045097894966602325, "losses/sft": 1.6745036840438843, "losses/total": 0.045097894966602325, "ref_logps/chosen": -40.36637878417969, "ref_logps/rejected": -40.004913330078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7931106090545654, "rewards/margins": 3.025268077850342, "rewards/rejected": -4.818378448486328, "step": 2130 }, { "epoch": 2.01, "grad_norm": 7.007860850458489, "learning_rate": 1.2965978742431833e-07, "logps/chosen": -36.906333923339844, "logps/rejected": -73.4402847290039, "loss": 0.0821, "losses/dpo": 0.026190344244241714, "losses/sft": 1.2684378623962402, "losses/total": 0.026190344244241714, "ref_logps/chosen": -25.282556533813477, "ref_logps/rejected": -32.34950637817383, "rewards/accuracies": 1.0, "rewards/chosen": -1.1623775959014893, "rewards/margins": 2.9467005729675293, "rewards/rejected": -4.109078407287598, "step": 2131 }, { "epoch": 2.01, "grad_norm": 20.033881606194413, "learning_rate": 1.2943662708061675e-07, "logps/chosen": -37.45870590209961, "logps/rejected": -74.85520935058594, "loss": 0.2631, "losses/dpo": 0.02004365809261799, "losses/sft": 2.3198678493499756, "losses/total": 0.02004365809261799, "ref_logps/chosen": -18.609785079956055, "ref_logps/rejected": -33.094154357910156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8848919868469238, "rewards/margins": 2.2912139892578125, "rewards/rejected": -4.1761064529418945, "step": 2132 }, { "epoch": 2.01, "grad_norm": 7.849074448273512, "learning_rate": 1.292135918452832e-07, "logps/chosen": -35.314735412597656, "logps/rejected": -66.85879516601562, "loss": 0.1069, "losses/dpo": 0.10029163956642151, "losses/sft": 1.0014828443527222, "losses/total": 0.10029163956642151, "ref_logps/chosen": -20.404205322265625, "ref_logps/rejected": -27.732961654663086, "rewards/accuracies": 1.0, "rewards/chosen": -1.4910528659820557, "rewards/margins": 2.4215312004089355, "rewards/rejected": -3.9125843048095703, "step": 2133 }, { "epoch": 2.01, "grad_norm": 9.235196174438444, "learning_rate": 1.2899068194976063e-07, "logps/chosen": -58.16923522949219, "logps/rejected": -85.94020080566406, "loss": 0.0969, "losses/dpo": 0.030445683747529984, "losses/sft": 1.9062426090240479, "losses/total": 0.030445683747529984, "ref_logps/chosen": -40.697364807128906, "ref_logps/rejected": -38.28364562988281, "rewards/accuracies": 1.0, "rewards/chosen": -1.7471870183944702, "rewards/margins": 3.0184683799743652, "rewards/rejected": -4.765655517578125, "step": 2134 }, { "epoch": 2.01, "grad_norm": 13.222676641612027, "learning_rate": 1.2876789762536228e-07, "logps/chosen": -53.37830352783203, "logps/rejected": -86.85287475585938, "loss": 0.1669, "losses/dpo": 0.015611725859344006, "losses/sft": 1.8511207103729248, "losses/total": 0.015611725859344006, "ref_logps/chosen": -33.00648880004883, "ref_logps/rejected": -40.860755920410156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0371816158294678, "rewards/margins": 2.56203031539917, "rewards/rejected": -4.599211692810059, "step": 2135 }, { "epoch": 2.02, "grad_norm": 9.333978790437545, "learning_rate": 1.2854523910327117e-07, "logps/chosen": -36.988765716552734, "logps/rejected": -69.90717315673828, "loss": 0.1594, "losses/dpo": 0.053248077630996704, "losses/sft": 2.6356418132781982, "losses/total": 0.053248077630996704, "ref_logps/chosen": -22.959861755371094, "ref_logps/rejected": -31.451770782470703, "rewards/accuracies": 0.875, "rewards/chosen": -1.4028905630111694, "rewards/margins": 2.4426493644714355, "rewards/rejected": -3.8455398082733154, "step": 2136 }, { "epoch": 2.02, "grad_norm": 12.750942702445434, "learning_rate": 1.2832270661453931e-07, "logps/chosen": -46.93973159790039, "logps/rejected": -82.69477844238281, "loss": 0.1597, "losses/dpo": 0.3478687107563019, "losses/sft": 1.0953787565231323, "losses/total": 0.3478687107563019, "ref_logps/chosen": -28.489276885986328, "ref_logps/rejected": -39.5272216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8450459241867065, "rewards/margins": 2.4717087745666504, "rewards/rejected": -4.3167548179626465, "step": 2137 }, { "epoch": 2.02, "grad_norm": 14.651465084593294, "learning_rate": 1.281003003900885e-07, "logps/chosen": -44.15983581542969, "logps/rejected": -71.3687744140625, "loss": 0.186, "losses/dpo": 0.0026068759616464376, "losses/sft": 0.6707552671432495, "losses/total": 0.0026068759616464376, "ref_logps/chosen": -28.319000244140625, "ref_logps/rejected": -31.743520736694336, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5840837955474854, "rewards/margins": 2.37844181060791, "rewards/rejected": -3.9625258445739746, "step": 2138 }, { "epoch": 2.02, "grad_norm": 8.369672918479056, "learning_rate": 1.2787802066070893e-07, "logps/chosen": -46.96303939819336, "logps/rejected": -86.58552551269531, "loss": 0.0952, "losses/dpo": 0.20771180093288422, "losses/sft": 1.2939633131027222, "losses/total": 0.20771180093288422, "ref_logps/chosen": -31.62124252319336, "ref_logps/rejected": -41.929588317871094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5341798067092896, "rewards/margins": 2.9314141273498535, "rewards/rejected": -4.465594291687012, "step": 2139 }, { "epoch": 2.02, "grad_norm": 10.749987636746138, "learning_rate": 1.276558676570601e-07, "logps/chosen": -54.01003646850586, "logps/rejected": -81.36859130859375, "loss": 0.1165, "losses/dpo": 0.25780439376831055, "losses/sft": 2.148787498474121, "losses/total": 0.25780439376831055, "ref_logps/chosen": -34.89239501953125, "ref_logps/rejected": -34.44438171386719, "rewards/accuracies": 1.0, "rewards/chosen": -1.911764144897461, "rewards/margins": 2.7806568145751953, "rewards/rejected": -4.692420959472656, "step": 2140 }, { "epoch": 2.02, "grad_norm": 11.257520826376258, "learning_rate": 1.2743384160966952e-07, "logps/chosen": -44.471527099609375, "logps/rejected": -70.75926208496094, "loss": 0.1858, "losses/dpo": 0.03862371668219566, "losses/sft": 0.9098672270774841, "losses/total": 0.03862371668219566, "ref_logps/chosen": -27.50552749633789, "ref_logps/rejected": -29.470001220703125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6965997219085693, "rewards/margins": 2.432326555252075, "rewards/rejected": -4.1289262771606445, "step": 2141 }, { "epoch": 2.02, "grad_norm": 10.238340316835453, "learning_rate": 1.272119427489333e-07, "logps/chosen": -52.94512939453125, "logps/rejected": -79.36495971679688, "loss": 0.1654, "losses/dpo": 0.018964489921927452, "losses/sft": 1.8311392068862915, "losses/total": 0.018964489921927452, "ref_logps/chosen": -33.5514030456543, "ref_logps/rejected": -34.62751388549805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9393728971481323, "rewards/margins": 2.534371852874756, "rewards/rejected": -4.473744869232178, "step": 2142 }, { "epoch": 2.02, "grad_norm": 12.54058442528547, "learning_rate": 1.269901713051154e-07, "logps/chosen": -52.69453048706055, "logps/rejected": -86.10647583007812, "loss": 0.1659, "losses/dpo": 0.1692153811454773, "losses/sft": 0.330059677362442, "losses/total": 0.1692153811454773, "ref_logps/chosen": -32.18424606323242, "ref_logps/rejected": -39.132904052734375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0510287284851074, "rewards/margins": 2.646329402923584, "rewards/rejected": -4.697357654571533, "step": 2143 }, { "epoch": 2.02, "grad_norm": 5.271385418717238, "learning_rate": 1.267685275083475e-07, "logps/chosen": -59.81181335449219, "logps/rejected": -110.8592529296875, "loss": 0.0407, "losses/dpo": 0.08467496186494827, "losses/sft": 1.1830005645751953, "losses/total": 0.08467496186494827, "ref_logps/chosen": -41.73639678955078, "ref_logps/rejected": -53.140724182128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.807542324066162, "rewards/margins": 3.9643101692199707, "rewards/rejected": -5.771852493286133, "step": 2144 }, { "epoch": 2.02, "grad_norm": 6.819235190299233, "learning_rate": 1.2654701158862918e-07, "logps/chosen": -43.20905685424805, "logps/rejected": -84.8586196899414, "loss": 0.0916, "losses/dpo": 0.0375312976539135, "losses/sft": 1.6514275074005127, "losses/total": 0.0375312976539135, "ref_logps/chosen": -31.10227394104004, "ref_logps/rejected": -42.9612922668457, "rewards/accuracies": 1.0, "rewards/chosen": -1.2106781005859375, "rewards/margins": 2.9790546894073486, "rewards/rejected": -4.189732551574707, "step": 2145 }, { "epoch": 2.02, "grad_norm": 13.650695105601603, "learning_rate": 1.2632562377582672e-07, "logps/chosen": -49.14841842651367, "logps/rejected": -80.3166732788086, "loss": 0.1988, "losses/dpo": 0.11761199682950974, "losses/sft": 0.23081831634044647, "losses/total": 0.11761199682950974, "ref_logps/chosen": -26.914670944213867, "ref_logps/rejected": -35.100833892822266, "rewards/accuracies": 1.0, "rewards/chosen": -2.223374366760254, "rewards/margins": 2.2982091903686523, "rewards/rejected": -4.521583557128906, "step": 2146 }, { "epoch": 2.03, "grad_norm": 13.077625160291245, "learning_rate": 1.2610436429967414e-07, "logps/chosen": -43.04429626464844, "logps/rejected": -71.41952514648438, "loss": 0.2023, "losses/dpo": 0.0058986712247133255, "losses/sft": 1.0705749988555908, "losses/total": 0.0058986712247133255, "ref_logps/chosen": -28.13217544555664, "ref_logps/rejected": -28.759275436401367, "rewards/accuracies": 0.875, "rewards/chosen": -1.4912126064300537, "rewards/margins": 2.7748122215270996, "rewards/rejected": -4.266024589538574, "step": 2147 }, { "epoch": 2.03, "grad_norm": 10.457366023779342, "learning_rate": 1.258832333897717e-07, "logps/chosen": -53.60513687133789, "logps/rejected": -75.67955017089844, "loss": 0.1112, "losses/dpo": 0.20583222806453705, "losses/sft": 1.4386086463928223, "losses/total": 0.20583222806453705, "ref_logps/chosen": -34.64515686035156, "ref_logps/rejected": -29.976940155029297, "rewards/accuracies": 1.0, "rewards/chosen": -1.895998477935791, "rewards/margins": 2.6742632389068604, "rewards/rejected": -4.5702619552612305, "step": 2148 }, { "epoch": 2.03, "grad_norm": 7.270615694334274, "learning_rate": 1.256622312755866e-07, "logps/chosen": -45.31295394897461, "logps/rejected": -83.78875732421875, "loss": 0.0789, "losses/dpo": 0.023386038839817047, "losses/sft": 1.3272950649261475, "losses/total": 0.023386038839817047, "ref_logps/chosen": -33.546546936035156, "ref_logps/rejected": -42.1944694519043, "rewards/accuracies": 1.0, "rewards/chosen": -1.1766408681869507, "rewards/margins": 2.982787609100342, "rewards/rejected": -4.159428596496582, "step": 2149 }, { "epoch": 2.03, "grad_norm": 7.794443381654126, "learning_rate": 1.2544135818645227e-07, "logps/chosen": -39.60930252075195, "logps/rejected": -81.87744903564453, "loss": 0.1051, "losses/dpo": 0.047190383076667786, "losses/sft": 0.8928359150886536, "losses/total": 0.047190383076667786, "ref_logps/chosen": -24.059459686279297, "ref_logps/rejected": -37.42584991455078, "rewards/accuracies": 1.0, "rewards/chosen": -1.5549845695495605, "rewards/margins": 2.8901753425598145, "rewards/rejected": -4.445159912109375, "step": 2150 }, { "epoch": 2.03, "grad_norm": 11.976889691457032, "learning_rate": 1.2522061435156823e-07, "logps/chosen": -44.75035095214844, "logps/rejected": -74.5401382446289, "loss": 0.147, "losses/dpo": 0.18477655947208405, "losses/sft": 1.1473087072372437, "losses/total": 0.18477655947208405, "ref_logps/chosen": -26.79261016845703, "ref_logps/rejected": -31.966766357421875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7957743406295776, "rewards/margins": 2.4615631103515625, "rewards/rejected": -4.25733757019043, "step": 2151 }, { "epoch": 2.03, "grad_norm": 13.200199923925231, "learning_rate": 1.2500000000000005e-07, "logps/chosen": -42.27772521972656, "logps/rejected": -82.61863708496094, "loss": 0.1675, "losses/dpo": 0.2619583010673523, "losses/sft": 1.89329993724823, "losses/total": 0.2619583010673523, "ref_logps/chosen": -24.735498428344727, "ref_logps/rejected": -35.61096954345703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7542229890823364, "rewards/margins": 2.9465441703796387, "rewards/rejected": -4.7007670402526855, "step": 2152 }, { "epoch": 2.03, "grad_norm": 12.98189032575309, "learning_rate": 1.247795153606786e-07, "logps/chosen": -35.94146728515625, "logps/rejected": -66.2434310913086, "loss": 0.1752, "losses/dpo": 0.23229075968265533, "losses/sft": 1.9800264835357666, "losses/total": 0.23229075968265533, "ref_logps/chosen": -21.696014404296875, "ref_logps/rejected": -31.390771865844727, "rewards/accuracies": 1.0, "rewards/chosen": -1.4245452880859375, "rewards/margins": 2.060720443725586, "rewards/rejected": -3.4852654933929443, "step": 2153 }, { "epoch": 2.03, "grad_norm": 6.81928956856872, "learning_rate": 1.245591606624005e-07, "logps/chosen": -37.64311218261719, "logps/rejected": -80.76299285888672, "loss": 0.0901, "losses/dpo": 0.0057234917767345905, "losses/sft": 0.9925311803817749, "losses/total": 0.0057234917767345905, "ref_logps/chosen": -22.46023178100586, "ref_logps/rejected": -35.211570739746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5182878971099854, "rewards/margins": 3.036853790283203, "rewards/rejected": -4.555142402648926, "step": 2154 }, { "epoch": 2.03, "grad_norm": 8.281873508180835, "learning_rate": 1.2433893613382728e-07, "logps/chosen": -38.18583297729492, "logps/rejected": -71.86067199707031, "loss": 0.095, "losses/dpo": 0.11146533489227295, "losses/sft": 1.3249351978302002, "losses/total": 0.11146533489227295, "ref_logps/chosen": -24.768245697021484, "ref_logps/rejected": -28.400711059570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3417589664459229, "rewards/margins": 3.004237651824951, "rewards/rejected": -4.345996856689453, "step": 2155 }, { "epoch": 2.03, "grad_norm": 11.523618273205253, "learning_rate": 1.241188420034856e-07, "logps/chosen": -48.10525894165039, "logps/rejected": -86.05026245117188, "loss": 0.1217, "losses/dpo": 0.010356403887271881, "losses/sft": 1.6011615991592407, "losses/total": 0.010356403887271881, "ref_logps/chosen": -28.312950134277344, "ref_logps/rejected": -37.5443115234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9792309999465942, "rewards/margins": 2.871364116668701, "rewards/rejected": -4.850594997406006, "step": 2156 }, { "epoch": 2.03, "grad_norm": 7.728788351245043, "learning_rate": 1.2389887849976648e-07, "logps/chosen": -47.39064025878906, "logps/rejected": -94.99240112304688, "loss": 0.0861, "losses/dpo": 0.13508956134319305, "losses/sft": 0.13487498462200165, "losses/total": 0.13508956134319305, "ref_logps/chosen": -32.530967712402344, "ref_logps/rejected": -45.14848327636719, "rewards/accuracies": 1.0, "rewards/chosen": -1.4859672784805298, "rewards/margins": 3.498425006866455, "rewards/rejected": -4.984392166137695, "step": 2157 }, { "epoch": 2.04, "grad_norm": 10.658410487632894, "learning_rate": 1.236790458509259e-07, "logps/chosen": -48.947025299072266, "logps/rejected": -83.7430419921875, "loss": 0.1266, "losses/dpo": 0.18385504186153412, "losses/sft": 1.8532642126083374, "losses/total": 0.18385504186153412, "ref_logps/chosen": -31.22490882873535, "ref_logps/rejected": -38.935752868652344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7722115516662598, "rewards/margins": 2.7085180282592773, "rewards/rejected": -4.480730056762695, "step": 2158 }, { "epoch": 2.04, "grad_norm": 8.842871422563375, "learning_rate": 1.2345934428508344e-07, "logps/chosen": -39.57358169555664, "logps/rejected": -74.60368347167969, "loss": 0.1231, "losses/dpo": 0.09341555833816528, "losses/sft": 0.9391907453536987, "losses/total": 0.09341555833816528, "ref_logps/chosen": -26.89555549621582, "ref_logps/rejected": -36.10014343261719, "rewards/accuracies": 1.0, "rewards/chosen": -1.2678024768829346, "rewards/margins": 2.5825514793395996, "rewards/rejected": -3.850353717803955, "step": 2159 }, { "epoch": 2.04, "grad_norm": 16.87848117615211, "learning_rate": 1.2323977403022315e-07, "logps/chosen": -44.746116638183594, "logps/rejected": -74.113037109375, "loss": 0.182, "losses/dpo": 1.2806577682495117, "losses/sft": 2.1271414756774902, "losses/total": 1.2806577682495117, "ref_logps/chosen": -29.633899688720703, "ref_logps/rejected": -32.36134719848633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.511222004890442, "rewards/margins": 2.663947582244873, "rewards/rejected": -4.175169467926025, "step": 2160 }, { "epoch": 2.04, "grad_norm": 8.74981588325398, "learning_rate": 1.230203353141927e-07, "logps/chosen": -48.645286560058594, "logps/rejected": -71.11538696289062, "loss": 0.1038, "losses/dpo": 0.21336206793785095, "losses/sft": 1.3439743518829346, "losses/total": 0.21336206793785095, "ref_logps/chosen": -31.324094772338867, "ref_logps/rejected": -28.00745391845703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7321194410324097, "rewards/margins": 2.5786733627319336, "rewards/rejected": -4.310792922973633, "step": 2161 }, { "epoch": 2.04, "grad_norm": 10.235439845303715, "learning_rate": 1.22801028364703e-07, "logps/chosen": -47.91560745239258, "logps/rejected": -78.59089660644531, "loss": 0.1201, "losses/dpo": 0.09965977072715759, "losses/sft": 2.3859636783599854, "losses/total": 0.09965977072715759, "ref_logps/chosen": -30.074966430664062, "ref_logps/rejected": -33.41664123535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7840642929077148, "rewards/margins": 2.7333617210388184, "rewards/rejected": -4.517425537109375, "step": 2162 }, { "epoch": 2.04, "grad_norm": 16.359602255581382, "learning_rate": 1.2258185340932865e-07, "logps/chosen": -45.02626419067383, "logps/rejected": -79.07301330566406, "loss": 0.2023, "losses/dpo": 0.08036478608846664, "losses/sft": 2.4563422203063965, "losses/total": 0.08036478608846664, "ref_logps/chosen": -31.007183074951172, "ref_logps/rejected": -34.990806579589844, "rewards/accuracies": 0.875, "rewards/chosen": -1.401908040046692, "rewards/margins": 3.006312370300293, "rewards/rejected": -4.408220291137695, "step": 2163 }, { "epoch": 2.04, "grad_norm": 10.769370118926133, "learning_rate": 1.2236281067550686e-07, "logps/chosen": -39.576698303222656, "logps/rejected": -64.650634765625, "loss": 0.1715, "losses/dpo": 0.1254437267780304, "losses/sft": 0.6647036671638489, "losses/total": 0.1254437267780304, "ref_logps/chosen": -26.375839233398438, "ref_logps/rejected": -28.645854949951172, "rewards/accuracies": 1.0, "rewards/chosen": -1.320085883140564, "rewards/margins": 2.280392646789551, "rewards/rejected": -3.600478410720825, "step": 2164 }, { "epoch": 2.04, "grad_norm": 8.924713915766402, "learning_rate": 1.2214390039053798e-07, "logps/chosen": -42.519691467285156, "logps/rejected": -81.2677001953125, "loss": 0.117, "losses/dpo": 0.05344432592391968, "losses/sft": 2.4607748985290527, "losses/total": 0.05344432592391968, "ref_logps/chosen": -28.202659606933594, "ref_logps/rejected": -39.038604736328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4317030906677246, "rewards/margins": 2.7912063598632812, "rewards/rejected": -4.222908973693848, "step": 2165 }, { "epoch": 2.04, "grad_norm": 16.205630986471707, "learning_rate": 1.219251227815846e-07, "logps/chosen": -58.876949310302734, "logps/rejected": -85.0107192993164, "loss": 0.1386, "losses/dpo": 0.0801529511809349, "losses/sft": 2.009794235229492, "losses/total": 0.0801529511809349, "ref_logps/chosen": -41.89856719970703, "ref_logps/rejected": -40.76127624511719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6978380680084229, "rewards/margins": 2.7271063327789307, "rewards/rejected": -4.4249444007873535, "step": 2166 }, { "epoch": 2.04, "grad_norm": 7.341226403577314, "learning_rate": 1.2170647807567205e-07, "logps/chosen": -39.10841369628906, "logps/rejected": -75.20257568359375, "loss": 0.0945, "losses/dpo": 0.04401090368628502, "losses/sft": 1.150805950164795, "losses/total": 0.04401090368628502, "ref_logps/chosen": -23.899532318115234, "ref_logps/rejected": -32.40522003173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.5208879709243774, "rewards/margins": 2.758847951889038, "rewards/rejected": -4.279735565185547, "step": 2167 }, { "epoch": 2.05, "grad_norm": 18.83978563755619, "learning_rate": 1.2148796649968725e-07, "logps/chosen": -44.63532257080078, "logps/rejected": -63.59761047363281, "loss": 0.2988, "losses/dpo": 0.9146918058395386, "losses/sft": 0.6542050838470459, "losses/total": 0.9146918058395386, "ref_logps/chosen": -28.907508850097656, "ref_logps/rejected": -29.647891998291016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5727810859680176, "rewards/margins": 1.8221908807754517, "rewards/rejected": -3.3949718475341797, "step": 2168 }, { "epoch": 2.05, "grad_norm": 7.476641235522318, "learning_rate": 1.2126958828037933e-07, "logps/chosen": -48.58612823486328, "logps/rejected": -84.16402435302734, "loss": 0.0913, "losses/dpo": 0.06616285443305969, "losses/sft": 1.8937175273895264, "losses/total": 0.06616285443305969, "ref_logps/chosen": -28.034809112548828, "ref_logps/rejected": -36.490028381347656, "rewards/accuracies": 1.0, "rewards/chosen": -2.0551319122314453, "rewards/margins": 2.7122678756713867, "rewards/rejected": -4.767399787902832, "step": 2169 }, { "epoch": 2.05, "grad_norm": 9.989000553562608, "learning_rate": 1.210513436443589e-07, "logps/chosen": -33.05354309082031, "logps/rejected": -65.17121124267578, "loss": 0.1452, "losses/dpo": 0.0006840461282990873, "losses/sft": 1.1321461200714111, "losses/total": 0.0006840461282990873, "ref_logps/chosen": -23.81131362915039, "ref_logps/rejected": -27.488643646240234, "rewards/accuracies": 1.0, "rewards/chosen": -0.9242230653762817, "rewards/margins": 2.84403395652771, "rewards/rejected": -3.7682571411132812, "step": 2170 }, { "epoch": 2.05, "grad_norm": 8.024295240147, "learning_rate": 1.2083323281809812e-07, "logps/chosen": -42.78569030761719, "logps/rejected": -90.59514617919922, "loss": 0.0985, "losses/dpo": 0.02436971664428711, "losses/sft": 1.3591309785842896, "losses/total": 0.02436971664428711, "ref_logps/chosen": -28.847078323364258, "ref_logps/rejected": -43.68105697631836, "rewards/accuracies": 1.0, "rewards/chosen": -1.393861174583435, "rewards/margins": 3.2975473403930664, "rewards/rejected": -4.691408634185791, "step": 2171 }, { "epoch": 2.05, "grad_norm": 11.293265604893364, "learning_rate": 1.2061525602792993e-07, "logps/chosen": -43.08900833129883, "logps/rejected": -71.43305969238281, "loss": 0.1632, "losses/dpo": 0.009037239477038383, "losses/sft": 1.7597953081130981, "losses/total": 0.009037239477038383, "ref_logps/chosen": -27.531234741210938, "ref_logps/rejected": -32.164669036865234, "rewards/accuracies": 1.0, "rewards/chosen": -1.5557775497436523, "rewards/margins": 2.3710618019104004, "rewards/rejected": -3.9268393516540527, "step": 2172 }, { "epoch": 2.05, "grad_norm": 5.930892533704848, "learning_rate": 1.203974135000485e-07, "logps/chosen": -32.23483657836914, "logps/rejected": -72.42863464355469, "loss": 0.0714, "losses/dpo": 0.009351562708616257, "losses/sft": 1.1551945209503174, "losses/total": 0.009351562708616257, "ref_logps/chosen": -19.4290771484375, "ref_logps/rejected": -26.541500091552734, "rewards/accuracies": 1.0, "rewards/chosen": -1.2805759906768799, "rewards/margins": 3.3081371784210205, "rewards/rejected": -4.5887131690979, "step": 2173 }, { "epoch": 2.05, "grad_norm": 13.547941578578985, "learning_rate": 1.2017970546050868e-07, "logps/chosen": -50.041107177734375, "logps/rejected": -91.24354553222656, "loss": 0.116, "losses/dpo": 0.1408526450395584, "losses/sft": 1.2353579998016357, "losses/total": 0.1408526450395584, "ref_logps/chosen": -31.878299713134766, "ref_logps/rejected": -34.95756149291992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8162806034088135, "rewards/margins": 3.8123183250427246, "rewards/rejected": -5.628599166870117, "step": 2174 }, { "epoch": 2.05, "grad_norm": 13.187529303572537, "learning_rate": 1.1996213213522548e-07, "logps/chosen": -45.46194839477539, "logps/rejected": -67.1530532836914, "loss": 0.1662, "losses/dpo": 0.16481570899486542, "losses/sft": 1.0563271045684814, "losses/total": 0.16481570899486542, "ref_logps/chosen": -29.106517791748047, "ref_logps/rejected": -28.841598510742188, "rewards/accuracies": 0.9375, "rewards/chosen": -1.635542869567871, "rewards/margins": 2.1956024169921875, "rewards/rejected": -3.831145763397217, "step": 2175 }, { "epoch": 2.05, "grad_norm": 13.144555838929481, "learning_rate": 1.1974469374997445e-07, "logps/chosen": -49.12852478027344, "logps/rejected": -86.35916900634766, "loss": 0.1407, "losses/dpo": 0.0245917197316885, "losses/sft": 0.6216521263122559, "losses/total": 0.0245917197316885, "ref_logps/chosen": -29.978710174560547, "ref_logps/rejected": -38.28242874145508, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9149813652038574, "rewards/margins": 2.8926925659179688, "rewards/rejected": -4.807674407958984, "step": 2176 }, { "epoch": 2.05, "grad_norm": 15.008363570763455, "learning_rate": 1.1952739053039085e-07, "logps/chosen": -37.912967681884766, "logps/rejected": -66.46011352539062, "loss": 0.1927, "losses/dpo": 0.27261027693748474, "losses/sft": 1.786821722984314, "losses/total": 0.27261027693748474, "ref_logps/chosen": -26.412193298339844, "ref_logps/rejected": -32.71965026855469, "rewards/accuracies": 0.875, "rewards/chosen": -1.1500775814056396, "rewards/margins": 2.223968267440796, "rewards/rejected": -3.3740458488464355, "step": 2177 }, { "epoch": 2.05, "grad_norm": 10.610590617815463, "learning_rate": 1.1931022270196989e-07, "logps/chosen": -38.02528762817383, "logps/rejected": -68.31451416015625, "loss": 0.091, "losses/dpo": 0.011617018841207027, "losses/sft": 1.150631070137024, "losses/total": 0.011617018841207027, "ref_logps/chosen": -24.4703369140625, "ref_logps/rejected": -24.715208053588867, "rewards/accuracies": 1.0, "rewards/chosen": -1.3554950952529907, "rewards/margins": 3.0044357776641846, "rewards/rejected": -4.359930992126465, "step": 2178 }, { "epoch": 2.06, "grad_norm": 16.317936228279482, "learning_rate": 1.1909319049006606e-07, "logps/chosen": -38.927433013916016, "logps/rejected": -62.61871337890625, "loss": 0.2793, "losses/dpo": 0.0947960615158081, "losses/sft": 0.8370547890663147, "losses/total": 0.0947960615158081, "ref_logps/chosen": -22.31976318359375, "ref_logps/rejected": -26.44713592529297, "rewards/accuracies": 0.8125, "rewards/chosen": -1.660766839981079, "rewards/margins": 1.9563913345336914, "rewards/rejected": -3.6171581745147705, "step": 2179 }, { "epoch": 2.06, "grad_norm": 14.353087933511706, "learning_rate": 1.1887629411989342e-07, "logps/chosen": -41.38824462890625, "logps/rejected": -74.22559356689453, "loss": 0.2371, "losses/dpo": 0.04541129246354103, "losses/sft": 1.554266095161438, "losses/total": 0.04541129246354103, "ref_logps/chosen": -24.476205825805664, "ref_logps/rejected": -30.626605987548828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6912040710449219, "rewards/margins": 2.668694496154785, "rewards/rejected": -4.359898567199707, "step": 2180 }, { "epoch": 2.06, "grad_norm": 13.87545131281895, "learning_rate": 1.1865953381652469e-07, "logps/chosen": -49.94427490234375, "logps/rejected": -80.4924087524414, "loss": 0.1753, "losses/dpo": 0.19994884729385376, "losses/sft": 1.5792789459228516, "losses/total": 0.19994884729385376, "ref_logps/chosen": -31.149211883544922, "ref_logps/rejected": -37.7044677734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.879506230354309, "rewards/margins": 2.3992881774902344, "rewards/rejected": -4.278794288635254, "step": 2181 }, { "epoch": 2.06, "grad_norm": 9.530034604395702, "learning_rate": 1.1844290980489172e-07, "logps/chosen": -60.884796142578125, "logps/rejected": -88.59149169921875, "loss": 0.0927, "losses/dpo": 0.18514086306095123, "losses/sft": 0.9215603470802307, "losses/total": 0.18514086306095123, "ref_logps/chosen": -40.023536682128906, "ref_logps/rejected": -38.47643280029297, "rewards/accuracies": 1.0, "rewards/chosen": -2.086125612258911, "rewards/margins": 2.925379991531372, "rewards/rejected": -5.011505603790283, "step": 2182 }, { "epoch": 2.06, "grad_norm": 11.611820908449667, "learning_rate": 1.1822642230978488e-07, "logps/chosen": -43.78041076660156, "logps/rejected": -82.94664001464844, "loss": 0.1599, "losses/dpo": 0.022719083353877068, "losses/sft": 0.9952024817466736, "losses/total": 0.022719083353877068, "ref_logps/chosen": -27.761154174804688, "ref_logps/rejected": -38.847145080566406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6019258499145508, "rewards/margins": 2.808023452758789, "rewards/rejected": -4.40994930267334, "step": 2183 }, { "epoch": 2.06, "grad_norm": 10.702255767065202, "learning_rate": 1.180100715558526e-07, "logps/chosen": -29.93195152282715, "logps/rejected": -58.60809326171875, "loss": 0.1715, "losses/dpo": 0.14399783313274384, "losses/sft": 0.5088459253311157, "losses/total": 0.14399783313274384, "ref_logps/chosen": -17.393451690673828, "ref_logps/rejected": -23.636043548583984, "rewards/accuracies": 1.0, "rewards/chosen": -1.2538501024246216, "rewards/margins": 2.2433547973632812, "rewards/rejected": -3.4972047805786133, "step": 2184 }, { "epoch": 2.06, "grad_norm": 12.431905053243197, "learning_rate": 1.1779385776760186e-07, "logps/chosen": -31.421470642089844, "logps/rejected": -60.59765625, "loss": 0.1894, "losses/dpo": 0.015584110282361507, "losses/sft": 1.1656800508499146, "losses/total": 0.015584110282361507, "ref_logps/chosen": -18.724498748779297, "ref_logps/rejected": -25.060659408569336, "rewards/accuracies": 1.0, "rewards/chosen": -1.2696971893310547, "rewards/margins": 2.2840023040771484, "rewards/rejected": -3.553699493408203, "step": 2185 }, { "epoch": 2.06, "grad_norm": 14.127963298639632, "learning_rate": 1.1757778116939704e-07, "logps/chosen": -38.54521942138672, "logps/rejected": -68.75927734375, "loss": 0.232, "losses/dpo": 0.3067886531352997, "losses/sft": 1.8830955028533936, "losses/total": 0.3067886531352997, "ref_logps/chosen": -22.65321159362793, "ref_logps/rejected": -27.616798400878906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5892009735107422, "rewards/margins": 2.5250468254089355, "rewards/rejected": -4.1142473220825195, "step": 2186 }, { "epoch": 2.06, "grad_norm": 10.354336380210542, "learning_rate": 1.1736184198546059e-07, "logps/chosen": -37.61503219604492, "logps/rejected": -75.29013061523438, "loss": 0.2156, "losses/dpo": 0.009381212294101715, "losses/sft": 0.7990490794181824, "losses/total": 0.009381212294101715, "ref_logps/chosen": -23.76809310913086, "ref_logps/rejected": -31.605134963989258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3846938610076904, "rewards/margins": 2.9838056564331055, "rewards/rejected": -4.368499755859375, "step": 2187 }, { "epoch": 2.06, "grad_norm": 12.736897095761782, "learning_rate": 1.1714604043987197e-07, "logps/chosen": -47.89482116699219, "logps/rejected": -71.06681823730469, "loss": 0.159, "losses/dpo": 0.09888505190610886, "losses/sft": 1.7294467687606812, "losses/total": 0.09888505190610886, "ref_logps/chosen": -30.463287353515625, "ref_logps/rejected": -30.704200744628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.7431533336639404, "rewards/margins": 2.2931079864501953, "rewards/rejected": -4.036261558532715, "step": 2188 }, { "epoch": 2.07, "grad_norm": 9.383661352495684, "learning_rate": 1.1693037675656811e-07, "logps/chosen": -32.85325622558594, "logps/rejected": -66.98477172851562, "loss": 0.1158, "losses/dpo": 0.22807711362838745, "losses/sft": 1.688382863998413, "losses/total": 0.22807711362838745, "ref_logps/chosen": -20.98690414428711, "ref_logps/rejected": -30.87148666381836, "rewards/accuracies": 1.0, "rewards/chosen": -1.1866353750228882, "rewards/margins": 2.4246928691864014, "rewards/rejected": -3.611328125, "step": 2189 }, { "epoch": 2.07, "grad_norm": 10.731363217163738, "learning_rate": 1.1671485115934284e-07, "logps/chosen": -57.922523498535156, "logps/rejected": -82.49656677246094, "loss": 0.1431, "losses/dpo": 0.1710861176252365, "losses/sft": 2.0710229873657227, "losses/total": 0.1710861176252365, "ref_logps/chosen": -40.785118103027344, "ref_logps/rejected": -37.72419738769531, "rewards/accuracies": 1.0, "rewards/chosen": -1.7137410640716553, "rewards/margins": 2.7634963989257812, "rewards/rejected": -4.477237224578857, "step": 2190 }, { "epoch": 2.07, "grad_norm": 10.652913856820824, "learning_rate": 1.164994638718465e-07, "logps/chosen": -39.08518600463867, "logps/rejected": -82.00020599365234, "loss": 0.115, "losses/dpo": 0.07512964308261871, "losses/sft": 1.2824007272720337, "losses/total": 0.07512964308261871, "ref_logps/chosen": -23.233766555786133, "ref_logps/rejected": -34.99971008300781, "rewards/accuracies": 1.0, "rewards/chosen": -1.585141897201538, "rewards/margins": 3.1149072647094727, "rewards/rejected": -4.70004940032959, "step": 2191 }, { "epoch": 2.07, "grad_norm": 17.08310235795727, "learning_rate": 1.1628421511758621e-07, "logps/chosen": -49.587703704833984, "logps/rejected": -89.45162963867188, "loss": 0.245, "losses/dpo": 0.04928503558039665, "losses/sft": 0.855913519859314, "losses/total": 0.04928503558039665, "ref_logps/chosen": -30.45762062072754, "ref_logps/rejected": -44.19218444824219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9130080938339233, "rewards/margins": 2.612936496734619, "rewards/rejected": -4.525944709777832, "step": 2192 }, { "epoch": 2.07, "grad_norm": 12.87976794311551, "learning_rate": 1.1606910511992497e-07, "logps/chosen": -46.8291015625, "logps/rejected": -71.28050231933594, "loss": 0.1735, "losses/dpo": 0.29325997829437256, "losses/sft": 1.0467424392700195, "losses/total": 0.29325997829437256, "ref_logps/chosen": -30.530471801757812, "ref_logps/rejected": -30.275989532470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.629862904548645, "rewards/margins": 2.470588207244873, "rewards/rejected": -4.1004509925842285, "step": 2193 }, { "epoch": 2.07, "grad_norm": 11.843381472619036, "learning_rate": 1.1585413410208217e-07, "logps/chosen": -43.35476303100586, "logps/rejected": -72.98719787597656, "loss": 0.1436, "losses/dpo": 0.044637586921453476, "losses/sft": 1.9985264539718628, "losses/total": 0.044637586921453476, "ref_logps/chosen": -28.285198211669922, "ref_logps/rejected": -31.76837921142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.5069565773010254, "rewards/margins": 2.6149251461029053, "rewards/rejected": -4.121881484985352, "step": 2194 }, { "epoch": 2.07, "grad_norm": 10.239188603314075, "learning_rate": 1.1563930228713262e-07, "logps/chosen": -45.767887115478516, "logps/rejected": -80.96997833251953, "loss": 0.13, "losses/dpo": 0.08831697702407837, "losses/sft": 1.67668616771698, "losses/total": 0.08831697702407837, "ref_logps/chosen": -28.730587005615234, "ref_logps/rejected": -34.29181671142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.7037302255630493, "rewards/margins": 2.964085340499878, "rewards/rejected": -4.667815685272217, "step": 2195 }, { "epoch": 2.07, "grad_norm": 9.478387931439725, "learning_rate": 1.1542460989800703e-07, "logps/chosen": -51.02070617675781, "logps/rejected": -77.03134155273438, "loss": 0.1096, "losses/dpo": 0.23350085318088531, "losses/sft": 1.9866466522216797, "losses/total": 0.23350085318088531, "ref_logps/chosen": -35.51627731323242, "ref_logps/rejected": -34.81273651123047, "rewards/accuracies": 1.0, "rewards/chosen": -1.5504429340362549, "rewards/margins": 2.671417713165283, "rewards/rejected": -4.221860885620117, "step": 2196 }, { "epoch": 2.07, "grad_norm": 10.26215821978315, "learning_rate": 1.1521005715749113e-07, "logps/chosen": -45.43396759033203, "logps/rejected": -75.09335327148438, "loss": 0.1267, "losses/dpo": 0.08170255273580551, "losses/sft": 1.394622802734375, "losses/total": 0.08170255273580551, "ref_logps/chosen": -31.269014358520508, "ref_logps/rejected": -32.449989318847656, "rewards/accuracies": 1.0, "rewards/chosen": -1.4164953231811523, "rewards/margins": 2.84784197807312, "rewards/rejected": -4.264337539672852, "step": 2197 }, { "epoch": 2.07, "grad_norm": 11.56874823544783, "learning_rate": 1.149956442882259e-07, "logps/chosen": -48.19874954223633, "logps/rejected": -81.83503723144531, "loss": 0.1475, "losses/dpo": 0.13148701190948486, "losses/sft": 1.3152461051940918, "losses/total": 0.13148701190948486, "ref_logps/chosen": -29.285686492919922, "ref_logps/rejected": -38.771644592285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8913064002990723, "rewards/margins": 2.4150328636169434, "rewards/rejected": -4.306339263916016, "step": 2198 }, { "epoch": 2.07, "grad_norm": 9.451014430275237, "learning_rate": 1.1478137151270723e-07, "logps/chosen": -46.69227600097656, "logps/rejected": -81.54839324951172, "loss": 0.1004, "losses/dpo": 0.0296200979501009, "losses/sft": 0.9435161352157593, "losses/total": 0.0296200979501009, "ref_logps/chosen": -31.091217041015625, "ref_logps/rejected": -35.40216827392578, "rewards/accuracies": 1.0, "rewards/chosen": -1.5601060390472412, "rewards/margins": 3.054516315460205, "rewards/rejected": -4.614622592926025, "step": 2199 }, { "epoch": 2.08, "grad_norm": 12.342731577877975, "learning_rate": 1.1456723905328539e-07, "logps/chosen": -43.63287353515625, "logps/rejected": -71.47578430175781, "loss": 0.2061, "losses/dpo": 0.09445653855800629, "losses/sft": 0.42876437306404114, "losses/total": 0.09445653855800629, "ref_logps/chosen": -26.08645248413086, "ref_logps/rejected": -31.86846923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7546420097351074, "rewards/margins": 2.206089496612549, "rewards/rejected": -3.960731267929077, "step": 2200 }, { "epoch": 2.08, "grad_norm": 8.941416656998554, "learning_rate": 1.1435324713216537e-07, "logps/chosen": -42.85490036010742, "logps/rejected": -76.31805419921875, "loss": 0.1067, "losses/dpo": 0.16937650740146637, "losses/sft": 2.154387950897217, "losses/total": 0.16937650740146637, "ref_logps/chosen": -27.247922897338867, "ref_logps/rejected": -31.177818298339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5606975555419922, "rewards/margins": 2.9533255100250244, "rewards/rejected": -4.5140228271484375, "step": 2201 }, { "epoch": 2.08, "grad_norm": 11.13291583564804, "learning_rate": 1.1413939597140595e-07, "logps/chosen": -45.784114837646484, "logps/rejected": -85.52584075927734, "loss": 0.1494, "losses/dpo": 0.00016596360364928842, "losses/sft": 0.323039710521698, "losses/total": 0.00016596360364928842, "ref_logps/chosen": -27.232868194580078, "ref_logps/rejected": -38.1536750793457, "rewards/accuracies": 1.0, "rewards/chosen": -1.8551249504089355, "rewards/margins": 2.882091522216797, "rewards/rejected": -4.737216472625732, "step": 2202 }, { "epoch": 2.08, "grad_norm": 8.726763136050444, "learning_rate": 1.139256857929203e-07, "logps/chosen": -48.571441650390625, "logps/rejected": -82.07615661621094, "loss": 0.1254, "losses/dpo": 0.13040298223495483, "losses/sft": 1.3543094396591187, "losses/total": 0.13040298223495483, "ref_logps/chosen": -30.76817512512207, "ref_logps/rejected": -37.15629577636719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7803263664245605, "rewards/margins": 2.7116594314575195, "rewards/rejected": -4.49198579788208, "step": 2203 }, { "epoch": 2.08, "grad_norm": 14.908798955342915, "learning_rate": 1.1371211681847478e-07, "logps/chosen": -45.90631103515625, "logps/rejected": -103.38220977783203, "loss": 0.1123, "losses/dpo": 0.0035804584622383118, "losses/sft": 1.4331105947494507, "losses/total": 0.0035804584622383118, "ref_logps/chosen": -28.041959762573242, "ref_logps/rejected": -48.69733428955078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7864348888397217, "rewards/margins": 3.6820523738861084, "rewards/rejected": -5.46848726272583, "step": 2204 }, { "epoch": 2.08, "grad_norm": 9.555668484763766, "learning_rate": 1.1349868926968969e-07, "logps/chosen": -45.85242462158203, "logps/rejected": -79.88895416259766, "loss": 0.0945, "losses/dpo": 0.02249525673687458, "losses/sft": 0.38143861293792725, "losses/total": 0.02249525673687458, "ref_logps/chosen": -29.34164047241211, "ref_logps/rejected": -30.087553024291992, "rewards/accuracies": 1.0, "rewards/chosen": -1.651078701019287, "rewards/margins": 3.329061508178711, "rewards/rejected": -4.980140686035156, "step": 2205 }, { "epoch": 2.08, "grad_norm": 12.113206031333648, "learning_rate": 1.1328540336803819e-07, "logps/chosen": -40.11097717285156, "logps/rejected": -70.10870361328125, "loss": 0.1917, "losses/dpo": 0.24282138049602509, "losses/sft": 1.5934255123138428, "losses/total": 0.24282138049602509, "ref_logps/chosen": -23.18143081665039, "ref_logps/rejected": -28.662700653076172, "rewards/accuracies": 0.875, "rewards/chosen": -1.6929547786712646, "rewards/margins": 2.451645851135254, "rewards/rejected": -4.144600868225098, "step": 2206 }, { "epoch": 2.08, "grad_norm": 13.26730184872255, "learning_rate": 1.130722593348467e-07, "logps/chosen": -49.05754852294922, "logps/rejected": -73.75599670410156, "loss": 0.1685, "losses/dpo": 0.07317560166120529, "losses/sft": 2.114898443222046, "losses/total": 0.07317560166120529, "ref_logps/chosen": -28.03213119506836, "ref_logps/rejected": -30.455089569091797, "rewards/accuracies": 0.9375, "rewards/chosen": -2.102541923522949, "rewards/margins": 2.227548599243164, "rewards/rejected": -4.330090522766113, "step": 2207 }, { "epoch": 2.08, "grad_norm": 14.35383205232881, "learning_rate": 1.128592573912945e-07, "logps/chosen": -47.418941497802734, "logps/rejected": -98.49038696289062, "loss": 0.1574, "losses/dpo": 0.014388901181519032, "losses/sft": 1.381947636604309, "losses/total": 0.014388901181519032, "ref_logps/chosen": -26.80439567565918, "ref_logps/rejected": -44.975929260253906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0614547729492188, "rewards/margins": 3.2899909019470215, "rewards/rejected": -5.351446151733398, "step": 2208 }, { "epoch": 2.08, "grad_norm": 10.526914445924135, "learning_rate": 1.1264639775841304e-07, "logps/chosen": -48.764984130859375, "logps/rejected": -74.84951782226562, "loss": 0.1258, "losses/dpo": 0.15242470800876617, "losses/sft": 1.794494867324829, "losses/total": 0.15242470800876617, "ref_logps/chosen": -28.516889572143555, "ref_logps/rejected": -29.543785095214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0248093605041504, "rewards/margins": 2.5057640075683594, "rewards/rejected": -4.53057336807251, "step": 2209 }, { "epoch": 2.08, "grad_norm": 14.480096950743576, "learning_rate": 1.124336806570865e-07, "logps/chosen": -44.18028259277344, "logps/rejected": -69.23780822753906, "loss": 0.2401, "losses/dpo": 0.02897561341524124, "losses/sft": 1.4699822664260864, "losses/total": 0.02897561341524124, "ref_logps/chosen": -26.775657653808594, "ref_logps/rejected": -29.263145446777344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7404626607894897, "rewards/margins": 2.257002830505371, "rewards/rejected": -3.9974656105041504, "step": 2210 }, { "epoch": 2.09, "grad_norm": 6.456273636173656, "learning_rate": 1.1222110630805082e-07, "logps/chosen": -54.4349250793457, "logps/rejected": -84.3758544921875, "loss": 0.0737, "losses/dpo": 0.05851196497678757, "losses/sft": 0.8665388822555542, "losses/total": 0.05851196497678757, "ref_logps/chosen": -37.86872100830078, "ref_logps/rejected": -35.830665588378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6566208600997925, "rewards/margins": 3.1978986263275146, "rewards/rejected": -4.854519367218018, "step": 2211 }, { "epoch": 2.09, "grad_norm": 12.379983421927896, "learning_rate": 1.1200867493189418e-07, "logps/chosen": -35.330047607421875, "logps/rejected": -64.25127410888672, "loss": 0.16, "losses/dpo": 0.02934953384101391, "losses/sft": 1.0443224906921387, "losses/total": 0.02934953384101391, "ref_logps/chosen": -21.708362579345703, "ref_logps/rejected": -24.67249298095703, "rewards/accuracies": 1.0, "rewards/chosen": -1.362168550491333, "rewards/margins": 2.5957095623016357, "rewards/rejected": -3.957878351211548, "step": 2212 }, { "epoch": 2.09, "grad_norm": 8.277359071915592, "learning_rate": 1.1179638674905597e-07, "logps/chosen": -58.485984802246094, "logps/rejected": -100.09819030761719, "loss": 0.0762, "losses/dpo": 0.04285106807947159, "losses/sft": 1.488760232925415, "losses/total": 0.04285106807947159, "ref_logps/chosen": -39.36682891845703, "ref_logps/rejected": -43.805423736572266, "rewards/accuracies": 1.0, "rewards/chosen": -1.9119153022766113, "rewards/margins": 3.7173614501953125, "rewards/rejected": -5.629276752471924, "step": 2213 }, { "epoch": 2.09, "grad_norm": 12.581405724895198, "learning_rate": 1.1158424197982741e-07, "logps/chosen": -51.27766418457031, "logps/rejected": -80.75887298583984, "loss": 0.1468, "losses/dpo": 0.5052915215492249, "losses/sft": 1.1555249691009521, "losses/total": 0.5052915215492249, "ref_logps/chosen": -32.187339782714844, "ref_logps/rejected": -34.150146484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9090325832366943, "rewards/margins": 2.751840114593506, "rewards/rejected": -4.660872459411621, "step": 2214 }, { "epoch": 2.09, "grad_norm": 7.907266770041839, "learning_rate": 1.1137224084435051e-07, "logps/chosen": -52.71190643310547, "logps/rejected": -82.60507202148438, "loss": 0.0909, "losses/dpo": 0.448567271232605, "losses/sft": 0.6332703828811646, "losses/total": 0.448567271232605, "ref_logps/chosen": -35.862937927246094, "ref_logps/rejected": -35.04417037963867, "rewards/accuracies": 1.0, "rewards/chosen": -1.6848969459533691, "rewards/margins": 3.071192741394043, "rewards/rejected": -4.756089687347412, "step": 2215 }, { "epoch": 2.09, "grad_norm": 10.471576157850054, "learning_rate": 1.1116038356261854e-07, "logps/chosen": -41.61774444580078, "logps/rejected": -75.27802276611328, "loss": 0.139, "losses/dpo": 0.46418166160583496, "losses/sft": 0.24594233930110931, "losses/total": 0.46418166160583496, "ref_logps/chosen": -24.85269546508789, "ref_logps/rejected": -33.36272430419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.6765048503875732, "rewards/margins": 2.5150249004364014, "rewards/rejected": -4.191529750823975, "step": 2216 }, { "epoch": 2.09, "grad_norm": 11.235411713955992, "learning_rate": 1.1094867035447542e-07, "logps/chosen": -47.689292907714844, "logps/rejected": -101.18516540527344, "loss": 0.1136, "losses/dpo": 0.12327257543802261, "losses/sft": 1.6913293600082397, "losses/total": 0.12327257543802261, "ref_logps/chosen": -25.80521583557129, "ref_logps/rejected": -47.13555908203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1884078979492188, "rewards/margins": 3.216553211212158, "rewards/rejected": -5.404961109161377, "step": 2217 }, { "epoch": 2.09, "grad_norm": 8.792200987772203, "learning_rate": 1.1073710143961532e-07, "logps/chosen": -39.96220397949219, "logps/rejected": -86.56826782226562, "loss": 0.1062, "losses/dpo": 0.026784054934978485, "losses/sft": 1.4278353452682495, "losses/total": 0.026784054934978485, "ref_logps/chosen": -23.990907669067383, "ref_logps/rejected": -34.803497314453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5971293449401855, "rewards/margins": 3.579347610473633, "rewards/rejected": -5.176476955413818, "step": 2218 }, { "epoch": 2.09, "grad_norm": 11.29504014971257, "learning_rate": 1.1052567703758312e-07, "logps/chosen": -44.77820587158203, "logps/rejected": -79.6952133178711, "loss": 0.1231, "losses/dpo": 0.05992653965950012, "losses/sft": 2.135526180267334, "losses/total": 0.05992653965950012, "ref_logps/chosen": -28.975906372070312, "ref_logps/rejected": -36.6085205078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5802297592163086, "rewards/margins": 2.7284395694732666, "rewards/rejected": -4.308669090270996, "step": 2219 }, { "epoch": 2.09, "grad_norm": 15.004274565970787, "learning_rate": 1.1031439736777326e-07, "logps/chosen": -40.11346435546875, "logps/rejected": -68.258056640625, "loss": 0.2682, "losses/dpo": 0.02200400084257126, "losses/sft": 2.037472724914551, "losses/total": 0.02200400084257126, "ref_logps/chosen": -22.119686126708984, "ref_logps/rejected": -26.95930290222168, "rewards/accuracies": 0.875, "rewards/chosen": -1.7993779182434082, "rewards/margins": 2.330498218536377, "rewards/rejected": -4.129876136779785, "step": 2220 }, { "epoch": 2.1, "grad_norm": 16.101445663063753, "learning_rate": 1.1010326264943043e-07, "logps/chosen": -51.20277404785156, "logps/rejected": -81.49531555175781, "loss": 0.1694, "losses/dpo": 0.3884062170982361, "losses/sft": 2.5310935974121094, "losses/total": 0.3884062170982361, "ref_logps/chosen": -31.175430297851562, "ref_logps/rejected": -35.22425079345703, "rewards/accuracies": 1.0, "rewards/chosen": -2.002734422683716, "rewards/margins": 2.6243724822998047, "rewards/rejected": -4.627106666564941, "step": 2221 }, { "epoch": 2.1, "grad_norm": 11.455107869160972, "learning_rate": 1.098922731016485e-07, "logps/chosen": -51.33045959472656, "logps/rejected": -80.83840942382812, "loss": 0.1407, "losses/dpo": 0.21712979674339294, "losses/sft": 1.5976245403289795, "losses/total": 0.21712979674339294, "ref_logps/chosen": -32.0926513671875, "ref_logps/rejected": -35.70891571044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.9237810373306274, "rewards/margins": 2.5891690254211426, "rewards/rejected": -4.5129499435424805, "step": 2222 }, { "epoch": 2.1, "grad_norm": 14.719036040763964, "learning_rate": 1.0968142894337101e-07, "logps/chosen": -53.777915954589844, "logps/rejected": -68.24440002441406, "loss": 0.1775, "losses/dpo": 0.09034456312656403, "losses/sft": 2.014115810394287, "losses/total": 0.09034456312656403, "ref_logps/chosen": -30.171222686767578, "ref_logps/rejected": -24.076732635498047, "rewards/accuracies": 1.0, "rewards/chosen": -2.3606693744659424, "rewards/margins": 2.0560967922210693, "rewards/rejected": -4.416766166687012, "step": 2223 }, { "epoch": 2.1, "grad_norm": 9.769301346598667, "learning_rate": 1.0947073039339058e-07, "logps/chosen": -38.23662185668945, "logps/rejected": -72.99211120605469, "loss": 0.1244, "losses/dpo": 0.12136444449424744, "losses/sft": 1.736124873161316, "losses/total": 0.12136444449424744, "ref_logps/chosen": -23.774356842041016, "ref_logps/rejected": -30.67220687866211, "rewards/accuracies": 1.0, "rewards/chosen": -1.4462263584136963, "rewards/margins": 2.785763740539551, "rewards/rejected": -4.231989860534668, "step": 2224 }, { "epoch": 2.1, "grad_norm": 7.973448822086857, "learning_rate": 1.0926017767034854e-07, "logps/chosen": -44.31915283203125, "logps/rejected": -86.9792709350586, "loss": 0.0792, "losses/dpo": 0.40411266684532166, "losses/sft": 1.3717572689056396, "losses/total": 0.40411266684532166, "ref_logps/chosen": -27.316587448120117, "ref_logps/rejected": -35.496891021728516, "rewards/accuracies": 1.0, "rewards/chosen": -1.70025634765625, "rewards/margins": 3.4479820728302, "rewards/rejected": -5.148238182067871, "step": 2225 }, { "epoch": 2.1, "grad_norm": 14.16960402538539, "learning_rate": 1.0904977099273513e-07, "logps/chosen": -43.756690979003906, "logps/rejected": -66.85258483886719, "loss": 0.2118, "losses/dpo": 0.004568714648485184, "losses/sft": 0.9926053881645203, "losses/total": 0.004568714648485184, "ref_logps/chosen": -29.820579528808594, "ref_logps/rejected": -28.113059997558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3936107158660889, "rewards/margins": 2.480341911315918, "rewards/rejected": -3.8739523887634277, "step": 2226 }, { "epoch": 2.1, "grad_norm": 12.37616166710338, "learning_rate": 1.0883951057888882e-07, "logps/chosen": -44.92584228515625, "logps/rejected": -77.2823715209961, "loss": 0.1296, "losses/dpo": 0.009947658516466618, "losses/sft": 1.8764362335205078, "losses/total": 0.009947658516466618, "ref_logps/chosen": -28.95404815673828, "ref_logps/rejected": -31.12073516845703, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5971792936325073, "rewards/margins": 3.018984317779541, "rewards/rejected": -4.61616325378418, "step": 2227 }, { "epoch": 2.1, "grad_norm": 10.749559804972941, "learning_rate": 1.0862939664699655e-07, "logps/chosen": -48.708526611328125, "logps/rejected": -86.93450927734375, "loss": 0.1625, "losses/dpo": 0.9779943823814392, "losses/sft": 0.769503653049469, "losses/total": 0.9779943823814392, "ref_logps/chosen": -29.663692474365234, "ref_logps/rejected": -36.26073455810547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9044833183288574, "rewards/margins": 3.162893772125244, "rewards/rejected": -5.067377090454102, "step": 2228 }, { "epoch": 2.1, "grad_norm": 7.659463501310138, "learning_rate": 1.0841942941509296e-07, "logps/chosen": -42.793514251708984, "logps/rejected": -81.95841217041016, "loss": 0.0875, "losses/dpo": 0.06217296048998833, "losses/sft": 1.35915207862854, "losses/total": 0.06217296048998833, "ref_logps/chosen": -27.270132064819336, "ref_logps/rejected": -32.476985931396484, "rewards/accuracies": 1.0, "rewards/chosen": -1.5523383617401123, "rewards/margins": 3.3958044052124023, "rewards/rejected": -4.948143005371094, "step": 2229 }, { "epoch": 2.1, "grad_norm": 8.511364812295843, "learning_rate": 1.0820960910106084e-07, "logps/chosen": -46.77088928222656, "logps/rejected": -80.91567993164062, "loss": 0.0825, "losses/dpo": 0.061447326093912125, "losses/sft": 1.0916664600372314, "losses/total": 0.061447326093912125, "ref_logps/chosen": -32.25337219238281, "ref_logps/rejected": -36.935516357421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.451751470565796, "rewards/margins": 2.9462647438049316, "rewards/rejected": -4.398015975952148, "step": 2230 }, { "epoch": 2.1, "grad_norm": 10.905511708359834, "learning_rate": 1.0799993592263005e-07, "logps/chosen": -50.155826568603516, "logps/rejected": -93.2027587890625, "loss": 0.088, "losses/dpo": 0.1650899052619934, "losses/sft": 1.5325959920883179, "losses/total": 0.1650899052619934, "ref_logps/chosen": -29.105976104736328, "ref_logps/rejected": -41.00617980957031, "rewards/accuracies": 1.0, "rewards/chosen": -2.104985237121582, "rewards/margins": 3.1146726608276367, "rewards/rejected": -5.219657897949219, "step": 2231 }, { "epoch": 2.11, "grad_norm": 10.332431932457524, "learning_rate": 1.0779041009737813e-07, "logps/chosen": -47.49070739746094, "logps/rejected": -75.00289916992188, "loss": 0.1277, "losses/dpo": 0.1500478833913803, "losses/sft": 1.676781415939331, "losses/total": 0.1500478833913803, "ref_logps/chosen": -28.349605560302734, "ref_logps/rejected": -29.022018432617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.9141103029251099, "rewards/margins": 2.6839778423309326, "rewards/rejected": -4.598088264465332, "step": 2232 }, { "epoch": 2.11, "grad_norm": 13.99564446102421, "learning_rate": 1.0758103184272974e-07, "logps/chosen": -58.535247802734375, "logps/rejected": -76.47395324707031, "loss": 0.1409, "losses/dpo": 0.3188338577747345, "losses/sft": 1.2629178762435913, "losses/total": 0.3188338577747345, "ref_logps/chosen": -37.499420166015625, "ref_logps/rejected": -30.59921646118164, "rewards/accuracies": 0.9375, "rewards/chosen": -2.103583335876465, "rewards/margins": 2.4838902950286865, "rewards/rejected": -4.587473392486572, "step": 2233 }, { "epoch": 2.11, "grad_norm": 14.532575100011456, "learning_rate": 1.0737180137595606e-07, "logps/chosen": -45.733253479003906, "logps/rejected": -76.87425994873047, "loss": 0.1714, "losses/dpo": 0.504072904586792, "losses/sft": 2.067323684692383, "losses/total": 0.504072904586792, "ref_logps/chosen": -27.84711265563965, "ref_logps/rejected": -33.575992584228516, "rewards/accuracies": 1.0, "rewards/chosen": -1.78861403465271, "rewards/margins": 2.541213035583496, "rewards/rejected": -4.329827308654785, "step": 2234 }, { "epoch": 2.11, "grad_norm": 18.506397798289182, "learning_rate": 1.071627189141753e-07, "logps/chosen": -38.842864990234375, "logps/rejected": -92.38355255126953, "loss": 0.228, "losses/dpo": 0.03984125331044197, "losses/sft": 1.3141028881072998, "losses/total": 0.03984125331044197, "ref_logps/chosen": -23.886123657226562, "ref_logps/rejected": -40.593345642089844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4956741333007812, "rewards/margins": 3.6833465099334717, "rewards/rejected": -5.179020881652832, "step": 2235 }, { "epoch": 2.11, "grad_norm": 10.309891679747391, "learning_rate": 1.0695378467435173e-07, "logps/chosen": -49.282413482666016, "logps/rejected": -78.412841796875, "loss": 0.1166, "losses/dpo": 0.10329000651836395, "losses/sft": 1.1328872442245483, "losses/total": 0.10329000651836395, "ref_logps/chosen": -29.31887435913086, "ref_logps/rejected": -33.50062561035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9963539838790894, "rewards/margins": 2.4948678016662598, "rewards/rejected": -4.491221904754639, "step": 2236 }, { "epoch": 2.11, "grad_norm": 6.557004781848228, "learning_rate": 1.0674499887329616e-07, "logps/chosen": -53.486656188964844, "logps/rejected": -95.47046661376953, "loss": 0.0589, "losses/dpo": 0.05329171195626259, "losses/sft": 1.4722223281860352, "losses/total": 0.05329171195626259, "ref_logps/chosen": -36.189453125, "ref_logps/rejected": -43.2060432434082, "rewards/accuracies": 1.0, "rewards/chosen": -1.7297203540802002, "rewards/margins": 3.4967222213745117, "rewards/rejected": -5.226442337036133, "step": 2237 }, { "epoch": 2.11, "grad_norm": 9.978085980404659, "learning_rate": 1.0653636172766504e-07, "logps/chosen": -47.981712341308594, "logps/rejected": -91.26229095458984, "loss": 0.0881, "losses/dpo": 0.17511767148971558, "losses/sft": 0.49155452847480774, "losses/total": 0.17511767148971558, "ref_logps/chosen": -27.638784408569336, "ref_logps/rejected": -34.076839447021484, "rewards/accuracies": 1.0, "rewards/chosen": -2.034292697906494, "rewards/margins": 3.6842522621154785, "rewards/rejected": -5.718544960021973, "step": 2238 }, { "epoch": 2.11, "grad_norm": 10.831214353804132, "learning_rate": 1.0632787345396088e-07, "logps/chosen": -41.367794036865234, "logps/rejected": -71.06507873535156, "loss": 0.1385, "losses/dpo": 0.013778986409306526, "losses/sft": 1.5866807699203491, "losses/total": 0.013778986409306526, "ref_logps/chosen": -24.74808120727539, "ref_logps/rejected": -27.807571411132812, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6619715690612793, "rewards/margins": 2.6637790203094482, "rewards/rejected": -4.325750350952148, "step": 2239 }, { "epoch": 2.11, "grad_norm": 7.635970485053312, "learning_rate": 1.0611953426853135e-07, "logps/chosen": -50.101478576660156, "logps/rejected": -98.05734252929688, "loss": 0.0775, "losses/dpo": 0.09040594100952148, "losses/sft": 0.46065235137939453, "losses/total": 0.09040594100952148, "ref_logps/chosen": -33.54548645019531, "ref_logps/rejected": -43.94580841064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6555991172790527, "rewards/margins": 3.7555556297302246, "rewards/rejected": -5.411154747009277, "step": 2240 }, { "epoch": 2.11, "grad_norm": 17.340948721257373, "learning_rate": 1.0591134438756968e-07, "logps/chosen": -44.980079650878906, "logps/rejected": -93.25665283203125, "loss": 0.1963, "losses/dpo": 0.17611077427864075, "losses/sft": 1.7650660276412964, "losses/total": 0.17611077427864075, "ref_logps/chosen": -28.35101318359375, "ref_logps/rejected": -44.89325714111328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6629070043563843, "rewards/margins": 3.1734328269958496, "rewards/rejected": -4.836339473724365, "step": 2241 }, { "epoch": 2.12, "grad_norm": 19.179081207794248, "learning_rate": 1.057033040271142e-07, "logps/chosen": -41.77638626098633, "logps/rejected": -69.70048522949219, "loss": 0.253, "losses/dpo": 0.1493512988090515, "losses/sft": 0.8241049647331238, "losses/total": 0.1493512988090515, "ref_logps/chosen": -22.887189865112305, "ref_logps/rejected": -29.82610321044922, "rewards/accuracies": 0.875, "rewards/chosen": -1.8889195919036865, "rewards/margins": 2.0985188484191895, "rewards/rejected": -3.987438440322876, "step": 2242 }, { "epoch": 2.12, "grad_norm": 13.448658242023669, "learning_rate": 1.054954134030478e-07, "logps/chosen": -57.50992965698242, "logps/rejected": -88.09429931640625, "loss": 0.1042, "losses/dpo": 0.10805398225784302, "losses/sft": 1.495133638381958, "losses/total": 0.10805398225784302, "ref_logps/chosen": -35.621944427490234, "ref_logps/rejected": -35.4661865234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.188798666000366, "rewards/margins": 3.074012517929077, "rewards/rejected": -5.262811183929443, "step": 2243 }, { "epoch": 2.12, "grad_norm": 12.97998797620543, "learning_rate": 1.0528767273109829e-07, "logps/chosen": -48.599876403808594, "logps/rejected": -86.8459243774414, "loss": 0.1568, "losses/dpo": 0.09851895272731781, "losses/sft": 2.322675943374634, "losses/total": 0.09851895272731781, "ref_logps/chosen": -31.085552215576172, "ref_logps/rejected": -39.8287239074707, "rewards/accuracies": 0.875, "rewards/chosen": -1.7514327764511108, "rewards/margins": 2.950287342071533, "rewards/rejected": -4.701720237731934, "step": 2244 }, { "epoch": 2.12, "grad_norm": 12.175470753256324, "learning_rate": 1.0508008222683763e-07, "logps/chosen": -54.886573791503906, "logps/rejected": -85.5859375, "loss": 0.1648, "losses/dpo": 0.11361725628376007, "losses/sft": 2.15736722946167, "losses/total": 0.11361725628376007, "ref_logps/chosen": -33.93705749511719, "ref_logps/rejected": -38.047874450683594, "rewards/accuracies": 1.0, "rewards/chosen": -2.094951868057251, "rewards/margins": 2.6588551998138428, "rewards/rejected": -4.753807067871094, "step": 2245 }, { "epoch": 2.12, "grad_norm": 9.331335006994738, "learning_rate": 1.048726421056822e-07, "logps/chosen": -50.40648651123047, "logps/rejected": -90.22016906738281, "loss": 0.1169, "losses/dpo": 0.0005588420899584889, "losses/sft": 1.437244176864624, "losses/total": 0.0005588420899584889, "ref_logps/chosen": -31.778207778930664, "ref_logps/rejected": -37.79987716674805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8628278970718384, "rewards/margins": 3.379201889038086, "rewards/rejected": -5.242030143737793, "step": 2246 }, { "epoch": 2.12, "grad_norm": 11.487487134284674, "learning_rate": 1.0466535258289202e-07, "logps/chosen": -46.62491226196289, "logps/rejected": -84.13427734375, "loss": 0.1489, "losses/dpo": 0.00144297257065773, "losses/sft": 1.310230016708374, "losses/total": 0.00144297257065773, "ref_logps/chosen": -25.53343963623047, "ref_logps/rejected": -33.84922790527344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.109147310256958, "rewards/margins": 2.9193577766418457, "rewards/rejected": -5.028504848480225, "step": 2247 }, { "epoch": 2.12, "grad_norm": 7.664454176427727, "learning_rate": 1.0445821387357123e-07, "logps/chosen": -40.64099884033203, "logps/rejected": -86.80860900878906, "loss": 0.0677, "losses/dpo": 0.13995838165283203, "losses/sft": 1.9852463006973267, "losses/total": 0.13995838165283203, "ref_logps/chosen": -23.973596572875977, "ref_logps/rejected": -39.45750045776367, "rewards/accuracies": 1.0, "rewards/chosen": -1.6667400598526, "rewards/margins": 3.0683703422546387, "rewards/rejected": -4.735110282897949, "step": 2248 }, { "epoch": 2.12, "grad_norm": 12.896989363922174, "learning_rate": 1.0425122619266699e-07, "logps/chosen": -43.04811096191406, "logps/rejected": -70.47932434082031, "loss": 0.1546, "losses/dpo": 0.034376464784145355, "losses/sft": 0.7658905982971191, "losses/total": 0.034376464784145355, "ref_logps/chosen": -25.287540435791016, "ref_logps/rejected": -28.44869613647461, "rewards/accuracies": 1.0, "rewards/chosen": -1.7760570049285889, "rewards/margins": 2.427006483078003, "rewards/rejected": -4.203063488006592, "step": 2249 }, { "epoch": 2.12, "grad_norm": 20.47475216945288, "learning_rate": 1.0404438975497018e-07, "logps/chosen": -50.561092376708984, "logps/rejected": -96.14694213867188, "loss": 0.2386, "losses/dpo": 0.08034365624189377, "losses/sft": 2.434922218322754, "losses/total": 0.08034365624189377, "ref_logps/chosen": -28.414505004882812, "ref_logps/rejected": -44.74824523925781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.214658737182617, "rewards/margins": 2.925210952758789, "rewards/rejected": -5.139869689941406, "step": 2250 }, { "epoch": 2.12, "grad_norm": 14.480510970480857, "learning_rate": 1.0383770477511455e-07, "logps/chosen": -40.2322998046875, "logps/rejected": -62.27214813232422, "loss": 0.2271, "losses/dpo": 0.021547453477978706, "losses/sft": 1.5189640522003174, "losses/total": 0.021547453477978706, "ref_logps/chosen": -26.279552459716797, "ref_logps/rejected": -27.94512939453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3952746391296387, "rewards/margins": 2.0374269485473633, "rewards/rejected": -3.432701587677002, "step": 2251 }, { "epoch": 2.12, "grad_norm": 10.113392417609798, "learning_rate": 1.0363117146757658e-07, "logps/chosen": -34.551109313964844, "logps/rejected": -70.93092346191406, "loss": 0.1536, "losses/dpo": 0.39792555570602417, "losses/sft": 0.4076381325721741, "losses/total": 0.39792555570602417, "ref_logps/chosen": -20.900480270385742, "ref_logps/rejected": -29.679821014404297, "rewards/accuracies": 1.0, "rewards/chosen": -1.365063190460205, "rewards/margins": 2.76004695892334, "rewards/rejected": -4.125110149383545, "step": 2252 }, { "epoch": 2.13, "grad_norm": 13.066484060987584, "learning_rate": 1.0342479004667559e-07, "logps/chosen": -56.54070281982422, "logps/rejected": -77.60009002685547, "loss": 0.1804, "losses/dpo": 0.11363489925861359, "losses/sft": 0.6760768890380859, "losses/total": 0.11363489925861359, "ref_logps/chosen": -34.18458557128906, "ref_logps/rejected": -30.069608688354492, "rewards/accuracies": 0.875, "rewards/chosen": -2.2356114387512207, "rewards/margins": 2.5174365043640137, "rewards/rejected": -4.753047943115234, "step": 2253 }, { "epoch": 2.13, "grad_norm": 10.573898758248154, "learning_rate": 1.0321856072657303e-07, "logps/chosen": -51.8715934753418, "logps/rejected": -88.58811950683594, "loss": 0.1284, "losses/dpo": 0.05480831861495972, "losses/sft": 0.8734775185585022, "losses/total": 0.05480831861495972, "ref_logps/chosen": -29.79532814025879, "ref_logps/rejected": -34.43863296508789, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2076263427734375, "rewards/margins": 3.207322597503662, "rewards/rejected": -5.4149489402771, "step": 2254 }, { "epoch": 2.13, "grad_norm": 10.053431202451037, "learning_rate": 1.0301248372127278e-07, "logps/chosen": -41.40800857543945, "logps/rejected": -70.00504302978516, "loss": 0.1516, "losses/dpo": 0.6931471824645996, "losses/sft": 0.34543246030807495, "losses/total": 0.6931471824645996, "ref_logps/chosen": -27.087383270263672, "ref_logps/rejected": -29.940921783447266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4320625066757202, "rewards/margins": 2.5743496417999268, "rewards/rejected": -4.006412506103516, "step": 2255 }, { "epoch": 2.13, "grad_norm": 12.052652816215485, "learning_rate": 1.0280655924462039e-07, "logps/chosen": -49.817867279052734, "logps/rejected": -73.63569641113281, "loss": 0.1602, "losses/dpo": 0.10382665693759918, "losses/sft": 1.129256010055542, "losses/total": 0.10382665693759918, "ref_logps/chosen": -26.591548919677734, "ref_logps/rejected": -26.11652374267578, "rewards/accuracies": 1.0, "rewards/chosen": -2.322632074356079, "rewards/margins": 2.4292850494384766, "rewards/rejected": -4.751916885375977, "step": 2256 }, { "epoch": 2.13, "grad_norm": 8.585047997434911, "learning_rate": 1.0260078751030335e-07, "logps/chosen": -43.887535095214844, "logps/rejected": -78.06687927246094, "loss": 0.1166, "losses/dpo": 0.17429743707180023, "losses/sft": 1.9920616149902344, "losses/total": 0.17429743707180023, "ref_logps/chosen": -27.03072738647461, "ref_logps/rejected": -34.79218673706055, "rewards/accuracies": 1.0, "rewards/chosen": -1.685680627822876, "rewards/margins": 2.6417884826660156, "rewards/rejected": -4.3274688720703125, "step": 2257 }, { "epoch": 2.13, "grad_norm": 8.643278630866336, "learning_rate": 1.0239516873185067e-07, "logps/chosen": -68.61470794677734, "logps/rejected": -96.1800537109375, "loss": 0.1393, "losses/dpo": 0.47611159086227417, "losses/sft": 2.0124685764312744, "losses/total": 0.47611159086227417, "ref_logps/chosen": -48.06904983520508, "ref_logps/rejected": -44.32471466064453, "rewards/accuracies": 0.9375, "rewards/chosen": -2.054565906524658, "rewards/margins": 3.1309683322906494, "rewards/rejected": -5.1855340003967285, "step": 2258 }, { "epoch": 2.13, "grad_norm": 8.628746483043777, "learning_rate": 1.0218970312263234e-07, "logps/chosen": -47.78765106201172, "logps/rejected": -85.28738403320312, "loss": 0.0826, "losses/dpo": 0.025450846180319786, "losses/sft": 1.1534479856491089, "losses/total": 0.025450846180319786, "ref_logps/chosen": -31.456893920898438, "ref_logps/rejected": -35.49958801269531, "rewards/accuracies": 1.0, "rewards/chosen": -1.6330759525299072, "rewards/margins": 3.345703363418579, "rewards/rejected": -4.978779315948486, "step": 2259 }, { "epoch": 2.13, "grad_norm": 9.166601096061243, "learning_rate": 1.0198439089585978e-07, "logps/chosen": -49.03463363647461, "logps/rejected": -88.40318298339844, "loss": 0.0992, "losses/dpo": 0.07704802602529526, "losses/sft": 1.777321696281433, "losses/total": 0.07704802602529526, "ref_logps/chosen": -31.566043853759766, "ref_logps/rejected": -40.408477783203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.746859073638916, "rewards/margins": 3.0526106357574463, "rewards/rejected": -4.799469947814941, "step": 2260 }, { "epoch": 2.13, "grad_norm": 10.927519135406682, "learning_rate": 1.0177923226458492e-07, "logps/chosen": -43.32666778564453, "logps/rejected": -83.72695922851562, "loss": 0.1137, "losses/dpo": 0.09243253618478775, "losses/sft": 1.724333643913269, "losses/total": 0.09243253618478775, "ref_logps/chosen": -24.200454711914062, "ref_logps/rejected": -32.823326110839844, "rewards/accuracies": 1.0, "rewards/chosen": -1.9126213788986206, "rewards/margins": 3.1777420043945312, "rewards/rejected": -5.090363502502441, "step": 2261 }, { "epoch": 2.13, "grad_norm": 7.140789017141607, "learning_rate": 1.0157422744170063e-07, "logps/chosen": -45.16083908081055, "logps/rejected": -92.7386474609375, "loss": 0.103, "losses/dpo": 0.02263716608285904, "losses/sft": 0.49641281366348267, "losses/total": 0.02263716608285904, "ref_logps/chosen": -28.340742111206055, "ref_logps/rejected": -38.058189392089844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6820096969604492, "rewards/margins": 3.7860360145568848, "rewards/rejected": -5.468045711517334, "step": 2262 }, { "epoch": 2.13, "grad_norm": 5.898777331538511, "learning_rate": 1.013693766399398e-07, "logps/chosen": -50.158447265625, "logps/rejected": -83.10966491699219, "loss": 0.0565, "losses/dpo": 0.03845364227890968, "losses/sft": 2.06715726852417, "losses/total": 0.03845364227890968, "ref_logps/chosen": -32.193050384521484, "ref_logps/rejected": -33.69212341308594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7965394258499146, "rewards/margins": 3.145214319229126, "rewards/rejected": -4.94175386428833, "step": 2263 }, { "epoch": 2.14, "grad_norm": 8.401412211187857, "learning_rate": 1.0116468007187589e-07, "logps/chosen": -41.67938995361328, "logps/rejected": -83.44007873535156, "loss": 0.0934, "losses/dpo": 0.04770128056406975, "losses/sft": 2.3543336391448975, "losses/total": 0.04770128056406975, "ref_logps/chosen": -24.328044891357422, "ref_logps/rejected": -31.736970901489258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7351343631744385, "rewards/margins": 3.435175895690918, "rewards/rejected": -5.1703104972839355, "step": 2264 }, { "epoch": 2.14, "grad_norm": 12.113420444513253, "learning_rate": 1.0096013794992186e-07, "logps/chosen": -43.24827575683594, "logps/rejected": -85.9427261352539, "loss": 0.159, "losses/dpo": 0.07430648058652878, "losses/sft": 1.4910544157028198, "losses/total": 0.07430648058652878, "ref_logps/chosen": -26.328353881835938, "ref_logps/rejected": -37.66239547729492, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6919922828674316, "rewards/margins": 3.1360414028167725, "rewards/rejected": -4.828033447265625, "step": 2265 }, { "epoch": 2.14, "grad_norm": 10.106753768066678, "learning_rate": 1.0075575048633098e-07, "logps/chosen": -49.03837966918945, "logps/rejected": -77.31513977050781, "loss": 0.1215, "losses/dpo": 0.00869788695126772, "losses/sft": 2.0961813926696777, "losses/total": 0.00869788695126772, "ref_logps/chosen": -28.92594337463379, "ref_logps/rejected": -32.73644256591797, "rewards/accuracies": 1.0, "rewards/chosen": -2.0112438201904297, "rewards/margins": 2.4466264247894287, "rewards/rejected": -4.457870006561279, "step": 2266 }, { "epoch": 2.14, "grad_norm": 8.451693524923806, "learning_rate": 1.0055151789319547e-07, "logps/chosen": -45.3800048828125, "logps/rejected": -84.84606170654297, "loss": 0.0965, "losses/dpo": 0.2628178298473358, "losses/sft": 1.0797560214996338, "losses/total": 0.2628178298473358, "ref_logps/chosen": -27.777427673339844, "ref_logps/rejected": -38.618717193603516, "rewards/accuracies": 1.0, "rewards/chosen": -1.7602577209472656, "rewards/margins": 2.8624773025512695, "rewards/rejected": -4.622735023498535, "step": 2267 }, { "epoch": 2.14, "grad_norm": 22.342526500518346, "learning_rate": 1.0034744038244722e-07, "logps/chosen": -47.00971221923828, "logps/rejected": -86.55609130859375, "loss": 0.2269, "losses/dpo": 1.439897060394287, "losses/sft": 2.264075756072998, "losses/total": 1.439897060394287, "ref_logps/chosen": -24.753803253173828, "ref_logps/rejected": -40.08842086791992, "rewards/accuracies": 0.9375, "rewards/chosen": -2.225590705871582, "rewards/margins": 2.4211764335632324, "rewards/rejected": -4.646767616271973, "step": 2268 }, { "epoch": 2.14, "grad_norm": 23.25123170079346, "learning_rate": 1.001435181658569e-07, "logps/chosen": -51.77416229248047, "logps/rejected": -86.44189453125, "loss": 0.3277, "losses/dpo": 0.05936424061655998, "losses/sft": 0.4401223659515381, "losses/total": 0.05936424061655998, "ref_logps/chosen": -28.360553741455078, "ref_logps/rejected": -38.66815948486328, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3413615226745605, "rewards/margins": 2.436011552810669, "rewards/rejected": -4.777373313903809, "step": 2269 }, { "epoch": 2.14, "grad_norm": 14.79362999797781, "learning_rate": 9.993975145503426e-08, "logps/chosen": -48.733543395996094, "logps/rejected": -77.1689453125, "loss": 0.1633, "losses/dpo": 0.019480876624584198, "losses/sft": 0.43245941400527954, "losses/total": 0.019480876624584198, "ref_logps/chosen": -26.808849334716797, "ref_logps/rejected": -30.594139099121094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.192469596862793, "rewards/margins": 2.465010404586792, "rewards/rejected": -4.657480239868164, "step": 2270 }, { "epoch": 2.14, "grad_norm": 9.262276130997943, "learning_rate": 9.973614046142768e-08, "logps/chosen": -39.41355895996094, "logps/rejected": -74.64805603027344, "loss": 0.1102, "losses/dpo": 0.18322418630123138, "losses/sft": 0.7197924256324768, "losses/total": 0.18322418630123138, "ref_logps/chosen": -21.68212127685547, "ref_logps/rejected": -28.84975814819336, "rewards/accuracies": 1.0, "rewards/chosen": -1.7731438875198364, "rewards/margins": 2.8066864013671875, "rewards/rejected": -4.579830169677734, "step": 2271 }, { "epoch": 2.14, "grad_norm": 15.227444635359586, "learning_rate": 9.953268539632373e-08, "logps/chosen": -41.999122619628906, "logps/rejected": -71.65292358398438, "loss": 0.1842, "losses/dpo": 0.05188501253724098, "losses/sft": 1.7199177742004395, "losses/total": 0.05188501253724098, "ref_logps/chosen": -23.957246780395508, "ref_logps/rejected": -29.917552947998047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.804187297821045, "rewards/margins": 2.369349956512451, "rewards/rejected": -4.173537254333496, "step": 2272 }, { "epoch": 2.14, "grad_norm": 17.131053281650637, "learning_rate": 9.932938647084746e-08, "logps/chosen": -64.07225036621094, "logps/rejected": -95.66474151611328, "loss": 0.1784, "losses/dpo": 0.00015467133198399097, "losses/sft": 2.230900764465332, "losses/total": 0.00015467133198399097, "ref_logps/chosen": -41.25385284423828, "ref_logps/rejected": -42.83660125732422, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2818398475646973, "rewards/margins": 3.000973701477051, "rewards/rejected": -5.28281307220459, "step": 2273 }, { "epoch": 2.15, "grad_norm": 11.688376524458373, "learning_rate": 9.912624389596161e-08, "logps/chosen": -45.67418670654297, "logps/rejected": -74.45220947265625, "loss": 0.1456, "losses/dpo": 0.038387976586818695, "losses/sft": 1.5658456087112427, "losses/total": 0.038387976586818695, "ref_logps/chosen": -24.860321044921875, "ref_logps/rejected": -29.028989791870117, "rewards/accuracies": 1.0, "rewards/chosen": -2.0813868045806885, "rewards/margins": 2.460935354232788, "rewards/rejected": -4.542322158813477, "step": 2274 }, { "epoch": 2.15, "grad_norm": 17.303434369416784, "learning_rate": 9.892325788246697e-08, "logps/chosen": -38.974220275878906, "logps/rejected": -64.3918685913086, "loss": 0.2294, "losses/dpo": 0.04352192208170891, "losses/sft": 1.3222259283065796, "losses/total": 0.04352192208170891, "ref_logps/chosen": -25.4884090423584, "ref_logps/rejected": -26.41399574279785, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3485811948776245, "rewards/margins": 2.4492061138153076, "rewards/rejected": -3.7977871894836426, "step": 2275 }, { "epoch": 2.15, "grad_norm": 12.143465985829938, "learning_rate": 9.872042864100156e-08, "logps/chosen": -38.45072937011719, "logps/rejected": -81.41290283203125, "loss": 0.1917, "losses/dpo": 0.03971622884273529, "losses/sft": 1.3413797616958618, "losses/total": 0.03971622884273529, "ref_logps/chosen": -24.75694465637207, "ref_logps/rejected": -37.229129791259766, "rewards/accuracies": 0.875, "rewards/chosen": -1.3693783283233643, "rewards/margins": 3.0489985942840576, "rewards/rejected": -4.418376922607422, "step": 2276 }, { "epoch": 2.15, "grad_norm": 6.330960464336373, "learning_rate": 9.851775638204101e-08, "logps/chosen": -40.86653137207031, "logps/rejected": -86.31683349609375, "loss": 0.0612, "losses/dpo": 0.046116746962070465, "losses/sft": 1.0003730058670044, "losses/total": 0.046116746962070465, "ref_logps/chosen": -27.346664428710938, "ref_logps/rejected": -37.25704574584961, "rewards/accuracies": 1.0, "rewards/chosen": -1.3519867658615112, "rewards/margins": 3.5539915561676025, "rewards/rejected": -4.905978202819824, "step": 2277 }, { "epoch": 2.15, "grad_norm": 9.83650318121141, "learning_rate": 9.83152413158978e-08, "logps/chosen": -42.51390075683594, "logps/rejected": -110.13246154785156, "loss": 0.0954, "losses/dpo": 0.0467352494597435, "losses/sft": 0.3963724672794342, "losses/total": 0.0467352494597435, "ref_logps/chosen": -26.447317123413086, "ref_logps/rejected": -49.28016662597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.6066583395004272, "rewards/margins": 4.478571891784668, "rewards/rejected": -6.085229873657227, "step": 2278 }, { "epoch": 2.15, "grad_norm": 11.749793069175714, "learning_rate": 9.811288365272144e-08, "logps/chosen": -57.44866943359375, "logps/rejected": -91.53313446044922, "loss": 0.1092, "losses/dpo": 0.08971155434846878, "losses/sft": 1.1344451904296875, "losses/total": 0.08971155434846878, "ref_logps/chosen": -32.58356857299805, "ref_logps/rejected": -36.14807891845703, "rewards/accuracies": 1.0, "rewards/chosen": -2.4865102767944336, "rewards/margins": 3.0519957542419434, "rewards/rejected": -5.538505554199219, "step": 2279 }, { "epoch": 2.15, "grad_norm": 12.641077911461997, "learning_rate": 9.791068360249819e-08, "logps/chosen": -42.86229705810547, "logps/rejected": -80.66983032226562, "loss": 0.1419, "losses/dpo": 0.05470779910683632, "losses/sft": 1.611227035522461, "losses/total": 0.05470779910683632, "ref_logps/chosen": -26.97314453125, "ref_logps/rejected": -34.804039001464844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5889153480529785, "rewards/margins": 2.9976634979248047, "rewards/rejected": -4.586578845977783, "step": 2280 }, { "epoch": 2.15, "grad_norm": 10.352989049374907, "learning_rate": 9.770864137505041e-08, "logps/chosen": -53.326805114746094, "logps/rejected": -89.27940368652344, "loss": 0.1462, "losses/dpo": 0.3117276430130005, "losses/sft": 2.329155206680298, "losses/total": 0.3117276430130005, "ref_logps/chosen": -35.650516510009766, "ref_logps/rejected": -40.38536071777344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7676289081573486, "rewards/margins": 3.1217756271362305, "rewards/rejected": -4.889404773712158, "step": 2281 }, { "epoch": 2.15, "grad_norm": 8.441053578336602, "learning_rate": 9.750675718003712e-08, "logps/chosen": -44.072662353515625, "logps/rejected": -106.59440612792969, "loss": 0.0907, "losses/dpo": 0.0780394971370697, "losses/sft": 0.5805241465568542, "losses/total": 0.0780394971370697, "ref_logps/chosen": -25.203392028808594, "ref_logps/rejected": -47.80165100097656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8869272470474243, "rewards/margins": 3.9923484325408936, "rewards/rejected": -5.879275321960449, "step": 2282 }, { "epoch": 2.15, "grad_norm": 12.90657066425611, "learning_rate": 9.730503122695297e-08, "logps/chosen": -45.71598815917969, "logps/rejected": -82.64043426513672, "loss": 0.1586, "losses/dpo": 0.08060836791992188, "losses/sft": 1.9827135801315308, "losses/total": 0.08060836791992188, "ref_logps/chosen": -25.630878448486328, "ref_logps/rejected": -35.185794830322266, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0085113048553467, "rewards/margins": 2.736952304840088, "rewards/rejected": -4.745463848114014, "step": 2283 }, { "epoch": 2.15, "grad_norm": 19.471101171890112, "learning_rate": 9.71034637251287e-08, "logps/chosen": -54.34114074707031, "logps/rejected": -85.51876831054688, "loss": 0.1813, "losses/dpo": 0.8217165470123291, "losses/sft": 1.5989336967468262, "losses/total": 0.8217165470123291, "ref_logps/chosen": -34.30613708496094, "ref_logps/rejected": -38.381614685058594, "rewards/accuracies": 0.875, "rewards/chosen": -2.003499746322632, "rewards/margins": 2.7102150917053223, "rewards/rejected": -4.713715553283691, "step": 2284 }, { "epoch": 2.16, "grad_norm": 11.529843991924695, "learning_rate": 9.69020548837304e-08, "logps/chosen": -41.155967712402344, "logps/rejected": -78.47367858886719, "loss": 0.1353, "losses/dpo": 0.03297772258520126, "losses/sft": 1.131345272064209, "losses/total": 0.03297772258520126, "ref_logps/chosen": -25.855226516723633, "ref_logps/rejected": -33.80516052246094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5300743579864502, "rewards/margins": 2.9367775917053223, "rewards/rejected": -4.466852188110352, "step": 2285 }, { "epoch": 2.16, "grad_norm": 9.362529558340839, "learning_rate": 9.670080491175966e-08, "logps/chosen": -41.678077697753906, "logps/rejected": -72.17430877685547, "loss": 0.1307, "losses/dpo": 0.20874036848545074, "losses/sft": 2.482130289077759, "losses/total": 0.20874036848545074, "ref_logps/chosen": -23.459209442138672, "ref_logps/rejected": -25.106517791748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8218870162963867, "rewards/margins": 2.884892463684082, "rewards/rejected": -4.706779479980469, "step": 2286 }, { "epoch": 2.16, "grad_norm": 10.758368284192768, "learning_rate": 9.649971401805327e-08, "logps/chosen": -55.36038589477539, "logps/rejected": -74.23218536376953, "loss": 0.1412, "losses/dpo": 0.09101774543523788, "losses/sft": 2.395521402359009, "losses/total": 0.09101774543523788, "ref_logps/chosen": -32.53956604003906, "ref_logps/rejected": -27.903879165649414, "rewards/accuracies": 1.0, "rewards/chosen": -2.2820820808410645, "rewards/margins": 2.3507485389709473, "rewards/rejected": -4.632830619812012, "step": 2287 }, { "epoch": 2.16, "grad_norm": 14.666936615012204, "learning_rate": 9.629878241128262e-08, "logps/chosen": -53.58887481689453, "logps/rejected": -91.60167694091797, "loss": 0.1633, "losses/dpo": 0.003932891879230738, "losses/sft": 1.6866576671600342, "losses/total": 0.003932891879230738, "ref_logps/chosen": -30.074054718017578, "ref_logps/rejected": -41.08518981933594, "rewards/accuracies": 0.875, "rewards/chosen": -2.3514819145202637, "rewards/margins": 2.700166702270508, "rewards/rejected": -5.05164909362793, "step": 2288 }, { "epoch": 2.16, "grad_norm": 8.356937236759299, "learning_rate": 9.609801029995426e-08, "logps/chosen": -39.33181381225586, "logps/rejected": -80.04535675048828, "loss": 0.1256, "losses/dpo": 0.2196580320596695, "losses/sft": 1.048388123512268, "losses/total": 0.2196580320596695, "ref_logps/chosen": -22.420143127441406, "ref_logps/rejected": -33.868831634521484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.691166877746582, "rewards/margins": 2.926485538482666, "rewards/rejected": -4.617652893066406, "step": 2289 }, { "epoch": 2.16, "grad_norm": 9.808001383401127, "learning_rate": 9.589739789240883e-08, "logps/chosen": -37.799232482910156, "logps/rejected": -82.16169738769531, "loss": 0.0992, "losses/dpo": 1.3964254321763292e-05, "losses/sft": 0.5434660911560059, "losses/total": 1.3964254321763292e-05, "ref_logps/chosen": -21.668222427368164, "ref_logps/rejected": -31.51094627380371, "rewards/accuracies": 1.0, "rewards/chosen": -1.6131007671356201, "rewards/margins": 3.4519741535186768, "rewards/rejected": -5.065074920654297, "step": 2290 }, { "epoch": 2.16, "grad_norm": 9.016711605493182, "learning_rate": 9.569694539682158e-08, "logps/chosen": -54.69706726074219, "logps/rejected": -84.59507751464844, "loss": 0.0894, "losses/dpo": 0.11471153050661087, "losses/sft": 2.758104085922241, "losses/total": 0.11471153050661087, "ref_logps/chosen": -32.43153762817383, "ref_logps/rejected": -32.52476119995117, "rewards/accuracies": 1.0, "rewards/chosen": -2.226552963256836, "rewards/margins": 2.9804787635803223, "rewards/rejected": -5.207031726837158, "step": 2291 }, { "epoch": 2.16, "grad_norm": 7.999282164891963, "learning_rate": 9.549665302120144e-08, "logps/chosen": -53.69896697998047, "logps/rejected": -96.43431854248047, "loss": 0.0769, "losses/dpo": 0.00360211287625134, "losses/sft": 1.9249672889709473, "losses/total": 0.00360211287625134, "ref_logps/chosen": -27.575281143188477, "ref_logps/rejected": -40.005401611328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.6123688220977783, "rewards/margins": 3.0305228233337402, "rewards/rejected": -5.642891883850098, "step": 2292 }, { "epoch": 2.16, "grad_norm": 11.782377848528812, "learning_rate": 9.529652097339167e-08, "logps/chosen": -41.07733917236328, "logps/rejected": -79.00381469726562, "loss": 0.1498, "losses/dpo": 0.006227277684956789, "losses/sft": 1.49384343624115, "losses/total": 0.006227277684956789, "ref_logps/chosen": -23.284408569335938, "ref_logps/rejected": -31.857622146606445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7792928218841553, "rewards/margins": 2.935326337814331, "rewards/rejected": -4.714619159698486, "step": 2293 }, { "epoch": 2.16, "grad_norm": 5.475216986570258, "learning_rate": 9.509654946106862e-08, "logps/chosen": -45.12257385253906, "logps/rejected": -91.73826599121094, "loss": 0.0678, "losses/dpo": 0.0010088320123031735, "losses/sft": 2.5522730350494385, "losses/total": 0.0010088320123031735, "ref_logps/chosen": -28.09095573425293, "ref_logps/rejected": -37.22523498535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7031619548797607, "rewards/margins": 3.748141050338745, "rewards/rejected": -5.451303005218506, "step": 2294 }, { "epoch": 2.17, "grad_norm": 8.57376212635333, "learning_rate": 9.489673869174247e-08, "logps/chosen": -38.063053131103516, "logps/rejected": -93.73212432861328, "loss": 0.0785, "losses/dpo": 0.17792554199695587, "losses/sft": 1.0983860492706299, "losses/total": 0.17792554199695587, "ref_logps/chosen": -21.521339416503906, "ref_logps/rejected": -39.20994567871094, "rewards/accuracies": 1.0, "rewards/chosen": -1.6541714668273926, "rewards/margins": 3.798046588897705, "rewards/rejected": -5.452218055725098, "step": 2295 }, { "epoch": 2.17, "grad_norm": 12.971006698300858, "learning_rate": 9.469708887275654e-08, "logps/chosen": -40.10315704345703, "logps/rejected": -77.54689025878906, "loss": 0.2045, "losses/dpo": 0.2495623528957367, "losses/sft": 1.1296131610870361, "losses/total": 0.2495623528957367, "ref_logps/chosen": -24.12929344177246, "ref_logps/rejected": -32.54499435424805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5973864793777466, "rewards/margins": 2.9028029441833496, "rewards/rejected": -4.500189781188965, "step": 2296 }, { "epoch": 2.17, "grad_norm": 8.518563257162636, "learning_rate": 9.449760021128683e-08, "logps/chosen": -41.89512252807617, "logps/rejected": -88.87464141845703, "loss": 0.0861, "losses/dpo": 0.033109113574028015, "losses/sft": 1.3073718547821045, "losses/total": 0.033109113574028015, "ref_logps/chosen": -23.49239730834961, "ref_logps/rejected": -37.69924545288086, "rewards/accuracies": 1.0, "rewards/chosen": -1.8402721881866455, "rewards/margins": 3.2772679328918457, "rewards/rejected": -5.11754035949707, "step": 2297 }, { "epoch": 2.17, "grad_norm": 14.202957771222897, "learning_rate": 9.429827291434254e-08, "logps/chosen": -35.232486724853516, "logps/rejected": -64.50856018066406, "loss": 0.2114, "losses/dpo": 0.04216964170336723, "losses/sft": 0.3687379062175751, "losses/total": 0.04216964170336723, "ref_logps/chosen": -19.221424102783203, "ref_logps/rejected": -25.233028411865234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6011061668395996, "rewards/margins": 2.326446771621704, "rewards/rejected": -3.9275527000427246, "step": 2298 }, { "epoch": 2.17, "grad_norm": 16.675877067067137, "learning_rate": 9.409910718876496e-08, "logps/chosen": -49.703216552734375, "logps/rejected": -85.56543731689453, "loss": 0.1838, "losses/dpo": 0.38181594014167786, "losses/sft": 1.682181715965271, "losses/total": 0.38181594014167786, "ref_logps/chosen": -28.5102481842041, "ref_logps/rejected": -34.58164978027344, "rewards/accuracies": 0.875, "rewards/chosen": -2.1192967891693115, "rewards/margins": 2.979081869125366, "rewards/rejected": -5.098378658294678, "step": 2299 }, { "epoch": 2.17, "grad_norm": 13.923741400029902, "learning_rate": 9.390010324122821e-08, "logps/chosen": -50.40496063232422, "logps/rejected": -78.6431884765625, "loss": 0.153, "losses/dpo": 0.5139957666397095, "losses/sft": 1.7263083457946777, "losses/total": 0.5139957666397095, "ref_logps/chosen": -29.021825790405273, "ref_logps/rejected": -30.398479461669922, "rewards/accuracies": 1.0, "rewards/chosen": -2.1383132934570312, "rewards/margins": 2.6861572265625, "rewards/rejected": -4.824470520019531, "step": 2300 }, { "epoch": 2.17, "grad_norm": 10.635723338978424, "learning_rate": 9.370126127823807e-08, "logps/chosen": -42.92018127441406, "logps/rejected": -90.3354721069336, "loss": 0.1409, "losses/dpo": 0.17785586416721344, "losses/sft": 2.0850510597229004, "losses/total": 0.17785586416721344, "ref_logps/chosen": -22.97719955444336, "ref_logps/rejected": -38.9228515625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9942984580993652, "rewards/margins": 3.1469640731811523, "rewards/rejected": -5.141263008117676, "step": 2301 }, { "epoch": 2.17, "grad_norm": 10.500397532830965, "learning_rate": 9.35025815061326e-08, "logps/chosen": -43.99000549316406, "logps/rejected": -75.27838134765625, "loss": 0.1317, "losses/dpo": 0.013275595381855965, "losses/sft": 1.745124340057373, "losses/total": 0.013275595381855965, "ref_logps/chosen": -23.54998779296875, "ref_logps/rejected": -28.351909637451172, "rewards/accuracies": 1.0, "rewards/chosen": -2.044001817703247, "rewards/margins": 2.6486458778381348, "rewards/rejected": -4.692647933959961, "step": 2302 }, { "epoch": 2.17, "grad_norm": 16.393512015556897, "learning_rate": 9.330406413108127e-08, "logps/chosen": -40.699398040771484, "logps/rejected": -81.609375, "loss": 0.1469, "losses/dpo": 0.14034073054790497, "losses/sft": 1.111220359802246, "losses/total": 0.14034073054790497, "ref_logps/chosen": -23.102142333984375, "ref_logps/rejected": -34.257179260253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7597256898880005, "rewards/margins": 2.9754934310913086, "rewards/rejected": -4.7352190017700195, "step": 2303 }, { "epoch": 2.17, "grad_norm": 7.505559021711025, "learning_rate": 9.310570935908523e-08, "logps/chosen": -51.88289260864258, "logps/rejected": -86.70557403564453, "loss": 0.0772, "losses/dpo": 0.0293138287961483, "losses/sft": 2.690086841583252, "losses/total": 0.0293138287961483, "ref_logps/chosen": -32.1309814453125, "ref_logps/rejected": -36.339599609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9751912355422974, "rewards/margins": 3.0614066123962402, "rewards/rejected": -5.036597728729248, "step": 2304 }, { "epoch": 2.17, "grad_norm": 7.665327127259155, "learning_rate": 9.290751739597689e-08, "logps/chosen": -35.85095977783203, "logps/rejected": -73.96261596679688, "loss": 0.1051, "losses/dpo": 0.004990467336028814, "losses/sft": 0.23312385380268097, "losses/total": 0.004990467336028814, "ref_logps/chosen": -20.99015235900879, "ref_logps/rejected": -29.40886116027832, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4860806465148926, "rewards/margins": 2.9692955017089844, "rewards/rejected": -4.455376148223877, "step": 2305 }, { "epoch": 2.18, "grad_norm": 6.800859622198287, "learning_rate": 9.270948844741949e-08, "logps/chosen": -58.905906677246094, "logps/rejected": -91.69227600097656, "loss": 0.0743, "losses/dpo": 0.02615065686404705, "losses/sft": 1.306505560874939, "losses/total": 0.02615065686404705, "ref_logps/chosen": -34.434425354003906, "ref_logps/rejected": -36.6334114074707, "rewards/accuracies": 1.0, "rewards/chosen": -2.447148561477661, "rewards/margins": 3.0587379932403564, "rewards/rejected": -5.505887031555176, "step": 2306 }, { "epoch": 2.18, "grad_norm": 7.098458468797135, "learning_rate": 9.251162271890744e-08, "logps/chosen": -54.442359924316406, "logps/rejected": -103.37690734863281, "loss": 0.0765, "losses/dpo": 0.02607535384595394, "losses/sft": 1.6763050556182861, "losses/total": 0.02607535384595394, "ref_logps/chosen": -32.105140686035156, "ref_logps/rejected": -46.7886962890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.23372220993042, "rewards/margins": 3.4250996112823486, "rewards/rejected": -5.658822059631348, "step": 2307 }, { "epoch": 2.18, "grad_norm": 9.616566074777266, "learning_rate": 9.231392041576544e-08, "logps/chosen": -40.67315673828125, "logps/rejected": -90.57272338867188, "loss": 0.1047, "losses/dpo": 0.34304651618003845, "losses/sft": 0.7975167632102966, "losses/total": 0.34304651618003845, "ref_logps/chosen": -21.13884735107422, "ref_logps/rejected": -39.5004768371582, "rewards/accuracies": 1.0, "rewards/chosen": -1.9534308910369873, "rewards/margins": 3.1537938117980957, "rewards/rejected": -5.107224464416504, "step": 2308 }, { "epoch": 2.18, "grad_norm": 18.42602806053875, "learning_rate": 9.211638174314893e-08, "logps/chosen": -46.36473083496094, "logps/rejected": -76.20829010009766, "loss": 0.241, "losses/dpo": 0.7205797433853149, "losses/sft": 1.0464842319488525, "losses/total": 0.7205797433853149, "ref_logps/chosen": -26.82806396484375, "ref_logps/rejected": -30.779743194580078, "rewards/accuracies": 0.875, "rewards/chosen": -1.9536668062210083, "rewards/margins": 2.5891876220703125, "rewards/rejected": -4.542854309082031, "step": 2309 }, { "epoch": 2.18, "grad_norm": 15.914615264954021, "learning_rate": 9.19190069060432e-08, "logps/chosen": -39.73893356323242, "logps/rejected": -53.409034729003906, "loss": 0.2504, "losses/dpo": 0.06557842344045639, "losses/sft": 1.2213945388793945, "losses/total": 0.06557842344045639, "ref_logps/chosen": -26.254688262939453, "ref_logps/rejected": -22.047744750976562, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3484244346618652, "rewards/margins": 1.787704586982727, "rewards/rejected": -3.136129140853882, "step": 2310 }, { "epoch": 2.18, "grad_norm": 11.84588775658531, "learning_rate": 9.172179610926395e-08, "logps/chosen": -47.404720306396484, "logps/rejected": -71.78424072265625, "loss": 0.1486, "losses/dpo": 0.1734827160835266, "losses/sft": 2.7816309928894043, "losses/total": 0.1734827160835266, "ref_logps/chosen": -24.213176727294922, "ref_logps/rejected": -25.58399200439453, "rewards/accuracies": 1.0, "rewards/chosen": -2.3191545009613037, "rewards/margins": 2.300869941711426, "rewards/rejected": -4.620024681091309, "step": 2311 }, { "epoch": 2.18, "grad_norm": 13.26105408791042, "learning_rate": 9.15247495574562e-08, "logps/chosen": -49.547821044921875, "logps/rejected": -84.70402526855469, "loss": 0.1128, "losses/dpo": 0.0833454355597496, "losses/sft": 1.8438447713851929, "losses/total": 0.0833454355597496, "ref_logps/chosen": -27.553070068359375, "ref_logps/rejected": -33.95627975463867, "rewards/accuracies": 1.0, "rewards/chosen": -2.1994752883911133, "rewards/margins": 2.8752996921539307, "rewards/rejected": -5.074774742126465, "step": 2312 }, { "epoch": 2.18, "grad_norm": 11.388227298468477, "learning_rate": 9.132786745509488e-08, "logps/chosen": -41.44120407104492, "logps/rejected": -67.24870300292969, "loss": 0.2036, "losses/dpo": 0.03153174743056297, "losses/sft": 1.58273446559906, "losses/total": 0.03153174743056297, "ref_logps/chosen": -26.613130569458008, "ref_logps/rejected": -32.7054443359375, "rewards/accuracies": 0.875, "rewards/chosen": -1.4828076362609863, "rewards/margins": 1.971518635749817, "rewards/rejected": -3.4543261528015137, "step": 2313 }, { "epoch": 2.18, "grad_norm": 15.990282666423868, "learning_rate": 9.11311500064842e-08, "logps/chosen": -52.24189758300781, "logps/rejected": -65.22584533691406, "loss": 0.2948, "losses/dpo": 0.09940177947282791, "losses/sft": 2.177506685256958, "losses/total": 0.09940177947282791, "ref_logps/chosen": -34.30650329589844, "ref_logps/rejected": -28.829605102539062, "rewards/accuracies": 0.875, "rewards/chosen": -1.7935388088226318, "rewards/margins": 1.8460848331451416, "rewards/rejected": -3.6396236419677734, "step": 2314 }, { "epoch": 2.18, "grad_norm": 14.472990732003826, "learning_rate": 9.093459741575729e-08, "logps/chosen": -41.93412399291992, "logps/rejected": -84.8960189819336, "loss": 0.1408, "losses/dpo": 0.07585930824279785, "losses/sft": 0.152274951338768, "losses/total": 0.07585930824279785, "ref_logps/chosen": -23.572303771972656, "ref_logps/rejected": -36.919918060302734, "rewards/accuracies": 1.0, "rewards/chosen": -1.836181879043579, "rewards/margins": 2.96142840385437, "rewards/rejected": -4.797610282897949, "step": 2315 }, { "epoch": 2.18, "grad_norm": 13.561426574925317, "learning_rate": 9.073820988687655e-08, "logps/chosen": -49.449981689453125, "logps/rejected": -91.18669128417969, "loss": 0.1257, "losses/dpo": 0.1306702345609665, "losses/sft": 0.5854824185371399, "losses/total": 0.1306702345609665, "ref_logps/chosen": -26.57770347595215, "ref_logps/rejected": -38.29747009277344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2872276306152344, "rewards/margins": 3.0016942024230957, "rewards/rejected": -5.288921356201172, "step": 2316 }, { "epoch": 2.19, "grad_norm": 12.138381631537007, "learning_rate": 9.05419876236328e-08, "logps/chosen": -36.67090606689453, "logps/rejected": -83.66153717041016, "loss": 0.1459, "losses/dpo": 0.12123847007751465, "losses/sft": 1.7860803604125977, "losses/total": 0.12123847007751465, "ref_logps/chosen": -21.900524139404297, "ref_logps/rejected": -40.02154541015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4770383834838867, "rewards/margins": 2.886960506439209, "rewards/rejected": -4.363999366760254, "step": 2317 }, { "epoch": 2.19, "grad_norm": 12.294626615331186, "learning_rate": 9.034593082964559e-08, "logps/chosen": -49.87745666503906, "logps/rejected": -91.0318832397461, "loss": 0.1266, "losses/dpo": 0.03185887634754181, "losses/sft": 1.6311466693878174, "losses/total": 0.03185887634754181, "ref_logps/chosen": -28.896835327148438, "ref_logps/rejected": -39.16761016845703, "rewards/accuracies": 1.0, "rewards/chosen": -2.098062515258789, "rewards/margins": 3.088364601135254, "rewards/rejected": -5.186427116394043, "step": 2318 }, { "epoch": 2.19, "grad_norm": 9.164007267941075, "learning_rate": 9.015003970836249e-08, "logps/chosen": -48.58573913574219, "logps/rejected": -80.5652847290039, "loss": 0.1177, "losses/dpo": 0.10539907217025757, "losses/sft": 1.1245936155319214, "losses/total": 0.10539907217025757, "ref_logps/chosen": -28.562593460083008, "ref_logps/rejected": -34.45570755004883, "rewards/accuracies": 1.0, "rewards/chosen": -2.002314567565918, "rewards/margins": 2.608643054962158, "rewards/rejected": -4.610957622528076, "step": 2319 }, { "epoch": 2.19, "grad_norm": 11.64947394763026, "learning_rate": 8.995431446305953e-08, "logps/chosen": -40.589176177978516, "logps/rejected": -86.49176025390625, "loss": 0.1216, "losses/dpo": 0.00697792274877429, "losses/sft": 1.755894660949707, "losses/total": 0.00697792274877429, "ref_logps/chosen": -19.930240631103516, "ref_logps/rejected": -31.008628845214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0658938884735107, "rewards/margins": 3.482419490814209, "rewards/rejected": -5.548313617706299, "step": 2320 }, { "epoch": 2.19, "grad_norm": 11.685083426408607, "learning_rate": 8.975875529684021e-08, "logps/chosen": -41.03142547607422, "logps/rejected": -83.91868591308594, "loss": 0.1166, "losses/dpo": 0.010401955805718899, "losses/sft": 0.800810694694519, "losses/total": 0.010401955805718899, "ref_logps/chosen": -23.951641082763672, "ref_logps/rejected": -34.68817901611328, "rewards/accuracies": 1.0, "rewards/chosen": -1.7079782485961914, "rewards/margins": 3.2150719165802, "rewards/rejected": -4.923050403594971, "step": 2321 }, { "epoch": 2.19, "grad_norm": 11.053672455400452, "learning_rate": 8.956336241263599e-08, "logps/chosen": -43.76572799682617, "logps/rejected": -76.18287658691406, "loss": 0.1263, "losses/dpo": 0.09336508810520172, "losses/sft": 1.355105996131897, "losses/total": 0.09336508810520172, "ref_logps/chosen": -29.08004379272461, "ref_logps/rejected": -35.222015380859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4685683250427246, "rewards/margins": 2.627518653869629, "rewards/rejected": -4.0960869789123535, "step": 2322 }, { "epoch": 2.19, "grad_norm": 9.495085315635558, "learning_rate": 8.93681360132057e-08, "logps/chosen": -40.10436248779297, "logps/rejected": -76.61000061035156, "loss": 0.1111, "losses/dpo": 0.0715000331401825, "losses/sft": 1.9449025392532349, "losses/total": 0.0715000331401825, "ref_logps/chosen": -23.467506408691406, "ref_logps/rejected": -30.719154357910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6636857986450195, "rewards/margins": 2.925398826599121, "rewards/rejected": -4.589084625244141, "step": 2323 }, { "epoch": 2.19, "grad_norm": 14.587901207174077, "learning_rate": 8.917307630113527e-08, "logps/chosen": -54.3773307800293, "logps/rejected": -86.08594512939453, "loss": 0.192, "losses/dpo": 0.008107288740575314, "losses/sft": 2.7903034687042236, "losses/total": 0.008107288740575314, "ref_logps/chosen": -31.09047508239746, "ref_logps/rejected": -37.159584045410156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.328685760498047, "rewards/margins": 2.563950538635254, "rewards/rejected": -4.892636299133301, "step": 2324 }, { "epoch": 2.19, "grad_norm": 11.764210904006497, "learning_rate": 8.89781834788379e-08, "logps/chosen": -41.657188415527344, "logps/rejected": -87.8310317993164, "loss": 0.1379, "losses/dpo": 0.5864167213439941, "losses/sft": 1.5826382637023926, "losses/total": 0.5864167213439941, "ref_logps/chosen": -22.559934616088867, "ref_logps/rejected": -36.64807891845703, "rewards/accuracies": 1.0, "rewards/chosen": -1.9097254276275635, "rewards/margins": 3.2085695266723633, "rewards/rejected": -5.118295192718506, "step": 2325 }, { "epoch": 2.19, "grad_norm": 10.51718317577919, "learning_rate": 8.878345774855331e-08, "logps/chosen": -49.54386520385742, "logps/rejected": -85.72610473632812, "loss": 0.1103, "losses/dpo": 0.004820036236196756, "losses/sft": 0.5917369723320007, "losses/total": 0.004820036236196756, "ref_logps/chosen": -32.6445198059082, "ref_logps/rejected": -36.36201477050781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.689934492111206, "rewards/margins": 3.246474504470825, "rewards/rejected": -4.936408519744873, "step": 2326 }, { "epoch": 2.2, "grad_norm": 11.868303093773806, "learning_rate": 8.858889931234817e-08, "logps/chosen": -42.650211334228516, "logps/rejected": -81.3026123046875, "loss": 0.1134, "losses/dpo": 0.19233879446983337, "losses/sft": 2.257504940032959, "losses/total": 0.19233879446983337, "ref_logps/chosen": -22.97986602783203, "ref_logps/rejected": -33.13606262207031, "rewards/accuracies": 1.0, "rewards/chosen": -1.9670345783233643, "rewards/margins": 2.8496201038360596, "rewards/rejected": -4.816654205322266, "step": 2327 }, { "epoch": 2.2, "grad_norm": 12.143068678100864, "learning_rate": 8.839450837211521e-08, "logps/chosen": -44.494850158691406, "logps/rejected": -98.35244750976562, "loss": 0.1401, "losses/dpo": 0.04296213760972023, "losses/sft": 1.0867230892181396, "losses/total": 0.04296213760972023, "ref_logps/chosen": -28.04985237121582, "ref_logps/rejected": -44.51864242553711, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6444995403289795, "rewards/margins": 3.7388806343078613, "rewards/rejected": -5.38338041305542, "step": 2328 }, { "epoch": 2.2, "grad_norm": 8.549927942928374, "learning_rate": 8.820028512957362e-08, "logps/chosen": -43.39472961425781, "logps/rejected": -70.291015625, "loss": 0.0945, "losses/dpo": 0.021351175382733345, "losses/sft": 1.1412537097930908, "losses/total": 0.021351175382733345, "ref_logps/chosen": -30.41812515258789, "ref_logps/rejected": -28.11444854736328, "rewards/accuracies": 1.0, "rewards/chosen": -1.2976598739624023, "rewards/margins": 2.9199962615966797, "rewards/rejected": -4.217656135559082, "step": 2329 }, { "epoch": 2.2, "grad_norm": 8.383384415067328, "learning_rate": 8.800622978626851e-08, "logps/chosen": -54.37509536743164, "logps/rejected": -99.22549438476562, "loss": 0.0713, "losses/dpo": 0.1408928632736206, "losses/sft": 2.575016975402832, "losses/total": 0.1408928632736206, "ref_logps/chosen": -32.219337463378906, "ref_logps/rejected": -39.37085723876953, "rewards/accuracies": 1.0, "rewards/chosen": -2.2155754566192627, "rewards/margins": 3.7698874473571777, "rewards/rejected": -5.9854631423950195, "step": 2330 }, { "epoch": 2.2, "grad_norm": 14.950132616772162, "learning_rate": 8.781234254357056e-08, "logps/chosen": -38.00943374633789, "logps/rejected": -68.89897155761719, "loss": 0.2215, "losses/dpo": 0.025201981887221336, "losses/sft": 0.7207736968994141, "losses/total": 0.025201981887221336, "ref_logps/chosen": -20.510242462158203, "ref_logps/rejected": -28.7283992767334, "rewards/accuracies": 1.0, "rewards/chosen": -1.7499191761016846, "rewards/margins": 2.2671382427215576, "rewards/rejected": -4.017057418823242, "step": 2331 }, { "epoch": 2.2, "grad_norm": 11.39316471873883, "learning_rate": 8.761862360267635e-08, "logps/chosen": -51.9931640625, "logps/rejected": -97.65374755859375, "loss": 0.1569, "losses/dpo": 0.008472473360598087, "losses/sft": 0.8662149906158447, "losses/total": 0.008472473360598087, "ref_logps/chosen": -28.696533203125, "ref_logps/rejected": -41.24634552001953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.329663038253784, "rewards/margins": 3.311077117919922, "rewards/rejected": -5.640740394592285, "step": 2332 }, { "epoch": 2.2, "grad_norm": 6.748006838854551, "learning_rate": 8.742507316460745e-08, "logps/chosen": -39.09984588623047, "logps/rejected": -87.27787780761719, "loss": 0.0799, "losses/dpo": 0.03712639957666397, "losses/sft": 1.751347303390503, "losses/total": 0.03712639957666397, "ref_logps/chosen": -19.77364730834961, "ref_logps/rejected": -35.825340270996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.9326200485229492, "rewards/margins": 3.2126338481903076, "rewards/rejected": -5.145254135131836, "step": 2333 }, { "epoch": 2.2, "grad_norm": 6.699287896961989, "learning_rate": 8.723169143021097e-08, "logps/chosen": -51.861793518066406, "logps/rejected": -93.61497497558594, "loss": 0.077, "losses/dpo": 0.041411470621824265, "losses/sft": 2.4571456909179688, "losses/total": 0.041411470621824265, "ref_logps/chosen": -27.11573028564453, "ref_logps/rejected": -38.77467346191406, "rewards/accuracies": 1.0, "rewards/chosen": -2.474606513977051, "rewards/margins": 3.0094242095947266, "rewards/rejected": -5.484030723571777, "step": 2334 }, { "epoch": 2.2, "grad_norm": 3.640500542831191, "learning_rate": 8.70384786001585e-08, "logps/chosen": -45.44734191894531, "logps/rejected": -109.542724609375, "loss": 0.03, "losses/dpo": 0.21546754240989685, "losses/sft": 0.786060631275177, "losses/total": 0.21546754240989685, "ref_logps/chosen": -26.44692039489746, "ref_logps/rejected": -45.69999313354492, "rewards/accuracies": 1.0, "rewards/chosen": -1.9000424146652222, "rewards/margins": 4.484230041503906, "rewards/rejected": -6.384272575378418, "step": 2335 }, { "epoch": 2.2, "grad_norm": 12.924291647507776, "learning_rate": 8.684543487494683e-08, "logps/chosen": -43.36237335205078, "logps/rejected": -85.29972839355469, "loss": 0.1842, "losses/dpo": 0.0031821681186556816, "losses/sft": 1.3383070230484009, "losses/total": 0.0031821681186556816, "ref_logps/chosen": -25.19538116455078, "ref_logps/rejected": -36.747474670410156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.816699504852295, "rewards/margins": 3.0385258197784424, "rewards/rejected": -4.855225563049316, "step": 2336 }, { "epoch": 2.2, "grad_norm": 11.444513285365613, "learning_rate": 8.665256045489685e-08, "logps/chosen": -42.75412368774414, "logps/rejected": -82.72666931152344, "loss": 0.1049, "losses/dpo": 0.0143513735383749, "losses/sft": 1.589110016822815, "losses/total": 0.0143513735383749, "ref_logps/chosen": -22.15102767944336, "ref_logps/rejected": -32.97490310668945, "rewards/accuracies": 1.0, "rewards/chosen": -2.060309886932373, "rewards/margins": 2.914867401123047, "rewards/rejected": -4.975176811218262, "step": 2337 }, { "epoch": 2.21, "grad_norm": 14.894077505971989, "learning_rate": 8.645985554015403e-08, "logps/chosen": -54.83761215209961, "logps/rejected": -86.92826080322266, "loss": 0.1749, "losses/dpo": 0.024551259353756905, "losses/sft": 1.2139889001846313, "losses/total": 0.024551259353756905, "ref_logps/chosen": -32.203643798828125, "ref_logps/rejected": -36.332069396972656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.263396739959717, "rewards/margins": 2.796222448348999, "rewards/rejected": -5.059618949890137, "step": 2338 }, { "epoch": 2.21, "grad_norm": 9.397292470438076, "learning_rate": 8.626732033068799e-08, "logps/chosen": -42.83110046386719, "logps/rejected": -96.61054992675781, "loss": 0.0765, "losses/dpo": 0.024828681722283363, "losses/sft": 1.303725004196167, "losses/total": 0.024828681722283363, "ref_logps/chosen": -26.07196044921875, "ref_logps/rejected": -42.310829162597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.6759141683578491, "rewards/margins": 3.7540574073791504, "rewards/rejected": -5.429971694946289, "step": 2339 }, { "epoch": 2.21, "grad_norm": 8.15057005575329, "learning_rate": 8.607495502629192e-08, "logps/chosen": -48.94721984863281, "logps/rejected": -76.72650909423828, "loss": 0.0956, "losses/dpo": 0.04888176918029785, "losses/sft": 2.302863121032715, "losses/total": 0.04888176918029785, "ref_logps/chosen": -30.215293884277344, "ref_logps/rejected": -29.498254776000977, "rewards/accuracies": 1.0, "rewards/chosen": -1.873192548751831, "rewards/margins": 2.849632740020752, "rewards/rejected": -4.722825050354004, "step": 2340 }, { "epoch": 2.21, "grad_norm": 20.913451775655037, "learning_rate": 8.588275982658311e-08, "logps/chosen": -46.92582702636719, "logps/rejected": -74.96311950683594, "loss": 0.2087, "losses/dpo": 0.017826568335294724, "losses/sft": 2.6574554443359375, "losses/total": 0.017826568335294724, "ref_logps/chosen": -27.75857162475586, "ref_logps/rejected": -28.537906646728516, "rewards/accuracies": 0.875, "rewards/chosen": -1.916725754737854, "rewards/margins": 2.725795269012451, "rewards/rejected": -4.642521381378174, "step": 2341 }, { "epoch": 2.21, "grad_norm": 9.190507725813132, "learning_rate": 8.569073493100193e-08, "logps/chosen": -36.628517150878906, "logps/rejected": -72.0887451171875, "loss": 0.1095, "losses/dpo": 0.04564071074128151, "losses/sft": 1.3148151636123657, "losses/total": 0.04564071074128151, "ref_logps/chosen": -19.864219665527344, "ref_logps/rejected": -27.860872268676758, "rewards/accuracies": 1.0, "rewards/chosen": -1.676430106163025, "rewards/margins": 2.7463574409484863, "rewards/rejected": -4.422787666320801, "step": 2342 }, { "epoch": 2.21, "grad_norm": 8.951228776925898, "learning_rate": 8.54988805388124e-08, "logps/chosen": -42.143402099609375, "logps/rejected": -75.32247924804688, "loss": 0.1194, "losses/dpo": 0.0668647363781929, "losses/sft": 1.7714217901229858, "losses/total": 0.0668647363781929, "ref_logps/chosen": -26.72049903869629, "ref_logps/rejected": -27.854663848876953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.542290210723877, "rewards/margins": 3.20449161529541, "rewards/rejected": -4.746781826019287, "step": 2343 }, { "epoch": 2.21, "grad_norm": 18.260048024919225, "learning_rate": 8.530719684910129e-08, "logps/chosen": -44.364234924316406, "logps/rejected": -69.99848937988281, "loss": 0.2312, "losses/dpo": 0.2793322205543518, "losses/sft": 0.6820459365844727, "losses/total": 0.2793322205543518, "ref_logps/chosen": -24.731225967407227, "ref_logps/rejected": -29.371051788330078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9633009433746338, "rewards/margins": 2.099442958831787, "rewards/rejected": -4.062744140625, "step": 2344 }, { "epoch": 2.21, "grad_norm": 16.23022471189402, "learning_rate": 8.511568406077852e-08, "logps/chosen": -43.218292236328125, "logps/rejected": -73.65396118164062, "loss": 0.2383, "losses/dpo": 0.16503793001174927, "losses/sft": 1.0692516565322876, "losses/total": 0.16503793001174927, "ref_logps/chosen": -22.641456604003906, "ref_logps/rejected": -30.145553588867188, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0576837062835693, "rewards/margins": 2.2931571006774902, "rewards/rejected": -4.3508405685424805, "step": 2345 }, { "epoch": 2.21, "grad_norm": 14.484144703754204, "learning_rate": 8.492434237257635e-08, "logps/chosen": -58.77733612060547, "logps/rejected": -99.2030258178711, "loss": 0.1511, "losses/dpo": 0.01636129803955555, "losses/sft": 3.121086359024048, "losses/total": 0.01636129803955555, "ref_logps/chosen": -36.95500564575195, "ref_logps/rejected": -41.28485107421875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1822335720062256, "rewards/margins": 3.609584331512451, "rewards/rejected": -5.791817665100098, "step": 2346 }, { "epoch": 2.21, "grad_norm": 10.208060919925208, "learning_rate": 8.473317198304977e-08, "logps/chosen": -42.039344787597656, "logps/rejected": -100.15824890136719, "loss": 0.0944, "losses/dpo": 0.016829775646328926, "losses/sft": 2.236762046813965, "losses/total": 0.016829775646328926, "ref_logps/chosen": -22.70730972290039, "ref_logps/rejected": -42.034454345703125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.933203935623169, "rewards/margins": 3.8791754245758057, "rewards/rejected": -5.812379360198975, "step": 2347 }, { "epoch": 2.22, "grad_norm": 13.411644831708486, "learning_rate": 8.454217309057596e-08, "logps/chosen": -46.20165252685547, "logps/rejected": -83.4698715209961, "loss": 0.2061, "losses/dpo": 0.2330167293548584, "losses/sft": 1.3093169927597046, "losses/total": 0.2330167293548584, "ref_logps/chosen": -25.743492126464844, "ref_logps/rejected": -33.7563362121582, "rewards/accuracies": 0.9375, "rewards/chosen": -2.045815944671631, "rewards/margins": 2.925537586212158, "rewards/rejected": -4.971353530883789, "step": 2348 }, { "epoch": 2.22, "grad_norm": 10.64424857358819, "learning_rate": 8.435134589335397e-08, "logps/chosen": -44.34653091430664, "logps/rejected": -75.83753967285156, "loss": 0.1439, "losses/dpo": 0.03548957034945488, "losses/sft": 1.6667158603668213, "losses/total": 0.03548957034945488, "ref_logps/chosen": -23.519285202026367, "ref_logps/rejected": -29.09332847595215, "rewards/accuracies": 1.0, "rewards/chosen": -2.0827248096466064, "rewards/margins": 2.5916972160339355, "rewards/rejected": -4.674422264099121, "step": 2349 }, { "epoch": 2.22, "grad_norm": 8.440484529622553, "learning_rate": 8.41606905894049e-08, "logps/chosen": -52.591041564941406, "logps/rejected": -87.89804077148438, "loss": 0.0798, "losses/dpo": 0.14734376966953278, "losses/sft": 2.3361384868621826, "losses/total": 0.14734376966953278, "ref_logps/chosen": -29.628952026367188, "ref_logps/rejected": -35.50071716308594, "rewards/accuracies": 1.0, "rewards/chosen": -2.2962090969085693, "rewards/margins": 2.943523406982422, "rewards/rejected": -5.239732265472412, "step": 2350 }, { "epoch": 2.22, "grad_norm": 20.645020291203732, "learning_rate": 8.397020737657123e-08, "logps/chosen": -43.358848571777344, "logps/rejected": -82.37478637695312, "loss": 0.1955, "losses/dpo": 0.02277899906039238, "losses/sft": 0.7413766980171204, "losses/total": 0.02277899906039238, "ref_logps/chosen": -21.935848236083984, "ref_logps/rejected": -33.658897399902344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1423001289367676, "rewards/margins": 2.7292890548706055, "rewards/rejected": -4.871588706970215, "step": 2351 }, { "epoch": 2.22, "grad_norm": 12.91205972638826, "learning_rate": 8.377989645251718e-08, "logps/chosen": -44.244083404541016, "logps/rejected": -75.51408386230469, "loss": 0.1377, "losses/dpo": 0.25694549083709717, "losses/sft": 0.8145913481712341, "losses/total": 0.25694549083709717, "ref_logps/chosen": -25.842239379882812, "ref_logps/rejected": -30.261768341064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.8401844501495361, "rewards/margins": 2.685046672821045, "rewards/rejected": -4.52523136138916, "step": 2352 }, { "epoch": 2.22, "grad_norm": 9.097840341410256, "learning_rate": 8.358975801472784e-08, "logps/chosen": -44.75466537475586, "logps/rejected": -80.78216552734375, "loss": 0.1127, "losses/dpo": 0.04170437902212143, "losses/sft": 0.4192425012588501, "losses/total": 0.04170437902212143, "ref_logps/chosen": -27.66936492919922, "ref_logps/rejected": -31.556472778320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.708530068397522, "rewards/margins": 3.2140393257141113, "rewards/rejected": -4.922569274902344, "step": 2353 }, { "epoch": 2.22, "grad_norm": 14.220525656397296, "learning_rate": 8.339979226050963e-08, "logps/chosen": -46.15920639038086, "logps/rejected": -68.82672119140625, "loss": 0.1672, "losses/dpo": 0.07867621630430222, "losses/sft": 1.1151442527770996, "losses/total": 0.07867621630430222, "ref_logps/chosen": -28.206008911132812, "ref_logps/rejected": -23.666519165039062, "rewards/accuracies": 0.875, "rewards/chosen": -1.7953195571899414, "rewards/margins": 2.720700740814209, "rewards/rejected": -4.516019821166992, "step": 2354 }, { "epoch": 2.22, "grad_norm": 9.900117142209114, "learning_rate": 8.32099993869895e-08, "logps/chosen": -47.028411865234375, "logps/rejected": -82.56153869628906, "loss": 0.1029, "losses/dpo": 0.1456521600484848, "losses/sft": 1.0124304294586182, "losses/total": 0.1456521600484848, "ref_logps/chosen": -28.60712242126465, "ref_logps/rejected": -34.771453857421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8421287536621094, "rewards/margins": 2.9368791580200195, "rewards/rejected": -4.779007911682129, "step": 2355 }, { "epoch": 2.22, "grad_norm": 10.702536697369094, "learning_rate": 8.302037959111519e-08, "logps/chosen": -46.84947204589844, "logps/rejected": -77.13580322265625, "loss": 0.1169, "losses/dpo": 0.018130401149392128, "losses/sft": 0.9609178900718689, "losses/total": 0.018130401149392128, "ref_logps/chosen": -25.898836135864258, "ref_logps/rejected": -31.179950714111328, "rewards/accuracies": 1.0, "rewards/chosen": -2.0950636863708496, "rewards/margins": 2.500521183013916, "rewards/rejected": -4.595584869384766, "step": 2356 }, { "epoch": 2.22, "grad_norm": 8.122546650908829, "learning_rate": 8.283093306965486e-08, "logps/chosen": -50.50022888183594, "logps/rejected": -85.39930725097656, "loss": 0.1006, "losses/dpo": 0.10726267844438553, "losses/sft": 1.6896758079528809, "losses/total": 0.10726267844438553, "ref_logps/chosen": -26.511720657348633, "ref_logps/rejected": -33.48474884033203, "rewards/accuracies": 1.0, "rewards/chosen": -2.398850917816162, "rewards/margins": 2.792604923248291, "rewards/rejected": -5.191455841064453, "step": 2357 }, { "epoch": 2.22, "grad_norm": 11.619713179268468, "learning_rate": 8.264166001919662e-08, "logps/chosen": -50.40601348876953, "logps/rejected": -87.94591522216797, "loss": 0.1151, "losses/dpo": 0.03715772181749344, "losses/sft": 1.214793086051941, "losses/total": 0.03715772181749344, "ref_logps/chosen": -32.73052215576172, "ref_logps/rejected": -36.55631637573242, "rewards/accuracies": 1.0, "rewards/chosen": -1.7675490379333496, "rewards/margins": 3.371410846710205, "rewards/rejected": -5.138959884643555, "step": 2358 }, { "epoch": 2.23, "grad_norm": 9.92000789613952, "learning_rate": 8.245256063614888e-08, "logps/chosen": -40.410980224609375, "logps/rejected": -86.01213073730469, "loss": 0.1346, "losses/dpo": 0.06240982934832573, "losses/sft": 0.6143261194229126, "losses/total": 0.06240982934832573, "ref_logps/chosen": -20.3916072845459, "ref_logps/rejected": -34.85695266723633, "rewards/accuracies": 1.0, "rewards/chosen": -2.0019371509552, "rewards/margins": 3.1135807037353516, "rewards/rejected": -5.115517616271973, "step": 2359 }, { "epoch": 2.23, "grad_norm": 11.137246185147504, "learning_rate": 8.226363511673954e-08, "logps/chosen": -44.47901153564453, "logps/rejected": -85.75287628173828, "loss": 0.1194, "losses/dpo": 0.023394634947180748, "losses/sft": 1.273769497871399, "losses/total": 0.023394634947180748, "ref_logps/chosen": -25.572816848754883, "ref_logps/rejected": -35.75577163696289, "rewards/accuracies": 1.0, "rewards/chosen": -1.8906195163726807, "rewards/margins": 3.109090805053711, "rewards/rejected": -4.9997100830078125, "step": 2360 }, { "epoch": 2.23, "grad_norm": 10.991278932138762, "learning_rate": 8.207488365701631e-08, "logps/chosen": -49.70770263671875, "logps/rejected": -70.85233306884766, "loss": 0.143, "losses/dpo": 0.08783113956451416, "losses/sft": 1.518804669380188, "losses/total": 0.08783113956451416, "ref_logps/chosen": -26.761531829833984, "ref_logps/rejected": -24.808452606201172, "rewards/accuracies": 1.0, "rewards/chosen": -2.294617176055908, "rewards/margins": 2.3097708225250244, "rewards/rejected": -4.604388236999512, "step": 2361 }, { "epoch": 2.23, "grad_norm": 16.368511179665052, "learning_rate": 8.188630645284621e-08, "logps/chosen": -55.98023223876953, "logps/rejected": -92.50224304199219, "loss": 0.1388, "losses/dpo": 0.1349620372056961, "losses/sft": 0.9725703001022339, "losses/total": 0.1349620372056961, "ref_logps/chosen": -29.80296516418457, "ref_logps/rejected": -37.36064529418945, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6177268028259277, "rewards/margins": 2.8964338302612305, "rewards/rejected": -5.514160633087158, "step": 2362 }, { "epoch": 2.23, "grad_norm": 11.304056354631426, "learning_rate": 8.169790369991547e-08, "logps/chosen": -44.924888610839844, "logps/rejected": -82.62913513183594, "loss": 0.1306, "losses/dpo": 0.009368150494992733, "losses/sft": 1.2639442682266235, "losses/total": 0.009368150494992733, "ref_logps/chosen": -26.043901443481445, "ref_logps/rejected": -34.9957275390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8880987167358398, "rewards/margins": 2.875241279602051, "rewards/rejected": -4.763339996337891, "step": 2363 }, { "epoch": 2.23, "grad_norm": 17.204974135096293, "learning_rate": 8.150967559372913e-08, "logps/chosen": -60.783973693847656, "logps/rejected": -91.137939453125, "loss": 0.2006, "losses/dpo": 0.019058823585510254, "losses/sft": 1.7737044095993042, "losses/total": 0.019058823585510254, "ref_logps/chosen": -35.024192810058594, "ref_logps/rejected": -35.713096618652344, "rewards/accuracies": 1.0, "rewards/chosen": -2.5759780406951904, "rewards/margins": 2.966505765914917, "rewards/rejected": -5.542483806610107, "step": 2364 }, { "epoch": 2.23, "grad_norm": 16.412324508665236, "learning_rate": 8.132162232961124e-08, "logps/chosen": -50.62512969970703, "logps/rejected": -72.49867248535156, "loss": 0.1922, "losses/dpo": 0.287606805562973, "losses/sft": 2.00203800201416, "losses/total": 0.287606805562973, "ref_logps/chosen": -29.332088470458984, "ref_logps/rejected": -27.642967224121094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1293044090270996, "rewards/margins": 2.3562660217285156, "rewards/rejected": -4.485570907592773, "step": 2365 }, { "epoch": 2.23, "grad_norm": 10.877600002278678, "learning_rate": 8.113374410270412e-08, "logps/chosen": -55.64144515991211, "logps/rejected": -82.42342376708984, "loss": 0.1267, "losses/dpo": 0.1944810301065445, "losses/sft": 2.1793315410614014, "losses/total": 0.1944810301065445, "ref_logps/chosen": -34.101890563964844, "ref_logps/rejected": -29.718955993652344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1539556980133057, "rewards/margins": 3.1164913177490234, "rewards/rejected": -5.27044677734375, "step": 2366 }, { "epoch": 2.23, "grad_norm": 6.726413171367586, "learning_rate": 8.094604110796874e-08, "logps/chosen": -50.271366119384766, "logps/rejected": -82.0071792602539, "loss": 0.0797, "losses/dpo": 0.0010117812780663371, "losses/sft": 0.9011655449867249, "losses/total": 0.0010117812780663371, "ref_logps/chosen": -29.630233764648438, "ref_logps/rejected": -30.99500274658203, "rewards/accuracies": 1.0, "rewards/chosen": -2.064113140106201, "rewards/margins": 3.0371041297912598, "rewards/rejected": -5.101217269897461, "step": 2367 }, { "epoch": 2.23, "grad_norm": 9.103974824887679, "learning_rate": 8.075851354018418e-08, "logps/chosen": -38.435829162597656, "logps/rejected": -80.13067626953125, "loss": 0.1531, "losses/dpo": 0.046944841742515564, "losses/sft": 0.7429807782173157, "losses/total": 0.046944841742515564, "ref_logps/chosen": -22.522464752197266, "ref_logps/rejected": -35.384986877441406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5913368463516235, "rewards/margins": 2.8832318782806396, "rewards/rejected": -4.474568843841553, "step": 2368 }, { "epoch": 2.23, "grad_norm": 9.464131330258201, "learning_rate": 8.057116159394725e-08, "logps/chosen": -48.74671173095703, "logps/rejected": -98.31950378417969, "loss": 0.0745, "losses/dpo": 0.08921023458242416, "losses/sft": 1.932194709777832, "losses/total": 0.08921023458242416, "ref_logps/chosen": -27.562580108642578, "ref_logps/rejected": -42.471275329589844, "rewards/accuracies": 1.0, "rewards/chosen": -2.118412971496582, "rewards/margins": 3.466409683227539, "rewards/rejected": -5.584823131561279, "step": 2369 }, { "epoch": 2.24, "grad_norm": 13.768573575796491, "learning_rate": 8.038398546367286e-08, "logps/chosen": -76.1904296875, "logps/rejected": -103.02357482910156, "loss": 0.1279, "losses/dpo": 0.18714264035224915, "losses/sft": 3.099661350250244, "losses/total": 0.18714264035224915, "ref_logps/chosen": -43.8936767578125, "ref_logps/rejected": -41.46439743041992, "rewards/accuracies": 1.0, "rewards/chosen": -3.2296760082244873, "rewards/margins": 2.9262423515319824, "rewards/rejected": -6.155918121337891, "step": 2370 }, { "epoch": 2.24, "grad_norm": 13.233850078486025, "learning_rate": 8.019698534359314e-08, "logps/chosen": -53.21002197265625, "logps/rejected": -79.7447509765625, "loss": 0.1343, "losses/dpo": 0.003527077380567789, "losses/sft": 2.421079635620117, "losses/total": 0.003527077380567789, "ref_logps/chosen": -33.44427490234375, "ref_logps/rejected": -29.87104034423828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9765746593475342, "rewards/margins": 3.0107967853546143, "rewards/rejected": -4.987371444702148, "step": 2371 }, { "epoch": 2.24, "grad_norm": 14.709569678350542, "learning_rate": 8.001016142775787e-08, "logps/chosen": -34.19728469848633, "logps/rejected": -84.18295288085938, "loss": 0.1498, "losses/dpo": 0.00013799862063024193, "losses/sft": 1.686987042427063, "losses/total": 0.00013799862063024193, "ref_logps/chosen": -16.44482421875, "ref_logps/rejected": -33.53253936767578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7752461433410645, "rewards/margins": 3.2897958755493164, "rewards/rejected": -5.065041542053223, "step": 2372 }, { "epoch": 2.24, "grad_norm": 7.249816255119534, "learning_rate": 7.982351391003376e-08, "logps/chosen": -45.32319259643555, "logps/rejected": -89.57202911376953, "loss": 0.0652, "losses/dpo": 0.005408951546996832, "losses/sft": 0.5855697989463806, "losses/total": 0.005408951546996832, "ref_logps/chosen": -28.736042022705078, "ref_logps/rejected": -35.56996154785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.658715009689331, "rewards/margins": 3.7414917945861816, "rewards/rejected": -5.400207042694092, "step": 2373 }, { "epoch": 2.24, "grad_norm": 12.418135455332118, "learning_rate": 7.963704298410462e-08, "logps/chosen": -55.48877716064453, "logps/rejected": -75.12660217285156, "loss": 0.1202, "losses/dpo": 0.15125232934951782, "losses/sft": 1.9332480430603027, "losses/total": 0.15125232934951782, "ref_logps/chosen": -34.32666778564453, "ref_logps/rejected": -27.896621704101562, "rewards/accuracies": 1.0, "rewards/chosen": -2.116211175918579, "rewards/margins": 2.6067872047424316, "rewards/rejected": -4.72299861907959, "step": 2374 }, { "epoch": 2.24, "grad_norm": 11.19988603189981, "learning_rate": 7.945074884347088e-08, "logps/chosen": -58.746429443359375, "logps/rejected": -95.58360290527344, "loss": 0.1633, "losses/dpo": 0.8797607421875, "losses/sft": 0.8088181018829346, "losses/total": 0.8797607421875, "ref_logps/chosen": -34.18205642700195, "ref_logps/rejected": -42.7416877746582, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4564368724823, "rewards/margins": 2.8277549743652344, "rewards/rejected": -5.284191608428955, "step": 2375 }, { "epoch": 2.24, "grad_norm": 8.562639725701008, "learning_rate": 7.926463168144961e-08, "logps/chosen": -62.32880401611328, "logps/rejected": -89.41764831542969, "loss": 0.1023, "losses/dpo": 0.2153392732143402, "losses/sft": 2.372520685195923, "losses/total": 0.2153392732143402, "ref_logps/chosen": -38.84333419799805, "ref_logps/rejected": -32.01014709472656, "rewards/accuracies": 1.0, "rewards/chosen": -2.3485469818115234, "rewards/margins": 3.3922033309936523, "rewards/rejected": -5.740750312805176, "step": 2376 }, { "epoch": 2.24, "grad_norm": 9.731820509833042, "learning_rate": 7.907869169117434e-08, "logps/chosen": -50.76564025878906, "logps/rejected": -83.6798095703125, "loss": 0.1197, "losses/dpo": 0.5099337100982666, "losses/sft": 1.3732564449310303, "losses/total": 0.5099337100982666, "ref_logps/chosen": -31.009536743164062, "ref_logps/rejected": -37.026424407958984, "rewards/accuracies": 1.0, "rewards/chosen": -1.9756100177764893, "rewards/margins": 2.6897284984588623, "rewards/rejected": -4.665338516235352, "step": 2377 }, { "epoch": 2.24, "grad_norm": 13.986157951769888, "learning_rate": 7.889292906559444e-08, "logps/chosen": -51.093650817871094, "logps/rejected": -88.90645599365234, "loss": 0.1536, "losses/dpo": 0.16737107932567596, "losses/sft": 1.9425419569015503, "losses/total": 0.16737107932567596, "ref_logps/chosen": -27.147132873535156, "ref_logps/rejected": -35.31585693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3946518898010254, "rewards/margins": 2.9644081592559814, "rewards/rejected": -5.359060287475586, "step": 2378 }, { "epoch": 2.24, "grad_norm": 13.738357856684267, "learning_rate": 7.870734399747556e-08, "logps/chosen": -49.13029479980469, "logps/rejected": -97.89824676513672, "loss": 0.169, "losses/dpo": 0.6340610980987549, "losses/sft": 0.6243157982826233, "losses/total": 0.6340610980987549, "ref_logps/chosen": -28.60980796813965, "ref_logps/rejected": -43.36943817138672, "rewards/accuracies": 1.0, "rewards/chosen": -2.052048683166504, "rewards/margins": 3.400831460952759, "rewards/rejected": -5.452879905700684, "step": 2379 }, { "epoch": 2.25, "grad_norm": 14.580473481945024, "learning_rate": 7.85219366793988e-08, "logps/chosen": -55.8433837890625, "logps/rejected": -84.5622329711914, "loss": 0.1445, "losses/dpo": 0.28490447998046875, "losses/sft": 1.8441522121429443, "losses/total": 0.28490447998046875, "ref_logps/chosen": -34.567657470703125, "ref_logps/rejected": -33.36802291870117, "rewards/accuracies": 0.9375, "rewards/chosen": -2.127573013305664, "rewards/margins": 2.9918479919433594, "rewards/rejected": -5.119421005249023, "step": 2380 }, { "epoch": 2.25, "grad_norm": 9.264783269263337, "learning_rate": 7.833670730376113e-08, "logps/chosen": -36.83060073852539, "logps/rejected": -73.5365219116211, "loss": 0.1681, "losses/dpo": 0.03409925848245621, "losses/sft": 1.5937761068344116, "losses/total": 0.03409925848245621, "ref_logps/chosen": -21.839548110961914, "ref_logps/rejected": -27.323200225830078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4991053342819214, "rewards/margins": 3.1222264766693115, "rewards/rejected": -4.621331691741943, "step": 2381 }, { "epoch": 2.25, "grad_norm": 11.325445903082503, "learning_rate": 7.815165606277452e-08, "logps/chosen": -59.813541412353516, "logps/rejected": -115.01339721679688, "loss": 0.0925, "losses/dpo": 0.018717458471655846, "losses/sft": 2.703646421432495, "losses/total": 0.018717458471655846, "ref_logps/chosen": -36.51038360595703, "ref_logps/rejected": -53.27164840698242, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3303158283233643, "rewards/margins": 3.8438587188720703, "rewards/rejected": -6.1741743087768555, "step": 2382 }, { "epoch": 2.25, "grad_norm": 12.579576113927164, "learning_rate": 7.796678314846642e-08, "logps/chosen": -45.58055877685547, "logps/rejected": -87.9195556640625, "loss": 0.1053, "losses/dpo": 0.08467628061771393, "losses/sft": 1.9587827920913696, "losses/total": 0.08467628061771393, "ref_logps/chosen": -27.922561645507812, "ref_logps/rejected": -37.98948669433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7657999992370605, "rewards/margins": 3.2272071838378906, "rewards/rejected": -4.993007183074951, "step": 2383 }, { "epoch": 2.25, "grad_norm": 9.72736656983725, "learning_rate": 7.778208875267895e-08, "logps/chosen": -45.07447814941406, "logps/rejected": -94.34195709228516, "loss": 0.0771, "losses/dpo": 0.023028872907161713, "losses/sft": 2.255192279815674, "losses/total": 0.023028872907161713, "ref_logps/chosen": -29.103660583496094, "ref_logps/rejected": -39.86174011230469, "rewards/accuracies": 1.0, "rewards/chosen": -1.5970818996429443, "rewards/margins": 3.8509395122528076, "rewards/rejected": -5.448020935058594, "step": 2384 }, { "epoch": 2.25, "grad_norm": 5.938374905230058, "learning_rate": 7.759757306706916e-08, "logps/chosen": -52.27143859863281, "logps/rejected": -101.54741668701172, "loss": 0.056, "losses/dpo": 0.0002836044004652649, "losses/sft": 1.3395633697509766, "losses/total": 0.0002836044004652649, "ref_logps/chosen": -28.477277755737305, "ref_logps/rejected": -35.6519775390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.3794164657592773, "rewards/margins": 4.210127830505371, "rewards/rejected": -6.589544296264648, "step": 2385 }, { "epoch": 2.25, "grad_norm": 8.628174411150336, "learning_rate": 7.74132362831087e-08, "logps/chosen": -40.70648193359375, "logps/rejected": -83.95143127441406, "loss": 0.13, "losses/dpo": 0.057001784443855286, "losses/sft": 1.5398056507110596, "losses/total": 0.057001784443855286, "ref_logps/chosen": -21.98432159423828, "ref_logps/rejected": -31.994436264038086, "rewards/accuracies": 0.875, "rewards/chosen": -1.872215986251831, "rewards/margins": 3.323484420776367, "rewards/rejected": -5.195700645446777, "step": 2386 }, { "epoch": 2.25, "grad_norm": 7.882797325047598, "learning_rate": 7.722907859208332e-08, "logps/chosen": -43.034366607666016, "logps/rejected": -86.33154296875, "loss": 0.0758, "losses/dpo": 0.048636868596076965, "losses/sft": 1.5337249040603638, "losses/total": 0.048636868596076965, "ref_logps/chosen": -22.051284790039062, "ref_logps/rejected": -29.40105438232422, "rewards/accuracies": 1.0, "rewards/chosen": -2.0983080863952637, "rewards/margins": 3.594740867614746, "rewards/rejected": -5.69304895401001, "step": 2387 }, { "epoch": 2.25, "grad_norm": 6.511000909887653, "learning_rate": 7.704510018509325e-08, "logps/chosen": -53.06363296508789, "logps/rejected": -109.85511779785156, "loss": 0.0559, "losses/dpo": 0.38758131861686707, "losses/sft": 1.6386446952819824, "losses/total": 0.38758131861686707, "ref_logps/chosen": -27.201797485351562, "ref_logps/rejected": -48.10161590576172, "rewards/accuracies": 1.0, "rewards/chosen": -2.586183547973633, "rewards/margins": 3.5891666412353516, "rewards/rejected": -6.175350189208984, "step": 2388 }, { "epoch": 2.25, "grad_norm": 10.65280989303549, "learning_rate": 7.686130125305232e-08, "logps/chosen": -52.74580001831055, "logps/rejected": -96.88943481445312, "loss": 0.1122, "losses/dpo": 0.020696043968200684, "losses/sft": 1.3742834329605103, "losses/total": 0.020696043968200684, "ref_logps/chosen": -31.218128204345703, "ref_logps/rejected": -43.424476623535156, "rewards/accuracies": 1.0, "rewards/chosen": -2.1527671813964844, "rewards/margins": 3.1937289237976074, "rewards/rejected": -5.346496105194092, "step": 2389 }, { "epoch": 2.25, "grad_norm": 10.818719539352173, "learning_rate": 7.667768198668847e-08, "logps/chosen": -44.08531188964844, "logps/rejected": -76.92121887207031, "loss": 0.1402, "losses/dpo": 0.03648604452610016, "losses/sft": 1.6798224449157715, "losses/total": 0.03648604452610016, "ref_logps/chosen": -26.054676055908203, "ref_logps/rejected": -32.257965087890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8030637502670288, "rewards/margins": 2.6632614135742188, "rewards/rejected": -4.466325283050537, "step": 2390 }, { "epoch": 2.26, "grad_norm": 13.81250735169376, "learning_rate": 7.649424257654292e-08, "logps/chosen": -48.847496032714844, "logps/rejected": -82.49829864501953, "loss": 0.2112, "losses/dpo": 0.44249773025512695, "losses/sft": 2.1539289951324463, "losses/total": 0.44249773025512695, "ref_logps/chosen": -30.514202117919922, "ref_logps/rejected": -33.65948486328125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.833329439163208, "rewards/margins": 3.0505523681640625, "rewards/rejected": -4.88388204574585, "step": 2391 }, { "epoch": 2.26, "grad_norm": 11.36526100648654, "learning_rate": 7.631098321297041e-08, "logps/chosen": -44.95690155029297, "logps/rejected": -99.72042846679688, "loss": 0.1637, "losses/dpo": 1.187156319618225, "losses/sft": 0.9849140048027039, "losses/total": 1.187156319618225, "ref_logps/chosen": -24.837446212768555, "ref_logps/rejected": -41.86737823486328, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0119457244873047, "rewards/margins": 3.773359537124634, "rewards/rejected": -5.785305023193359, "step": 2392 }, { "epoch": 2.26, "grad_norm": 11.352223541720182, "learning_rate": 7.612790408613895e-08, "logps/chosen": -37.697418212890625, "logps/rejected": -70.11640930175781, "loss": 0.1384, "losses/dpo": 0.20368853211402893, "losses/sft": 0.7749658226966858, "losses/total": 0.20368853211402893, "ref_logps/chosen": -20.454402923583984, "ref_logps/rejected": -27.004947662353516, "rewards/accuracies": 1.0, "rewards/chosen": -1.7243015766143799, "rewards/margins": 2.5868453979492188, "rewards/rejected": -4.3111467361450195, "step": 2393 }, { "epoch": 2.26, "grad_norm": 10.182989672319694, "learning_rate": 7.594500538602917e-08, "logps/chosen": -39.2525520324707, "logps/rejected": -81.13178253173828, "loss": 0.0952, "losses/dpo": 0.0037699539680033922, "losses/sft": 1.8897596597671509, "losses/total": 0.0037699539680033922, "ref_logps/chosen": -20.890117645263672, "ref_logps/rejected": -30.202871322631836, "rewards/accuracies": 1.0, "rewards/chosen": -1.8362431526184082, "rewards/margins": 3.2566475868225098, "rewards/rejected": -5.092890739440918, "step": 2394 }, { "epoch": 2.26, "grad_norm": 10.398382653832634, "learning_rate": 7.576228730243483e-08, "logps/chosen": -46.02165603637695, "logps/rejected": -88.02220916748047, "loss": 0.1226, "losses/dpo": 0.4906512498855591, "losses/sft": 0.28248026967048645, "losses/total": 0.4906512498855591, "ref_logps/chosen": -28.66380500793457, "ref_logps/rejected": -35.432334899902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7357850074768066, "rewards/margins": 3.523202419281006, "rewards/rejected": -5.2589874267578125, "step": 2395 }, { "epoch": 2.26, "grad_norm": 9.035219916940017, "learning_rate": 7.557975002496198e-08, "logps/chosen": -39.98387908935547, "logps/rejected": -85.63088989257812, "loss": 0.0829, "losses/dpo": 0.0333956703543663, "losses/sft": 1.3486703634262085, "losses/total": 0.0333956703543663, "ref_logps/chosen": -20.58121109008789, "ref_logps/rejected": -33.149288177490234, "rewards/accuracies": 1.0, "rewards/chosen": -1.9402670860290527, "rewards/margins": 3.3078927993774414, "rewards/rejected": -5.248159885406494, "step": 2396 }, { "epoch": 2.26, "grad_norm": 18.24192318167906, "learning_rate": 7.539739374302936e-08, "logps/chosen": -57.29951858520508, "logps/rejected": -81.83607482910156, "loss": 0.1815, "losses/dpo": 0.08285889774560928, "losses/sft": 1.8517040014266968, "losses/total": 0.08285889774560928, "ref_logps/chosen": -31.720874786376953, "ref_logps/rejected": -29.922687530517578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5578644275665283, "rewards/margins": 2.633474826812744, "rewards/rejected": -5.191339015960693, "step": 2397 }, { "epoch": 2.26, "grad_norm": 12.729390531153529, "learning_rate": 7.521521864586752e-08, "logps/chosen": -42.65928268432617, "logps/rejected": -69.46676635742188, "loss": 0.1464, "losses/dpo": 0.1162622720003128, "losses/sft": 1.4791607856750488, "losses/total": 0.1162622720003128, "ref_logps/chosen": -25.236053466796875, "ref_logps/rejected": -26.394290924072266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7423224449157715, "rewards/margins": 2.564924716949463, "rewards/rejected": -4.307247161865234, "step": 2398 }, { "epoch": 2.26, "grad_norm": 8.685644088936296, "learning_rate": 7.503322492251937e-08, "logps/chosen": -46.90311813354492, "logps/rejected": -85.83321380615234, "loss": 0.1046, "losses/dpo": 0.017378240823745728, "losses/sft": 1.0605846643447876, "losses/total": 0.017378240823745728, "ref_logps/chosen": -28.225112915039062, "ref_logps/rejected": -34.314453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8678007125854492, "rewards/margins": 3.2840757369995117, "rewards/rejected": -5.151876449584961, "step": 2399 }, { "epoch": 2.26, "grad_norm": 12.337357851353985, "learning_rate": 7.485141276183926e-08, "logps/chosen": -40.300270080566406, "logps/rejected": -73.33683776855469, "loss": 0.152, "losses/dpo": 0.012997850775718689, "losses/sft": 1.1789551973342896, "losses/total": 0.012997850775718689, "ref_logps/chosen": -21.819232940673828, "ref_logps/rejected": -28.845672607421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8481040000915527, "rewards/margins": 2.6010122299194336, "rewards/rejected": -4.449116230010986, "step": 2400 }, { "epoch": 2.27, "grad_norm": 17.06847919403477, "learning_rate": 7.466978235249338e-08, "logps/chosen": -50.566368103027344, "logps/rejected": -88.7744140625, "loss": 0.213, "losses/dpo": 0.06410735100507736, "losses/sft": 1.601102352142334, "losses/total": 0.06410735100507736, "ref_logps/chosen": -28.641387939453125, "ref_logps/rejected": -33.604148864746094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.192497730255127, "rewards/margins": 3.324528694152832, "rewards/rejected": -5.517026424407959, "step": 2401 }, { "epoch": 2.27, "grad_norm": 9.109012135495826, "learning_rate": 7.448833388295934e-08, "logps/chosen": -61.40499496459961, "logps/rejected": -91.49293518066406, "loss": 0.1171, "losses/dpo": 0.041842687875032425, "losses/sft": 2.343252182006836, "losses/total": 0.041842687875032425, "ref_logps/chosen": -36.34279251098633, "ref_logps/rejected": -37.431644439697266, "rewards/accuracies": 0.9375, "rewards/chosen": -2.506220579147339, "rewards/margins": 2.899909496307373, "rewards/rejected": -5.406129837036133, "step": 2402 }, { "epoch": 2.27, "grad_norm": 15.362264794712683, "learning_rate": 7.430706754152563e-08, "logps/chosen": -48.600852966308594, "logps/rejected": -95.51527404785156, "loss": 0.1552, "losses/dpo": 0.16731618344783783, "losses/sft": 2.289929151535034, "losses/total": 0.16731618344783783, "ref_logps/chosen": -23.678245544433594, "ref_logps/rejected": -38.66536331176758, "rewards/accuracies": 0.9375, "rewards/chosen": -2.492260694503784, "rewards/margins": 3.1927294731140137, "rewards/rejected": -5.684990406036377, "step": 2403 }, { "epoch": 2.27, "grad_norm": 13.014457895130832, "learning_rate": 7.412598351629221e-08, "logps/chosen": -42.02848815917969, "logps/rejected": -60.97235870361328, "loss": 0.1619, "losses/dpo": 0.023568201810121536, "losses/sft": 0.7262465953826904, "losses/total": 0.023568201810121536, "ref_logps/chosen": -23.166805267333984, "ref_logps/rejected": -20.756797790527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.8861685991287231, "rewards/margins": 2.1353869438171387, "rewards/rejected": -4.021555423736572, "step": 2404 }, { "epoch": 2.27, "grad_norm": 18.034733849280876, "learning_rate": 7.394508199516938e-08, "logps/chosen": -47.064186096191406, "logps/rejected": -74.24620056152344, "loss": 0.1794, "losses/dpo": 0.08028129488229752, "losses/sft": 1.2045714855194092, "losses/total": 0.08028129488229752, "ref_logps/chosen": -26.796987533569336, "ref_logps/rejected": -27.06381607055664, "rewards/accuracies": 0.875, "rewards/chosen": -2.026719570159912, "rewards/margins": 2.691519260406494, "rewards/rejected": -4.718238830566406, "step": 2405 }, { "epoch": 2.27, "grad_norm": 8.667600624695519, "learning_rate": 7.376436316587845e-08, "logps/chosen": -46.14865493774414, "logps/rejected": -109.47056579589844, "loss": 0.082, "losses/dpo": 0.44510385394096375, "losses/sft": 1.166013240814209, "losses/total": 0.44510385394096375, "ref_logps/chosen": -23.070919036865234, "ref_logps/rejected": -48.34431457519531, "rewards/accuracies": 1.0, "rewards/chosen": -2.3077735900878906, "rewards/margins": 3.804852247238159, "rewards/rejected": -6.112626075744629, "step": 2406 }, { "epoch": 2.27, "grad_norm": 7.283511602544891, "learning_rate": 7.358382721595085e-08, "logps/chosen": -41.301666259765625, "logps/rejected": -80.4039535522461, "loss": 0.0956, "losses/dpo": 0.000586473906878382, "losses/sft": 1.8339414596557617, "losses/total": 0.000586473906878382, "ref_logps/chosen": -23.463499069213867, "ref_logps/rejected": -30.517263412475586, "rewards/accuracies": 1.0, "rewards/chosen": -1.7838166952133179, "rewards/margins": 3.20485258102417, "rewards/rejected": -4.988668918609619, "step": 2407 }, { "epoch": 2.27, "grad_norm": 8.832543964560724, "learning_rate": 7.340347433272853e-08, "logps/chosen": -47.72789001464844, "logps/rejected": -79.6998291015625, "loss": 0.0965, "losses/dpo": 0.20143450796604156, "losses/sft": 0.7956730127334595, "losses/total": 0.20143450796604156, "ref_logps/chosen": -26.02257537841797, "ref_logps/rejected": -30.56067657470703, "rewards/accuracies": 1.0, "rewards/chosen": -2.1705312728881836, "rewards/margins": 2.74338436126709, "rewards/rejected": -4.913915634155273, "step": 2408 }, { "epoch": 2.27, "grad_norm": 17.479494749175565, "learning_rate": 7.322330470336313e-08, "logps/chosen": -58.132545471191406, "logps/rejected": -95.17835235595703, "loss": 0.1924, "losses/dpo": 0.03927107900381088, "losses/sft": 1.744028091430664, "losses/total": 0.03927107900381088, "ref_logps/chosen": -30.64853286743164, "ref_logps/rejected": -41.874183654785156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.748401165008545, "rewards/margins": 2.5820152759552, "rewards/rejected": -5.330416202545166, "step": 2409 }, { "epoch": 2.27, "grad_norm": 6.615305863049021, "learning_rate": 7.30433185148164e-08, "logps/chosen": -41.213523864746094, "logps/rejected": -110.40701293945312, "loss": 0.0487, "losses/dpo": 0.0001476170727983117, "losses/sft": 1.1235805749893188, "losses/total": 0.0001476170727983117, "ref_logps/chosen": -23.478195190429688, "ref_logps/rejected": -44.25409698486328, "rewards/accuracies": 1.0, "rewards/chosen": -1.7735328674316406, "rewards/margins": 4.841759204864502, "rewards/rejected": -6.615291595458984, "step": 2410 }, { "epoch": 2.27, "grad_norm": 11.280930909724322, "learning_rate": 7.286351595385973e-08, "logps/chosen": -52.57825469970703, "logps/rejected": -88.55087280273438, "loss": 0.1075, "losses/dpo": 0.019279828295111656, "losses/sft": 1.6658655405044556, "losses/total": 0.019279828295111656, "ref_logps/chosen": -27.781139373779297, "ref_logps/rejected": -33.3935546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.4797115325927734, "rewards/margins": 3.036020040512085, "rewards/rejected": -5.5157318115234375, "step": 2411 }, { "epoch": 2.28, "grad_norm": 14.343275071014737, "learning_rate": 7.268389720707368e-08, "logps/chosen": -57.725677490234375, "logps/rejected": -101.26808166503906, "loss": 0.1373, "losses/dpo": 0.10621468722820282, "losses/sft": 1.666741967201233, "losses/total": 0.10621468722820282, "ref_logps/chosen": -28.727584838867188, "ref_logps/rejected": -37.19432067871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.899808645248413, "rewards/margins": 3.5075674057006836, "rewards/rejected": -6.407376289367676, "step": 2412 }, { "epoch": 2.28, "grad_norm": 13.14654557126012, "learning_rate": 7.250446246084847e-08, "logps/chosen": -47.97246551513672, "logps/rejected": -81.852294921875, "loss": 0.1204, "losses/dpo": 0.02482135035097599, "losses/sft": 1.898773193359375, "losses/total": 0.02482135035097599, "ref_logps/chosen": -23.73011589050293, "ref_logps/rejected": -28.06102752685547, "rewards/accuracies": 1.0, "rewards/chosen": -2.4242353439331055, "rewards/margins": 2.9548919200897217, "rewards/rejected": -5.379127502441406, "step": 2413 }, { "epoch": 2.28, "grad_norm": 16.63589312919629, "learning_rate": 7.232521190138297e-08, "logps/chosen": -51.56632995605469, "logps/rejected": -97.41108703613281, "loss": 0.1619, "losses/dpo": 0.07830829173326492, "losses/sft": 2.0400404930114746, "losses/total": 0.07830829173326492, "ref_logps/chosen": -25.14029312133789, "ref_logps/rejected": -36.35055160522461, "rewards/accuracies": 0.9375, "rewards/chosen": -2.642603874206543, "rewards/margins": 3.4634501934051514, "rewards/rejected": -6.106054306030273, "step": 2414 }, { "epoch": 2.28, "grad_norm": 13.850242635862449, "learning_rate": 7.21461457146853e-08, "logps/chosen": -44.673240661621094, "logps/rejected": -72.939453125, "loss": 0.1515, "losses/dpo": 0.07450686395168304, "losses/sft": 1.5841000080108643, "losses/total": 0.07450686395168304, "ref_logps/chosen": -24.298398971557617, "ref_logps/rejected": -29.585384368896484, "rewards/accuracies": 1.0, "rewards/chosen": -2.0374841690063477, "rewards/margins": 2.2979226112365723, "rewards/rejected": -4.335406303405762, "step": 2415 }, { "epoch": 2.28, "grad_norm": 21.134169833785908, "learning_rate": 7.196726408657192e-08, "logps/chosen": -55.75285339355469, "logps/rejected": -91.98855590820312, "loss": 0.1952, "losses/dpo": 0.06847335398197174, "losses/sft": 1.1362639665603638, "losses/total": 0.06847335398197174, "ref_logps/chosen": -27.95813751220703, "ref_logps/rejected": -32.23032760620117, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7794713973999023, "rewards/margins": 3.1963510513305664, "rewards/rejected": -5.975822925567627, "step": 2416 }, { "epoch": 2.28, "grad_norm": 11.18701727625524, "learning_rate": 7.17885672026681e-08, "logps/chosen": -54.78962707519531, "logps/rejected": -98.15068817138672, "loss": 0.1437, "losses/dpo": 0.15482944250106812, "losses/sft": 1.4983007907867432, "losses/total": 0.15482944250106812, "ref_logps/chosen": -27.579082489013672, "ref_logps/rejected": -39.90020751953125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.721054792404175, "rewards/margins": 3.1039931774139404, "rewards/rejected": -5.825047969818115, "step": 2417 }, { "epoch": 2.28, "grad_norm": 8.796163231819612, "learning_rate": 7.161005524840704e-08, "logps/chosen": -45.96002197265625, "logps/rejected": -81.54541015625, "loss": 0.1004, "losses/dpo": 0.2330007404088974, "losses/sft": 2.2725234031677246, "losses/total": 0.2330007404088974, "ref_logps/chosen": -27.332834243774414, "ref_logps/rejected": -29.931543350219727, "rewards/accuracies": 1.0, "rewards/chosen": -1.862718939781189, "rewards/margins": 3.2986679077148438, "rewards/rejected": -5.161386966705322, "step": 2418 }, { "epoch": 2.28, "grad_norm": 16.433082513307887, "learning_rate": 7.143172840903036e-08, "logps/chosen": -53.503562927246094, "logps/rejected": -83.71632385253906, "loss": 0.1576, "losses/dpo": 0.09016431868076324, "losses/sft": 1.2643556594848633, "losses/total": 0.09016431868076324, "ref_logps/chosen": -28.515439987182617, "ref_logps/rejected": -32.67597198486328, "rewards/accuracies": 0.875, "rewards/chosen": -2.498812437057495, "rewards/margins": 2.6052231788635254, "rewards/rejected": -5.1040358543396, "step": 2419 }, { "epoch": 2.28, "grad_norm": 21.83529236284417, "learning_rate": 7.125358686958752e-08, "logps/chosen": -52.768375396728516, "logps/rejected": -88.97179412841797, "loss": 0.178, "losses/dpo": 1.035089135169983, "losses/sft": 2.686922550201416, "losses/total": 1.035089135169983, "ref_logps/chosen": -27.54034423828125, "ref_logps/rejected": -34.6373405456543, "rewards/accuracies": 0.9375, "rewards/chosen": -2.522803544998169, "rewards/margins": 2.910642147064209, "rewards/rejected": -5.433445453643799, "step": 2420 }, { "epoch": 2.28, "grad_norm": 14.77941647034559, "learning_rate": 7.107563081493548e-08, "logps/chosen": -47.5499267578125, "logps/rejected": -77.77040100097656, "loss": 0.1845, "losses/dpo": 0.019148409366607666, "losses/sft": 1.989184856414795, "losses/total": 0.019148409366607666, "ref_logps/chosen": -24.788877487182617, "ref_logps/rejected": -29.892108917236328, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2761051654815674, "rewards/margins": 2.5117244720458984, "rewards/rejected": -4.787829399108887, "step": 2421 }, { "epoch": 2.28, "grad_norm": 10.53520597174079, "learning_rate": 7.089786042973903e-08, "logps/chosen": -54.507686614990234, "logps/rejected": -95.54151153564453, "loss": 0.0985, "losses/dpo": 0.0583513006567955, "losses/sft": 1.288939118385315, "losses/total": 0.0583513006567955, "ref_logps/chosen": -29.65081787109375, "ref_logps/rejected": -38.29169464111328, "rewards/accuracies": 1.0, "rewards/chosen": -2.485686779022217, "rewards/margins": 3.23929500579834, "rewards/rejected": -5.724981784820557, "step": 2422 }, { "epoch": 2.29, "grad_norm": 14.112900004256264, "learning_rate": 7.072027589846995e-08, "logps/chosen": -36.506961822509766, "logps/rejected": -76.33271789550781, "loss": 0.1592, "losses/dpo": 0.09932266920804977, "losses/sft": 2.0688281059265137, "losses/total": 0.09932266920804977, "ref_logps/chosen": -18.70574951171875, "ref_logps/rejected": -28.792430877685547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7801213264465332, "rewards/margins": 2.973907232284546, "rewards/rejected": -4.7540283203125, "step": 2423 }, { "epoch": 2.29, "grad_norm": 15.326376660365952, "learning_rate": 7.05428774054076e-08, "logps/chosen": -44.8723258972168, "logps/rejected": -94.21074676513672, "loss": 0.13, "losses/dpo": 0.16879694163799286, "losses/sft": 0.47720709443092346, "losses/total": 0.16879694163799286, "ref_logps/chosen": -25.940731048583984, "ref_logps/rejected": -39.3236083984375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8931598663330078, "rewards/margins": 3.5955543518066406, "rewards/rejected": -5.488714694976807, "step": 2424 }, { "epoch": 2.29, "grad_norm": 14.854759081536146, "learning_rate": 7.036566513463776e-08, "logps/chosen": -53.76167678833008, "logps/rejected": -82.50770568847656, "loss": 0.1421, "losses/dpo": 0.24359111487865448, "losses/sft": 1.3553709983825684, "losses/total": 0.24359111487865448, "ref_logps/chosen": -27.25876235961914, "ref_logps/rejected": -29.993408203125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6502914428710938, "rewards/margins": 2.6011383533477783, "rewards/rejected": -5.251429557800293, "step": 2425 }, { "epoch": 2.29, "grad_norm": 8.593497735621721, "learning_rate": 7.01886392700535e-08, "logps/chosen": -54.05184555053711, "logps/rejected": -92.79304504394531, "loss": 0.077, "losses/dpo": 0.04232688620686531, "losses/sft": 0.7511841058731079, "losses/total": 0.04232688620686531, "ref_logps/chosen": -34.07633972167969, "ref_logps/rejected": -37.155086517333984, "rewards/accuracies": 1.0, "rewards/chosen": -1.9975504875183105, "rewards/margins": 3.5662457942962646, "rewards/rejected": -5.563796043395996, "step": 2426 }, { "epoch": 2.29, "grad_norm": 9.330747506796312, "learning_rate": 7.001179999535398e-08, "logps/chosen": -49.97709274291992, "logps/rejected": -82.86619567871094, "loss": 0.0933, "losses/dpo": 0.015399634838104248, "losses/sft": 1.5055418014526367, "losses/total": 0.015399634838104248, "ref_logps/chosen": -29.395353317260742, "ref_logps/rejected": -30.5920352935791, "rewards/accuracies": 1.0, "rewards/chosen": -2.058173656463623, "rewards/margins": 3.1692428588867188, "rewards/rejected": -5.227416038513184, "step": 2427 }, { "epoch": 2.29, "grad_norm": 11.261591935389973, "learning_rate": 6.983514749404506e-08, "logps/chosen": -46.94280242919922, "logps/rejected": -87.62156677246094, "loss": 0.1104, "losses/dpo": 0.000994333648122847, "losses/sft": 1.0540763139724731, "losses/total": 0.000994333648122847, "ref_logps/chosen": -25.8293514251709, "ref_logps/rejected": -35.48684310913086, "rewards/accuracies": 1.0, "rewards/chosen": -2.111344814300537, "rewards/margins": 3.1021275520324707, "rewards/rejected": -5.213472366333008, "step": 2428 }, { "epoch": 2.29, "grad_norm": 10.586302414256934, "learning_rate": 6.965868194943878e-08, "logps/chosen": -54.55357360839844, "logps/rejected": -100.99085998535156, "loss": 0.0949, "losses/dpo": 0.011073073372244835, "losses/sft": 1.5428855419158936, "losses/total": 0.011073073372244835, "ref_logps/chosen": -31.795238494873047, "ref_logps/rejected": -43.578582763671875, "rewards/accuracies": 1.0, "rewards/chosen": -2.2758336067199707, "rewards/margins": 3.465395212173462, "rewards/rejected": -5.7412285804748535, "step": 2429 }, { "epoch": 2.29, "grad_norm": 9.200437449326348, "learning_rate": 6.948240354465287e-08, "logps/chosen": -47.32967758178711, "logps/rejected": -99.3116455078125, "loss": 0.1339, "losses/dpo": 0.0008190381922759116, "losses/sft": 1.5682072639465332, "losses/total": 0.0008190381922759116, "ref_logps/chosen": -27.438884735107422, "ref_logps/rejected": -42.86970138549805, "rewards/accuracies": 1.0, "rewards/chosen": -1.989079236984253, "rewards/margins": 3.6551146507263184, "rewards/rejected": -5.644194602966309, "step": 2430 }, { "epoch": 2.29, "grad_norm": 7.03766183808652, "learning_rate": 6.930631246261128e-08, "logps/chosen": -54.26518249511719, "logps/rejected": -105.01849365234375, "loss": 0.06, "losses/dpo": 0.027170049026608467, "losses/sft": 2.3826613426208496, "losses/total": 0.027170049026608467, "ref_logps/chosen": -31.58014678955078, "ref_logps/rejected": -45.121849060058594, "rewards/accuracies": 1.0, "rewards/chosen": -2.2685041427612305, "rewards/margins": 3.7211592197418213, "rewards/rejected": -5.989663124084473, "step": 2431 }, { "epoch": 2.29, "grad_norm": 13.268481121963994, "learning_rate": 6.913040888604319e-08, "logps/chosen": -46.83760070800781, "logps/rejected": -88.4345703125, "loss": 0.1345, "losses/dpo": 0.10263676196336746, "losses/sft": 2.876195192337036, "losses/total": 0.10263676196336746, "ref_logps/chosen": -25.886455535888672, "ref_logps/rejected": -38.486846923828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0951147079467773, "rewards/margins": 2.8996567726135254, "rewards/rejected": -4.994771957397461, "step": 2432 }, { "epoch": 2.3, "grad_norm": 11.130147637839578, "learning_rate": 6.895469299748355e-08, "logps/chosen": -45.432456970214844, "logps/rejected": -72.15469360351562, "loss": 0.162, "losses/dpo": 0.041241005063056946, "losses/sft": 2.046617031097412, "losses/total": 0.041241005063056946, "ref_logps/chosen": -27.612995147705078, "ref_logps/rejected": -28.624061584472656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7819461822509766, "rewards/margins": 2.5711169242858887, "rewards/rejected": -4.353063583374023, "step": 2433 }, { "epoch": 2.3, "grad_norm": 22.560188841349174, "learning_rate": 6.877916497927222e-08, "logps/chosen": -43.992889404296875, "logps/rejected": -65.51176452636719, "loss": 0.3354, "losses/dpo": 0.07235845178365707, "losses/sft": 0.7785294651985168, "losses/total": 0.07235845178365707, "ref_logps/chosen": -25.176843643188477, "ref_logps/rejected": -24.26583480834961, "rewards/accuracies": 0.875, "rewards/chosen": -1.8816049098968506, "rewards/margins": 2.242988348007202, "rewards/rejected": -4.1245927810668945, "step": 2434 }, { "epoch": 2.3, "grad_norm": 21.473900252946308, "learning_rate": 6.860382501355432e-08, "logps/chosen": -54.38262176513672, "logps/rejected": -92.5719985961914, "loss": 0.1837, "losses/dpo": 0.008028344251215458, "losses/sft": 1.390054702758789, "losses/total": 0.008028344251215458, "ref_logps/chosen": -31.661710739135742, "ref_logps/rejected": -39.00130844116211, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2720909118652344, "rewards/margins": 3.084977865219116, "rewards/rejected": -5.35706901550293, "step": 2435 }, { "epoch": 2.3, "grad_norm": 17.86064014884376, "learning_rate": 6.842867328227994e-08, "logps/chosen": -54.908775329589844, "logps/rejected": -103.2605209350586, "loss": 0.1298, "losses/dpo": 2.6402265575597994e-05, "losses/sft": 1.4815863370895386, "losses/total": 2.6402265575597994e-05, "ref_logps/chosen": -30.76681900024414, "ref_logps/rejected": -44.44053649902344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4141957759857178, "rewards/margins": 3.4678025245666504, "rewards/rejected": -5.881998062133789, "step": 2436 }, { "epoch": 2.3, "grad_norm": 16.646302950000912, "learning_rate": 6.825370996720342e-08, "logps/chosen": -47.83030700683594, "logps/rejected": -81.14591979980469, "loss": 0.201, "losses/dpo": 0.004453115630894899, "losses/sft": 0.5700121521949768, "losses/total": 0.004453115630894899, "ref_logps/chosen": -24.53354263305664, "ref_logps/rejected": -29.573156356811523, "rewards/accuracies": 0.9375, "rewards/chosen": -2.329676866531372, "rewards/margins": 2.8276000022888184, "rewards/rejected": -5.1572771072387695, "step": 2437 }, { "epoch": 2.3, "grad_norm": 12.269938626032094, "learning_rate": 6.807893524988406e-08, "logps/chosen": -37.42242431640625, "logps/rejected": -89.50765991210938, "loss": 0.1524, "losses/dpo": 0.5219810009002686, "losses/sft": 0.6418821215629578, "losses/total": 0.5219810009002686, "ref_logps/chosen": -19.898313522338867, "ref_logps/rejected": -37.256587982177734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.752410888671875, "rewards/margins": 3.472696304321289, "rewards/rejected": -5.225107192993164, "step": 2438 }, { "epoch": 2.3, "grad_norm": 14.683163124664604, "learning_rate": 6.790434931168504e-08, "logps/chosen": -45.08622741699219, "logps/rejected": -89.93927001953125, "loss": 0.128, "losses/dpo": 0.5432968139648438, "losses/sft": 0.9788967370986938, "losses/total": 0.5432968139648438, "ref_logps/chosen": -26.173805236816406, "ref_logps/rejected": -38.137046813964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.8912420272827148, "rewards/margins": 3.288980484008789, "rewards/rejected": -5.180222511291504, "step": 2439 }, { "epoch": 2.3, "grad_norm": 10.04361845923633, "learning_rate": 6.772995233377402e-08, "logps/chosen": -48.692291259765625, "logps/rejected": -89.26136779785156, "loss": 0.1067, "losses/dpo": 0.0013335268013179302, "losses/sft": 1.652388095855713, "losses/total": 0.0013335268013179302, "ref_logps/chosen": -31.30554962158203, "ref_logps/rejected": -34.83957290649414, "rewards/accuracies": 1.0, "rewards/chosen": -1.738674283027649, "rewards/margins": 3.7035045623779297, "rewards/rejected": -5.442178726196289, "step": 2440 }, { "epoch": 2.3, "grad_norm": 20.122792753939223, "learning_rate": 6.75557444971222e-08, "logps/chosen": -51.24436569213867, "logps/rejected": -84.15013122558594, "loss": 0.1802, "losses/dpo": 0.07209227979183197, "losses/sft": 1.4812955856323242, "losses/total": 0.07209227979183197, "ref_logps/chosen": -27.310747146606445, "ref_logps/rejected": -31.691980361938477, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3933615684509277, "rewards/margins": 2.8524534702301025, "rewards/rejected": -5.245815277099609, "step": 2441 }, { "epoch": 2.3, "grad_norm": 7.082668538947104, "learning_rate": 6.738172598250488e-08, "logps/chosen": -36.72899627685547, "logps/rejected": -78.12136840820312, "loss": 0.0726, "losses/dpo": 0.017435815185308456, "losses/sft": 0.6487583518028259, "losses/total": 0.017435815185308456, "ref_logps/chosen": -18.2921142578125, "ref_logps/rejected": -28.240650177001953, "rewards/accuracies": 1.0, "rewards/chosen": -1.8436882495880127, "rewards/margins": 3.144383430480957, "rewards/rejected": -4.988071441650391, "step": 2442 }, { "epoch": 2.3, "grad_norm": 9.679339558554785, "learning_rate": 6.720789697050055e-08, "logps/chosen": -41.31434631347656, "logps/rejected": -74.704833984375, "loss": 0.1574, "losses/dpo": 0.2959994375705719, "losses/sft": 0.6432849168777466, "losses/total": 0.2959994375705719, "ref_logps/chosen": -24.321300506591797, "ref_logps/rejected": -28.120264053344727, "rewards/accuracies": 1.0, "rewards/chosen": -1.6993045806884766, "rewards/margins": 2.9591526985168457, "rewards/rejected": -4.658457279205322, "step": 2443 }, { "epoch": 2.31, "grad_norm": 19.417932373183408, "learning_rate": 6.703425764149132e-08, "logps/chosen": -46.12525939941406, "logps/rejected": -69.14031982421875, "loss": 0.3206, "losses/dpo": 0.045732807368040085, "losses/sft": 2.3022637367248535, "losses/total": 0.045732807368040085, "ref_logps/chosen": -24.30396842956543, "ref_logps/rejected": -27.28146743774414, "rewards/accuracies": 0.8125, "rewards/chosen": -2.182129144668579, "rewards/margins": 2.003756523132324, "rewards/rejected": -4.185885429382324, "step": 2444 }, { "epoch": 2.31, "grad_norm": 8.74619585121807, "learning_rate": 6.686080817566241e-08, "logps/chosen": -59.90646743774414, "logps/rejected": -90.42112731933594, "loss": 0.078, "losses/dpo": 0.061096999794244766, "losses/sft": 1.3348792791366577, "losses/total": 0.061096999794244766, "ref_logps/chosen": -35.717247009277344, "ref_logps/rejected": -32.67002868652344, "rewards/accuracies": 1.0, "rewards/chosen": -2.418921947479248, "rewards/margins": 3.3561882972717285, "rewards/rejected": -5.775110244750977, "step": 2445 }, { "epoch": 2.31, "grad_norm": 8.991592539065095, "learning_rate": 6.66875487530019e-08, "logps/chosen": -56.82960510253906, "logps/rejected": -106.64200592041016, "loss": 0.0647, "losses/dpo": 0.07042186707258224, "losses/sft": 2.0942444801330566, "losses/total": 0.07042186707258224, "ref_logps/chosen": -31.495046615600586, "ref_logps/rejected": -42.944557189941406, "rewards/accuracies": 1.0, "rewards/chosen": -2.5334558486938477, "rewards/margins": 3.8362886905670166, "rewards/rejected": -6.369744777679443, "step": 2446 }, { "epoch": 2.31, "grad_norm": 10.086622819089492, "learning_rate": 6.651447955330081e-08, "logps/chosen": -52.55323791503906, "logps/rejected": -96.32311248779297, "loss": 0.1294, "losses/dpo": 0.03714875131845474, "losses/sft": 2.190786600112915, "losses/total": 0.03714875131845474, "ref_logps/chosen": -31.83238410949707, "ref_logps/rejected": -35.769432067871094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.072085380554199, "rewards/margins": 3.9832825660705566, "rewards/rejected": -6.055367946624756, "step": 2447 }, { "epoch": 2.31, "grad_norm": 12.452476584270197, "learning_rate": 6.634160075615258e-08, "logps/chosen": -56.83106231689453, "logps/rejected": -85.47064208984375, "loss": 0.1284, "losses/dpo": 0.0030084799509495497, "losses/sft": 2.8513083457946777, "losses/total": 0.0030084799509495497, "ref_logps/chosen": -34.92241668701172, "ref_logps/rejected": -32.723541259765625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1908650398254395, "rewards/margins": 3.0838451385498047, "rewards/rejected": -5.274709701538086, "step": 2448 }, { "epoch": 2.31, "grad_norm": 12.63469784998967, "learning_rate": 6.616891254095336e-08, "logps/chosen": -40.20646667480469, "logps/rejected": -74.95649719238281, "loss": 0.1496, "losses/dpo": 0.06163876876235008, "losses/sft": 1.8180625438690186, "losses/total": 0.06163876876235008, "ref_logps/chosen": -25.934669494628906, "ref_logps/rejected": -29.666057586669922, "rewards/accuracies": 1.0, "rewards/chosen": -1.4271793365478516, "rewards/margins": 3.1018645763397217, "rewards/rejected": -4.529044151306152, "step": 2449 }, { "epoch": 2.31, "grad_norm": 14.056376520352515, "learning_rate": 6.599641508690117e-08, "logps/chosen": -47.48586654663086, "logps/rejected": -89.97550964355469, "loss": 0.1428, "losses/dpo": 0.11364739388227463, "losses/sft": 1.782745599746704, "losses/total": 0.11364739388227463, "ref_logps/chosen": -25.923372268676758, "ref_logps/rejected": -35.819786071777344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.156249523162842, "rewards/margins": 3.2593231201171875, "rewards/rejected": -5.415572643280029, "step": 2450 }, { "epoch": 2.31, "grad_norm": 11.322820429939666, "learning_rate": 6.582410857299647e-08, "logps/chosen": -49.31037139892578, "logps/rejected": -94.13135528564453, "loss": 0.1254, "losses/dpo": 0.42256733775138855, "losses/sft": 0.37698784470558167, "losses/total": 0.42256733775138855, "ref_logps/chosen": -26.958715438842773, "ref_logps/rejected": -39.89036560058594, "rewards/accuracies": 1.0, "rewards/chosen": -2.23516583442688, "rewards/margins": 3.1889333724975586, "rewards/rejected": -5.424098968505859, "step": 2451 }, { "epoch": 2.31, "grad_norm": 14.263297330767834, "learning_rate": 6.565199317804118e-08, "logps/chosen": -48.75492858886719, "logps/rejected": -86.19011688232422, "loss": 0.1392, "losses/dpo": 0.00907914899289608, "losses/sft": 1.4108113050460815, "losses/total": 0.00907914899289608, "ref_logps/chosen": -27.772682189941406, "ref_logps/rejected": -33.70388412475586, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0982248783111572, "rewards/margins": 3.1503984928131104, "rewards/rejected": -5.248623371124268, "step": 2452 }, { "epoch": 2.31, "grad_norm": 14.13826678071658, "learning_rate": 6.548006908063922e-08, "logps/chosen": -38.81008529663086, "logps/rejected": -71.65089416503906, "loss": 0.168, "losses/dpo": 0.32999682426452637, "losses/sft": 1.3231308460235596, "losses/total": 0.32999682426452637, "ref_logps/chosen": -19.23772430419922, "ref_logps/rejected": -26.031208038330078, "rewards/accuracies": 1.0, "rewards/chosen": -1.9572359323501587, "rewards/margins": 2.6047327518463135, "rewards/rejected": -4.561968803405762, "step": 2453 }, { "epoch": 2.32, "grad_norm": 24.52917508452013, "learning_rate": 6.530833645919589e-08, "logps/chosen": -49.7413330078125, "logps/rejected": -79.92938995361328, "loss": 0.2971, "losses/dpo": 0.541107714176178, "losses/sft": 2.787144422531128, "losses/total": 0.541107714176178, "ref_logps/chosen": -26.33194923400879, "ref_logps/rejected": -34.28358459472656, "rewards/accuracies": 0.875, "rewards/chosen": -2.3409385681152344, "rewards/margins": 2.223641872406006, "rewards/rejected": -4.564579963684082, "step": 2454 }, { "epoch": 2.32, "grad_norm": 9.842107545352684, "learning_rate": 6.513679549191767e-08, "logps/chosen": -55.76287078857422, "logps/rejected": -94.92960357666016, "loss": 0.0973, "losses/dpo": 0.08791015297174454, "losses/sft": 3.5805537700653076, "losses/total": 0.08791015297174454, "ref_logps/chosen": -33.426204681396484, "ref_logps/rejected": -38.67597198486328, "rewards/accuracies": 1.0, "rewards/chosen": -2.23366641998291, "rewards/margins": 3.3916966915130615, "rewards/rejected": -5.625363349914551, "step": 2455 }, { "epoch": 2.32, "grad_norm": 9.684909041766431, "learning_rate": 6.496544635681245e-08, "logps/chosen": -57.36408615112305, "logps/rejected": -95.09259033203125, "loss": 0.0877, "losses/dpo": 0.010613520629703999, "losses/sft": 2.7819180488586426, "losses/total": 0.010613520629703999, "ref_logps/chosen": -34.28029251098633, "ref_logps/rejected": -38.55858612060547, "rewards/accuracies": 1.0, "rewards/chosen": -2.3083794116973877, "rewards/margins": 3.3450207710266113, "rewards/rejected": -5.653400421142578, "step": 2456 }, { "epoch": 2.32, "grad_norm": 9.17877791242254, "learning_rate": 6.47942892316887e-08, "logps/chosen": -60.395599365234375, "logps/rejected": -97.51744842529297, "loss": 0.0809, "losses/dpo": 0.04174577817320824, "losses/sft": 2.3199985027313232, "losses/total": 0.04174577817320824, "ref_logps/chosen": -34.531005859375, "ref_logps/rejected": -39.62914276123047, "rewards/accuracies": 1.0, "rewards/chosen": -2.586459159851074, "rewards/margins": 3.20237135887146, "rewards/rejected": -5.788830757141113, "step": 2457 }, { "epoch": 2.32, "grad_norm": 9.645912520155889, "learning_rate": 6.462332429415587e-08, "logps/chosen": -39.00136184692383, "logps/rejected": -91.77491760253906, "loss": 0.0817, "losses/dpo": 0.011906815692782402, "losses/sft": 1.348075270652771, "losses/total": 0.011906815692782402, "ref_logps/chosen": -21.391582489013672, "ref_logps/rejected": -35.89926528930664, "rewards/accuracies": 1.0, "rewards/chosen": -1.760977864265442, "rewards/margins": 3.826587677001953, "rewards/rejected": -5.5875654220581055, "step": 2458 }, { "epoch": 2.32, "grad_norm": 13.48365296121846, "learning_rate": 6.445255172162398e-08, "logps/chosen": -42.41533660888672, "logps/rejected": -79.12549591064453, "loss": 0.1869, "losses/dpo": 0.025629684329032898, "losses/sft": 2.215329170227051, "losses/total": 0.025629684329032898, "ref_logps/chosen": -23.066179275512695, "ref_logps/rejected": -34.4935188293457, "rewards/accuracies": 1.0, "rewards/chosen": -1.9349154233932495, "rewards/margins": 2.5282821655273438, "rewards/rejected": -4.463197708129883, "step": 2459 }, { "epoch": 2.32, "grad_norm": 11.089802103722619, "learning_rate": 6.428197169130345e-08, "logps/chosen": -52.45281219482422, "logps/rejected": -82.98652648925781, "loss": 0.1181, "losses/dpo": 0.0684826597571373, "losses/sft": 1.3634045124053955, "losses/total": 0.0684826597571373, "ref_logps/chosen": -31.650550842285156, "ref_logps/rejected": -33.271080017089844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.080226421356201, "rewards/margins": 2.8913190364837646, "rewards/rejected": -4.971545219421387, "step": 2460 }, { "epoch": 2.32, "grad_norm": 17.208582886486088, "learning_rate": 6.411158438020465e-08, "logps/chosen": -60.72191619873047, "logps/rejected": -96.33525085449219, "loss": 0.1645, "losses/dpo": 0.037247635424137115, "losses/sft": 0.9734750390052795, "losses/total": 0.037247635424137115, "ref_logps/chosen": -36.809181213378906, "ref_logps/rejected": -35.140228271484375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3912734985351562, "rewards/margins": 3.728229284286499, "rewards/rejected": -6.119502544403076, "step": 2461 }, { "epoch": 2.32, "grad_norm": 15.733341551857457, "learning_rate": 6.394138996513831e-08, "logps/chosen": -47.62530517578125, "logps/rejected": -78.33169555664062, "loss": 0.2058, "losses/dpo": 0.7799222469329834, "losses/sft": 2.163569450378418, "losses/total": 0.7799222469329834, "ref_logps/chosen": -26.26806640625, "ref_logps/rejected": -30.238279342651367, "rewards/accuracies": 0.875, "rewards/chosen": -2.135723829269409, "rewards/margins": 2.673618793487549, "rewards/rejected": -4.809342384338379, "step": 2462 }, { "epoch": 2.32, "grad_norm": 8.461514587082474, "learning_rate": 6.377138862271472e-08, "logps/chosen": -42.78850555419922, "logps/rejected": -88.81306457519531, "loss": 0.0883, "losses/dpo": 0.22098271548748016, "losses/sft": 2.671246290206909, "losses/total": 0.22098271548748016, "ref_logps/chosen": -24.31090545654297, "ref_logps/rejected": -35.99437713623047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8477602005004883, "rewards/margins": 3.4341089725494385, "rewards/rejected": -5.281868934631348, "step": 2463 }, { "epoch": 2.32, "grad_norm": 16.098986068030538, "learning_rate": 6.360158052934395e-08, "logps/chosen": -52.64945983886719, "logps/rejected": -91.68144226074219, "loss": 0.1675, "losses/dpo": 0.020064378157258034, "losses/sft": 1.6796029806137085, "losses/total": 0.020064378157258034, "ref_logps/chosen": -29.289047241210938, "ref_logps/rejected": -37.30128479003906, "rewards/accuracies": 0.875, "rewards/chosen": -2.336040735244751, "rewards/margins": 3.1019749641418457, "rewards/rejected": -5.438015937805176, "step": 2464 }, { "epoch": 2.33, "grad_norm": 12.782891707606785, "learning_rate": 6.343196586123561e-08, "logps/chosen": -51.71548080444336, "logps/rejected": -86.4343490600586, "loss": 0.127, "losses/dpo": 0.11841075867414474, "losses/sft": 1.8684957027435303, "losses/total": 0.11841075867414474, "ref_logps/chosen": -28.885072708129883, "ref_logps/rejected": -35.226348876953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2830405235290527, "rewards/margins": 2.837759494781494, "rewards/rejected": -5.120800018310547, "step": 2465 }, { "epoch": 2.33, "grad_norm": 9.583259073505548, "learning_rate": 6.326254479439835e-08, "logps/chosen": -60.91111755371094, "logps/rejected": -94.30615997314453, "loss": 0.1026, "losses/dpo": 0.31525006890296936, "losses/sft": 1.2225524187088013, "losses/total": 0.31525006890296936, "ref_logps/chosen": -33.18967819213867, "ref_logps/rejected": -38.24048614501953, "rewards/accuracies": 1.0, "rewards/chosen": -2.772143840789795, "rewards/margins": 2.834423542022705, "rewards/rejected": -5.6065673828125, "step": 2466 }, { "epoch": 2.33, "grad_norm": 6.8763681232885565, "learning_rate": 6.309331750464023e-08, "logps/chosen": -49.30305480957031, "logps/rejected": -93.39641571044922, "loss": 0.0803, "losses/dpo": 0.008549025282263756, "losses/sft": 1.5419135093688965, "losses/total": 0.008549025282263756, "ref_logps/chosen": -28.18161964416504, "ref_logps/rejected": -34.68167495727539, "rewards/accuracies": 1.0, "rewards/chosen": -2.1121435165405273, "rewards/margins": 3.7593307495117188, "rewards/rejected": -5.871474266052246, "step": 2467 }, { "epoch": 2.33, "grad_norm": 21.001305401724064, "learning_rate": 6.292428416756789e-08, "logps/chosen": -52.97474670410156, "logps/rejected": -87.66156005859375, "loss": 0.2511, "losses/dpo": 0.49325263500213623, "losses/sft": 1.4493902921676636, "losses/total": 0.49325263500213623, "ref_logps/chosen": -26.289520263671875, "ref_logps/rejected": -35.65961456298828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.668522834777832, "rewards/margins": 2.531672239303589, "rewards/rejected": -5.2001953125, "step": 2468 }, { "epoch": 2.33, "grad_norm": 12.198363409152414, "learning_rate": 6.275544495858703e-08, "logps/chosen": -47.226070404052734, "logps/rejected": -78.83092498779297, "loss": 0.1456, "losses/dpo": 0.01892193593084812, "losses/sft": 2.6787726879119873, "losses/total": 0.01892193593084812, "ref_logps/chosen": -28.042675018310547, "ref_logps/rejected": -30.181184768676758, "rewards/accuracies": 0.9375, "rewards/chosen": -1.918339729309082, "rewards/margins": 2.946634292602539, "rewards/rejected": -4.864974021911621, "step": 2469 }, { "epoch": 2.33, "grad_norm": 5.702493401800902, "learning_rate": 6.258680005290165e-08, "logps/chosen": -37.62389373779297, "logps/rejected": -79.179931640625, "loss": 0.0571, "losses/dpo": 0.11961573362350464, "losses/sft": 0.8082253336906433, "losses/total": 0.11961573362350464, "ref_logps/chosen": -19.317020416259766, "ref_logps/rejected": -28.966442108154297, "rewards/accuracies": 1.0, "rewards/chosen": -1.8306872844696045, "rewards/margins": 3.190661907196045, "rewards/rejected": -5.0213494300842285, "step": 2470 }, { "epoch": 2.33, "grad_norm": 16.178236856356172, "learning_rate": 6.241834962551432e-08, "logps/chosen": -54.950218200683594, "logps/rejected": -79.47747802734375, "loss": 0.1565, "losses/dpo": 0.014648182317614555, "losses/sft": 1.3320918083190918, "losses/total": 0.014648182317614555, "ref_logps/chosen": -31.565433502197266, "ref_logps/rejected": -30.22665023803711, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3384785652160645, "rewards/margins": 2.586604118347168, "rewards/rejected": -4.925082683563232, "step": 2471 }, { "epoch": 2.33, "grad_norm": 10.063568682701154, "learning_rate": 6.22500938512256e-08, "logps/chosen": -42.57269287109375, "logps/rejected": -67.95745849609375, "loss": 0.1257, "losses/dpo": 0.24035413563251495, "losses/sft": 1.9734654426574707, "losses/total": 0.24035413563251495, "ref_logps/chosen": -25.03997230529785, "ref_logps/rejected": -24.70186424255371, "rewards/accuracies": 1.0, "rewards/chosen": -1.7532718181610107, "rewards/margins": 2.5722880363464355, "rewards/rejected": -4.325559616088867, "step": 2472 }, { "epoch": 2.33, "grad_norm": 15.684044120773555, "learning_rate": 6.20820329046342e-08, "logps/chosen": -50.887359619140625, "logps/rejected": -91.46089172363281, "loss": 0.1391, "losses/dpo": 0.011105386540293694, "losses/sft": 0.5834668874740601, "losses/total": 0.011105386540293694, "ref_logps/chosen": -28.090072631835938, "ref_logps/rejected": -34.66209411621094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.279728651046753, "rewards/margins": 3.400151252746582, "rewards/rejected": -5.679880142211914, "step": 2473 }, { "epoch": 2.33, "grad_norm": 11.324796618649762, "learning_rate": 6.191416696013674e-08, "logps/chosen": -53.8972053527832, "logps/rejected": -93.39762878417969, "loss": 0.1109, "losses/dpo": 0.0008283862844109535, "losses/sft": 1.7748315334320068, "losses/total": 0.0008283862844109535, "ref_logps/chosen": -27.77981185913086, "ref_logps/rejected": -36.52339172363281, "rewards/accuracies": 1.0, "rewards/chosen": -2.61173939704895, "rewards/margins": 3.0756850242614746, "rewards/rejected": -5.687424659729004, "step": 2474 }, { "epoch": 2.33, "grad_norm": 16.915475654559682, "learning_rate": 6.17464961919272e-08, "logps/chosen": -55.90142822265625, "logps/rejected": -83.30272674560547, "loss": 0.1911, "losses/dpo": 0.1962929219007492, "losses/sft": 0.8188110589981079, "losses/total": 0.1962929219007492, "ref_logps/chosen": -33.30770492553711, "ref_logps/rejected": -35.42737579345703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2593722343444824, "rewards/margins": 2.528162956237793, "rewards/rejected": -4.787535667419434, "step": 2475 }, { "epoch": 2.34, "grad_norm": 12.239337451132203, "learning_rate": 6.157902077399735e-08, "logps/chosen": -58.49480056762695, "logps/rejected": -86.94174194335938, "loss": 0.1095, "losses/dpo": 0.026830552145838737, "losses/sft": 1.5785197019577026, "losses/total": 0.026830552145838737, "ref_logps/chosen": -34.13359832763672, "ref_logps/rejected": -29.788036346435547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4361202716827393, "rewards/margins": 3.279250144958496, "rewards/rejected": -5.715370178222656, "step": 2476 }, { "epoch": 2.34, "grad_norm": 16.503782566368088, "learning_rate": 6.141174088013595e-08, "logps/chosen": -47.6297607421875, "logps/rejected": -89.23281860351562, "loss": 0.1754, "losses/dpo": 0.10078131407499313, "losses/sft": 1.5505940914154053, "losses/total": 0.10078131407499313, "ref_logps/chosen": -27.381423950195312, "ref_logps/rejected": -36.58209991455078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.024833917617798, "rewards/margins": 3.2402374744415283, "rewards/rejected": -5.265071392059326, "step": 2477 }, { "epoch": 2.34, "grad_norm": 13.214286142793537, "learning_rate": 6.124465668392914e-08, "logps/chosen": -61.360145568847656, "logps/rejected": -85.51277923583984, "loss": 0.1094, "losses/dpo": 0.07340383529663086, "losses/sft": 0.8070600628852844, "losses/total": 0.07340383529663086, "ref_logps/chosen": -39.00689697265625, "ref_logps/rejected": -32.219398498535156, "rewards/accuracies": 1.0, "rewards/chosen": -2.2353250980377197, "rewards/margins": 3.0940134525299072, "rewards/rejected": -5.329338550567627, "step": 2478 }, { "epoch": 2.34, "grad_norm": 7.997200247119282, "learning_rate": 6.107776835875972e-08, "logps/chosen": -62.05547332763672, "logps/rejected": -103.68057250976562, "loss": 0.064, "losses/dpo": 0.08651916682720184, "losses/sft": 2.9135141372680664, "losses/total": 0.08651916682720184, "ref_logps/chosen": -38.16618347167969, "ref_logps/rejected": -41.33454513549805, "rewards/accuracies": 1.0, "rewards/chosen": -2.388929843902588, "rewards/margins": 3.845673084259033, "rewards/rejected": -6.234602451324463, "step": 2479 }, { "epoch": 2.34, "grad_norm": 16.952418589006857, "learning_rate": 6.091107607780749e-08, "logps/chosen": -47.971351623535156, "logps/rejected": -80.29824829101562, "loss": 0.1731, "losses/dpo": 0.027138330042362213, "losses/sft": 1.4434787034988403, "losses/total": 0.027138330042362213, "ref_logps/chosen": -26.82118797302246, "ref_logps/rejected": -32.88490676879883, "rewards/accuracies": 0.9375, "rewards/chosen": -2.115016222000122, "rewards/margins": 2.6263184547424316, "rewards/rejected": -4.741334915161133, "step": 2480 }, { "epoch": 2.34, "grad_norm": 8.688935270576616, "learning_rate": 6.074458001404859e-08, "logps/chosen": -58.468353271484375, "logps/rejected": -103.06016540527344, "loss": 0.0879, "losses/dpo": 0.18780119717121124, "losses/sft": 2.0836539268493652, "losses/total": 0.18780119717121124, "ref_logps/chosen": -29.70208168029785, "ref_logps/rejected": -40.25128173828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.87662672996521, "rewards/margins": 3.404261350631714, "rewards/rejected": -6.280888557434082, "step": 2481 }, { "epoch": 2.34, "grad_norm": 14.444642860014424, "learning_rate": 6.057828034025566e-08, "logps/chosen": -51.07634353637695, "logps/rejected": -80.97213745117188, "loss": 0.1983, "losses/dpo": 0.020588941872119904, "losses/sft": 1.8583917617797852, "losses/total": 0.020588941872119904, "ref_logps/chosen": -30.23394012451172, "ref_logps/rejected": -32.448055267333984, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0842409133911133, "rewards/margins": 2.76816725730896, "rewards/rejected": -4.852407932281494, "step": 2482 }, { "epoch": 2.34, "grad_norm": 16.336166774845168, "learning_rate": 6.041217722899767e-08, "logps/chosen": -37.1588134765625, "logps/rejected": -67.21138763427734, "loss": 0.2422, "losses/dpo": 0.0932135060429573, "losses/sft": 2.4103260040283203, "losses/total": 0.0932135060429573, "ref_logps/chosen": -18.142343521118164, "ref_logps/rejected": -24.088085174560547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9016470909118652, "rewards/margins": 2.4106831550598145, "rewards/rejected": -4.31233024597168, "step": 2483 }, { "epoch": 2.34, "grad_norm": 7.439935228747468, "learning_rate": 6.024627085263928e-08, "logps/chosen": -38.216064453125, "logps/rejected": -76.37513732910156, "loss": 0.0943, "losses/dpo": 0.0544583797454834, "losses/sft": 0.932412326335907, "losses/total": 0.0544583797454834, "ref_logps/chosen": -23.820737838745117, "ref_logps/rejected": -30.371973037719727, "rewards/accuracies": 1.0, "rewards/chosen": -1.43953275680542, "rewards/margins": 3.1607837677001953, "rewards/rejected": -4.600317001342773, "step": 2484 }, { "epoch": 2.34, "grad_norm": 13.76886062819182, "learning_rate": 6.008056138334139e-08, "logps/chosen": -58.3422966003418, "logps/rejected": -96.5542984008789, "loss": 0.1455, "losses/dpo": 0.6601214408874512, "losses/sft": 2.688690423965454, "losses/total": 0.6601214408874512, "ref_logps/chosen": -28.782699584960938, "ref_logps/rejected": -35.59059143066406, "rewards/accuracies": 1.0, "rewards/chosen": -2.9559597969055176, "rewards/margins": 3.140410900115967, "rewards/rejected": -6.096370697021484, "step": 2485 }, { "epoch": 2.35, "grad_norm": 13.527443083775118, "learning_rate": 5.991504899306022e-08, "logps/chosen": -45.1123046875, "logps/rejected": -78.11001586914062, "loss": 0.1458, "losses/dpo": 0.0074897767044603825, "losses/sft": 1.3507438898086548, "losses/total": 0.0074897767044603825, "ref_logps/chosen": -24.874359130859375, "ref_logps/rejected": -29.65108299255371, "rewards/accuracies": 1.0, "rewards/chosen": -2.023794174194336, "rewards/margins": 2.822099208831787, "rewards/rejected": -4.845893383026123, "step": 2486 }, { "epoch": 2.35, "grad_norm": 6.153190515435214, "learning_rate": 5.974973385354778e-08, "logps/chosen": -54.067771911621094, "logps/rejected": -101.79252624511719, "loss": 0.05, "losses/dpo": 0.09270991384983063, "losses/sft": 2.1182899475097656, "losses/total": 0.09270991384983063, "ref_logps/chosen": -29.482959747314453, "ref_logps/rejected": -40.97761535644531, "rewards/accuracies": 1.0, "rewards/chosen": -2.4584813117980957, "rewards/margins": 3.6230101585388184, "rewards/rejected": -6.081491470336914, "step": 2487 }, { "epoch": 2.35, "grad_norm": 8.371851348961636, "learning_rate": 5.9584616136351126e-08, "logps/chosen": -50.37097930908203, "logps/rejected": -116.90811157226562, "loss": 0.0711, "losses/dpo": 0.06671243160963058, "losses/sft": 0.42193037271499634, "losses/total": 0.06671243160963058, "ref_logps/chosen": -26.4874267578125, "ref_logps/rejected": -49.73341369628906, "rewards/accuracies": 1.0, "rewards/chosen": -2.388355255126953, "rewards/margins": 4.329115390777588, "rewards/rejected": -6.717470645904541, "step": 2488 }, { "epoch": 2.35, "grad_norm": 10.203195863761373, "learning_rate": 5.941969601281272e-08, "logps/chosen": -54.625709533691406, "logps/rejected": -91.72148895263672, "loss": 0.0903, "losses/dpo": 0.36963939666748047, "losses/sft": 1.8543617725372314, "losses/total": 0.36963939666748047, "ref_logps/chosen": -31.843677520751953, "ref_logps/rejected": -35.14140319824219, "rewards/accuracies": 1.0, "rewards/chosen": -2.278203248977661, "rewards/margins": 3.379805564880371, "rewards/rejected": -5.658008575439453, "step": 2489 }, { "epoch": 2.35, "grad_norm": 8.183892541194945, "learning_rate": 5.9254973654069675e-08, "logps/chosen": -54.142494201660156, "logps/rejected": -101.15138244628906, "loss": 0.0726, "losses/dpo": 0.02792343497276306, "losses/sft": 0.6435433626174927, "losses/total": 0.02792343497276306, "ref_logps/chosen": -32.32624053955078, "ref_logps/rejected": -41.42847442626953, "rewards/accuracies": 1.0, "rewards/chosen": -2.1816253662109375, "rewards/margins": 3.7906653881073, "rewards/rejected": -5.972290515899658, "step": 2490 }, { "epoch": 2.35, "grad_norm": 7.533861953656046, "learning_rate": 5.909044923105413e-08, "logps/chosen": -43.27823257446289, "logps/rejected": -84.04753112792969, "loss": 0.077, "losses/dpo": 0.060449838638305664, "losses/sft": 1.7553725242614746, "losses/total": 0.060449838638305664, "ref_logps/chosen": -24.63449478149414, "ref_logps/rejected": -31.82433319091797, "rewards/accuracies": 1.0, "rewards/chosen": -1.8643736839294434, "rewards/margins": 3.3579463958740234, "rewards/rejected": -5.222320079803467, "step": 2491 }, { "epoch": 2.35, "grad_norm": 12.744300189827763, "learning_rate": 5.8926122914492835e-08, "logps/chosen": -59.46965789794922, "logps/rejected": -104.09396362304688, "loss": 0.1126, "losses/dpo": 0.028572922572493553, "losses/sft": 1.1692535877227783, "losses/total": 0.028572922572493553, "ref_logps/chosen": -31.306964874267578, "ref_logps/rejected": -42.500797271728516, "rewards/accuracies": 1.0, "rewards/chosen": -2.8162693977355957, "rewards/margins": 3.3430466651916504, "rewards/rejected": -6.159316062927246, "step": 2492 }, { "epoch": 2.35, "grad_norm": 11.25128524507767, "learning_rate": 5.876199487490674e-08, "logps/chosen": -49.78718185424805, "logps/rejected": -92.37328338623047, "loss": 0.1268, "losses/dpo": 0.0041667683981359005, "losses/sft": 1.1314847469329834, "losses/total": 0.0041667683981359005, "ref_logps/chosen": -25.014759063720703, "ref_logps/rejected": -35.7772331237793, "rewards/accuracies": 1.0, "rewards/chosen": -2.4772424697875977, "rewards/margins": 3.182363271713257, "rewards/rejected": -5.659605503082275, "step": 2493 }, { "epoch": 2.35, "grad_norm": 12.222869833120615, "learning_rate": 5.8598065282611296e-08, "logps/chosen": -55.0211181640625, "logps/rejected": -97.90899658203125, "loss": 0.1162, "losses/dpo": 0.10943197458982468, "losses/sft": 4.1409077644348145, "losses/total": 0.10943197458982468, "ref_logps/chosen": -29.15453338623047, "ref_logps/rejected": -42.321311950683594, "rewards/accuracies": 1.0, "rewards/chosen": -2.586658000946045, "rewards/margins": 2.9721109867095947, "rewards/rejected": -5.558769226074219, "step": 2494 }, { "epoch": 2.35, "grad_norm": 11.103489326659604, "learning_rate": 5.843433430771574e-08, "logps/chosen": -47.170379638671875, "logps/rejected": -83.77902221679688, "loss": 0.1293, "losses/dpo": 0.08025161921977997, "losses/sft": 0.6593008637428284, "losses/total": 0.08025161921977997, "ref_logps/chosen": -23.802120208740234, "ref_logps/rejected": -33.27418899536133, "rewards/accuracies": 1.0, "rewards/chosen": -2.3368263244628906, "rewards/margins": 2.7136573791503906, "rewards/rejected": -5.050483703613281, "step": 2495 }, { "epoch": 2.35, "grad_norm": 11.081215513314316, "learning_rate": 5.827080212012359e-08, "logps/chosen": -49.189727783203125, "logps/rejected": -83.03419494628906, "loss": 0.1135, "losses/dpo": 0.29749101400375366, "losses/sft": 0.7074821591377258, "losses/total": 0.29749101400375366, "ref_logps/chosen": -31.325729370117188, "ref_logps/rejected": -33.33237838745117, "rewards/accuracies": 1.0, "rewards/chosen": -1.7864001989364624, "rewards/margins": 3.183781147003174, "rewards/rejected": -4.970181465148926, "step": 2496 }, { "epoch": 2.36, "grad_norm": 9.61406186716922, "learning_rate": 5.810746888953166e-08, "logps/chosen": -61.62954330444336, "logps/rejected": -98.72171020507812, "loss": 0.1058, "losses/dpo": 0.06423758715391159, "losses/sft": 1.855358362197876, "losses/total": 0.06423758715391159, "ref_logps/chosen": -36.47919464111328, "ref_logps/rejected": -42.7855224609375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5150346755981445, "rewards/margins": 3.0785837173461914, "rewards/rejected": -5.593618392944336, "step": 2497 }, { "epoch": 2.36, "grad_norm": 11.200188637130326, "learning_rate": 5.794433478543062e-08, "logps/chosen": -47.914119720458984, "logps/rejected": -82.49176788330078, "loss": 0.1095, "losses/dpo": 0.021217800676822662, "losses/sft": 1.7831709384918213, "losses/total": 0.021217800676822662, "ref_logps/chosen": -24.621274948120117, "ref_logps/rejected": -32.737510681152344, "rewards/accuracies": 1.0, "rewards/chosen": -2.32928466796875, "rewards/margins": 2.6461410522460938, "rewards/rejected": -4.975425720214844, "step": 2498 }, { "epoch": 2.36, "grad_norm": 23.945175847348064, "learning_rate": 5.7781399977104436e-08, "logps/chosen": -49.840579986572266, "logps/rejected": -80.03289031982422, "loss": 0.2378, "losses/dpo": 0.085589699447155, "losses/sft": 0.6738280653953552, "losses/total": 0.085589699447155, "ref_logps/chosen": -25.755483627319336, "ref_logps/rejected": -31.232261657714844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4085094928741455, "rewards/margins": 2.471553325653076, "rewards/rejected": -4.880063056945801, "step": 2499 }, { "epoch": 2.36, "grad_norm": 13.667960738242193, "learning_rate": 5.761866463363013e-08, "logps/chosen": -41.62419891357422, "logps/rejected": -66.40196228027344, "loss": 0.2162, "losses/dpo": 0.052098147571086884, "losses/sft": 1.780278205871582, "losses/total": 0.052098147571086884, "ref_logps/chosen": -21.16683006286621, "ref_logps/rejected": -26.08302116394043, "rewards/accuracies": 0.9375, "rewards/chosen": -2.045736789703369, "rewards/margins": 1.9861575365066528, "rewards/rejected": -4.031894683837891, "step": 2500 }, { "epoch": 2.36, "grad_norm": 11.22625810253396, "learning_rate": 5.7456128923877936e-08, "logps/chosen": -53.68470001220703, "logps/rejected": -89.39196014404297, "loss": 0.1774, "losses/dpo": 0.01531263254582882, "losses/sft": 2.2463595867156982, "losses/total": 0.01531263254582882, "ref_logps/chosen": -29.015731811523438, "ref_logps/rejected": -35.64305877685547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4668967723846436, "rewards/margins": 2.9079933166503906, "rewards/rejected": -5.374889850616455, "step": 2501 }, { "epoch": 2.36, "grad_norm": 9.363981248555508, "learning_rate": 5.729379301651072e-08, "logps/chosen": -44.8216438293457, "logps/rejected": -76.57180786132812, "loss": 0.1083, "losses/dpo": 0.14511001110076904, "losses/sft": 0.6695989966392517, "losses/total": 0.14511001110076904, "ref_logps/chosen": -23.353195190429688, "ref_logps/rejected": -27.867847442626953, "rewards/accuracies": 1.0, "rewards/chosen": -2.1468448638916016, "rewards/margins": 2.723550796508789, "rewards/rejected": -4.870395660400391, "step": 2502 }, { "epoch": 2.36, "grad_norm": 11.439537757756304, "learning_rate": 5.713165707998427e-08, "logps/chosen": -52.60350799560547, "logps/rejected": -102.55532836914062, "loss": 0.1068, "losses/dpo": 0.23198938369750977, "losses/sft": 1.6189769506454468, "losses/total": 0.23198938369750977, "ref_logps/chosen": -30.32005500793457, "ref_logps/rejected": -43.214691162109375, "rewards/accuracies": 1.0, "rewards/chosen": -2.2283458709716797, "rewards/margins": 3.705718994140625, "rewards/rejected": -5.9340643882751465, "step": 2503 }, { "epoch": 2.36, "grad_norm": 7.729457551117102, "learning_rate": 5.696972128254654e-08, "logps/chosen": -47.702796936035156, "logps/rejected": -100.13792419433594, "loss": 0.1139, "losses/dpo": 0.00852581113576889, "losses/sft": 0.5011367797851562, "losses/total": 0.00852581113576889, "ref_logps/chosen": -26.839189529418945, "ref_logps/rejected": -38.01255416870117, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0863606929779053, "rewards/margins": 4.126175880432129, "rewards/rejected": -6.212536334991455, "step": 2504 }, { "epoch": 2.36, "grad_norm": 8.474540473901698, "learning_rate": 5.680798579223817e-08, "logps/chosen": -48.407676696777344, "logps/rejected": -87.25175476074219, "loss": 0.0828, "losses/dpo": 0.03406066820025444, "losses/sft": 2.1178066730499268, "losses/total": 0.03406066820025444, "ref_logps/chosen": -25.122377395629883, "ref_logps/rejected": -32.73267364501953, "rewards/accuracies": 1.0, "rewards/chosen": -2.3285298347473145, "rewards/margins": 3.123378276824951, "rewards/rejected": -5.451908111572266, "step": 2505 }, { "epoch": 2.36, "grad_norm": 7.113475526912228, "learning_rate": 5.6646450776891566e-08, "logps/chosen": -52.96394348144531, "logps/rejected": -101.39563751220703, "loss": 0.0765, "losses/dpo": 0.0003459801955614239, "losses/sft": 2.5531342029571533, "losses/total": 0.0003459801955614239, "ref_logps/chosen": -32.016502380371094, "ref_logps/rejected": -40.231040954589844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0947442054748535, "rewards/margins": 4.0217156410217285, "rewards/rejected": -6.116459846496582, "step": 2506 }, { "epoch": 2.37, "grad_norm": 11.81850294207077, "learning_rate": 5.648511640413139e-08, "logps/chosen": -38.14630889892578, "logps/rejected": -69.6519775390625, "loss": 0.1602, "losses/dpo": 0.14984215795993805, "losses/sft": 1.7647110223770142, "losses/total": 0.14984215795993805, "ref_logps/chosen": -19.852397918701172, "ref_logps/rejected": -24.996362686157227, "rewards/accuracies": 1.0, "rewards/chosen": -1.8293914794921875, "rewards/margins": 2.63616943359375, "rewards/rejected": -4.4655609130859375, "step": 2507 }, { "epoch": 2.37, "grad_norm": 8.808393733781381, "learning_rate": 5.632398284137405e-08, "logps/chosen": -56.44976806640625, "logps/rejected": -113.56744384765625, "loss": 0.1004, "losses/dpo": 0.03952431678771973, "losses/sft": 1.748262882232666, "losses/total": 0.03952431678771973, "ref_logps/chosen": -31.917434692382812, "ref_logps/rejected": -48.321746826171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.453233242034912, "rewards/margins": 4.07133674621582, "rewards/rejected": -6.524569511413574, "step": 2508 }, { "epoch": 2.37, "grad_norm": 4.515401686024401, "learning_rate": 5.616305025582735e-08, "logps/chosen": -53.97819519042969, "logps/rejected": -112.7757568359375, "loss": 0.0345, "losses/dpo": 0.03067706525325775, "losses/sft": 1.1158589124679565, "losses/total": 0.03067706525325775, "ref_logps/chosen": -27.641765594482422, "ref_logps/rejected": -46.469276428222656, "rewards/accuracies": 1.0, "rewards/chosen": -2.633643388748169, "rewards/margins": 3.9970054626464844, "rewards/rejected": -6.630648612976074, "step": 2509 }, { "epoch": 2.37, "grad_norm": 11.720691920657828, "learning_rate": 5.6002318814490884e-08, "logps/chosen": -52.55008316040039, "logps/rejected": -90.73937225341797, "loss": 0.1208, "losses/dpo": 0.008625705726444721, "losses/sft": 1.567247748374939, "losses/total": 0.008625705726444721, "ref_logps/chosen": -29.172821044921875, "ref_logps/rejected": -35.77800750732422, "rewards/accuracies": 1.0, "rewards/chosen": -2.33772611618042, "rewards/margins": 3.1584105491638184, "rewards/rejected": -5.496136665344238, "step": 2510 }, { "epoch": 2.37, "grad_norm": 33.43358177975205, "learning_rate": 5.5841788684155205e-08, "logps/chosen": -58.93376922607422, "logps/rejected": -85.39608764648438, "loss": 0.4132, "losses/dpo": 0.0001778889709385112, "losses/sft": 1.0213769674301147, "losses/total": 0.0001778889709385112, "ref_logps/chosen": -30.60880470275879, "ref_logps/rejected": -34.0464973449707, "rewards/accuracies": 0.75, "rewards/chosen": -2.8324966430664062, "rewards/margins": 2.302462100982666, "rewards/rejected": -5.1349592208862305, "step": 2511 }, { "epoch": 2.37, "grad_norm": 8.565078039226524, "learning_rate": 5.5681460031402226e-08, "logps/chosen": -50.434574127197266, "logps/rejected": -89.53651428222656, "loss": 0.1064, "losses/dpo": 0.19870175421237946, "losses/sft": 1.873494267463684, "losses/total": 0.19870175421237946, "ref_logps/chosen": -28.85816764831543, "ref_logps/rejected": -35.20616149902344, "rewards/accuracies": 1.0, "rewards/chosen": -2.1576409339904785, "rewards/margins": 3.2753942012786865, "rewards/rejected": -5.433034896850586, "step": 2512 }, { "epoch": 2.37, "grad_norm": 6.838794119104954, "learning_rate": 5.552133302260453e-08, "logps/chosen": -53.11662673950195, "logps/rejected": -95.49479675292969, "loss": 0.0584, "losses/dpo": 0.09784672409296036, "losses/sft": 2.0882344245910645, "losses/total": 0.09784672409296036, "ref_logps/chosen": -28.76864242553711, "ref_logps/rejected": -33.303733825683594, "rewards/accuracies": 1.0, "rewards/chosen": -2.434798240661621, "rewards/margins": 3.7843079566955566, "rewards/rejected": -6.219106674194336, "step": 2513 }, { "epoch": 2.37, "grad_norm": 16.211414321003264, "learning_rate": 5.536140782392573e-08, "logps/chosen": -50.615135192871094, "logps/rejected": -76.34944915771484, "loss": 0.1944, "losses/dpo": 0.059881117194890976, "losses/sft": 0.58478844165802, "losses/total": 0.059881117194890976, "ref_logps/chosen": -28.482728958129883, "ref_logps/rejected": -28.992891311645508, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2132408618927, "rewards/margins": 2.5224151611328125, "rewards/rejected": -4.735655784606934, "step": 2514 }, { "epoch": 2.37, "grad_norm": 8.552532055698986, "learning_rate": 5.520168460131977e-08, "logps/chosen": -50.13624572753906, "logps/rejected": -91.90261840820312, "loss": 0.0748, "losses/dpo": 0.012623824179172516, "losses/sft": 2.0275096893310547, "losses/total": 0.012623824179172516, "ref_logps/chosen": -25.63751983642578, "ref_logps/rejected": -34.29613494873047, "rewards/accuracies": 1.0, "rewards/chosen": -2.4498724937438965, "rewards/margins": 3.3107757568359375, "rewards/rejected": -5.760648727416992, "step": 2515 }, { "epoch": 2.37, "grad_norm": 4.013096174654395, "learning_rate": 5.504216352053112e-08, "logps/chosen": -59.08881759643555, "logps/rejected": -98.8931884765625, "loss": 0.0329, "losses/dpo": 0.0055832345969974995, "losses/sft": 1.7391122579574585, "losses/total": 0.0055832345969974995, "ref_logps/chosen": -35.637725830078125, "ref_logps/rejected": -32.71720504760742, "rewards/accuracies": 1.0, "rewards/chosen": -2.345109224319458, "rewards/margins": 4.272489547729492, "rewards/rejected": -6.617598533630371, "step": 2516 }, { "epoch": 2.37, "grad_norm": 6.944751398527647, "learning_rate": 5.488284474709459e-08, "logps/chosen": -62.66202163696289, "logps/rejected": -106.7015609741211, "loss": 0.0729, "losses/dpo": 0.022001786157488823, "losses/sft": 2.7758848667144775, "losses/total": 0.022001786157488823, "ref_logps/chosen": -32.863956451416016, "ref_logps/rejected": -42.07421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.979806900024414, "rewards/margins": 3.4829277992248535, "rewards/rejected": -6.462734699249268, "step": 2517 }, { "epoch": 2.38, "grad_norm": 9.477242050142486, "learning_rate": 5.472372844633483e-08, "logps/chosen": -49.700008392333984, "logps/rejected": -90.93949890136719, "loss": 0.0952, "losses/dpo": 0.0743306577205658, "losses/sft": 1.8263269662857056, "losses/total": 0.0743306577205658, "ref_logps/chosen": -28.800148010253906, "ref_logps/rejected": -34.234153747558594, "rewards/accuracies": 1.0, "rewards/chosen": -2.0899860858917236, "rewards/margins": 3.58054780960083, "rewards/rejected": -5.670533657073975, "step": 2518 }, { "epoch": 2.38, "grad_norm": 12.467718148925224, "learning_rate": 5.456481478336661e-08, "logps/chosen": -55.51654815673828, "logps/rejected": -88.55767822265625, "loss": 0.1084, "losses/dpo": 0.022085554897785187, "losses/sft": 0.9626584053039551, "losses/total": 0.022085554897785187, "ref_logps/chosen": -29.923770904541016, "ref_logps/rejected": -33.64368438720703, "rewards/accuracies": 1.0, "rewards/chosen": -2.5592775344848633, "rewards/margins": 2.9321224689483643, "rewards/rejected": -5.491399765014648, "step": 2519 }, { "epoch": 2.38, "grad_norm": 14.252207003422487, "learning_rate": 5.440610392309419e-08, "logps/chosen": -47.694801330566406, "logps/rejected": -82.18034362792969, "loss": 0.1505, "losses/dpo": 0.00022352208907250315, "losses/sft": 3.4584250450134277, "losses/total": 0.00022352208907250315, "ref_logps/chosen": -23.85776138305664, "ref_logps/rejected": -31.50176429748535, "rewards/accuracies": 1.0, "rewards/chosen": -2.3837037086486816, "rewards/margins": 2.6841535568237305, "rewards/rejected": -5.067857265472412, "step": 2520 }, { "epoch": 2.38, "grad_norm": 13.53690346277356, "learning_rate": 5.424759603021165e-08, "logps/chosen": -43.075138092041016, "logps/rejected": -77.91140747070312, "loss": 0.1605, "losses/dpo": 0.034168556332588196, "losses/sft": 1.3451577425003052, "losses/total": 0.034168556332588196, "ref_logps/chosen": -19.787960052490234, "ref_logps/rejected": -29.1668758392334, "rewards/accuracies": 1.0, "rewards/chosen": -2.3287179470062256, "rewards/margins": 2.5457358360290527, "rewards/rejected": -4.874453544616699, "step": 2521 }, { "epoch": 2.38, "grad_norm": 10.75358425305355, "learning_rate": 5.408929126920217e-08, "logps/chosen": -41.61283874511719, "logps/rejected": -78.55885314941406, "loss": 0.1386, "losses/dpo": 0.029829520732164383, "losses/sft": 0.09256710857152939, "losses/total": 0.029829520732164383, "ref_logps/chosen": -19.1347599029541, "ref_logps/rejected": -28.48369598388672, "rewards/accuracies": 1.0, "rewards/chosen": -2.247807741165161, "rewards/margins": 2.7597079277038574, "rewards/rejected": -5.007515907287598, "step": 2522 }, { "epoch": 2.38, "grad_norm": 9.586957432802716, "learning_rate": 5.393118980433839e-08, "logps/chosen": -52.20649719238281, "logps/rejected": -102.22343444824219, "loss": 0.1034, "losses/dpo": 0.17750179767608643, "losses/sft": 0.024680938571691513, "losses/total": 0.17750179767608643, "ref_logps/chosen": -27.883798599243164, "ref_logps/rejected": -42.218753814697266, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4322702884674072, "rewards/margins": 3.5681982040405273, "rewards/rejected": -6.0004682540893555, "step": 2523 }, { "epoch": 2.38, "grad_norm": 9.318144296330253, "learning_rate": 5.377329179968179e-08, "logps/chosen": -46.84856414794922, "logps/rejected": -93.04896545410156, "loss": 0.0839, "losses/dpo": 0.5715543031692505, "losses/sft": 1.68071711063385, "losses/total": 0.5715543031692505, "ref_logps/chosen": -26.377145767211914, "ref_logps/rejected": -35.203941345214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0471420288085938, "rewards/margins": 3.737360715866089, "rewards/rejected": -5.7845025062561035, "step": 2524 }, { "epoch": 2.38, "grad_norm": 8.296737879290701, "learning_rate": 5.3615597419082834e-08, "logps/chosen": -49.74724578857422, "logps/rejected": -88.47314453125, "loss": 0.0661, "losses/dpo": 0.025082694366574287, "losses/sft": 0.41776448488235474, "losses/total": 0.025082694366574287, "ref_logps/chosen": -27.43016815185547, "ref_logps/rejected": -33.48902130126953, "rewards/accuracies": 1.0, "rewards/chosen": -2.231707811355591, "rewards/margins": 3.2667040824890137, "rewards/rejected": -5.498411655426025, "step": 2525 }, { "epoch": 2.38, "grad_norm": 12.982423299252243, "learning_rate": 5.3458106826180765e-08, "logps/chosen": -35.87092971801758, "logps/rejected": -83.56098937988281, "loss": 0.1255, "losses/dpo": 0.5576165914535522, "losses/sft": 1.8833770751953125, "losses/total": 0.5576165914535522, "ref_logps/chosen": -19.385074615478516, "ref_logps/rejected": -34.62205123901367, "rewards/accuracies": 1.0, "rewards/chosen": -1.6485857963562012, "rewards/margins": 3.2453083992004395, "rewards/rejected": -4.893894195556641, "step": 2526 }, { "epoch": 2.38, "grad_norm": 11.110971103425259, "learning_rate": 5.33008201844031e-08, "logps/chosen": -55.11167907714844, "logps/rejected": -102.92338562011719, "loss": 0.0952, "losses/dpo": 0.03464629873633385, "losses/sft": 0.9240120053291321, "losses/total": 0.03464629873633385, "ref_logps/chosen": -28.93885040283203, "ref_logps/rejected": -40.60065841674805, "rewards/accuracies": 0.9375, "rewards/chosen": -2.617283344268799, "rewards/margins": 3.6149892807006836, "rewards/rejected": -6.232272624969482, "step": 2527 }, { "epoch": 2.38, "grad_norm": 14.67276404050101, "learning_rate": 5.3143737656966e-08, "logps/chosen": -38.92262268066406, "logps/rejected": -74.70950317382812, "loss": 0.1861, "losses/dpo": 0.002676728880032897, "losses/sft": 0.9304791688919067, "losses/total": 0.002676728880032897, "ref_logps/chosen": -21.671680450439453, "ref_logps/rejected": -30.000213623046875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7250943183898926, "rewards/margins": 2.7458343505859375, "rewards/rejected": -4.470928192138672, "step": 2528 }, { "epoch": 2.39, "grad_norm": 13.775657723385114, "learning_rate": 5.2986859406873555e-08, "logps/chosen": -50.7269401550293, "logps/rejected": -80.0516357421875, "loss": 0.1685, "losses/dpo": 0.0540376640856266, "losses/sft": 1.2744781970977783, "losses/total": 0.0540376640856266, "ref_logps/chosen": -28.854326248168945, "ref_logps/rejected": -32.596641540527344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1872611045837402, "rewards/margins": 2.5582380294799805, "rewards/rejected": -4.745499134063721, "step": 2529 }, { "epoch": 2.39, "grad_norm": 8.050024579732577, "learning_rate": 5.2830185596918154e-08, "logps/chosen": -41.045799255371094, "logps/rejected": -67.69140625, "loss": 0.1336, "losses/dpo": 0.0416705496609211, "losses/sft": 1.7220492362976074, "losses/total": 0.0416705496609211, "ref_logps/chosen": -23.267807006835938, "ref_logps/rejected": -24.19548797607422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7777994871139526, "rewards/margins": 2.5717926025390625, "rewards/rejected": -4.349592208862305, "step": 2530 }, { "epoch": 2.39, "grad_norm": 17.00952725872835, "learning_rate": 5.267371638967971e-08, "logps/chosen": -57.98863220214844, "logps/rejected": -85.09255981445312, "loss": 0.1594, "losses/dpo": 0.36046096682548523, "losses/sft": 1.8373514413833618, "losses/total": 0.36046096682548523, "ref_logps/chosen": -33.56342315673828, "ref_logps/rejected": -32.008384704589844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.442521095275879, "rewards/margins": 2.865896224975586, "rewards/rejected": -5.308417320251465, "step": 2531 }, { "epoch": 2.39, "grad_norm": 11.108610114572286, "learning_rate": 5.2517451947526214e-08, "logps/chosen": -52.2525520324707, "logps/rejected": -98.55172729492188, "loss": 0.1087, "losses/dpo": 0.09333202987909317, "losses/sft": 1.2998175621032715, "losses/total": 0.09333202987909317, "ref_logps/chosen": -26.711387634277344, "ref_logps/rejected": -38.859310150146484, "rewards/accuracies": 1.0, "rewards/chosen": -2.5541164875030518, "rewards/margins": 3.4151251316070557, "rewards/rejected": -5.969241619110107, "step": 2532 }, { "epoch": 2.39, "grad_norm": 8.015837032608239, "learning_rate": 5.2361392432612755e-08, "logps/chosen": -52.89011764526367, "logps/rejected": -94.86116027832031, "loss": 0.0683, "losses/dpo": 0.09276513755321503, "losses/sft": 2.075674057006836, "losses/total": 0.09276513755321503, "ref_logps/chosen": -31.29096794128418, "ref_logps/rejected": -40.218353271484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.159914970397949, "rewards/margins": 3.304365873336792, "rewards/rejected": -5.46428108215332, "step": 2533 }, { "epoch": 2.39, "grad_norm": 13.407456728367611, "learning_rate": 5.22055380068821e-08, "logps/chosen": -48.811424255371094, "logps/rejected": -95.6765365600586, "loss": 0.1014, "losses/dpo": 0.03440171480178833, "losses/sft": 2.8199901580810547, "losses/total": 0.03440171480178833, "ref_logps/chosen": -26.851463317871094, "ref_logps/rejected": -37.20808410644531, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1959962844848633, "rewards/margins": 3.6508493423461914, "rewards/rejected": -5.846845626831055, "step": 2534 }, { "epoch": 2.39, "grad_norm": 9.958752364091294, "learning_rate": 5.20498888320641e-08, "logps/chosen": -55.97610855102539, "logps/rejected": -93.61894989013672, "loss": 0.0841, "losses/dpo": 0.0006501578609459102, "losses/sft": 0.6345639228820801, "losses/total": 0.0006501578609459102, "ref_logps/chosen": -32.96406173706055, "ref_logps/rejected": -37.122283935546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.3012046813964844, "rewards/margins": 3.3484621047973633, "rewards/rejected": -5.649666786193848, "step": 2535 }, { "epoch": 2.39, "grad_norm": 12.481393251019218, "learning_rate": 5.189444506967547e-08, "logps/chosen": -33.84223556518555, "logps/rejected": -76.26316833496094, "loss": 0.2055, "losses/dpo": 0.2476281225681305, "losses/sft": 0.39975225925445557, "losses/total": 0.2476281225681305, "ref_logps/chosen": -16.855270385742188, "ref_logps/rejected": -30.832176208496094, "rewards/accuracies": 0.875, "rewards/chosen": -1.698696255683899, "rewards/margins": 2.844403028488159, "rewards/rejected": -4.543099403381348, "step": 2536 }, { "epoch": 2.39, "grad_norm": 12.605572281660985, "learning_rate": 5.1739206881020016e-08, "logps/chosen": -46.48650360107422, "logps/rejected": -88.34135437011719, "loss": 0.1087, "losses/dpo": 0.002137091476470232, "losses/sft": 0.07751625776290894, "losses/total": 0.002137091476470232, "ref_logps/chosen": -27.9970760345459, "ref_logps/rejected": -35.420555114746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.848942518234253, "rewards/margins": 3.4431371688842773, "rewards/rejected": -5.292079925537109, "step": 2537 }, { "epoch": 2.39, "grad_norm": 11.702519732192124, "learning_rate": 5.158417442718796e-08, "logps/chosen": -51.746910095214844, "logps/rejected": -84.29936218261719, "loss": 0.1223, "losses/dpo": 0.03189888596534729, "losses/sft": 1.3492896556854248, "losses/total": 0.03189888596534729, "ref_logps/chosen": -29.701210021972656, "ref_logps/rejected": -34.6558837890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.2045702934265137, "rewards/margins": 2.759777069091797, "rewards/rejected": -4.964346885681152, "step": 2538 }, { "epoch": 2.4, "grad_norm": 12.18601924926373, "learning_rate": 5.1429347869056285e-08, "logps/chosen": -37.785987854003906, "logps/rejected": -77.19645690917969, "loss": 0.1374, "losses/dpo": 0.3319104313850403, "losses/sft": 0.5070889592170715, "losses/total": 0.3319104313850403, "ref_logps/chosen": -19.18831443786621, "ref_logps/rejected": -26.022960662841797, "rewards/accuracies": 1.0, "rewards/chosen": -1.8597673177719116, "rewards/margins": 3.257582187652588, "rewards/rejected": -5.117349624633789, "step": 2539 }, { "epoch": 2.4, "grad_norm": 13.75280130931429, "learning_rate": 5.127472736728811e-08, "logps/chosen": -52.570308685302734, "logps/rejected": -96.11468505859375, "loss": 0.0825, "losses/dpo": 0.008887202478945255, "losses/sft": 2.8904833793640137, "losses/total": 0.008887202478945255, "ref_logps/chosen": -26.347129821777344, "ref_logps/rejected": -39.05414581298828, "rewards/accuracies": 1.0, "rewards/chosen": -2.6223182678222656, "rewards/margins": 3.083735942840576, "rewards/rejected": -5.706054210662842, "step": 2540 }, { "epoch": 2.4, "grad_norm": 10.714235714925678, "learning_rate": 5.112031308233281e-08, "logps/chosen": -44.72364044189453, "logps/rejected": -72.32769775390625, "loss": 0.1244, "losses/dpo": 0.0672258660197258, "losses/sft": 2.386364698410034, "losses/total": 0.0672258660197258, "ref_logps/chosen": -26.97154998779297, "ref_logps/rejected": -22.931781768798828, "rewards/accuracies": 1.0, "rewards/chosen": -1.7752090692520142, "rewards/margins": 3.1643829345703125, "rewards/rejected": -4.939591407775879, "step": 2541 }, { "epoch": 2.4, "grad_norm": 16.098800356860767, "learning_rate": 5.09661051744259e-08, "logps/chosen": -58.246681213378906, "logps/rejected": -89.69792175292969, "loss": 0.1121, "losses/dpo": 0.025495298206806183, "losses/sft": 1.7726359367370605, "losses/total": 0.025495298206806183, "ref_logps/chosen": -35.03335952758789, "ref_logps/rejected": -32.99957275390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.3213324546813965, "rewards/margins": 3.3485023975372314, "rewards/rejected": -5.669834613800049, "step": 2542 }, { "epoch": 2.4, "grad_norm": 10.67777736137005, "learning_rate": 5.081210380358847e-08, "logps/chosen": -51.15179443359375, "logps/rejected": -102.00736999511719, "loss": 0.103, "losses/dpo": 0.004866909235715866, "losses/sft": 2.5112650394439697, "losses/total": 0.004866909235715866, "ref_logps/chosen": -26.300765991210938, "ref_logps/rejected": -44.19701385498047, "rewards/accuracies": 1.0, "rewards/chosen": -2.485103130340576, "rewards/margins": 3.2959322929382324, "rewards/rejected": -5.781035423278809, "step": 2543 }, { "epoch": 2.4, "grad_norm": 9.349295114425507, "learning_rate": 5.065830912962754e-08, "logps/chosen": -40.93965148925781, "logps/rejected": -81.04048156738281, "loss": 0.115, "losses/dpo": 0.35329023003578186, "losses/sft": 0.6407262086868286, "losses/total": 0.35329023003578186, "ref_logps/chosen": -24.314537048339844, "ref_logps/rejected": -31.861515045166016, "rewards/accuracies": 1.0, "rewards/chosen": -1.6625111103057861, "rewards/margins": 3.255385398864746, "rewards/rejected": -4.917896270751953, "step": 2544 }, { "epoch": 2.4, "grad_norm": 12.096363573194315, "learning_rate": 5.050472131213543e-08, "logps/chosen": -53.288673400878906, "logps/rejected": -88.15020751953125, "loss": 0.1095, "losses/dpo": 0.009176469407975674, "losses/sft": 0.4107613265514374, "losses/total": 0.009176469407975674, "ref_logps/chosen": -29.068824768066406, "ref_logps/rejected": -33.52912902832031, "rewards/accuracies": 1.0, "rewards/chosen": -2.421985149383545, "rewards/margins": 3.0401225090026855, "rewards/rejected": -5.4621076583862305, "step": 2545 }, { "epoch": 2.4, "grad_norm": 12.696821025588504, "learning_rate": 5.035134051049003e-08, "logps/chosen": -57.99869155883789, "logps/rejected": -107.31500244140625, "loss": 0.11, "losses/dpo": 0.7208854556083679, "losses/sft": 1.8334240913391113, "losses/total": 0.7208854556083679, "ref_logps/chosen": -29.93027114868164, "ref_logps/rejected": -41.71564483642578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8068418502807617, "rewards/margins": 3.753093719482422, "rewards/rejected": -6.559935569763184, "step": 2546 }, { "epoch": 2.4, "grad_norm": 17.21317895618404, "learning_rate": 5.019816688385414e-08, "logps/chosen": -48.604148864746094, "logps/rejected": -73.64396667480469, "loss": 0.2365, "losses/dpo": 0.4291173815727234, "losses/sft": 2.439079523086548, "losses/total": 0.4291173815727234, "ref_logps/chosen": -26.0575008392334, "ref_logps/rejected": -27.31593132019043, "rewards/accuracies": 0.9375, "rewards/chosen": -2.254664659500122, "rewards/margins": 2.3781392574310303, "rewards/rejected": -4.632803916931152, "step": 2547 }, { "epoch": 2.4, "grad_norm": 12.92209240181344, "learning_rate": 5.0045200591175866e-08, "logps/chosen": -56.65664291381836, "logps/rejected": -94.96382141113281, "loss": 0.1157, "losses/dpo": 0.005529471207410097, "losses/sft": 1.2384644746780396, "losses/total": 0.005529471207410097, "ref_logps/chosen": -32.85851287841797, "ref_logps/rejected": -34.51328659057617, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3798131942749023, "rewards/margins": 3.6652400493621826, "rewards/rejected": -6.045053005218506, "step": 2548 }, { "epoch": 2.4, "grad_norm": 14.331825752544123, "learning_rate": 4.98924417911879e-08, "logps/chosen": -52.74739074707031, "logps/rejected": -90.63993072509766, "loss": 0.1824, "losses/dpo": 0.32082822918891907, "losses/sft": 1.8076846599578857, "losses/total": 0.32082822918891907, "ref_logps/chosen": -28.5712890625, "ref_logps/rejected": -37.146697998046875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4176101684570312, "rewards/margins": 2.931713342666626, "rewards/rejected": -5.349323749542236, "step": 2549 }, { "epoch": 2.41, "grad_norm": 9.772526926050446, "learning_rate": 4.973989064240777e-08, "logps/chosen": -52.38165283203125, "logps/rejected": -100.31861877441406, "loss": 0.0785, "losses/dpo": 0.029859749600291252, "losses/sft": 2.088087558746338, "losses/total": 0.029859749600291252, "ref_logps/chosen": -26.23602294921875, "ref_logps/rejected": -35.720115661621094, "rewards/accuracies": 1.0, "rewards/chosen": -2.61456298828125, "rewards/margins": 3.845287799835205, "rewards/rejected": -6.459850311279297, "step": 2550 }, { "epoch": 2.41, "grad_norm": 9.756072265734035, "learning_rate": 4.958754730313758e-08, "logps/chosen": -55.95258331298828, "logps/rejected": -86.77003479003906, "loss": 0.151, "losses/dpo": 0.012003387324512005, "losses/sft": 1.6673554182052612, "losses/total": 0.012003387324512005, "ref_logps/chosen": -29.933589935302734, "ref_logps/rejected": -32.7676887512207, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6018996238708496, "rewards/margins": 2.798335075378418, "rewards/rejected": -5.400234699249268, "step": 2551 }, { "epoch": 2.41, "grad_norm": 10.743765935357976, "learning_rate": 4.9435411931463584e-08, "logps/chosen": -48.730857849121094, "logps/rejected": -87.78193664550781, "loss": 0.1352, "losses/dpo": 0.10449020564556122, "losses/sft": 1.2762911319732666, "losses/total": 0.10449020564556122, "ref_logps/chosen": -23.321353912353516, "ref_logps/rejected": -32.25444030761719, "rewards/accuracies": 1.0, "rewards/chosen": -2.540950059890747, "rewards/margins": 3.0117993354797363, "rewards/rejected": -5.552749156951904, "step": 2552 }, { "epoch": 2.41, "grad_norm": 10.73576526255834, "learning_rate": 4.928348468525648e-08, "logps/chosen": -49.82389831542969, "logps/rejected": -78.47589874267578, "loss": 0.1102, "losses/dpo": 0.13142216205596924, "losses/sft": 0.487385630607605, "losses/total": 0.13142216205596924, "ref_logps/chosen": -27.819683074951172, "ref_logps/rejected": -28.998912811279297, "rewards/accuracies": 1.0, "rewards/chosen": -2.2004218101501465, "rewards/margins": 2.747276782989502, "rewards/rejected": -4.947698593139648, "step": 2553 }, { "epoch": 2.41, "grad_norm": 14.279624639559575, "learning_rate": 4.913176572217065e-08, "logps/chosen": -61.28639221191406, "logps/rejected": -92.97659301757812, "loss": 0.1646, "losses/dpo": 0.0390595979988575, "losses/sft": 1.817474365234375, "losses/total": 0.0390595979988575, "ref_logps/chosen": -32.84109878540039, "ref_logps/rejected": -37.579811096191406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.844529151916504, "rewards/margins": 2.6951487064361572, "rewards/rejected": -5.539677619934082, "step": 2554 }, { "epoch": 2.41, "grad_norm": 14.96721736021692, "learning_rate": 4.898025519964483e-08, "logps/chosen": -55.49047088623047, "logps/rejected": -90.28202056884766, "loss": 0.2029, "losses/dpo": 0.031070727854967117, "losses/sft": 1.6074532270431519, "losses/total": 0.031070727854967117, "ref_logps/chosen": -28.77897834777832, "ref_logps/rejected": -34.43097686767578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.671149253845215, "rewards/margins": 2.9139552116394043, "rewards/rejected": -5.585104465484619, "step": 2555 }, { "epoch": 2.41, "grad_norm": 21.824316001363233, "learning_rate": 4.882895327490097e-08, "logps/chosen": -47.52152633666992, "logps/rejected": -82.9278564453125, "loss": 0.1754, "losses/dpo": 0.021213386207818985, "losses/sft": 1.0018869638442993, "losses/total": 0.021213386207818985, "ref_logps/chosen": -23.299617767333984, "ref_logps/rejected": -26.982402801513672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4221909046173096, "rewards/margins": 3.172354221343994, "rewards/rejected": -5.594545364379883, "step": 2556 }, { "epoch": 2.41, "grad_norm": 5.7041444640788805, "learning_rate": 4.867786010494493e-08, "logps/chosen": -57.691932678222656, "logps/rejected": -100.14796447753906, "loss": 0.0467, "losses/dpo": 0.05729825422167778, "losses/sft": 1.9874383211135864, "losses/total": 0.05729825422167778, "ref_logps/chosen": -36.9215202331543, "ref_logps/rejected": -41.52635192871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0770411491394043, "rewards/margins": 3.7851202487945557, "rewards/rejected": -5.862161636352539, "step": 2557 }, { "epoch": 2.41, "grad_norm": 11.661418766061509, "learning_rate": 4.852697584656562e-08, "logps/chosen": -47.32194519042969, "logps/rejected": -84.57112121582031, "loss": 0.1073, "losses/dpo": 0.6116894483566284, "losses/sft": 1.554773211479187, "losses/total": 0.6116894483566284, "ref_logps/chosen": -28.642501831054688, "ref_logps/rejected": -29.66653060913086, "rewards/accuracies": 1.0, "rewards/chosen": -1.8679447174072266, "rewards/margins": 3.622514247894287, "rewards/rejected": -5.490459442138672, "step": 2558 }, { "epoch": 2.41, "grad_norm": 9.565601124102502, "learning_rate": 4.8376300656335495e-08, "logps/chosen": -43.7745361328125, "logps/rejected": -78.63570404052734, "loss": 0.1332, "losses/dpo": 0.017182039096951485, "losses/sft": 1.4743989706039429, "losses/total": 0.017182039096951485, "ref_logps/chosen": -23.52145767211914, "ref_logps/rejected": -29.970096588134766, "rewards/accuracies": 1.0, "rewards/chosen": -2.0253076553344727, "rewards/margins": 2.8412537574768066, "rewards/rejected": -4.866561412811279, "step": 2559 }, { "epoch": 2.42, "grad_norm": 11.898386750362231, "learning_rate": 4.822583469060973e-08, "logps/chosen": -53.29932403564453, "logps/rejected": -91.23963928222656, "loss": 0.1297, "losses/dpo": 0.16005249321460724, "losses/sft": 1.4717011451721191, "losses/total": 0.16005249321460724, "ref_logps/chosen": -29.665372848510742, "ref_logps/rejected": -37.4647216796875, "rewards/accuracies": 1.0, "rewards/chosen": -2.3633949756622314, "rewards/margins": 3.014097213745117, "rewards/rejected": -5.3774919509887695, "step": 2560 }, { "epoch": 2.42, "grad_norm": 19.715472732522702, "learning_rate": 4.8075578105526704e-08, "logps/chosen": -55.88805389404297, "logps/rejected": -86.87693786621094, "loss": 0.1703, "losses/dpo": 0.06221839413046837, "losses/sft": 1.790152907371521, "losses/total": 0.06221839413046837, "ref_logps/chosen": -32.54371643066406, "ref_logps/rejected": -37.609527587890625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3344337940216064, "rewards/margins": 2.5923075675964355, "rewards/rejected": -4.926741600036621, "step": 2561 }, { "epoch": 2.42, "grad_norm": 11.538794335993577, "learning_rate": 4.792553105700725e-08, "logps/chosen": -50.07918167114258, "logps/rejected": -88.70314025878906, "loss": 0.1093, "losses/dpo": 0.20474548637866974, "losses/sft": 1.7510005235671997, "losses/total": 0.20474548637866974, "ref_logps/chosen": -26.629962921142578, "ref_logps/rejected": -35.22599792480469, "rewards/accuracies": 1.0, "rewards/chosen": -2.3449220657348633, "rewards/margins": 3.0027916431427, "rewards/rejected": -5.347713470458984, "step": 2562 }, { "epoch": 2.42, "grad_norm": 9.888944809662805, "learning_rate": 4.7775693700754905e-08, "logps/chosen": -37.787384033203125, "logps/rejected": -90.55126953125, "loss": 0.0794, "losses/dpo": 0.005301692057400942, "losses/sft": 0.2316817045211792, "losses/total": 0.005301692057400942, "ref_logps/chosen": -20.534423828125, "ref_logps/rejected": -35.929649353027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7252957820892334, "rewards/margins": 3.7368662357330322, "rewards/rejected": -5.462162017822266, "step": 2563 }, { "epoch": 2.42, "grad_norm": 9.250867097333082, "learning_rate": 4.7626066192255734e-08, "logps/chosen": -61.643062591552734, "logps/rejected": -98.92178344726562, "loss": 0.1367, "losses/dpo": 0.005997644737362862, "losses/sft": 1.8816196918487549, "losses/total": 0.005997644737362862, "ref_logps/chosen": -32.917598724365234, "ref_logps/rejected": -39.29053497314453, "rewards/accuracies": 0.9375, "rewards/chosen": -2.872546434402466, "rewards/margins": 3.0905776023864746, "rewards/rejected": -5.963123798370361, "step": 2564 }, { "epoch": 2.42, "grad_norm": 7.7300574044571135, "learning_rate": 4.74766486867777e-08, "logps/chosen": -59.223506927490234, "logps/rejected": -90.52142333984375, "loss": 0.079, "losses/dpo": 0.013196777552366257, "losses/sft": 0.8064956068992615, "losses/total": 0.013196777552366257, "ref_logps/chosen": -33.80642318725586, "ref_logps/rejected": -35.42997741699219, "rewards/accuracies": 1.0, "rewards/chosen": -2.541708469390869, "rewards/margins": 2.967435836791992, "rewards/rejected": -5.5091447830200195, "step": 2565 }, { "epoch": 2.42, "grad_norm": 14.240053994040881, "learning_rate": 4.732744133937119e-08, "logps/chosen": -65.79824829101562, "logps/rejected": -102.89645385742188, "loss": 0.1406, "losses/dpo": 0.11815966665744781, "losses/sft": 1.55833899974823, "losses/total": 0.11815966665744781, "ref_logps/chosen": -35.529151916503906, "ref_logps/rejected": -44.491127014160156, "rewards/accuracies": 1.0, "rewards/chosen": -3.026909828186035, "rewards/margins": 2.8136234283447266, "rewards/rejected": -5.840533256530762, "step": 2566 }, { "epoch": 2.42, "grad_norm": 16.08145852830718, "learning_rate": 4.7178444304868284e-08, "logps/chosen": -51.59674835205078, "logps/rejected": -83.01612091064453, "loss": 0.1437, "losses/dpo": 0.4571632742881775, "losses/sft": 1.5191712379455566, "losses/total": 0.4571632742881775, "ref_logps/chosen": -29.921707153320312, "ref_logps/rejected": -31.109519958496094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.167503833770752, "rewards/margins": 3.023155689239502, "rewards/rejected": -5.190659999847412, "step": 2567 }, { "epoch": 2.42, "grad_norm": 11.47011240504692, "learning_rate": 4.702965773788295e-08, "logps/chosen": -49.306785583496094, "logps/rejected": -96.90950775146484, "loss": 0.0922, "losses/dpo": 0.11370083689689636, "losses/sft": 1.6571416854858398, "losses/total": 0.11370083689689636, "ref_logps/chosen": -25.164844512939453, "ref_logps/rejected": -36.08013916015625, "rewards/accuracies": 1.0, "rewards/chosen": -2.414193868637085, "rewards/margins": 3.66874361038208, "rewards/rejected": -6.082937240600586, "step": 2568 }, { "epoch": 2.42, "grad_norm": 14.27536513560572, "learning_rate": 4.6881081792810667e-08, "logps/chosen": -47.34332275390625, "logps/rejected": -94.31884002685547, "loss": 0.1624, "losses/dpo": 0.06147804111242294, "losses/sft": 2.234593391418457, "losses/total": 0.06147804111242294, "ref_logps/chosen": -24.92853546142578, "ref_logps/rejected": -39.202789306640625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2414791584014893, "rewards/margins": 3.2701256275177, "rewards/rejected": -5.511605262756348, "step": 2569 }, { "epoch": 2.42, "grad_norm": 9.389676863877039, "learning_rate": 4.6732716623828385e-08, "logps/chosen": -46.68357849121094, "logps/rejected": -93.79119873046875, "loss": 0.0942, "losses/dpo": 0.0012552151456475258, "losses/sft": 1.9227054119110107, "losses/total": 0.0012552151456475258, "ref_logps/chosen": -22.727500915527344, "ref_logps/rejected": -34.367103576660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.3956079483032227, "rewards/margins": 3.5468015670776367, "rewards/rejected": -5.942409515380859, "step": 2570 }, { "epoch": 2.43, "grad_norm": 14.652934553613079, "learning_rate": 4.658456238489444e-08, "logps/chosen": -56.12397003173828, "logps/rejected": -90.3096923828125, "loss": 0.1435, "losses/dpo": 0.010447688400745392, "losses/sft": 1.3884626626968384, "losses/total": 0.010447688400745392, "ref_logps/chosen": -29.563840866088867, "ref_logps/rejected": -32.51200866699219, "rewards/accuracies": 1.0, "rewards/chosen": -2.656013011932373, "rewards/margins": 3.1237549781799316, "rewards/rejected": -5.779767990112305, "step": 2571 }, { "epoch": 2.43, "grad_norm": 12.881553466902547, "learning_rate": 4.6436619229748034e-08, "logps/chosen": -45.10182189941406, "logps/rejected": -82.27485656738281, "loss": 0.1677, "losses/dpo": 0.19127929210662842, "losses/sft": 2.1255555152893066, "losses/total": 0.19127929210662842, "ref_logps/chosen": -21.300838470458984, "ref_logps/rejected": -29.233747482299805, "rewards/accuracies": 0.9375, "rewards/chosen": -2.380098342895508, "rewards/margins": 2.9240126609802246, "rewards/rejected": -5.304111003875732, "step": 2572 }, { "epoch": 2.43, "grad_norm": 9.793337171993537, "learning_rate": 4.628888731190964e-08, "logps/chosen": -71.11317443847656, "logps/rejected": -94.77876281738281, "loss": 0.0827, "losses/dpo": 0.04818829894065857, "losses/sft": 1.3609886169433594, "losses/total": 0.04818829894065857, "ref_logps/chosen": -43.124454498291016, "ref_logps/rejected": -37.3254508972168, "rewards/accuracies": 1.0, "rewards/chosen": -2.7988715171813965, "rewards/margins": 2.946460723876953, "rewards/rejected": -5.74533224105835, "step": 2573 }, { "epoch": 2.43, "grad_norm": 8.68683936543536, "learning_rate": 4.6141366784680225e-08, "logps/chosen": -49.65240478515625, "logps/rejected": -98.4720230102539, "loss": 0.0917, "losses/dpo": 0.4344930350780487, "losses/sft": 1.4429712295532227, "losses/total": 0.4344930350780487, "ref_logps/chosen": -26.645963668823242, "ref_logps/rejected": -36.15586471557617, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3006443977355957, "rewards/margins": 3.93097186088562, "rewards/rejected": -6.231616020202637, "step": 2574 }, { "epoch": 2.43, "grad_norm": 11.17448549326311, "learning_rate": 4.599405780114166e-08, "logps/chosen": -49.746429443359375, "logps/rejected": -92.4546127319336, "loss": 0.1182, "losses/dpo": 0.3685522675514221, "losses/sft": 2.3588552474975586, "losses/total": 0.3685522675514221, "ref_logps/chosen": -27.830446243286133, "ref_logps/rejected": -40.356815338134766, "rewards/accuracies": 1.0, "rewards/chosen": -2.1915981769561768, "rewards/margins": 3.018181800842285, "rewards/rejected": -5.209779739379883, "step": 2575 }, { "epoch": 2.43, "grad_norm": 10.919190618215545, "learning_rate": 4.584696051415607e-08, "logps/chosen": -52.25593948364258, "logps/rejected": -91.53016662597656, "loss": 0.0943, "losses/dpo": 0.04734724014997482, "losses/sft": 1.6740498542785645, "losses/total": 0.04734724014997482, "ref_logps/chosen": -30.346702575683594, "ref_logps/rejected": -36.20817184448242, "rewards/accuracies": 1.0, "rewards/chosen": -2.1909239292144775, "rewards/margins": 3.341275691986084, "rewards/rejected": -5.532199382781982, "step": 2576 }, { "epoch": 2.43, "grad_norm": 12.05181441050591, "learning_rate": 4.5700075076366115e-08, "logps/chosen": -45.39623260498047, "logps/rejected": -77.2787857055664, "loss": 0.1205, "losses/dpo": 0.07013572752475739, "losses/sft": 0.9261975288391113, "losses/total": 0.07013572752475739, "ref_logps/chosen": -23.009212493896484, "ref_logps/rejected": -29.48380470275879, "rewards/accuracies": 1.0, "rewards/chosen": -2.2387020587921143, "rewards/margins": 2.5407962799072266, "rewards/rejected": -4.779498100280762, "step": 2577 }, { "epoch": 2.43, "grad_norm": 9.796271900823479, "learning_rate": 4.555340164019442e-08, "logps/chosen": -41.74982833862305, "logps/rejected": -79.31538391113281, "loss": 0.0809, "losses/dpo": 0.0039123608730733395, "losses/sft": 1.1744085550308228, "losses/total": 0.0039123608730733395, "ref_logps/chosen": -25.683151245117188, "ref_logps/rejected": -31.100833892822266, "rewards/accuracies": 1.0, "rewards/chosen": -1.6066678762435913, "rewards/margins": 3.214787006378174, "rewards/rejected": -4.8214545249938965, "step": 2578 }, { "epoch": 2.43, "grad_norm": 7.628150930486684, "learning_rate": 4.540694035784373e-08, "logps/chosen": -50.30974578857422, "logps/rejected": -102.34226989746094, "loss": 0.0721, "losses/dpo": 0.01900474913418293, "losses/sft": 2.271937608718872, "losses/total": 0.01900474913418293, "ref_logps/chosen": -26.42770767211914, "ref_logps/rejected": -39.849395751953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.3882036209106445, "rewards/margins": 3.8610832691192627, "rewards/rejected": -6.249287128448486, "step": 2579 }, { "epoch": 2.43, "grad_norm": 10.855322225396414, "learning_rate": 4.5260691381296733e-08, "logps/chosen": -50.554176330566406, "logps/rejected": -72.99510192871094, "loss": 0.1527, "losses/dpo": 0.09109723567962646, "losses/sft": 1.493789553642273, "losses/total": 0.09109723567962646, "ref_logps/chosen": -30.354856491088867, "ref_logps/rejected": -26.035560607910156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.019932270050049, "rewards/margins": 2.676022529602051, "rewards/rejected": -4.6959547996521, "step": 2580 }, { "epoch": 2.43, "grad_norm": 8.141687219014317, "learning_rate": 4.511465486231555e-08, "logps/chosen": -54.34214782714844, "logps/rejected": -98.07677459716797, "loss": 0.0739, "losses/dpo": 0.03617195785045624, "losses/sft": 1.2978075742721558, "losses/total": 0.03617195785045624, "ref_logps/chosen": -30.24698829650879, "ref_logps/rejected": -37.29758834838867, "rewards/accuracies": 1.0, "rewards/chosen": -2.4095163345336914, "rewards/margins": 3.6684019565582275, "rewards/rejected": -6.07791805267334, "step": 2581 }, { "epoch": 2.44, "grad_norm": 15.177027703855257, "learning_rate": 4.496883095244211e-08, "logps/chosen": -52.21919250488281, "logps/rejected": -90.92189025878906, "loss": 0.1481, "losses/dpo": 0.05134037137031555, "losses/sft": 1.749255895614624, "losses/total": 0.05134037137031555, "ref_logps/chosen": -30.397382736206055, "ref_logps/rejected": -37.582855224609375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1821813583374023, "rewards/margins": 3.1517226696014404, "rewards/rejected": -5.333904266357422, "step": 2582 }, { "epoch": 2.44, "grad_norm": 14.617887220067873, "learning_rate": 4.482321980299744e-08, "logps/chosen": -51.27632141113281, "logps/rejected": -93.46284484863281, "loss": 0.1504, "losses/dpo": 0.00016110438446048647, "losses/sft": 0.7826722264289856, "losses/total": 0.00016110438446048647, "ref_logps/chosen": -26.689876556396484, "ref_logps/rejected": -33.69302749633789, "rewards/accuracies": 1.0, "rewards/chosen": -2.458644390106201, "rewards/margins": 3.5183377265930176, "rewards/rejected": -5.976982116699219, "step": 2583 }, { "epoch": 2.44, "grad_norm": 7.243028054560645, "learning_rate": 4.4677821565082126e-08, "logps/chosen": -54.68217468261719, "logps/rejected": -93.85865020751953, "loss": 0.0623, "losses/dpo": 0.01622113026678562, "losses/sft": 1.4722155332565308, "losses/total": 0.01622113026678562, "ref_logps/chosen": -29.777616500854492, "ref_logps/rejected": -33.49024963378906, "rewards/accuracies": 1.0, "rewards/chosen": -2.4904561042785645, "rewards/margins": 3.546383857727051, "rewards/rejected": -6.036839962005615, "step": 2584 }, { "epoch": 2.44, "grad_norm": 15.090848279712336, "learning_rate": 4.453263638957541e-08, "logps/chosen": -44.51856994628906, "logps/rejected": -86.31275939941406, "loss": 0.1529, "losses/dpo": 0.060995154082775116, "losses/sft": 0.19122165441513062, "losses/total": 0.060995154082775116, "ref_logps/chosen": -24.08584213256836, "ref_logps/rejected": -31.97066307067871, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0432729721069336, "rewards/margins": 3.390936851501465, "rewards/rejected": -5.434209823608398, "step": 2585 }, { "epoch": 2.44, "grad_norm": 16.93288002748481, "learning_rate": 4.4387664427135844e-08, "logps/chosen": -50.908599853515625, "logps/rejected": -73.29310607910156, "loss": 0.2217, "losses/dpo": 0.2190237194299698, "losses/sft": 0.3265767991542816, "losses/total": 0.2190237194299698, "ref_logps/chosen": -32.411155700683594, "ref_logps/rejected": -27.680688858032227, "rewards/accuracies": 0.9375, "rewards/chosen": -1.849744439125061, "rewards/margins": 2.711496591567993, "rewards/rejected": -4.561241149902344, "step": 2586 }, { "epoch": 2.44, "grad_norm": 27.045676317342462, "learning_rate": 4.4242905828200395e-08, "logps/chosen": -59.92374038696289, "logps/rejected": -79.13553619384766, "loss": 0.2508, "losses/dpo": 1.3558247089385986, "losses/sft": 1.7524330615997314, "losses/total": 1.3558247089385986, "ref_logps/chosen": -33.929500579833984, "ref_logps/rejected": -28.84128761291504, "rewards/accuracies": 0.875, "rewards/chosen": -2.599423885345459, "rewards/margins": 2.4300007820129395, "rewards/rejected": -5.029424667358398, "step": 2587 }, { "epoch": 2.44, "grad_norm": 14.479744746916536, "learning_rate": 4.4098360742984814e-08, "logps/chosen": -45.317108154296875, "logps/rejected": -78.15686798095703, "loss": 0.1858, "losses/dpo": 0.37352922558784485, "losses/sft": 1.4939039945602417, "losses/total": 0.37352922558784485, "ref_logps/chosen": -25.286638259887695, "ref_logps/rejected": -32.548763275146484, "rewards/accuracies": 1.0, "rewards/chosen": -2.003047466278076, "rewards/margins": 2.5577633380889893, "rewards/rejected": -4.560810565948486, "step": 2588 }, { "epoch": 2.44, "grad_norm": 10.146335411666106, "learning_rate": 4.3954029321483345e-08, "logps/chosen": -59.16047286987305, "logps/rejected": -100.69452667236328, "loss": 0.0592, "losses/dpo": 0.005806533619761467, "losses/sft": 1.5592405796051025, "losses/total": 0.005806533619761467, "ref_logps/chosen": -36.466556549072266, "ref_logps/rejected": -38.61592483520508, "rewards/accuracies": 1.0, "rewards/chosen": -2.2693917751312256, "rewards/margins": 3.9384689331054688, "rewards/rejected": -6.207860946655273, "step": 2589 }, { "epoch": 2.44, "grad_norm": 14.03120972842, "learning_rate": 4.3809911713468234e-08, "logps/chosen": -32.79899215698242, "logps/rejected": -66.8756103515625, "loss": 0.1556, "losses/dpo": 0.27600544691085815, "losses/sft": 0.9589486122131348, "losses/total": 0.27600544691085815, "ref_logps/chosen": -16.445423126220703, "ref_logps/rejected": -21.536972045898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6353572607040405, "rewards/margins": 2.8985066413879395, "rewards/rejected": -4.5338640213012695, "step": 2590 }, { "epoch": 2.44, "grad_norm": 14.964549782999638, "learning_rate": 4.366600806849019e-08, "logps/chosen": -53.715240478515625, "logps/rejected": -99.16275024414062, "loss": 0.151, "losses/dpo": 0.11191148310899734, "losses/sft": 2.5380358695983887, "losses/total": 0.11191148310899734, "ref_logps/chosen": -28.091711044311523, "ref_logps/rejected": -42.259613037109375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5623531341552734, "rewards/margins": 3.127960443496704, "rewards/rejected": -5.690313339233398, "step": 2591 }, { "epoch": 2.45, "grad_norm": 16.285743877390846, "learning_rate": 4.3522318535877624e-08, "logps/chosen": -52.952186584472656, "logps/rejected": -79.18155670166016, "loss": 0.1965, "losses/dpo": 0.22036102414131165, "losses/sft": 1.2092045545578003, "losses/total": 0.22036102414131165, "ref_logps/chosen": -27.530567169189453, "ref_logps/rejected": -29.558120727539062, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5421621799468994, "rewards/margins": 2.4201812744140625, "rewards/rejected": -4.962343215942383, "step": 2592 }, { "epoch": 2.45, "grad_norm": 8.82559186262943, "learning_rate": 4.337884326473698e-08, "logps/chosen": -61.66985321044922, "logps/rejected": -109.85359191894531, "loss": 0.075, "losses/dpo": 0.18600554764270782, "losses/sft": 0.8924756646156311, "losses/total": 0.18600554764270782, "ref_logps/chosen": -38.132015228271484, "ref_logps/rejected": -49.588096618652344, "rewards/accuracies": 1.0, "rewards/chosen": -2.3537838459014893, "rewards/margins": 3.6727652549743652, "rewards/rejected": -6.026548862457275, "step": 2593 }, { "epoch": 2.45, "grad_norm": 13.27014987060364, "learning_rate": 4.3235582403952114e-08, "logps/chosen": -55.904998779296875, "logps/rejected": -105.64794158935547, "loss": 0.1359, "losses/dpo": 0.00416839262470603, "losses/sft": 1.5465335845947266, "losses/total": 0.00416839262470603, "ref_logps/chosen": -30.79557228088379, "ref_logps/rejected": -44.90947723388672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5109429359436035, "rewards/margins": 3.562903881072998, "rewards/rejected": -6.073846340179443, "step": 2594 }, { "epoch": 2.45, "grad_norm": 20.428418702021613, "learning_rate": 4.309253610218469e-08, "logps/chosen": -52.287803649902344, "logps/rejected": -82.97416687011719, "loss": 0.2186, "losses/dpo": 0.1284637451171875, "losses/sft": 1.9608168601989746, "losses/total": 0.1284637451171875, "ref_logps/chosen": -28.849491119384766, "ref_logps/rejected": -28.24970245361328, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3438310623168945, "rewards/margins": 3.128615379333496, "rewards/rejected": -5.472446441650391, "step": 2595 }, { "epoch": 2.45, "grad_norm": 12.139337884327567, "learning_rate": 4.294970450787339e-08, "logps/chosen": -46.561920166015625, "logps/rejected": -78.93379974365234, "loss": 0.0964, "losses/dpo": 0.08002860844135284, "losses/sft": 1.3986728191375732, "losses/total": 0.08002860844135284, "ref_logps/chosen": -25.6644344329834, "ref_logps/rejected": -25.27840805053711, "rewards/accuracies": 1.0, "rewards/chosen": -2.0897488594055176, "rewards/margins": 3.2757906913757324, "rewards/rejected": -5.36553955078125, "step": 2596 }, { "epoch": 2.45, "grad_norm": 8.591150978882157, "learning_rate": 4.280708776923439e-08, "logps/chosen": -57.99597930908203, "logps/rejected": -107.06019592285156, "loss": 0.0712, "losses/dpo": 0.0011406907578930259, "losses/sft": 2.2863729000091553, "losses/total": 0.0011406907578930259, "ref_logps/chosen": -31.913537979125977, "ref_logps/rejected": -41.52598571777344, "rewards/accuracies": 1.0, "rewards/chosen": -2.6082441806793213, "rewards/margins": 3.9451770782470703, "rewards/rejected": -6.553421497344971, "step": 2597 }, { "epoch": 2.45, "grad_norm": 30.605998306383967, "learning_rate": 4.266468603426082e-08, "logps/chosen": -58.177406311035156, "logps/rejected": -77.76284790039062, "loss": 0.3317, "losses/dpo": 0.07254192233085632, "losses/sft": 1.5934504270553589, "losses/total": 0.07254192233085632, "ref_logps/chosen": -35.103755950927734, "ref_logps/rejected": -29.420181274414062, "rewards/accuracies": 0.75, "rewards/chosen": -2.3073649406433105, "rewards/margins": 2.5269017219543457, "rewards/rejected": -4.834266662597656, "step": 2598 }, { "epoch": 2.45, "grad_norm": 10.605821613012901, "learning_rate": 4.252249945072256e-08, "logps/chosen": -53.22776794433594, "logps/rejected": -98.8343734741211, "loss": 0.0548, "losses/dpo": 0.08002713322639465, "losses/sft": 1.5662713050842285, "losses/total": 0.08002713322639465, "ref_logps/chosen": -33.589168548583984, "ref_logps/rejected": -39.31565856933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.9638596773147583, "rewards/margins": 3.9880120754241943, "rewards/rejected": -5.951871871948242, "step": 2599 }, { "epoch": 2.45, "grad_norm": 16.94622811601375, "learning_rate": 4.2380528166166466e-08, "logps/chosen": -52.44696044921875, "logps/rejected": -87.76622009277344, "loss": 0.1783, "losses/dpo": 0.6064998507499695, "losses/sft": 3.023655891418457, "losses/total": 0.6064998507499695, "ref_logps/chosen": -24.207965850830078, "ref_logps/rejected": -32.542572021484375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.823899269104004, "rewards/margins": 2.698465347290039, "rewards/rejected": -5.522364616394043, "step": 2600 }, { "epoch": 2.45, "grad_norm": 10.12183831827525, "learning_rate": 4.22387723279157e-08, "logps/chosen": -44.864906311035156, "logps/rejected": -96.03511047363281, "loss": 0.0782, "losses/dpo": 0.0021337687503546476, "losses/sft": 1.3769079446792603, "losses/total": 0.0021337687503546476, "ref_logps/chosen": -26.15048599243164, "ref_logps/rejected": -39.651611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.871442198753357, "rewards/margins": 3.7669081687927246, "rewards/rejected": -5.638350486755371, "step": 2601 }, { "epoch": 2.45, "grad_norm": 18.17289419365447, "learning_rate": 4.209723208307017e-08, "logps/chosen": -52.45624923706055, "logps/rejected": -83.76322174072266, "loss": 0.159, "losses/dpo": 0.06982685625553131, "losses/sft": 2.1783041954040527, "losses/total": 0.06982685625553131, "ref_logps/chosen": -29.136831283569336, "ref_logps/rejected": -29.22007942199707, "rewards/accuracies": 0.9375, "rewards/chosen": -2.331942081451416, "rewards/margins": 3.1223721504211426, "rewards/rejected": -5.454314231872559, "step": 2602 }, { "epoch": 2.46, "grad_norm": 8.12809438505887, "learning_rate": 4.195590757850576e-08, "logps/chosen": -39.50404357910156, "logps/rejected": -76.49710083007812, "loss": 0.103, "losses/dpo": 0.08084659278392792, "losses/sft": 1.1013925075531006, "losses/total": 0.08084659278392792, "ref_logps/chosen": -24.370561599731445, "ref_logps/rejected": -30.138389587402344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5133479833602905, "rewards/margins": 3.122523546218872, "rewards/rejected": -4.635871410369873, "step": 2603 }, { "epoch": 2.46, "grad_norm": 10.52094751766026, "learning_rate": 4.18147989608747e-08, "logps/chosen": -56.65145492553711, "logps/rejected": -79.73261260986328, "loss": 0.1075, "losses/dpo": 0.0068103596568107605, "losses/sft": 2.400883674621582, "losses/total": 0.0068103596568107605, "ref_logps/chosen": -29.098285675048828, "ref_logps/rejected": -25.150371551513672, "rewards/accuracies": 1.0, "rewards/chosen": -2.755317211151123, "rewards/margins": 2.702907085418701, "rewards/rejected": -5.458224296569824, "step": 2604 }, { "epoch": 2.46, "grad_norm": 12.272808600760033, "learning_rate": 4.1673906376605145e-08, "logps/chosen": -43.41936492919922, "logps/rejected": -83.14106750488281, "loss": 0.1313, "losses/dpo": 0.0272201057523489, "losses/sft": 0.8910903334617615, "losses/total": 0.0272201057523489, "ref_logps/chosen": -22.30083465576172, "ref_logps/rejected": -33.451168060302734, "rewards/accuracies": 0.9375, "rewards/chosen": -2.11185359954834, "rewards/margins": 2.857137441635132, "rewards/rejected": -4.968991279602051, "step": 2605 }, { "epoch": 2.46, "grad_norm": 7.337245853460361, "learning_rate": 4.153322997190095e-08, "logps/chosen": -55.299095153808594, "logps/rejected": -88.11856079101562, "loss": 0.0666, "losses/dpo": 0.006432465277612209, "losses/sft": 1.9760733842849731, "losses/total": 0.006432465277612209, "ref_logps/chosen": -33.15419387817383, "ref_logps/rejected": -29.55999755859375, "rewards/accuracies": 1.0, "rewards/chosen": -2.2144899368286133, "rewards/margins": 3.6413662433624268, "rewards/rejected": -5.855856418609619, "step": 2606 }, { "epoch": 2.46, "grad_norm": 12.940978719940698, "learning_rate": 4.139276989274182e-08, "logps/chosen": -45.28966522216797, "logps/rejected": -72.03724670410156, "loss": 0.1658, "losses/dpo": 0.11762786656618118, "losses/sft": 0.8325470089912415, "losses/total": 0.11762786656618118, "ref_logps/chosen": -24.30261993408203, "ref_logps/rejected": -26.52325439453125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0987043380737305, "rewards/margins": 2.4526944160461426, "rewards/rejected": -4.551398754119873, "step": 2607 }, { "epoch": 2.46, "grad_norm": 11.963293442126147, "learning_rate": 4.125252628488282e-08, "logps/chosen": -50.474666595458984, "logps/rejected": -96.94334411621094, "loss": 0.101, "losses/dpo": 0.09981000423431396, "losses/sft": 2.035552740097046, "losses/total": 0.09981000423431396, "ref_logps/chosen": -31.301172256469727, "ref_logps/rejected": -37.70844268798828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.917349100112915, "rewards/margins": 4.00614070892334, "rewards/rejected": -5.923489570617676, "step": 2608 }, { "epoch": 2.46, "grad_norm": 19.981007471195067, "learning_rate": 4.111249929385458e-08, "logps/chosen": -37.555206298828125, "logps/rejected": -63.57000732421875, "loss": 0.2284, "losses/dpo": 0.5359127521514893, "losses/sft": 0.9503449201583862, "losses/total": 0.5359127521514893, "ref_logps/chosen": -18.12391471862793, "ref_logps/rejected": -22.595504760742188, "rewards/accuracies": 0.875, "rewards/chosen": -1.9431291818618774, "rewards/margins": 2.1543211936950684, "rewards/rejected": -4.097450256347656, "step": 2609 }, { "epoch": 2.46, "grad_norm": 12.25203668298395, "learning_rate": 4.097268906496265e-08, "logps/chosen": -64.97499084472656, "logps/rejected": -101.2823486328125, "loss": 0.1144, "losses/dpo": 0.0002474568609613925, "losses/sft": 1.7721425294876099, "losses/total": 0.0002474568609613925, "ref_logps/chosen": -36.006553649902344, "ref_logps/rejected": -39.9622688293457, "rewards/accuracies": 1.0, "rewards/chosen": -2.8968429565429688, "rewards/margins": 3.23516583442688, "rewards/rejected": -6.1320085525512695, "step": 2610 }, { "epoch": 2.46, "grad_norm": 10.612490028148265, "learning_rate": 4.0833095743288e-08, "logps/chosen": -47.52824020385742, "logps/rejected": -86.60816955566406, "loss": 0.0884, "losses/dpo": 0.05945480987429619, "losses/sft": 2.2431607246398926, "losses/total": 0.05945480987429619, "ref_logps/chosen": -26.69143295288086, "ref_logps/rejected": -34.854549407958984, "rewards/accuracies": 1.0, "rewards/chosen": -2.0836806297302246, "rewards/margins": 3.0916810035705566, "rewards/rejected": -5.175361633300781, "step": 2611 }, { "epoch": 2.46, "grad_norm": 7.509685422721822, "learning_rate": 4.069371947368619e-08, "logps/chosen": -44.575931549072266, "logps/rejected": -95.34391021728516, "loss": 0.08, "losses/dpo": 0.07697848975658417, "losses/sft": 1.1213959455490112, "losses/total": 0.07697848975658417, "ref_logps/chosen": -22.724609375, "ref_logps/rejected": -36.15858459472656, "rewards/accuracies": 1.0, "rewards/chosen": -2.1851320266723633, "rewards/margins": 3.733400821685791, "rewards/rejected": -5.9185333251953125, "step": 2612 }, { "epoch": 2.47, "grad_norm": 9.19167200761221, "learning_rate": 4.0554560400787754e-08, "logps/chosen": -48.80958557128906, "logps/rejected": -120.26585388183594, "loss": 0.0678, "losses/dpo": 0.009068584069609642, "losses/sft": 2.3365745544433594, "losses/total": 0.009068584069609642, "ref_logps/chosen": -26.19810676574707, "ref_logps/rejected": -48.98887634277344, "rewards/accuracies": 1.0, "rewards/chosen": -2.261147975921631, "rewards/margins": 4.866549968719482, "rewards/rejected": -7.127697944641113, "step": 2613 }, { "epoch": 2.47, "grad_norm": 27.080643450909847, "learning_rate": 4.0415618668997845e-08, "logps/chosen": -60.06767654418945, "logps/rejected": -85.28572082519531, "loss": 0.269, "losses/dpo": 0.3251399099826813, "losses/sft": 0.362210214138031, "losses/total": 0.3251399099826813, "ref_logps/chosen": -31.63421630859375, "ref_logps/rejected": -33.014305114746094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.843346118927002, "rewards/margins": 2.383795738220215, "rewards/rejected": -5.227141857147217, "step": 2614 }, { "epoch": 2.47, "grad_norm": 12.002812184825004, "learning_rate": 4.0276894422495875e-08, "logps/chosen": -55.28301239013672, "logps/rejected": -89.57423400878906, "loss": 0.1231, "losses/dpo": 0.27272626757621765, "losses/sft": 2.5501768589019775, "losses/total": 0.27272626757621765, "ref_logps/chosen": -29.94855499267578, "ref_logps/rejected": -31.10499382019043, "rewards/accuracies": 1.0, "rewards/chosen": -2.5334458351135254, "rewards/margins": 3.3134779930114746, "rewards/rejected": -5.846923828125, "step": 2615 }, { "epoch": 2.47, "grad_norm": 8.2757093403984, "learning_rate": 4.0138387805235874e-08, "logps/chosen": -48.89826965332031, "logps/rejected": -82.14445495605469, "loss": 0.0758, "losses/dpo": 0.004826088435947895, "losses/sft": 2.561535120010376, "losses/total": 0.004826088435947895, "ref_logps/chosen": -29.487445831298828, "ref_logps/rejected": -28.425687789916992, "rewards/accuracies": 1.0, "rewards/chosen": -1.9410825967788696, "rewards/margins": 3.4307942390441895, "rewards/rejected": -5.3718767166137695, "step": 2616 }, { "epoch": 2.47, "grad_norm": 12.58300210078696, "learning_rate": 4.000009896094572e-08, "logps/chosen": -52.72842025756836, "logps/rejected": -87.17658996582031, "loss": 0.1306, "losses/dpo": 0.09054283797740936, "losses/sft": 0.25017377734184265, "losses/total": 0.09054283797740936, "ref_logps/chosen": -26.08812713623047, "ref_logps/rejected": -33.13079071044922, "rewards/accuracies": 1.0, "rewards/chosen": -2.664029359817505, "rewards/margins": 2.740550994873047, "rewards/rejected": -5.404580116271973, "step": 2617 }, { "epoch": 2.47, "grad_norm": 10.309624359549733, "learning_rate": 3.98620280331276e-08, "logps/chosen": -54.31832504272461, "logps/rejected": -95.88821411132812, "loss": 0.0974, "losses/dpo": 0.016848597675561905, "losses/sft": 1.7464864253997803, "losses/total": 0.016848597675561905, "ref_logps/chosen": -29.52193832397461, "ref_logps/rejected": -37.62067413330078, "rewards/accuracies": 1.0, "rewards/chosen": -2.4796385765075684, "rewards/margins": 3.3471157550811768, "rewards/rejected": -5.826754570007324, "step": 2618 }, { "epoch": 2.47, "grad_norm": 9.304696611742651, "learning_rate": 3.972417516505736e-08, "logps/chosen": -53.54047393798828, "logps/rejected": -98.81790161132812, "loss": 0.0875, "losses/dpo": 0.11293582618236542, "losses/sft": 2.143489360809326, "losses/total": 0.11293582618236542, "ref_logps/chosen": -25.458820343017578, "ref_logps/rejected": -37.17475128173828, "rewards/accuracies": 1.0, "rewards/chosen": -2.8081650733947754, "rewards/margins": 3.356149673461914, "rewards/rejected": -6.164315223693848, "step": 2619 }, { "epoch": 2.47, "grad_norm": 18.48367593720205, "learning_rate": 3.9586540499784685e-08, "logps/chosen": -44.87347412109375, "logps/rejected": -91.21470642089844, "loss": 0.1501, "losses/dpo": 0.006818925496190786, "losses/sft": 0.938701868057251, "losses/total": 0.006818925496190786, "ref_logps/chosen": -24.798999786376953, "ref_logps/rejected": -35.205101013183594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0074474811553955, "rewards/margins": 3.5935134887695312, "rewards/rejected": -5.600960731506348, "step": 2620 }, { "epoch": 2.47, "grad_norm": 12.394362126853018, "learning_rate": 3.9449124180132685e-08, "logps/chosen": -51.379966735839844, "logps/rejected": -81.71673583984375, "loss": 0.1185, "losses/dpo": 0.06592794507741928, "losses/sft": 0.8842443227767944, "losses/total": 0.06592794507741928, "ref_logps/chosen": -30.122661590576172, "ref_logps/rejected": -30.153339385986328, "rewards/accuracies": 1.0, "rewards/chosen": -2.125730514526367, "rewards/margins": 3.030608654022217, "rewards/rejected": -5.156338691711426, "step": 2621 }, { "epoch": 2.47, "grad_norm": 22.507594045208617, "learning_rate": 3.9311926348698124e-08, "logps/chosen": -41.1981201171875, "logps/rejected": -75.82054138183594, "loss": 0.2574, "losses/dpo": 1.501495599746704, "losses/sft": 2.6332457065582275, "losses/total": 1.501495599746704, "ref_logps/chosen": -23.93972396850586, "ref_logps/rejected": -29.517780303955078, "rewards/accuracies": 0.875, "rewards/chosen": -1.7258398532867432, "rewards/margins": 2.9044361114501953, "rewards/rejected": -4.630275726318359, "step": 2622 }, { "epoch": 2.47, "grad_norm": 13.736580547220056, "learning_rate": 3.917494714785088e-08, "logps/chosen": -59.34613800048828, "logps/rejected": -84.93162536621094, "loss": 0.1447, "losses/dpo": 0.5490550994873047, "losses/sft": 1.3513484001159668, "losses/total": 0.5490550994873047, "ref_logps/chosen": -29.92303466796875, "ref_logps/rejected": -27.72544288635254, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9423105716705322, "rewards/margins": 2.7783071994781494, "rewards/rejected": -5.72061824798584, "step": 2623 }, { "epoch": 2.48, "grad_norm": 7.582428460563856, "learning_rate": 3.903818671973397e-08, "logps/chosen": -47.642852783203125, "logps/rejected": -101.15375518798828, "loss": 0.0693, "losses/dpo": 0.00047277507837861776, "losses/sft": 1.1133564710617065, "losses/total": 0.00047277507837861776, "ref_logps/chosen": -29.204410552978516, "ref_logps/rejected": -42.688385009765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8438441753387451, "rewards/margins": 4.002693176269531, "rewards/rejected": -5.846537113189697, "step": 2624 }, { "epoch": 2.48, "grad_norm": 12.894694309366601, "learning_rate": 3.890164520626343e-08, "logps/chosen": -53.41375732421875, "logps/rejected": -92.78626251220703, "loss": 0.1749, "losses/dpo": 0.04343239590525627, "losses/sft": 1.7472223043441772, "losses/total": 0.04343239590525627, "ref_logps/chosen": -28.258586883544922, "ref_logps/rejected": -38.34803771972656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.515517234802246, "rewards/margins": 2.9283056259155273, "rewards/rejected": -5.443822860717773, "step": 2625 }, { "epoch": 2.48, "grad_norm": 11.899000704289254, "learning_rate": 3.876532274912808e-08, "logps/chosen": -61.05156707763672, "logps/rejected": -96.23432922363281, "loss": 0.119, "losses/dpo": 0.4339641332626343, "losses/sft": 1.3875727653503418, "losses/total": 0.4339641332626343, "ref_logps/chosen": -35.87606430053711, "ref_logps/rejected": -41.509735107421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.5175509452819824, "rewards/margins": 2.954908847808838, "rewards/rejected": -5.47245979309082, "step": 2626 }, { "epoch": 2.48, "grad_norm": 13.92800704363627, "learning_rate": 3.8629219489789504e-08, "logps/chosen": -54.916473388671875, "logps/rejected": -88.2210464477539, "loss": 0.1565, "losses/dpo": 0.0732639953494072, "losses/sft": 2.3402090072631836, "losses/total": 0.0732639953494072, "ref_logps/chosen": -30.268587112426758, "ref_logps/rejected": -35.68766784667969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4647889137268066, "rewards/margins": 2.7885489463806152, "rewards/rejected": -5.253337860107422, "step": 2627 }, { "epoch": 2.48, "grad_norm": 9.739918512938775, "learning_rate": 3.849333556948173e-08, "logps/chosen": -41.180599212646484, "logps/rejected": -92.169921875, "loss": 0.0933, "losses/dpo": 0.04525012522935867, "losses/sft": 0.9331750273704529, "losses/total": 0.04525012522935867, "ref_logps/chosen": -23.997337341308594, "ref_logps/rejected": -39.9053955078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7183265686035156, "rewards/margins": 3.5081262588500977, "rewards/rejected": -5.226452827453613, "step": 2628 }, { "epoch": 2.48, "grad_norm": 4.39058870697694, "learning_rate": 3.8357671129211286e-08, "logps/chosen": -56.234825134277344, "logps/rejected": -103.88525390625, "loss": 0.0373, "losses/dpo": 0.11708486825227737, "losses/sft": 1.0856505632400513, "losses/total": 0.11708486825227737, "ref_logps/chosen": -31.557985305786133, "ref_logps/rejected": -41.15559387207031, "rewards/accuracies": 1.0, "rewards/chosen": -2.467684030532837, "rewards/margins": 3.8052821159362793, "rewards/rejected": -6.272966384887695, "step": 2629 }, { "epoch": 2.48, "grad_norm": 15.114445125931983, "learning_rate": 3.822222630975683e-08, "logps/chosen": -45.4117431640625, "logps/rejected": -82.90974426269531, "loss": 0.1285, "losses/dpo": 0.0041596307419240475, "losses/sft": 1.3365198373794556, "losses/total": 0.0041596307419240475, "ref_logps/chosen": -23.978221893310547, "ref_logps/rejected": -30.787752151489258, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1433522701263428, "rewards/margins": 3.068847179412842, "rewards/rejected": -5.2121992111206055, "step": 2630 }, { "epoch": 2.48, "grad_norm": 8.292377421325908, "learning_rate": 3.8087001251669165e-08, "logps/chosen": -53.24065399169922, "logps/rejected": -101.28495788574219, "loss": 0.0689, "losses/dpo": 0.005061564035713673, "losses/sft": 1.6199276447296143, "losses/total": 0.005061564035713673, "ref_logps/chosen": -27.901649475097656, "ref_logps/rejected": -38.16233444213867, "rewards/accuracies": 1.0, "rewards/chosen": -2.533900260925293, "rewards/margins": 3.77836275100708, "rewards/rejected": -6.312263011932373, "step": 2631 }, { "epoch": 2.48, "grad_norm": 9.382797695915169, "learning_rate": 3.795199609527117e-08, "logps/chosen": -48.95608139038086, "logps/rejected": -85.09114074707031, "loss": 0.0806, "losses/dpo": 0.05530970171093941, "losses/sft": 1.46912682056427, "losses/total": 0.05530970171093941, "ref_logps/chosen": -27.903430938720703, "ref_logps/rejected": -31.881967544555664, "rewards/accuracies": 1.0, "rewards/chosen": -2.1052651405334473, "rewards/margins": 3.2156527042388916, "rewards/rejected": -5.320918083190918, "step": 2632 }, { "epoch": 2.48, "grad_norm": 20.20884364769018, "learning_rate": 3.781721098065729e-08, "logps/chosen": -39.374473571777344, "logps/rejected": -70.61911010742188, "loss": 0.3252, "losses/dpo": 1.7694076299667358, "losses/sft": 1.6637299060821533, "losses/total": 1.7694076299667358, "ref_logps/chosen": -21.96051025390625, "ref_logps/rejected": -32.78803253173828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.741396427154541, "rewards/margins": 2.0417110919952393, "rewards/rejected": -3.783107280731201, "step": 2633 }, { "epoch": 2.48, "grad_norm": 17.21108336924717, "learning_rate": 3.768264604769386e-08, "logps/chosen": -54.49037170410156, "logps/rejected": -83.65675354003906, "loss": 0.2347, "losses/dpo": 0.46051567792892456, "losses/sft": 1.7675354480743408, "losses/total": 0.46051567792892456, "ref_logps/chosen": -28.27248764038086, "ref_logps/rejected": -28.054807662963867, "rewards/accuracies": 0.9375, "rewards/chosen": -2.621788501739502, "rewards/margins": 2.938406467437744, "rewards/rejected": -5.560194492340088, "step": 2634 }, { "epoch": 2.49, "grad_norm": 10.258705620639086, "learning_rate": 3.754830143601856e-08, "logps/chosen": -48.486820220947266, "logps/rejected": -75.41670227050781, "loss": 0.16, "losses/dpo": 0.1245240643620491, "losses/sft": 1.1678003072738647, "losses/total": 0.1245240643620491, "ref_logps/chosen": -27.720090866088867, "ref_logps/rejected": -28.00031280517578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0766727924346924, "rewards/margins": 2.664966583251953, "rewards/rejected": -4.741639137268066, "step": 2635 }, { "epoch": 2.49, "grad_norm": 10.111904433103495, "learning_rate": 3.741417728504062e-08, "logps/chosen": -59.39948272705078, "logps/rejected": -85.35057067871094, "loss": 0.1085, "losses/dpo": 0.18381258845329285, "losses/sft": 1.7041651010513306, "losses/total": 0.18381258845329285, "ref_logps/chosen": -29.718448638916016, "ref_logps/rejected": -30.601539611816406, "rewards/accuracies": 1.0, "rewards/chosen": -2.9681036472320557, "rewards/margins": 2.5067989826202393, "rewards/rejected": -5.474902153015137, "step": 2636 }, { "epoch": 2.49, "grad_norm": 17.76107205121227, "learning_rate": 3.7280273733940296e-08, "logps/chosen": -54.63761901855469, "logps/rejected": -90.65769958496094, "loss": 0.1627, "losses/dpo": 0.5711193680763245, "losses/sft": 1.3112726211547852, "losses/total": 0.5711193680763245, "ref_logps/chosen": -31.871614456176758, "ref_logps/rejected": -38.583030700683594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2766005992889404, "rewards/margins": 2.93086576461792, "rewards/rejected": -5.207466125488281, "step": 2637 }, { "epoch": 2.49, "grad_norm": 11.611036833648082, "learning_rate": 3.7146590921669054e-08, "logps/chosen": -52.04790115356445, "logps/rejected": -85.89372253417969, "loss": 0.1214, "losses/dpo": 0.0476919561624527, "losses/sft": 1.9965085983276367, "losses/total": 0.0476919561624527, "ref_logps/chosen": -29.479999542236328, "ref_logps/rejected": -32.62158203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2567901611328125, "rewards/margins": 3.0704240798950195, "rewards/rejected": -5.327214241027832, "step": 2638 }, { "epoch": 2.49, "grad_norm": 7.586326498103917, "learning_rate": 3.7013128986949386e-08, "logps/chosen": -50.51923370361328, "logps/rejected": -92.4671630859375, "loss": 0.106, "losses/dpo": 0.11612872779369354, "losses/sft": 2.361053705215454, "losses/total": 0.11612872779369354, "ref_logps/chosen": -28.854888916015625, "ref_logps/rejected": -37.737403869628906, "rewards/accuracies": 1.0, "rewards/chosen": -2.1664345264434814, "rewards/margins": 3.306541681289673, "rewards/rejected": -5.472976207733154, "step": 2639 }, { "epoch": 2.49, "grad_norm": 23.785165365828888, "learning_rate": 3.6879888068274304e-08, "logps/chosen": -53.8675537109375, "logps/rejected": -89.50325012207031, "loss": 0.272, "losses/dpo": 0.021677857264876366, "losses/sft": 2.1300699710845947, "losses/total": 0.021677857264876366, "ref_logps/chosen": -27.687314987182617, "ref_logps/rejected": -34.64741134643555, "rewards/accuracies": 0.875, "rewards/chosen": -2.6180238723754883, "rewards/margins": 2.8675601482391357, "rewards/rejected": -5.485584259033203, "step": 2640 }, { "epoch": 2.49, "grad_norm": 8.546313226559903, "learning_rate": 3.67468683039078e-08, "logps/chosen": -37.465057373046875, "logps/rejected": -88.30116271972656, "loss": 0.055, "losses/dpo": 0.2087099850177765, "losses/sft": 0.39377740025520325, "losses/total": 0.2087099850177765, "ref_logps/chosen": -21.272916793823242, "ref_logps/rejected": -31.31125259399414, "rewards/accuracies": 1.0, "rewards/chosen": -1.6192142963409424, "rewards/margins": 4.079776763916016, "rewards/rejected": -5.698991298675537, "step": 2641 }, { "epoch": 2.49, "grad_norm": 14.684703056191514, "learning_rate": 3.661406983188406e-08, "logps/chosen": -44.699951171875, "logps/rejected": -70.4007568359375, "loss": 0.2057, "losses/dpo": 0.6059960126876831, "losses/sft": 1.2432807683944702, "losses/total": 0.6059960126876831, "ref_logps/chosen": -28.354402542114258, "ref_logps/rejected": -30.312908172607422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6345549821853638, "rewards/margins": 2.3742294311523438, "rewards/rejected": -4.008784294128418, "step": 2642 }, { "epoch": 2.49, "grad_norm": 5.958093523862478, "learning_rate": 3.648149279000795e-08, "logps/chosen": -45.13789749145508, "logps/rejected": -93.2790756225586, "loss": 0.0488, "losses/dpo": 0.010127810761332512, "losses/sft": 1.720119833946228, "losses/total": 0.010127810761332512, "ref_logps/chosen": -23.405868530273438, "ref_logps/rejected": -34.11711883544922, "rewards/accuracies": 1.0, "rewards/chosen": -2.1732029914855957, "rewards/margins": 3.742992877960205, "rewards/rejected": -5.916195869445801, "step": 2643 }, { "epoch": 2.49, "grad_norm": 7.148326251089714, "learning_rate": 3.6349137315854236e-08, "logps/chosen": -46.744834899902344, "logps/rejected": -88.80457305908203, "loss": 0.0965, "losses/dpo": 0.0016183351399376988, "losses/sft": 2.0271048545837402, "losses/total": 0.0016183351399376988, "ref_logps/chosen": -24.914535522460938, "ref_logps/rejected": -31.909320831298828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.183030128479004, "rewards/margins": 3.5064949989318848, "rewards/rejected": -5.6895246505737305, "step": 2644 }, { "epoch": 2.5, "grad_norm": 8.007465937633473, "learning_rate": 3.621700354676807e-08, "logps/chosen": -57.26786804199219, "logps/rejected": -102.14315032958984, "loss": 0.0516, "losses/dpo": 0.0035750074312090874, "losses/sft": 0.38906946778297424, "losses/total": 0.0035750074312090874, "ref_logps/chosen": -37.8175048828125, "ref_logps/rejected": -40.29123306274414, "rewards/accuracies": 1.0, "rewards/chosen": -1.9450366497039795, "rewards/margins": 4.240154266357422, "rewards/rejected": -6.1851911544799805, "step": 2645 }, { "epoch": 2.5, "grad_norm": 10.207334171633365, "learning_rate": 3.608509161986426e-08, "logps/chosen": -41.38456726074219, "logps/rejected": -91.33695220947266, "loss": 0.1077, "losses/dpo": 0.047613903880119324, "losses/sft": 2.3291571140289307, "losses/total": 0.047613903880119324, "ref_logps/chosen": -21.808879852294922, "ref_logps/rejected": -38.55962371826172, "rewards/accuracies": 1.0, "rewards/chosen": -1.9575685262680054, "rewards/margins": 3.320164203643799, "rewards/rejected": -5.277732849121094, "step": 2646 }, { "epoch": 2.5, "grad_norm": 17.117924516522592, "learning_rate": 3.5953401672027625e-08, "logps/chosen": -50.04778289794922, "logps/rejected": -89.35240173339844, "loss": 0.1542, "losses/dpo": 0.28088778257369995, "losses/sft": 1.1872541904449463, "losses/total": 0.28088778257369995, "ref_logps/chosen": -25.949142456054688, "ref_logps/rejected": -38.784828186035156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4098639488220215, "rewards/margins": 2.6468939781188965, "rewards/rejected": -5.056757926940918, "step": 2647 }, { "epoch": 2.5, "grad_norm": 8.166588975506025, "learning_rate": 3.582193383991261e-08, "logps/chosen": -51.34450912475586, "logps/rejected": -86.17306518554688, "loss": 0.0764, "losses/dpo": 0.013617048040032387, "losses/sft": 1.1037962436676025, "losses/total": 0.013617048040032387, "ref_logps/chosen": -31.402423858642578, "ref_logps/rejected": -31.856475830078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9942083358764648, "rewards/margins": 3.437450408935547, "rewards/rejected": -5.431658744812012, "step": 2648 }, { "epoch": 2.5, "grad_norm": 7.374726210890184, "learning_rate": 3.569068825994298e-08, "logps/chosen": -55.06612777709961, "logps/rejected": -91.62129211425781, "loss": 0.0812, "losses/dpo": 0.004949949216097593, "losses/sft": 2.0828511714935303, "losses/total": 0.004949949216097593, "ref_logps/chosen": -32.60859680175781, "ref_logps/rejected": -35.277381896972656, "rewards/accuracies": 1.0, "rewards/chosen": -2.245753049850464, "rewards/margins": 3.3886380195617676, "rewards/rejected": -5.634390830993652, "step": 2649 }, { "epoch": 2.5, "grad_norm": 22.735747086195293, "learning_rate": 3.5559665068312093e-08, "logps/chosen": -56.92021179199219, "logps/rejected": -95.34210205078125, "loss": 0.2562, "losses/dpo": 0.6069148778915405, "losses/sft": 0.9762298464775085, "losses/total": 0.6069148778915405, "ref_logps/chosen": -31.590852737426758, "ref_logps/rejected": -36.753475189208984, "rewards/accuracies": 0.8125, "rewards/chosen": -2.532935857772827, "rewards/margins": 3.325927495956421, "rewards/rejected": -5.858863353729248, "step": 2650 }, { "epoch": 2.5, "grad_norm": 5.366486561973608, "learning_rate": 3.542886440098244e-08, "logps/chosen": -56.974205017089844, "logps/rejected": -88.9166259765625, "loss": 0.0437, "losses/dpo": 0.016014987602829933, "losses/sft": 2.311936616897583, "losses/total": 0.016014987602829933, "ref_logps/chosen": -33.49897766113281, "ref_logps/rejected": -28.4985294342041, "rewards/accuracies": 1.0, "rewards/chosen": -2.347522497177124, "rewards/margins": 3.694286346435547, "rewards/rejected": -6.04180908203125, "step": 2651 }, { "epoch": 2.5, "grad_norm": 12.938143486244012, "learning_rate": 3.5298286393685675e-08, "logps/chosen": -50.68658447265625, "logps/rejected": -91.26463317871094, "loss": 0.138, "losses/dpo": 0.020939258858561516, "losses/sft": 3.2031219005584717, "losses/total": 0.020939258858561516, "ref_logps/chosen": -28.82832908630371, "ref_logps/rejected": -35.37699890136719, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1858253479003906, "rewards/margins": 3.4029388427734375, "rewards/rejected": -5.588764190673828, "step": 2652 }, { "epoch": 2.5, "grad_norm": 5.141400499738, "learning_rate": 3.5167931181922185e-08, "logps/chosen": -48.598941802978516, "logps/rejected": -97.4646987915039, "loss": 0.0556, "losses/dpo": 0.03176893666386604, "losses/sft": 0.9512225985527039, "losses/total": 0.03176893666386604, "ref_logps/chosen": -26.292282104492188, "ref_logps/rejected": -38.44354248046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.230665683746338, "rewards/margins": 3.671450138092041, "rewards/rejected": -5.902115821838379, "step": 2653 }, { "epoch": 2.5, "grad_norm": 12.11188688400505, "learning_rate": 3.503779890096148e-08, "logps/chosen": -41.05271911621094, "logps/rejected": -72.87355041503906, "loss": 0.112, "losses/dpo": 0.0436759889125824, "losses/sft": 1.1732561588287354, "losses/total": 0.0436759889125824, "ref_logps/chosen": -20.395401000976562, "ref_logps/rejected": -23.199569702148438, "rewards/accuracies": 0.9375, "rewards/chosen": -2.065732002258301, "rewards/margins": 2.9016659259796143, "rewards/rejected": -4.967398643493652, "step": 2654 }, { "epoch": 2.5, "grad_norm": 4.90769391581424, "learning_rate": 3.490788968584138e-08, "logps/chosen": -43.07527160644531, "logps/rejected": -92.87586975097656, "loss": 0.0395, "losses/dpo": 0.1075555607676506, "losses/sft": 1.825156569480896, "losses/total": 0.1075555607676506, "ref_logps/chosen": -21.152433395385742, "ref_logps/rejected": -31.83258628845215, "rewards/accuracies": 1.0, "rewards/chosen": -2.192283868789673, "rewards/margins": 3.912043809890747, "rewards/rejected": -6.10432767868042, "step": 2655 }, { "epoch": 2.51, "grad_norm": 8.29721321559461, "learning_rate": 3.4778203671368575e-08, "logps/chosen": -44.15467071533203, "logps/rejected": -92.93818664550781, "loss": 0.0646, "losses/dpo": 0.12334519624710083, "losses/sft": 1.755689263343811, "losses/total": 0.12334519624710083, "ref_logps/chosen": -24.75591278076172, "ref_logps/rejected": -36.13294982910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9398759603500366, "rewards/margins": 3.740647554397583, "rewards/rejected": -5.680523872375488, "step": 2656 }, { "epoch": 2.51, "grad_norm": 23.746092407078905, "learning_rate": 3.464874099211787e-08, "logps/chosen": -49.58080291748047, "logps/rejected": -80.74503326416016, "loss": 0.2172, "losses/dpo": 0.24394358694553375, "losses/sft": 1.262919306755066, "losses/total": 0.24394358694553375, "ref_logps/chosen": -25.276691436767578, "ref_logps/rejected": -27.431507110595703, "rewards/accuracies": 0.875, "rewards/chosen": -2.4304113388061523, "rewards/margins": 2.9009413719177246, "rewards/rejected": -5.331352710723877, "step": 2657 }, { "epoch": 2.51, "grad_norm": 16.631357821143563, "learning_rate": 3.4519501782432514e-08, "logps/chosen": -58.412750244140625, "logps/rejected": -111.90633392333984, "loss": 0.1377, "losses/dpo": 0.012635301798582077, "losses/sft": 1.7626612186431885, "losses/total": 0.012635301798582077, "ref_logps/chosen": -33.81650161743164, "ref_logps/rejected": -48.87652587890625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.459625005722046, "rewards/margins": 3.843355655670166, "rewards/rejected": -6.302980899810791, "step": 2658 }, { "epoch": 2.51, "grad_norm": 9.911046681700558, "learning_rate": 3.439048617642368e-08, "logps/chosen": -51.685218811035156, "logps/rejected": -96.52110290527344, "loss": 0.1128, "losses/dpo": 0.14987969398498535, "losses/sft": 0.3211422264575958, "losses/total": 0.14987969398498535, "ref_logps/chosen": -28.53314208984375, "ref_logps/rejected": -37.47174072265625, "rewards/accuracies": 1.0, "rewards/chosen": -2.3152077198028564, "rewards/margins": 3.589728593826294, "rewards/rejected": -5.90493631362915, "step": 2659 }, { "epoch": 2.51, "grad_norm": 17.19924346947107, "learning_rate": 3.4261694307970626e-08, "logps/chosen": -53.2867546081543, "logps/rejected": -81.69369506835938, "loss": 0.1696, "losses/dpo": 0.8073215484619141, "losses/sft": 2.019181251525879, "losses/total": 0.8073215484619141, "ref_logps/chosen": -31.508651733398438, "ref_logps/rejected": -31.961380004882812, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1778106689453125, "rewards/margins": 2.7954211235046387, "rewards/rejected": -4.973231315612793, "step": 2660 }, { "epoch": 2.51, "grad_norm": 11.672458299545585, "learning_rate": 3.4133126310720516e-08, "logps/chosen": -54.67784118652344, "logps/rejected": -78.70245361328125, "loss": 0.1719, "losses/dpo": 0.2155279964208603, "losses/sft": 1.868168592453003, "losses/total": 0.2155279964208603, "ref_logps/chosen": -33.49391555786133, "ref_logps/rejected": -28.75627899169922, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1183927059173584, "rewards/margins": 2.8762240409851074, "rewards/rejected": -4.994616985321045, "step": 2661 }, { "epoch": 2.51, "grad_norm": 14.255293273262351, "learning_rate": 3.400478231808795e-08, "logps/chosen": -39.89396667480469, "logps/rejected": -96.01268768310547, "loss": 0.1691, "losses/dpo": 0.20448313653469086, "losses/sft": 0.485235333442688, "losses/total": 0.20448313653469086, "ref_logps/chosen": -20.207096099853516, "ref_logps/rejected": -38.83936309814453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9686870574951172, "rewards/margins": 3.748645305633545, "rewards/rejected": -5.717332363128662, "step": 2662 }, { "epoch": 2.51, "grad_norm": 13.077290017610544, "learning_rate": 3.387666246325535e-08, "logps/chosen": -56.57677459716797, "logps/rejected": -85.63571166992188, "loss": 0.1403, "losses/dpo": 0.06933525949716568, "losses/sft": 2.4461424350738525, "losses/total": 0.06933525949716568, "ref_logps/chosen": -31.862773895263672, "ref_logps/rejected": -32.527099609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.471400260925293, "rewards/margins": 2.839460849761963, "rewards/rejected": -5.310861110687256, "step": 2663 }, { "epoch": 2.51, "grad_norm": 14.951355630740586, "learning_rate": 3.374876687917233e-08, "logps/chosen": -58.67634582519531, "logps/rejected": -99.54553985595703, "loss": 0.1106, "losses/dpo": 0.0036805481649935246, "losses/sft": 1.9677447080612183, "losses/total": 0.0036805481649935246, "ref_logps/chosen": -29.467529296875, "ref_logps/rejected": -37.16237258911133, "rewards/accuracies": 1.0, "rewards/chosen": -2.920881986618042, "rewards/margins": 3.317434787750244, "rewards/rejected": -6.238317012786865, "step": 2664 }, { "epoch": 2.51, "grad_norm": 20.074824288199768, "learning_rate": 3.362109569855601e-08, "logps/chosen": -41.634117126464844, "logps/rejected": -74.7257080078125, "loss": 0.1804, "losses/dpo": 0.22685353457927704, "losses/sft": 0.468039870262146, "losses/total": 0.22685353457927704, "ref_logps/chosen": -21.92133903503418, "ref_logps/rejected": -28.974319458007812, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9712775945663452, "rewards/margins": 2.60386061668396, "rewards/rejected": -4.575138092041016, "step": 2665 }, { "epoch": 2.52, "grad_norm": 10.50880169066449, "learning_rate": 3.349364905389032e-08, "logps/chosen": -49.68889617919922, "logps/rejected": -73.45954895019531, "loss": 0.1319, "losses/dpo": 0.08537129312753677, "losses/sft": 1.3250240087509155, "losses/total": 0.08537129312753677, "ref_logps/chosen": -29.249366760253906, "ref_logps/rejected": -25.32094955444336, "rewards/accuracies": 1.0, "rewards/chosen": -2.0439529418945312, "rewards/margins": 2.7699074745178223, "rewards/rejected": -4.8138604164123535, "step": 2666 }, { "epoch": 2.52, "grad_norm": 7.922051015670692, "learning_rate": 3.3366427077426596e-08, "logps/chosen": -54.58648681640625, "logps/rejected": -113.03430938720703, "loss": 0.0587, "losses/dpo": 0.02879534289240837, "losses/sft": 1.9441862106323242, "losses/total": 0.02879534289240837, "ref_logps/chosen": -27.000410079956055, "ref_logps/rejected": -47.17523193359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.758607864379883, "rewards/margins": 3.8273000717163086, "rewards/rejected": -6.585907936096191, "step": 2667 }, { "epoch": 2.52, "grad_norm": 8.414087329532034, "learning_rate": 3.3239429901182644e-08, "logps/chosen": -61.18140411376953, "logps/rejected": -89.89410400390625, "loss": 0.0675, "losses/dpo": 0.02761172503232956, "losses/sft": 1.2201838493347168, "losses/total": 0.02761172503232956, "ref_logps/chosen": -40.15367126464844, "ref_logps/rejected": -33.91158676147461, "rewards/accuracies": 1.0, "rewards/chosen": -2.102773427963257, "rewards/margins": 3.4954781532287598, "rewards/rejected": -5.5982513427734375, "step": 2668 }, { "epoch": 2.52, "grad_norm": 8.225102335878095, "learning_rate": 3.311265765694327e-08, "logps/chosen": -39.95682907104492, "logps/rejected": -84.28788757324219, "loss": 0.0976, "losses/dpo": 0.009277494624257088, "losses/sft": 0.8650946021080017, "losses/total": 0.009277494624257088, "ref_logps/chosen": -21.277137756347656, "ref_logps/rejected": -31.670948028564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.867969274520874, "rewards/margins": 3.3937244415283203, "rewards/rejected": -5.261693477630615, "step": 2669 }, { "epoch": 2.52, "grad_norm": 7.0335077782735755, "learning_rate": 3.298611047625982e-08, "logps/chosen": -58.71772384643555, "logps/rejected": -93.85340118408203, "loss": 0.0627, "losses/dpo": 0.007819823920726776, "losses/sft": 1.6241166591644287, "losses/total": 0.007819823920726776, "ref_logps/chosen": -28.819847106933594, "ref_logps/rejected": -31.143352508544922, "rewards/accuracies": 1.0, "rewards/chosen": -2.9897875785827637, "rewards/margins": 3.281217098236084, "rewards/rejected": -6.271005153656006, "step": 2670 }, { "epoch": 2.52, "grad_norm": 12.07160200223332, "learning_rate": 3.285978849044996e-08, "logps/chosen": -51.39549255371094, "logps/rejected": -87.22091674804688, "loss": 0.0991, "losses/dpo": 0.275134801864624, "losses/sft": 1.90529465675354, "losses/total": 0.275134801864624, "ref_logps/chosen": -29.991291046142578, "ref_logps/rejected": -31.53472328186035, "rewards/accuracies": 1.0, "rewards/chosen": -2.140420436859131, "rewards/margins": 3.42819881439209, "rewards/rejected": -5.568619251251221, "step": 2671 }, { "epoch": 2.52, "grad_norm": 12.628522469501188, "learning_rate": 3.273369183059782e-08, "logps/chosen": -43.81079864501953, "logps/rejected": -78.60316467285156, "loss": 0.1065, "losses/dpo": 0.022272367030382156, "losses/sft": 1.0159107446670532, "losses/total": 0.022272367030382156, "ref_logps/chosen": -23.190814971923828, "ref_logps/rejected": -26.38561248779297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0619983673095703, "rewards/margins": 3.159756898880005, "rewards/rejected": -5.221755027770996, "step": 2672 }, { "epoch": 2.52, "grad_norm": 9.238081281376356, "learning_rate": 3.260782062755355e-08, "logps/chosen": -55.383819580078125, "logps/rejected": -79.56793212890625, "loss": 0.1412, "losses/dpo": 0.00857403315603733, "losses/sft": 1.622836947441101, "losses/total": 0.00857403315603733, "ref_logps/chosen": -33.70185852050781, "ref_logps/rejected": -28.455242156982422, "rewards/accuracies": 1.0, "rewards/chosen": -2.1681957244873047, "rewards/margins": 2.9430737495422363, "rewards/rejected": -5.111269474029541, "step": 2673 }, { "epoch": 2.52, "grad_norm": 10.1346443000891, "learning_rate": 3.2482175011933595e-08, "logps/chosen": -45.40193176269531, "logps/rejected": -86.58841705322266, "loss": 0.0975, "losses/dpo": 0.034068964421749115, "losses/sft": 1.8732470273971558, "losses/total": 0.034068964421749115, "ref_logps/chosen": -23.55054473876953, "ref_logps/rejected": -28.733154296875, "rewards/accuracies": 1.0, "rewards/chosen": -2.185138702392578, "rewards/margins": 3.6003880500793457, "rewards/rejected": -5.785526752471924, "step": 2674 }, { "epoch": 2.52, "grad_norm": 29.235365617865554, "learning_rate": 3.235675511412e-08, "logps/chosen": -51.72361755371094, "logps/rejected": -94.26211547851562, "loss": 0.445, "losses/dpo": 0.07299412041902542, "losses/sft": 1.0211750268936157, "losses/total": 0.07299412041902542, "ref_logps/chosen": -22.104793548583984, "ref_logps/rejected": -33.69314956665039, "rewards/accuracies": 0.875, "rewards/chosen": -2.9618821144104004, "rewards/margins": 3.095014810562134, "rewards/rejected": -6.056897163391113, "step": 2675 }, { "epoch": 2.52, "grad_norm": 8.266671859744442, "learning_rate": 3.223156106426081e-08, "logps/chosen": -64.61402893066406, "logps/rejected": -97.20098876953125, "loss": 0.1377, "losses/dpo": 0.7045887112617493, "losses/sft": 2.032607316970825, "losses/total": 0.7045887112617493, "ref_logps/chosen": -37.593360900878906, "ref_logps/rejected": -35.73711395263672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7020668983459473, "rewards/margins": 3.4443211555480957, "rewards/rejected": -6.146388053894043, "step": 2676 }, { "epoch": 2.53, "grad_norm": 18.210453998320375, "learning_rate": 3.2106592992269696e-08, "logps/chosen": -62.56412887573242, "logps/rejected": -86.52461242675781, "loss": 0.1975, "losses/dpo": 0.040119558572769165, "losses/sft": 2.2894837856292725, "losses/total": 0.040119558572769165, "ref_logps/chosen": -33.64255905151367, "ref_logps/rejected": -32.8414192199707, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8921568393707275, "rewards/margins": 2.476162910461426, "rewards/rejected": -5.368319511413574, "step": 2677 }, { "epoch": 2.53, "grad_norm": 14.23365709423164, "learning_rate": 3.1981851027825665e-08, "logps/chosen": -48.554649353027344, "logps/rejected": -73.57992553710938, "loss": 0.167, "losses/dpo": 0.16636227071285248, "losses/sft": 1.247054100036621, "losses/total": 0.16636227071285248, "ref_logps/chosen": -27.374666213989258, "ref_logps/rejected": -30.000316619873047, "rewards/accuracies": 1.0, "rewards/chosen": -2.117997884750366, "rewards/margins": 2.2399630546569824, "rewards/rejected": -4.3579607009887695, "step": 2678 }, { "epoch": 2.53, "grad_norm": 19.469748649288864, "learning_rate": 3.18573353003733e-08, "logps/chosen": -48.91520690917969, "logps/rejected": -78.53504943847656, "loss": 0.1997, "losses/dpo": 0.024887725710868835, "losses/sft": 1.0411938428878784, "losses/total": 0.024887725710868835, "ref_logps/chosen": -23.769466400146484, "ref_logps/rejected": -27.353900909423828, "rewards/accuracies": 0.875, "rewards/chosen": -2.514573812484741, "rewards/margins": 2.6035408973693848, "rewards/rejected": -5.118114471435547, "step": 2679 }, { "epoch": 2.53, "grad_norm": 13.149566617841735, "learning_rate": 3.173304593912221e-08, "logps/chosen": -71.44758605957031, "logps/rejected": -87.43108367919922, "loss": 0.1211, "losses/dpo": 0.011515099555253983, "losses/sft": 1.946105718612671, "losses/total": 0.011515099555253983, "ref_logps/chosen": -42.259422302246094, "ref_logps/rejected": -31.87358856201172, "rewards/accuracies": 1.0, "rewards/chosen": -2.9188170433044434, "rewards/margins": 2.636932373046875, "rewards/rejected": -5.555749416351318, "step": 2680 }, { "epoch": 2.53, "grad_norm": 13.723027294524151, "learning_rate": 3.1608983073047354e-08, "logps/chosen": -54.914100646972656, "logps/rejected": -83.7470932006836, "loss": 0.1372, "losses/dpo": 0.058060966432094574, "losses/sft": 0.6299970149993896, "losses/total": 0.058060966432094574, "ref_logps/chosen": -32.46308135986328, "ref_logps/rejected": -33.64567184448242, "rewards/accuracies": 1.0, "rewards/chosen": -2.2451014518737793, "rewards/margins": 2.765040397644043, "rewards/rejected": -5.0101423263549805, "step": 2681 }, { "epoch": 2.53, "grad_norm": 16.323097766015863, "learning_rate": 3.148514683088835e-08, "logps/chosen": -51.28575897216797, "logps/rejected": -77.15762329101562, "loss": 0.1817, "losses/dpo": 0.21909256279468536, "losses/sft": 2.0758965015411377, "losses/total": 0.21909256279468536, "ref_logps/chosen": -26.160255432128906, "ref_logps/rejected": -23.538816452026367, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5125508308410645, "rewards/margins": 2.849330186843872, "rewards/rejected": -5.361880779266357, "step": 2682 }, { "epoch": 2.53, "grad_norm": 12.590380282381922, "learning_rate": 3.136153734114999e-08, "logps/chosen": -42.26861572265625, "logps/rejected": -70.1590805053711, "loss": 0.158, "losses/dpo": 0.24505701661109924, "losses/sft": 1.2080564498901367, "losses/total": 0.24505701661109924, "ref_logps/chosen": -24.045570373535156, "ref_logps/rejected": -24.451393127441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.822304368019104, "rewards/margins": 2.748465061187744, "rewards/rejected": -4.570769309997559, "step": 2683 }, { "epoch": 2.53, "grad_norm": 14.506681247975314, "learning_rate": 3.123815473210142e-08, "logps/chosen": -59.094539642333984, "logps/rejected": -100.5949935913086, "loss": 0.1491, "losses/dpo": 0.0678325742483139, "losses/sft": 2.0020127296447754, "losses/total": 0.0678325742483139, "ref_logps/chosen": -33.16717529296875, "ref_logps/rejected": -41.989131927490234, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5927367210388184, "rewards/margins": 3.267849922180176, "rewards/rejected": -5.860586643218994, "step": 2684 }, { "epoch": 2.53, "grad_norm": 6.488393621082597, "learning_rate": 3.111499913177662e-08, "logps/chosen": -46.03663635253906, "logps/rejected": -85.55916595458984, "loss": 0.0864, "losses/dpo": 0.03031194396317005, "losses/sft": 2.611246109008789, "losses/total": 0.03031194396317005, "ref_logps/chosen": -24.6951847076416, "ref_logps/rejected": -32.85591125488281, "rewards/accuracies": 1.0, "rewards/chosen": -2.1341452598571777, "rewards/margins": 3.1361804008483887, "rewards/rejected": -5.270325660705566, "step": 2685 }, { "epoch": 2.53, "grad_norm": 26.93587361764815, "learning_rate": 3.0992070667973934e-08, "logps/chosen": -59.239830017089844, "logps/rejected": -76.68051147460938, "loss": 0.3748, "losses/dpo": 0.8081260323524475, "losses/sft": 1.4887402057647705, "losses/total": 0.8081260323524475, "ref_logps/chosen": -36.4951286315918, "ref_logps/rejected": -28.317481994628906, "rewards/accuracies": 0.8125, "rewards/chosen": -2.274470090866089, "rewards/margins": 2.561832904815674, "rewards/rejected": -4.836302757263184, "step": 2686 }, { "epoch": 2.53, "grad_norm": 10.360093429150263, "learning_rate": 3.086936946825591e-08, "logps/chosen": -48.293853759765625, "logps/rejected": -85.04978942871094, "loss": 0.1491, "losses/dpo": 0.04125676676630974, "losses/sft": 0.4483183026313782, "losses/total": 0.04125676676630974, "ref_logps/chosen": -26.692100524902344, "ref_logps/rejected": -35.131622314453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1601755619049072, "rewards/margins": 2.831641435623169, "rewards/rejected": -4.991817474365234, "step": 2687 }, { "epoch": 2.54, "grad_norm": 22.874785938822352, "learning_rate": 3.0746895659949396e-08, "logps/chosen": -48.676883697509766, "logps/rejected": -81.76156616210938, "loss": 0.286, "losses/dpo": 0.07053960114717484, "losses/sft": 1.6804752349853516, "losses/total": 0.07053960114717484, "ref_logps/chosen": -24.308734893798828, "ref_logps/rejected": -31.362125396728516, "rewards/accuracies": 0.875, "rewards/chosen": -2.436815023422241, "rewards/margins": 2.6031289100646973, "rewards/rejected": -5.039944171905518, "step": 2688 }, { "epoch": 2.54, "grad_norm": 9.005541228716254, "learning_rate": 3.062464937014514e-08, "logps/chosen": -53.96099090576172, "logps/rejected": -88.59765625, "loss": 0.0718, "losses/dpo": 0.10970473289489746, "losses/sft": 2.527672529220581, "losses/total": 0.10970473289489746, "ref_logps/chosen": -32.9727668762207, "ref_logps/rejected": -31.066421508789062, "rewards/accuracies": 1.0, "rewards/chosen": -2.098822593688965, "rewards/margins": 3.654301166534424, "rewards/rejected": -5.753123760223389, "step": 2689 }, { "epoch": 2.54, "grad_norm": 15.132805334875561, "learning_rate": 3.050263072569797e-08, "logps/chosen": -52.233978271484375, "logps/rejected": -92.39396667480469, "loss": 0.1592, "losses/dpo": 0.026102054864168167, "losses/sft": 2.8209633827209473, "losses/total": 0.026102054864168167, "ref_logps/chosen": -27.135976791381836, "ref_logps/rejected": -38.90884017944336, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5098001956939697, "rewards/margins": 2.838712453842163, "rewards/rejected": -5.348512649536133, "step": 2690 }, { "epoch": 2.54, "grad_norm": 12.255674045800797, "learning_rate": 3.038083985322626e-08, "logps/chosen": -56.177162170410156, "logps/rejected": -90.35353088378906, "loss": 0.1033, "losses/dpo": 0.11621439456939697, "losses/sft": 1.8772209882736206, "losses/total": 0.11621439456939697, "ref_logps/chosen": -28.5288028717041, "ref_logps/rejected": -30.730146408081055, "rewards/accuracies": 1.0, "rewards/chosen": -2.764836311340332, "rewards/margins": 3.197502613067627, "rewards/rejected": -5.962338924407959, "step": 2691 }, { "epoch": 2.54, "grad_norm": 13.615186532249233, "learning_rate": 3.0259276879112284e-08, "logps/chosen": -47.93488311767578, "logps/rejected": -79.57521057128906, "loss": 0.1188, "losses/dpo": 0.022493472322821617, "losses/sft": 0.9730085730552673, "losses/total": 0.022493472322821617, "ref_logps/chosen": -27.888568878173828, "ref_logps/rejected": -27.31626319885254, "rewards/accuracies": 0.9375, "rewards/chosen": -2.004631519317627, "rewards/margins": 3.2212631702423096, "rewards/rejected": -5.225894927978516, "step": 2692 }, { "epoch": 2.54, "grad_norm": 11.921835978488573, "learning_rate": 3.013794192950159e-08, "logps/chosen": -42.93629455566406, "logps/rejected": -82.664306640625, "loss": 0.1794, "losses/dpo": 0.07495870441198349, "losses/sft": 2.410172939300537, "losses/total": 0.07495870441198349, "ref_logps/chosen": -23.535430908203125, "ref_logps/rejected": -33.99427795410156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9400863647460938, "rewards/margins": 2.92691707611084, "rewards/rejected": -4.867002964019775, "step": 2693 }, { "epoch": 2.54, "grad_norm": 13.363804335490139, "learning_rate": 3.001683513030323e-08, "logps/chosen": -45.90353775024414, "logps/rejected": -73.8237075805664, "loss": 0.1594, "losses/dpo": 0.1401471197605133, "losses/sft": 2.07018780708313, "losses/total": 0.1401471197605133, "ref_logps/chosen": -25.324514389038086, "ref_logps/rejected": -26.278696060180664, "rewards/accuracies": 1.0, "rewards/chosen": -2.0579023361206055, "rewards/margins": 2.696598529815674, "rewards/rejected": -4.7545013427734375, "step": 2694 }, { "epoch": 2.54, "grad_norm": 11.001336427137566, "learning_rate": 2.989595660718955e-08, "logps/chosen": -59.00761413574219, "logps/rejected": -106.5263671875, "loss": 0.1243, "losses/dpo": 0.351075142621994, "losses/sft": 2.8525447845458984, "losses/total": 0.351075142621994, "ref_logps/chosen": -28.11004638671875, "ref_logps/rejected": -40.4289665222168, "rewards/accuracies": 1.0, "rewards/chosen": -3.089756488800049, "rewards/margins": 3.519984006881714, "rewards/rejected": -6.609740257263184, "step": 2695 }, { "epoch": 2.54, "grad_norm": 7.408535227743132, "learning_rate": 2.9775306485595868e-08, "logps/chosen": -52.47521209716797, "logps/rejected": -89.66690063476562, "loss": 0.072, "losses/dpo": 0.0799306258559227, "losses/sft": 1.7973060607910156, "losses/total": 0.0799306258559227, "ref_logps/chosen": -35.24913787841797, "ref_logps/rejected": -35.8529052734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7226073741912842, "rewards/margins": 3.6587929725646973, "rewards/rejected": -5.381400108337402, "step": 2696 }, { "epoch": 2.54, "grad_norm": 10.69773614798251, "learning_rate": 2.9654884890720656e-08, "logps/chosen": -58.039878845214844, "logps/rejected": -91.89281463623047, "loss": 0.0997, "losses/dpo": 0.34309422969818115, "losses/sft": 1.4983646869659424, "losses/total": 0.34309422969818115, "ref_logps/chosen": -34.311485290527344, "ref_logps/rejected": -34.07612609863281, "rewards/accuracies": 1.0, "rewards/chosen": -2.3728394508361816, "rewards/margins": 3.4088292121887207, "rewards/rejected": -5.781668663024902, "step": 2697 }, { "epoch": 2.55, "grad_norm": 15.252702780318195, "learning_rate": 2.9534691947525032e-08, "logps/chosen": -33.884178161621094, "logps/rejected": -66.91603088378906, "loss": 0.1873, "losses/dpo": 0.009148281998932362, "losses/sft": 1.3962152004241943, "losses/total": 0.009148281998932362, "ref_logps/chosen": -20.001296997070312, "ref_logps/rejected": -28.046518325805664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.388288140296936, "rewards/margins": 2.498663902282715, "rewards/rejected": -3.8869519233703613, "step": 2698 }, { "epoch": 2.55, "grad_norm": 7.761351879891621, "learning_rate": 2.941472778073312e-08, "logps/chosen": -47.54792022705078, "logps/rejected": -80.66004180908203, "loss": 0.0763, "losses/dpo": 0.01705845072865486, "losses/sft": 1.0403242111206055, "losses/total": 0.01705845072865486, "ref_logps/chosen": -29.53575897216797, "ref_logps/rejected": -29.36387062072754, "rewards/accuracies": 1.0, "rewards/chosen": -1.8012163639068604, "rewards/margins": 3.3284010887145996, "rewards/rejected": -5.129617691040039, "step": 2699 }, { "epoch": 2.55, "grad_norm": 11.649445306191925, "learning_rate": 2.9294992514831363e-08, "logps/chosen": -54.10200500488281, "logps/rejected": -79.37782287597656, "loss": 0.1115, "losses/dpo": 0.04723985120654106, "losses/sft": 2.2116587162017822, "losses/total": 0.04723985120654106, "ref_logps/chosen": -30.048221588134766, "ref_logps/rejected": -28.271318435668945, "rewards/accuracies": 1.0, "rewards/chosen": -2.4053783416748047, "rewards/margins": 2.7052724361419678, "rewards/rejected": -5.110651016235352, "step": 2700 }, { "epoch": 2.55, "grad_norm": 15.075270004463635, "learning_rate": 2.9175486274068933e-08, "logps/chosen": -48.91619873046875, "logps/rejected": -93.76402282714844, "loss": 0.1815, "losses/dpo": 0.08068656176328659, "losses/sft": 1.1109594106674194, "losses/total": 0.08068656176328659, "ref_logps/chosen": -22.508342742919922, "ref_logps/rejected": -36.802093505859375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6407856941223145, "rewards/margins": 3.0554075241088867, "rewards/rejected": -5.696193695068359, "step": 2701 }, { "epoch": 2.55, "grad_norm": 20.19346111782461, "learning_rate": 2.9056209182457047e-08, "logps/chosen": -47.690940856933594, "logps/rejected": -92.07622528076172, "loss": 0.1963, "losses/dpo": 0.04291243478655815, "losses/sft": 1.7952395677566528, "losses/total": 0.04291243478655815, "ref_logps/chosen": -20.79499053955078, "ref_logps/rejected": -32.09994125366211, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6895947456359863, "rewards/margins": 3.3080332279205322, "rewards/rejected": -5.997628211975098, "step": 2702 }, { "epoch": 2.55, "grad_norm": 20.28563808377291, "learning_rate": 2.8937161363769418e-08, "logps/chosen": -48.21047592163086, "logps/rejected": -89.8565673828125, "loss": 0.1748, "losses/dpo": 0.004930257797241211, "losses/sft": 1.6970144510269165, "losses/total": 0.004930257797241211, "ref_logps/chosen": -29.775556564331055, "ref_logps/rejected": -34.682655334472656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.843492031097412, "rewards/margins": 3.6738994121551514, "rewards/rejected": -5.517391204833984, "step": 2703 }, { "epoch": 2.55, "grad_norm": 11.991116046072397, "learning_rate": 2.8818342941541757e-08, "logps/chosen": -57.6106071472168, "logps/rejected": -102.30088806152344, "loss": 0.1035, "losses/dpo": 0.0011599237332120538, "losses/sft": 0.8931546211242676, "losses/total": 0.0011599237332120538, "ref_logps/chosen": -32.334617614746094, "ref_logps/rejected": -42.526145935058594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5275988578796387, "rewards/margins": 3.449875831604004, "rewards/rejected": -5.977474689483643, "step": 2704 }, { "epoch": 2.55, "grad_norm": 19.09982723928477, "learning_rate": 2.8699754039071598e-08, "logps/chosen": -57.86820983886719, "logps/rejected": -102.38932800292969, "loss": 0.146, "losses/dpo": 0.013680243864655495, "losses/sft": 2.446249008178711, "losses/total": 0.013680243864655495, "ref_logps/chosen": -30.076492309570312, "ref_logps/rejected": -38.28478240966797, "rewards/accuracies": 0.9375, "rewards/chosen": -2.779171943664551, "rewards/margins": 3.6312830448150635, "rewards/rejected": -6.410455226898193, "step": 2705 }, { "epoch": 2.55, "grad_norm": 13.551278782473734, "learning_rate": 2.8581394779418512e-08, "logps/chosen": -52.56938934326172, "logps/rejected": -80.56519317626953, "loss": 0.1165, "losses/dpo": 0.2087048590183258, "losses/sft": 3.4637160301208496, "losses/total": 0.2087048590183258, "ref_logps/chosen": -30.92573356628418, "ref_logps/rejected": -31.8363037109375, "rewards/accuracies": 1.0, "rewards/chosen": -2.164365530014038, "rewards/margins": 2.708523750305176, "rewards/rejected": -4.872889518737793, "step": 2706 }, { "epoch": 2.55, "grad_norm": 16.657867294006657, "learning_rate": 2.846326528540355e-08, "logps/chosen": -54.040611267089844, "logps/rejected": -83.82115173339844, "loss": 0.229, "losses/dpo": 0.247134268283844, "losses/sft": 1.6328058242797852, "losses/total": 0.247134268283844, "ref_logps/chosen": -30.335731506347656, "ref_logps/rejected": -34.531829833984375, "rewards/accuracies": 1.0, "rewards/chosen": -2.370487928390503, "rewards/margins": 2.558443784713745, "rewards/rejected": -4.928932189941406, "step": 2707 }, { "epoch": 2.55, "grad_norm": 8.146288862662182, "learning_rate": 2.8345365679609544e-08, "logps/chosen": -57.809940338134766, "logps/rejected": -106.45416259765625, "loss": 0.0614, "losses/dpo": 0.05934369936585426, "losses/sft": 2.194131374359131, "losses/total": 0.05934369936585426, "ref_logps/chosen": -28.908369064331055, "ref_logps/rejected": -37.27268981933594, "rewards/accuracies": 1.0, "rewards/chosen": -2.8901572227478027, "rewards/margins": 4.027990341186523, "rewards/rejected": -6.918148040771484, "step": 2708 }, { "epoch": 2.56, "grad_norm": 11.306651720195287, "learning_rate": 2.822769608438058e-08, "logps/chosen": -39.81816864013672, "logps/rejected": -79.62010192871094, "loss": 0.158, "losses/dpo": 0.0711982473731041, "losses/sft": 2.3317174911499023, "losses/total": 0.0711982473731041, "ref_logps/chosen": -21.597869873046875, "ref_logps/rejected": -31.849464416503906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8220295906066895, "rewards/margins": 2.955033302307129, "rewards/rejected": -4.777063369750977, "step": 2709 }, { "epoch": 2.56, "grad_norm": 16.446811535154403, "learning_rate": 2.8110256621822203e-08, "logps/chosen": -48.169578552246094, "logps/rejected": -82.61956024169922, "loss": 0.1573, "losses/dpo": 0.038867104798555374, "losses/sft": 0.7769805192947388, "losses/total": 0.038867104798555374, "ref_logps/chosen": -26.586992263793945, "ref_logps/rejected": -32.445640563964844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1582584381103516, "rewards/margins": 2.85913348197937, "rewards/rejected": -5.017392158508301, "step": 2710 }, { "epoch": 2.56, "grad_norm": 12.865178708964445, "learning_rate": 2.7993047413801207e-08, "logps/chosen": -44.830116271972656, "logps/rejected": -81.82022857666016, "loss": 0.1547, "losses/dpo": 0.4177599549293518, "losses/sft": 0.20538929104804993, "losses/total": 0.4177599549293518, "ref_logps/chosen": -24.85140609741211, "ref_logps/rejected": -31.954404830932617, "rewards/accuracies": 1.0, "rewards/chosen": -1.9978711605072021, "rewards/margins": 2.98871111869812, "rewards/rejected": -4.986582279205322, "step": 2711 }, { "epoch": 2.56, "grad_norm": 16.013628766907004, "learning_rate": 2.7876068581945188e-08, "logps/chosen": -68.19851684570312, "logps/rejected": -104.36466217041016, "loss": 0.1353, "losses/dpo": 0.0013337924610823393, "losses/sft": 1.384867548942566, "losses/total": 0.0013337924610823393, "ref_logps/chosen": -39.49822998046875, "ref_logps/rejected": -41.513614654541016, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8700289726257324, "rewards/margins": 3.4150757789611816, "rewards/rejected": -6.285104751586914, "step": 2712 }, { "epoch": 2.56, "grad_norm": 8.147930230228035, "learning_rate": 2.7759320247642963e-08, "logps/chosen": -62.87496566772461, "logps/rejected": -101.90969848632812, "loss": 0.0783, "losses/dpo": 0.08853595703840256, "losses/sft": 2.0777528285980225, "losses/total": 0.08853595703840256, "ref_logps/chosen": -37.51312255859375, "ref_logps/rejected": -39.97563171386719, "rewards/accuracies": 1.0, "rewards/chosen": -2.536184072494507, "rewards/margins": 3.657222270965576, "rewards/rejected": -6.193406581878662, "step": 2713 }, { "epoch": 2.56, "grad_norm": 16.069367460647758, "learning_rate": 2.764280253204393e-08, "logps/chosen": -50.701377868652344, "logps/rejected": -87.47899627685547, "loss": 0.1597, "losses/dpo": 0.3790525197982788, "losses/sft": 2.830092668533325, "losses/total": 0.3790525197982788, "ref_logps/chosen": -24.655696868896484, "ref_logps/rejected": -32.716278076171875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6045680046081543, "rewards/margins": 2.8717033863067627, "rewards/rejected": -5.476271629333496, "step": 2714 }, { "epoch": 2.56, "grad_norm": 11.344598025363625, "learning_rate": 2.7526515556058422e-08, "logps/chosen": -65.46241760253906, "logps/rejected": -114.4560546875, "loss": 0.0804, "losses/dpo": 0.0015685048419982195, "losses/sft": 2.0056869983673096, "losses/total": 0.0015685048419982195, "ref_logps/chosen": -35.63660430908203, "ref_logps/rejected": -44.53301239013672, "rewards/accuracies": 1.0, "rewards/chosen": -2.982581377029419, "rewards/margins": 4.00972318649292, "rewards/rejected": -6.992304801940918, "step": 2715 }, { "epoch": 2.56, "grad_norm": 11.031259558574313, "learning_rate": 2.741045944035708e-08, "logps/chosen": -62.67218017578125, "logps/rejected": -116.79830932617188, "loss": 0.1409, "losses/dpo": 0.00012249669816810638, "losses/sft": 2.285620927810669, "losses/total": 0.00012249669816810638, "ref_logps/chosen": -33.28530502319336, "ref_logps/rejected": -49.539283752441406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.938687801361084, "rewards/margins": 3.787214756011963, "rewards/rejected": -6.725902557373047, "step": 2716 }, { "epoch": 2.56, "grad_norm": 15.132076607458739, "learning_rate": 2.7294634305371183e-08, "logps/chosen": -59.31013870239258, "logps/rejected": -97.71548461914062, "loss": 0.1595, "losses/dpo": 0.0007349230581894517, "losses/sft": 2.1795923709869385, "losses/total": 0.0007349230581894517, "ref_logps/chosen": -34.715206146240234, "ref_logps/rejected": -37.78888702392578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4594931602478027, "rewards/margins": 3.5331668853759766, "rewards/rejected": -5.992660045623779, "step": 2717 }, { "epoch": 2.56, "grad_norm": 9.532915627960222, "learning_rate": 2.717904027129214e-08, "logps/chosen": -44.97378158569336, "logps/rejected": -76.23070526123047, "loss": 0.1259, "losses/dpo": 0.12461420148611069, "losses/sft": 1.8308730125427246, "losses/total": 0.12461420148611069, "ref_logps/chosen": -22.542049407958984, "ref_logps/rejected": -24.04547882080078, "rewards/accuracies": 1.0, "rewards/chosen": -2.243173122406006, "rewards/margins": 2.9753494262695312, "rewards/rejected": -5.218522071838379, "step": 2718 }, { "epoch": 2.57, "grad_norm": 6.520689353713708, "learning_rate": 2.7063677458071725e-08, "logps/chosen": -44.80255889892578, "logps/rejected": -80.20112609863281, "loss": 0.0755, "losses/dpo": 0.013752452097833157, "losses/sft": 1.9033948183059692, "losses/total": 0.013752452097833157, "ref_logps/chosen": -23.508586883544922, "ref_logps/rejected": -27.598663330078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.129397392272949, "rewards/margins": 3.1308484077453613, "rewards/rejected": -5.260246276855469, "step": 2719 }, { "epoch": 2.57, "grad_norm": 13.161618564985595, "learning_rate": 2.6948545985421732e-08, "logps/chosen": -41.26304626464844, "logps/rejected": -87.23516845703125, "loss": 0.2049, "losses/dpo": 0.030723772943019867, "losses/sft": 1.6778788566589355, "losses/total": 0.030723772943019867, "ref_logps/chosen": -20.302135467529297, "ref_logps/rejected": -35.167999267578125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0960915088653564, "rewards/margins": 3.1106252670288086, "rewards/rejected": -5.206716537475586, "step": 2720 }, { "epoch": 2.57, "grad_norm": 10.719889802708966, "learning_rate": 2.6833645972813733e-08, "logps/chosen": -68.1815185546875, "logps/rejected": -102.74978637695312, "loss": 0.0669, "losses/dpo": 0.05527889356017113, "losses/sft": 2.6650350093841553, "losses/total": 0.05527889356017113, "ref_logps/chosen": -38.10003662109375, "ref_logps/rejected": -38.14176559448242, "rewards/accuracies": 1.0, "rewards/chosen": -3.008148670196533, "rewards/margins": 3.452653169631958, "rewards/rejected": -6.46080207824707, "step": 2721 }, { "epoch": 2.57, "grad_norm": 13.720508724061649, "learning_rate": 2.6718977539479382e-08, "logps/chosen": -44.8494873046875, "logps/rejected": -74.35707092285156, "loss": 0.1414, "losses/dpo": 0.409992516040802, "losses/sft": 1.3384910821914673, "losses/total": 0.409992516040802, "ref_logps/chosen": -28.787193298339844, "ref_logps/rejected": -27.831199645996094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6062290668487549, "rewards/margins": 3.046358585357666, "rewards/rejected": -4.652587890625, "step": 2722 }, { "epoch": 2.57, "grad_norm": 6.868329068675198, "learning_rate": 2.660454080440977e-08, "logps/chosen": -59.313541412353516, "logps/rejected": -102.56784057617188, "loss": 0.0565, "losses/dpo": 0.055038049817085266, "losses/sft": 0.9103659391403198, "losses/total": 0.055038049817085266, "ref_logps/chosen": -35.85322570800781, "ref_logps/rejected": -41.011966705322266, "rewards/accuracies": 1.0, "rewards/chosen": -2.346031427383423, "rewards/margins": 3.809556007385254, "rewards/rejected": -6.155587673187256, "step": 2723 }, { "epoch": 2.57, "grad_norm": 10.312182235714262, "learning_rate": 2.6490335886355713e-08, "logps/chosen": -50.8269157409668, "logps/rejected": -99.95195770263672, "loss": 0.1098, "losses/dpo": 0.06317713856697083, "losses/sft": 1.2248775959014893, "losses/total": 0.06317713856697083, "ref_logps/chosen": -27.60997772216797, "ref_logps/rejected": -40.18260192871094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3216936588287354, "rewards/margins": 3.6552422046661377, "rewards/rejected": -5.976936340332031, "step": 2724 }, { "epoch": 2.57, "grad_norm": 5.658417840566704, "learning_rate": 2.6376362903827387e-08, "logps/chosen": -47.42815399169922, "logps/rejected": -94.9926986694336, "loss": 0.0419, "losses/dpo": 0.018004102632403374, "losses/sft": 1.2230923175811768, "losses/total": 0.018004102632403374, "ref_logps/chosen": -29.37811851501465, "ref_logps/rejected": -36.20184326171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8050038814544678, "rewards/margins": 4.074082374572754, "rewards/rejected": -5.879086017608643, "step": 2725 }, { "epoch": 2.57, "grad_norm": 19.607289630101647, "learning_rate": 2.6262621975094406e-08, "logps/chosen": -47.585968017578125, "logps/rejected": -85.18641662597656, "loss": 0.2042, "losses/dpo": 0.08130954205989838, "losses/sft": 2.1039092540740967, "losses/total": 0.08130954205989838, "ref_logps/chosen": -25.465354919433594, "ref_logps/rejected": -33.51008224487305, "rewards/accuracies": 0.875, "rewards/chosen": -2.2120614051818848, "rewards/margins": 2.9555716514587402, "rewards/rejected": -5.167633056640625, "step": 2726 }, { "epoch": 2.57, "grad_norm": 17.89391061000701, "learning_rate": 2.6149113218185388e-08, "logps/chosen": -49.46661376953125, "logps/rejected": -83.57937622070312, "loss": 0.1912, "losses/dpo": 0.00017180567374452949, "losses/sft": 2.451093912124634, "losses/total": 0.00017180567374452949, "ref_logps/chosen": -28.924091339111328, "ref_logps/rejected": -35.96635818481445, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0542526245117188, "rewards/margins": 2.70704984664917, "rewards/rejected": -4.761302471160889, "step": 2727 }, { "epoch": 2.57, "grad_norm": 15.977085907651233, "learning_rate": 2.603583675088819e-08, "logps/chosen": -44.147193908691406, "logps/rejected": -81.76335144042969, "loss": 0.2121, "losses/dpo": 0.013487046584486961, "losses/sft": 2.4093685150146484, "losses/total": 0.013487046584486961, "ref_logps/chosen": -25.63092803955078, "ref_logps/rejected": -29.678625106811523, "rewards/accuracies": 0.875, "rewards/chosen": -1.8516266345977783, "rewards/margins": 3.356846809387207, "rewards/rejected": -5.208473205566406, "step": 2728 }, { "epoch": 2.57, "grad_norm": 7.140099837649573, "learning_rate": 2.592279269074965e-08, "logps/chosen": -60.2972412109375, "logps/rejected": -103.70145416259766, "loss": 0.0512, "losses/dpo": 0.12934689223766327, "losses/sft": 2.6562435626983643, "losses/total": 0.12934689223766327, "ref_logps/chosen": -31.6436767578125, "ref_logps/rejected": -39.37200164794922, "rewards/accuracies": 1.0, "rewards/chosen": -2.865356683731079, "rewards/margins": 3.5675888061523438, "rewards/rejected": -6.432945251464844, "step": 2729 }, { "epoch": 2.58, "grad_norm": 14.985655892280851, "learning_rate": 2.580998115507524e-08, "logps/chosen": -48.17218017578125, "logps/rejected": -92.87737274169922, "loss": 0.2053, "losses/dpo": 0.11081431806087494, "losses/sft": 2.1071882247924805, "losses/total": 0.11081431806087494, "ref_logps/chosen": -23.973651885986328, "ref_logps/rejected": -40.07617950439453, "rewards/accuracies": 0.875, "rewards/chosen": -2.4198527336120605, "rewards/margins": 2.860267162322998, "rewards/rejected": -5.280120372772217, "step": 2730 }, { "epoch": 2.58, "grad_norm": 17.993053959662333, "learning_rate": 2.569740226092934e-08, "logps/chosen": -49.35478591918945, "logps/rejected": -86.54694366455078, "loss": 0.1925, "losses/dpo": 0.1665515899658203, "losses/sft": 2.154317855834961, "losses/total": 0.1665515899658203, "ref_logps/chosen": -27.086368560791016, "ref_logps/rejected": -30.8785400390625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.226841926574707, "rewards/margins": 3.339998960494995, "rewards/rejected": -5.566840171813965, "step": 2731 }, { "epoch": 2.58, "grad_norm": 9.548583394349185, "learning_rate": 2.5585056125134786e-08, "logps/chosen": -57.593196868896484, "logps/rejected": -100.49333953857422, "loss": 0.0781, "losses/dpo": 0.028623804450035095, "losses/sft": 1.4165736436843872, "losses/total": 0.028623804450035095, "ref_logps/chosen": -36.222660064697266, "ref_logps/rejected": -41.674251556396484, "rewards/accuracies": 1.0, "rewards/chosen": -2.1370537281036377, "rewards/margins": 3.7448554039001465, "rewards/rejected": -5.881909370422363, "step": 2732 }, { "epoch": 2.58, "grad_norm": 20.563750685877686, "learning_rate": 2.5472942864273013e-08, "logps/chosen": -49.07540512084961, "logps/rejected": -90.78097534179688, "loss": 0.1954, "losses/dpo": 0.03648596256971359, "losses/sft": 0.6336705684661865, "losses/total": 0.03648596256971359, "ref_logps/chosen": -25.651065826416016, "ref_logps/rejected": -32.907779693603516, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3424339294433594, "rewards/margins": 3.444885492324829, "rewards/rejected": -5.787319183349609, "step": 2733 }, { "epoch": 2.58, "grad_norm": 12.491824662551252, "learning_rate": 2.5361062594683596e-08, "logps/chosen": -36.84577941894531, "logps/rejected": -75.2706069946289, "loss": 0.1542, "losses/dpo": 0.03145650029182434, "losses/sft": 0.623892605304718, "losses/total": 0.03145650029182434, "ref_logps/chosen": -19.376449584960938, "ref_logps/rejected": -26.014415740966797, "rewards/accuracies": 1.0, "rewards/chosen": -1.746932864189148, "rewards/margins": 3.17868709564209, "rewards/rejected": -4.925620079040527, "step": 2734 }, { "epoch": 2.58, "grad_norm": 9.002737332967188, "learning_rate": 2.5249415432464616e-08, "logps/chosen": -48.74702835083008, "logps/rejected": -93.89508056640625, "loss": 0.0794, "losses/dpo": 0.13062068819999695, "losses/sft": 1.6463589668273926, "losses/total": 0.13062068819999695, "ref_logps/chosen": -26.421932220458984, "ref_logps/rejected": -36.683837890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.2325096130371094, "rewards/margins": 3.488614559173584, "rewards/rejected": -5.721123695373535, "step": 2735 }, { "epoch": 2.58, "grad_norm": 17.577195393776623, "learning_rate": 2.513800149347198e-08, "logps/chosen": -38.264892578125, "logps/rejected": -84.64749145507812, "loss": 0.1884, "losses/dpo": 0.04315364733338356, "losses/sft": 1.7901108264923096, "losses/total": 0.04315364733338356, "ref_logps/chosen": -21.440492630004883, "ref_logps/rejected": -31.56580924987793, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6824396848678589, "rewards/margins": 3.625728130340576, "rewards/rejected": -5.308167457580566, "step": 2736 }, { "epoch": 2.58, "grad_norm": 10.394718636450767, "learning_rate": 2.5026820893319783e-08, "logps/chosen": -55.63432312011719, "logps/rejected": -84.04444885253906, "loss": 0.1156, "losses/dpo": 0.05547125265002251, "losses/sft": 1.5957084894180298, "losses/total": 0.05547125265002251, "ref_logps/chosen": -29.0841064453125, "ref_logps/rejected": -29.841045379638672, "rewards/accuracies": 1.0, "rewards/chosen": -2.6550216674804688, "rewards/margins": 2.7653186321258545, "rewards/rejected": -5.420340538024902, "step": 2737 }, { "epoch": 2.58, "grad_norm": 12.810395883878172, "learning_rate": 2.491587374737991e-08, "logps/chosen": -44.69795227050781, "logps/rejected": -82.16364288330078, "loss": 0.1512, "losses/dpo": 0.34142476320266724, "losses/sft": 2.1801207065582275, "losses/total": 0.34142476320266724, "ref_logps/chosen": -27.520336151123047, "ref_logps/rejected": -30.802642822265625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7177613973617554, "rewards/margins": 3.4183387756347656, "rewards/rejected": -5.136099815368652, "step": 2738 }, { "epoch": 2.58, "grad_norm": 14.856912065121834, "learning_rate": 2.4805160170781937e-08, "logps/chosen": -53.27427291870117, "logps/rejected": -96.34713745117188, "loss": 0.1264, "losses/dpo": 0.047903940081596375, "losses/sft": 1.5740584135055542, "losses/total": 0.047903940081596375, "ref_logps/chosen": -27.546405792236328, "ref_logps/rejected": -35.63115692138672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.572786808013916, "rewards/margins": 3.4988112449645996, "rewards/rejected": -6.071598052978516, "step": 2739 }, { "epoch": 2.58, "grad_norm": 11.346157429470606, "learning_rate": 2.4694680278413176e-08, "logps/chosen": -52.82317352294922, "logps/rejected": -87.27427673339844, "loss": 0.0978, "losses/dpo": 0.1755254566669464, "losses/sft": 0.9998778700828552, "losses/total": 0.1755254566669464, "ref_logps/chosen": -30.419170379638672, "ref_logps/rejected": -33.87396240234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.2404003143310547, "rewards/margins": 3.0996310710906982, "rewards/rejected": -5.340031147003174, "step": 2740 }, { "epoch": 2.59, "grad_norm": 17.02914226443892, "learning_rate": 2.4584434184918317e-08, "logps/chosen": -38.370262145996094, "logps/rejected": -73.66921997070312, "loss": 0.2446, "losses/dpo": 0.08576661348342896, "losses/sft": 1.6133564710617065, "losses/total": 0.08576661348342896, "ref_logps/chosen": -22.3017578125, "ref_logps/rejected": -30.457727432250977, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6068503856658936, "rewards/margins": 2.714298725128174, "rewards/rejected": -4.3211493492126465, "step": 2741 }, { "epoch": 2.59, "grad_norm": 8.7503116418952, "learning_rate": 2.4474422004699597e-08, "logps/chosen": -47.017372131347656, "logps/rejected": -95.80741882324219, "loss": 0.0893, "losses/dpo": 0.024502446874976158, "losses/sft": 0.022280976176261902, "losses/total": 0.024502446874976158, "ref_logps/chosen": -26.588516235351562, "ref_logps/rejected": -36.702362060546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0428853034973145, "rewards/margins": 3.867619752883911, "rewards/rejected": -5.910505294799805, "step": 2742 }, { "epoch": 2.59, "grad_norm": 16.300358598666627, "learning_rate": 2.4364643851916362e-08, "logps/chosen": -49.69927215576172, "logps/rejected": -101.90428161621094, "loss": 0.121, "losses/dpo": 0.012251688167452812, "losses/sft": 1.1276808977127075, "losses/total": 0.012251688167452812, "ref_logps/chosen": -29.18274688720703, "ref_logps/rejected": -42.801780700683594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.051652669906616, "rewards/margins": 3.858597755432129, "rewards/rejected": -5.910250663757324, "step": 2743 }, { "epoch": 2.59, "grad_norm": 12.414245547908806, "learning_rate": 2.4255099840485184e-08, "logps/chosen": -49.2210693359375, "logps/rejected": -85.08793640136719, "loss": 0.1138, "losses/dpo": 0.00035722777829505503, "losses/sft": 2.202641725540161, "losses/total": 0.00035722777829505503, "ref_logps/chosen": -27.47730255126953, "ref_logps/rejected": -33.261940002441406, "rewards/accuracies": 1.0, "rewards/chosen": -2.174376964569092, "rewards/margins": 3.008222818374634, "rewards/rejected": -5.182600021362305, "step": 2744 }, { "epoch": 2.59, "grad_norm": 21.36046167524447, "learning_rate": 2.4145790084079716e-08, "logps/chosen": -69.0560302734375, "logps/rejected": -88.95668029785156, "loss": 0.2122, "losses/dpo": 0.2950823903083801, "losses/sft": 2.0782341957092285, "losses/total": 0.2950823903083801, "ref_logps/chosen": -41.82060241699219, "ref_logps/rejected": -39.13457489013672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7235422134399414, "rewards/margins": 2.2586684226989746, "rewards/rejected": -4.982211112976074, "step": 2745 }, { "epoch": 2.59, "grad_norm": 11.360503528933908, "learning_rate": 2.403671469613039e-08, "logps/chosen": -49.77761459350586, "logps/rejected": -87.20838928222656, "loss": 0.1559, "losses/dpo": 0.33505505323410034, "losses/sft": 2.7232415676116943, "losses/total": 0.33505505323410034, "ref_logps/chosen": -29.052204132080078, "ref_logps/rejected": -35.258628845214844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0725412368774414, "rewards/margins": 3.1224348545074463, "rewards/rejected": -5.194975852966309, "step": 2746 }, { "epoch": 2.59, "grad_norm": 10.69030450627124, "learning_rate": 2.392787378982458e-08, "logps/chosen": -48.41753387451172, "logps/rejected": -84.25342559814453, "loss": 0.1054, "losses/dpo": 0.0018194104777649045, "losses/sft": 0.5226593613624573, "losses/total": 0.0018194104777649045, "ref_logps/chosen": -30.636165618896484, "ref_logps/rejected": -33.34065628051758, "rewards/accuracies": 1.0, "rewards/chosen": -1.778136968612671, "rewards/margins": 3.3131399154663086, "rewards/rejected": -5.091277122497559, "step": 2747 }, { "epoch": 2.59, "grad_norm": 14.860194666258641, "learning_rate": 2.3819267478106254e-08, "logps/chosen": -50.12232971191406, "logps/rejected": -75.59242248535156, "loss": 0.1816, "losses/dpo": 1.1368775367736816, "losses/sft": 2.3216259479522705, "losses/total": 1.1368775367736816, "ref_logps/chosen": -30.003477096557617, "ref_logps/rejected": -25.19556999206543, "rewards/accuracies": 0.875, "rewards/chosen": -2.011885404586792, "rewards/margins": 3.0277998447418213, "rewards/rejected": -5.039685249328613, "step": 2748 }, { "epoch": 2.59, "grad_norm": 18.183035488365807, "learning_rate": 2.3710895873676e-08, "logps/chosen": -70.32432556152344, "logps/rejected": -97.92835998535156, "loss": 0.164, "losses/dpo": 0.0071369982324540615, "losses/sft": 1.8842030763626099, "losses/total": 0.0071369982324540615, "ref_logps/chosen": -39.68402099609375, "ref_logps/rejected": -38.99892807006836, "rewards/accuracies": 0.9375, "rewards/chosen": -3.064030647277832, "rewards/margins": 2.8289124965667725, "rewards/rejected": -5.892943382263184, "step": 2749 }, { "epoch": 2.59, "grad_norm": 10.202371050393344, "learning_rate": 2.3602759088990703e-08, "logps/chosen": -51.637916564941406, "logps/rejected": -102.19738006591797, "loss": 0.0692, "losses/dpo": 0.02169414982199669, "losses/sft": 2.6030116081237793, "losses/total": 0.02169414982199669, "ref_logps/chosen": -29.66993522644043, "ref_logps/rejected": -44.796295166015625, "rewards/accuracies": 1.0, "rewards/chosen": -2.196798086166382, "rewards/margins": 3.5433104038238525, "rewards/rejected": -5.740108489990234, "step": 2750 }, { "epoch": 2.6, "grad_norm": 17.449611494265916, "learning_rate": 2.349485723626385e-08, "logps/chosen": -38.924095153808594, "logps/rejected": -72.66438293457031, "loss": 0.1824, "losses/dpo": 7.460082997567952e-05, "losses/sft": 0.1278119534254074, "losses/total": 7.460082997567952e-05, "ref_logps/chosen": -20.97256088256836, "ref_logps/rejected": -27.24146270751953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7951538562774658, "rewards/margins": 2.747138261795044, "rewards/rejected": -4.54229211807251, "step": 2751 }, { "epoch": 2.6, "grad_norm": 7.804361968190996, "learning_rate": 2.3387190427464816e-08, "logps/chosen": -52.57478713989258, "logps/rejected": -107.67198181152344, "loss": 0.092, "losses/dpo": 0.03199414908885956, "losses/sft": 1.4597874879837036, "losses/total": 0.03199414908885956, "ref_logps/chosen": -25.267120361328125, "ref_logps/rejected": -43.19216537475586, "rewards/accuracies": 1.0, "rewards/chosen": -2.730766534805298, "rewards/margins": 3.717214584350586, "rewards/rejected": -6.447980880737305, "step": 2752 }, { "epoch": 2.6, "grad_norm": 9.328991252579302, "learning_rate": 2.327975877431934e-08, "logps/chosen": -51.641502380371094, "logps/rejected": -112.048583984375, "loss": 0.0795, "losses/dpo": 0.04838087037205696, "losses/sft": 1.6890509128570557, "losses/total": 0.04838087037205696, "ref_logps/chosen": -28.51938247680664, "ref_logps/rejected": -48.57217788696289, "rewards/accuracies": 1.0, "rewards/chosen": -2.3122119903564453, "rewards/margins": 4.035429000854492, "rewards/rejected": -6.3476409912109375, "step": 2753 }, { "epoch": 2.6, "grad_norm": 11.790184094972293, "learning_rate": 2.3172562388308918e-08, "logps/chosen": -49.109737396240234, "logps/rejected": -74.57487487792969, "loss": 0.1483, "losses/dpo": 0.022802947089076042, "losses/sft": 1.0280996561050415, "losses/total": 0.022802947089076042, "ref_logps/chosen": -29.337299346923828, "ref_logps/rejected": -27.397573471069336, "rewards/accuracies": 1.0, "rewards/chosen": -1.9772436618804932, "rewards/margins": 2.7404863834381104, "rewards/rejected": -4.7177300453186035, "step": 2754 }, { "epoch": 2.6, "grad_norm": 15.710244534723936, "learning_rate": 2.306560138067118e-08, "logps/chosen": -45.92095184326172, "logps/rejected": -82.61746215820312, "loss": 0.1467, "losses/dpo": 0.017784440889954567, "losses/sft": 1.3490129709243774, "losses/total": 0.017784440889954567, "ref_logps/chosen": -23.763818740844727, "ref_logps/rejected": -30.762815475463867, "rewards/accuracies": 1.0, "rewards/chosen": -2.2157132625579834, "rewards/margins": 2.9697511196136475, "rewards/rejected": -5.185464382171631, "step": 2755 }, { "epoch": 2.6, "grad_norm": 8.951945748225222, "learning_rate": 2.2958875862399197e-08, "logps/chosen": -46.644065856933594, "logps/rejected": -81.99259948730469, "loss": 0.0843, "losses/dpo": 0.015084178186953068, "losses/sft": 1.0725233554840088, "losses/total": 0.015084178186953068, "ref_logps/chosen": -25.588932037353516, "ref_logps/rejected": -29.46247100830078, "rewards/accuracies": 1.0, "rewards/chosen": -2.105513572692871, "rewards/margins": 3.147498607635498, "rewards/rejected": -5.253012180328369, "step": 2756 }, { "epoch": 2.6, "grad_norm": 9.147083255891694, "learning_rate": 2.285238594424188e-08, "logps/chosen": -45.37481689453125, "logps/rejected": -84.80319213867188, "loss": 0.0911, "losses/dpo": 0.0003177548642270267, "losses/sft": 0.8186232447624207, "losses/total": 0.0003177548642270267, "ref_logps/chosen": -21.700504302978516, "ref_logps/rejected": -28.357980728149414, "rewards/accuracies": 1.0, "rewards/chosen": -2.367431163787842, "rewards/margins": 3.277089834213257, "rewards/rejected": -5.6445207595825195, "step": 2757 }, { "epoch": 2.6, "grad_norm": 12.589971978232493, "learning_rate": 2.2746131736703643e-08, "logps/chosen": -65.60881042480469, "logps/rejected": -91.37013244628906, "loss": 0.1139, "losses/dpo": 0.0805017426609993, "losses/sft": 2.50421142578125, "losses/total": 0.0805017426609993, "ref_logps/chosen": -37.77596664428711, "ref_logps/rejected": -31.57980728149414, "rewards/accuracies": 1.0, "rewards/chosen": -2.7832841873168945, "rewards/margins": 3.1957483291625977, "rewards/rejected": -5.979032516479492, "step": 2758 }, { "epoch": 2.6, "grad_norm": 19.195727866953266, "learning_rate": 2.2640113350044203e-08, "logps/chosen": -63.36485290527344, "logps/rejected": -95.17207336425781, "loss": 0.1525, "losses/dpo": 0.7624515891075134, "losses/sft": 2.7412633895874023, "losses/total": 0.7624515891075134, "ref_logps/chosen": -33.62134552001953, "ref_logps/rejected": -36.29462432861328, "rewards/accuracies": 0.875, "rewards/chosen": -2.974350929260254, "rewards/margins": 2.9133946895599365, "rewards/rejected": -5.887744903564453, "step": 2759 }, { "epoch": 2.6, "grad_norm": 19.044899947357205, "learning_rate": 2.253433089427867e-08, "logps/chosen": -51.28593063354492, "logps/rejected": -86.18977355957031, "loss": 0.144, "losses/dpo": 0.14188635349273682, "losses/sft": 0.4510689377784729, "losses/total": 0.14188635349273682, "ref_logps/chosen": -29.82598876953125, "ref_logps/rejected": -32.511680603027344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1459944248199463, "rewards/margins": 3.221815586090088, "rewards/rejected": -5.367810249328613, "step": 2760 }, { "epoch": 2.6, "grad_norm": 12.714627531736546, "learning_rate": 2.242878447917723e-08, "logps/chosen": -42.17584228515625, "logps/rejected": -76.11283874511719, "loss": 0.1297, "losses/dpo": 0.7895907759666443, "losses/sft": 1.2678422927856445, "losses/total": 0.7895907759666443, "ref_logps/chosen": -22.367605209350586, "ref_logps/rejected": -27.30356216430664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9808239936828613, "rewards/margins": 2.90010404586792, "rewards/rejected": -4.880928039550781, "step": 2761 }, { "epoch": 2.61, "grad_norm": 12.017192251456303, "learning_rate": 2.2323474214265252e-08, "logps/chosen": -45.67198181152344, "logps/rejected": -69.02965545654297, "loss": 0.1248, "losses/dpo": 0.01835092343389988, "losses/sft": 1.834346890449524, "losses/total": 0.01835092343389988, "ref_logps/chosen": -26.219120025634766, "ref_logps/rejected": -26.026023864746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.9452863931655884, "rewards/margins": 2.355076789855957, "rewards/rejected": -4.300363540649414, "step": 2762 }, { "epoch": 2.61, "grad_norm": 9.066484987890515, "learning_rate": 2.221840020882293e-08, "logps/chosen": -55.85718536376953, "logps/rejected": -91.18468475341797, "loss": 0.1134, "losses/dpo": 0.023093445226550102, "losses/sft": 2.6087026596069336, "losses/total": 0.023093445226550102, "ref_logps/chosen": -34.45395278930664, "ref_logps/rejected": -34.54568099975586, "rewards/accuracies": 1.0, "rewards/chosen": -2.1403236389160156, "rewards/margins": 3.5235772132873535, "rewards/rejected": -5.663900375366211, "step": 2763 }, { "epoch": 2.61, "grad_norm": 6.738431978963111, "learning_rate": 2.2113562571885373e-08, "logps/chosen": -49.28642272949219, "logps/rejected": -106.45751190185547, "loss": 0.0437, "losses/dpo": 0.0029456615447998047, "losses/sft": 1.5021170377731323, "losses/total": 0.0029456615447998047, "ref_logps/chosen": -26.665626525878906, "ref_logps/rejected": -41.0621337890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.2620797157287598, "rewards/margins": 4.277458190917969, "rewards/rejected": -6.5395379066467285, "step": 2764 }, { "epoch": 2.61, "grad_norm": 8.947967694834562, "learning_rate": 2.2008961412242346e-08, "logps/chosen": -49.2435302734375, "logps/rejected": -82.56488800048828, "loss": 0.0929, "losses/dpo": 0.10552024841308594, "losses/sft": 0.6198070049285889, "losses/total": 0.10552024841308594, "ref_logps/chosen": -29.273059844970703, "ref_logps/rejected": -32.256900787353516, "rewards/accuracies": 1.0, "rewards/chosen": -1.9970468282699585, "rewards/margins": 3.033752202987671, "rewards/rejected": -5.03079891204834, "step": 2765 }, { "epoch": 2.61, "grad_norm": 8.8796029947771, "learning_rate": 2.190459683843829e-08, "logps/chosen": -50.46037292480469, "logps/rejected": -94.95057678222656, "loss": 0.0937, "losses/dpo": 0.5766861438751221, "losses/sft": 1.1218425035476685, "losses/total": 0.5766861438751221, "ref_logps/chosen": -26.934589385986328, "ref_logps/rejected": -34.997005462646484, "rewards/accuracies": 1.0, "rewards/chosen": -2.3525781631469727, "rewards/margins": 3.642779588699341, "rewards/rejected": -5.995357513427734, "step": 2766 }, { "epoch": 2.61, "grad_norm": 18.54657907115748, "learning_rate": 2.180046895877216e-08, "logps/chosen": -44.374114990234375, "logps/rejected": -85.71937561035156, "loss": 0.2078, "losses/dpo": 0.02500331401824951, "losses/sft": 1.2326395511627197, "losses/total": 0.02500331401824951, "ref_logps/chosen": -21.07046890258789, "ref_logps/rejected": -31.986740112304688, "rewards/accuracies": 0.9375, "rewards/chosen": -2.330364465713501, "rewards/margins": 3.0428991317749023, "rewards/rejected": -5.373263835906982, "step": 2767 }, { "epoch": 2.61, "grad_norm": 16.283506679759554, "learning_rate": 2.1696577881297167e-08, "logps/chosen": -50.96017074584961, "logps/rejected": -100.02521514892578, "loss": 0.1471, "losses/dpo": 0.008866881020367146, "losses/sft": 1.6615039110183716, "losses/total": 0.008866881020367146, "ref_logps/chosen": -25.087421417236328, "ref_logps/rejected": -37.84906005859375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5872747898101807, "rewards/margins": 3.630340576171875, "rewards/rejected": -6.217615604400635, "step": 2768 }, { "epoch": 2.61, "grad_norm": 18.067775897025353, "learning_rate": 2.159292371382093e-08, "logps/chosen": -62.447410583496094, "logps/rejected": -106.92877197265625, "loss": 0.1605, "losses/dpo": 0.007000113371759653, "losses/sft": 1.6122372150421143, "losses/total": 0.007000113371759653, "ref_logps/chosen": -32.78302001953125, "ref_logps/rejected": -43.33043670654297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.966439723968506, "rewards/margins": 3.3933937549591064, "rewards/rejected": -6.359833717346191, "step": 2769 }, { "epoch": 2.61, "grad_norm": 12.364422970283842, "learning_rate": 2.1489506563905108e-08, "logps/chosen": -47.29262924194336, "logps/rejected": -103.22120666503906, "loss": 0.0979, "losses/dpo": 0.01619880460202694, "losses/sft": 1.7186412811279297, "losses/total": 0.01619880460202694, "ref_logps/chosen": -24.523906707763672, "ref_logps/rejected": -44.270389556884766, "rewards/accuracies": 1.0, "rewards/chosen": -2.2768726348876953, "rewards/margins": 3.6182096004486084, "rewards/rejected": -5.895082473754883, "step": 2770 }, { "epoch": 2.61, "grad_norm": 9.781044825539798, "learning_rate": 2.1386326538865545e-08, "logps/chosen": -58.184906005859375, "logps/rejected": -93.72877502441406, "loss": 0.0772, "losses/dpo": 0.047234658151865005, "losses/sft": 1.4496526718139648, "losses/total": 0.047234658151865005, "ref_logps/chosen": -30.708112716674805, "ref_logps/rejected": -37.573028564453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.7476792335510254, "rewards/margins": 2.8678951263427734, "rewards/rejected": -5.615574359893799, "step": 2771 }, { "epoch": 2.62, "grad_norm": 10.217788345913267, "learning_rate": 2.1283383745771848e-08, "logps/chosen": -49.739356994628906, "logps/rejected": -99.28373718261719, "loss": 0.0804, "losses/dpo": 0.04198518022894859, "losses/sft": 2.8240363597869873, "losses/total": 0.04198518022894859, "ref_logps/chosen": -24.305585861206055, "ref_logps/rejected": -37.56924819946289, "rewards/accuracies": 1.0, "rewards/chosen": -2.54337739944458, "rewards/margins": 3.6280717849731445, "rewards/rejected": -6.171449661254883, "step": 2772 }, { "epoch": 2.62, "grad_norm": 18.23948442582933, "learning_rate": 2.1180678291447652e-08, "logps/chosen": -59.924468994140625, "logps/rejected": -86.37973022460938, "loss": 0.2311, "losses/dpo": 0.8079334497451782, "losses/sft": 2.0099782943725586, "losses/total": 0.8079334497451782, "ref_logps/chosen": -34.71525955200195, "ref_logps/rejected": -32.1442756652832, "rewards/accuracies": 0.875, "rewards/chosen": -2.520921230316162, "rewards/margins": 2.9026246070861816, "rewards/rejected": -5.4235453605651855, "step": 2773 }, { "epoch": 2.62, "grad_norm": 10.198866435636322, "learning_rate": 2.1078210282470115e-08, "logps/chosen": -47.18974304199219, "logps/rejected": -90.71170043945312, "loss": 0.1104, "losses/dpo": 0.004251506179571152, "losses/sft": 1.0759092569351196, "losses/total": 0.004251506179571152, "ref_logps/chosen": -26.94364356994629, "ref_logps/rejected": -38.08863830566406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0246095657348633, "rewards/margins": 3.2376956939697266, "rewards/rejected": -5.26230525970459, "step": 2774 }, { "epoch": 2.62, "grad_norm": 13.045352921961921, "learning_rate": 2.0975979825170076e-08, "logps/chosen": -63.89512252807617, "logps/rejected": -119.4297103881836, "loss": 0.0898, "losses/dpo": 0.002285171067342162, "losses/sft": 1.0636892318725586, "losses/total": 0.002285171067342162, "ref_logps/chosen": -30.951295852661133, "ref_logps/rejected": -43.17805480957031, "rewards/accuracies": 1.0, "rewards/chosen": -3.2943830490112305, "rewards/margins": 4.330782890319824, "rewards/rejected": -7.625165939331055, "step": 2775 }, { "epoch": 2.62, "grad_norm": 6.014139018063561, "learning_rate": 2.087398702563198e-08, "logps/chosen": -58.73792266845703, "logps/rejected": -97.0787353515625, "loss": 0.0532, "losses/dpo": 0.058259788900613785, "losses/sft": 1.7226417064666748, "losses/total": 0.058259788900613785, "ref_logps/chosen": -33.5751838684082, "ref_logps/rejected": -38.4234504699707, "rewards/accuracies": 1.0, "rewards/chosen": -2.5162742137908936, "rewards/margins": 3.3492541313171387, "rewards/rejected": -5.865528583526611, "step": 2776 }, { "epoch": 2.62, "grad_norm": 18.368141228252988, "learning_rate": 2.077223198969344e-08, "logps/chosen": -56.297760009765625, "logps/rejected": -90.841552734375, "loss": 0.1842, "losses/dpo": 0.060791976749897, "losses/sft": 1.8201543092727661, "losses/total": 0.060791976749897, "ref_logps/chosen": -33.343589782714844, "ref_logps/rejected": -36.067893981933594, "rewards/accuracies": 0.875, "rewards/chosen": -2.295417308807373, "rewards/margins": 3.181948184967041, "rewards/rejected": -5.477365493774414, "step": 2777 }, { "epoch": 2.62, "grad_norm": 13.531624925476876, "learning_rate": 2.0670714822945522e-08, "logps/chosen": -40.116371154785156, "logps/rejected": -68.57271575927734, "loss": 0.16, "losses/dpo": 0.546197235584259, "losses/sft": 0.05926438793540001, "losses/total": 0.546197235584259, "ref_logps/chosen": -23.82113265991211, "ref_logps/rejected": -25.566822052001953, "rewards/accuracies": 1.0, "rewards/chosen": -1.629523754119873, "rewards/margins": 2.6710658073425293, "rewards/rejected": -4.300589561462402, "step": 2778 }, { "epoch": 2.62, "grad_norm": 7.284691423131697, "learning_rate": 2.0569435630732295e-08, "logps/chosen": -45.748294830322266, "logps/rejected": -87.69154357910156, "loss": 0.06, "losses/dpo": 0.1462639421224594, "losses/sft": 2.1889920234680176, "losses/total": 0.1462639421224594, "ref_logps/chosen": -25.864696502685547, "ref_logps/rejected": -33.772216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9883596897125244, "rewards/margins": 3.4035730361938477, "rewards/rejected": -5.391932487487793, "step": 2779 }, { "epoch": 2.62, "grad_norm": 5.9047041532782405, "learning_rate": 2.046839451815108e-08, "logps/chosen": -31.137479782104492, "logps/rejected": -77.61946105957031, "loss": 0.0779, "losses/dpo": 0.0007183171692304313, "losses/sft": 1.53215754032135, "losses/total": 0.0007183171692304313, "ref_logps/chosen": -17.13630485534668, "ref_logps/rejected": -26.48965835571289, "rewards/accuracies": 1.0, "rewards/chosen": -1.4001176357269287, "rewards/margins": 3.712862491607666, "rewards/rejected": -5.112979888916016, "step": 2780 }, { "epoch": 2.62, "grad_norm": 8.503579806456873, "learning_rate": 2.0367591590051942e-08, "logps/chosen": -50.14327621459961, "logps/rejected": -89.3668212890625, "loss": 0.1047, "losses/dpo": 0.004324531182646751, "losses/sft": 1.3345733880996704, "losses/total": 0.004324531182646751, "ref_logps/chosen": -24.816898345947266, "ref_logps/rejected": -32.75376510620117, "rewards/accuracies": 1.0, "rewards/chosen": -2.5326380729675293, "rewards/margins": 3.1286673545837402, "rewards/rejected": -5.6613054275512695, "step": 2781 }, { "epoch": 2.62, "grad_norm": 16.796037463544405, "learning_rate": 2.0267026951037906e-08, "logps/chosen": -47.99809265136719, "logps/rejected": -67.80964660644531, "loss": 0.254, "losses/dpo": 0.0203064177185297, "losses/sft": 2.0999362468719482, "losses/total": 0.0203064177185297, "ref_logps/chosen": -27.567428588867188, "ref_logps/rejected": -23.94832420349121, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0430667400360107, "rewards/margins": 2.343066692352295, "rewards/rejected": -4.386133193969727, "step": 2782 }, { "epoch": 2.63, "grad_norm": 18.73429148037673, "learning_rate": 2.016670070546478e-08, "logps/chosen": -52.295555114746094, "logps/rejected": -74.56727600097656, "loss": 0.1445, "losses/dpo": 0.9796704649925232, "losses/sft": 2.6465938091278076, "losses/total": 0.9796704649925232, "ref_logps/chosen": -31.012624740600586, "ref_logps/rejected": -27.679927825927734, "rewards/accuracies": 0.9375, "rewards/chosen": -2.128293037414551, "rewards/margins": 2.560441732406616, "rewards/rejected": -4.688735008239746, "step": 2783 }, { "epoch": 2.63, "grad_norm": 13.819340651005534, "learning_rate": 2.006661295744075e-08, "logps/chosen": -52.32939147949219, "logps/rejected": -76.89653015136719, "loss": 0.1288, "losses/dpo": 0.2601540684700012, "losses/sft": 1.962113380432129, "losses/total": 0.2601540684700012, "ref_logps/chosen": -30.89973258972168, "ref_logps/rejected": -28.34990692138672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1429662704467773, "rewards/margins": 2.7116966247558594, "rewards/rejected": -4.854662895202637, "step": 2784 }, { "epoch": 2.63, "grad_norm": 22.829794820543235, "learning_rate": 1.996676381082682e-08, "logps/chosen": -40.18866729736328, "logps/rejected": -68.18269348144531, "loss": 0.3033, "losses/dpo": 0.026705702766776085, "losses/sft": 0.9418706297874451, "losses/total": 0.026705702766776085, "ref_logps/chosen": -23.97208023071289, "ref_logps/rejected": -25.370376586914062, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6216585636138916, "rewards/margins": 2.6595726013183594, "rewards/rejected": -4.281230926513672, "step": 2785 }, { "epoch": 2.63, "grad_norm": 8.314952305697986, "learning_rate": 1.9867153369236112e-08, "logps/chosen": -61.22450637817383, "logps/rejected": -98.9183349609375, "loss": 0.1007, "losses/dpo": 0.007618012838065624, "losses/sft": 1.154832124710083, "losses/total": 0.007618012838065624, "ref_logps/chosen": -39.181880950927734, "ref_logps/rejected": -39.27879333496094, "rewards/accuracies": 1.0, "rewards/chosen": -2.2042620182037354, "rewards/margins": 3.759692907333374, "rewards/rejected": -5.963954925537109, "step": 2786 }, { "epoch": 2.63, "grad_norm": 12.781451066175077, "learning_rate": 1.97677817360343e-08, "logps/chosen": -47.691123962402344, "logps/rejected": -96.13882446289062, "loss": 0.1022, "losses/dpo": 0.1300014704465866, "losses/sft": 1.440125584602356, "losses/total": 0.1300014704465866, "ref_logps/chosen": -25.957965850830078, "ref_logps/rejected": -39.03749084472656, "rewards/accuracies": 1.0, "rewards/chosen": -2.1733155250549316, "rewards/margins": 3.536818504333496, "rewards/rejected": -5.710134506225586, "step": 2787 }, { "epoch": 2.63, "grad_norm": 15.759082484658805, "learning_rate": 1.966864901433901e-08, "logps/chosen": -46.907569885253906, "logps/rejected": -88.58041381835938, "loss": 0.1679, "losses/dpo": 0.036769889295101166, "losses/sft": 2.605133295059204, "losses/total": 0.036769889295101166, "ref_logps/chosen": -25.978891372680664, "ref_logps/rejected": -36.72311019897461, "rewards/accuracies": 0.9375, "rewards/chosen": -2.092867851257324, "rewards/margins": 3.0928618907928467, "rewards/rejected": -5.18572998046875, "step": 2788 }, { "epoch": 2.63, "grad_norm": 13.49236806457064, "learning_rate": 1.9569755307020208e-08, "logps/chosen": -66.5517807006836, "logps/rejected": -96.81708526611328, "loss": 0.1191, "losses/dpo": 0.015381070785224438, "losses/sft": 1.4063947200775146, "losses/total": 0.015381070785224438, "ref_logps/chosen": -34.59379577636719, "ref_logps/rejected": -37.88645935058594, "rewards/accuracies": 1.0, "rewards/chosen": -3.195798635482788, "rewards/margins": 2.6972641944885254, "rewards/rejected": -5.893062591552734, "step": 2789 }, { "epoch": 2.63, "grad_norm": 17.6450032218876, "learning_rate": 1.9471100716699523e-08, "logps/chosen": -47.754310607910156, "logps/rejected": -83.47128295898438, "loss": 0.1937, "losses/dpo": 0.7982841730117798, "losses/sft": 1.2709115743637085, "losses/total": 0.7982841730117798, "ref_logps/chosen": -24.730716705322266, "ref_logps/rejected": -30.86513900756836, "rewards/accuracies": 0.875, "rewards/chosen": -2.3023593425750732, "rewards/margins": 2.958254814147949, "rewards/rejected": -5.260613918304443, "step": 2790 }, { "epoch": 2.63, "grad_norm": 26.256619363834673, "learning_rate": 1.9372685345750762e-08, "logps/chosen": -49.73558044433594, "logps/rejected": -72.3668212890625, "loss": 0.4712, "losses/dpo": 0.3017767667770386, "losses/sft": 0.9928619265556335, "losses/total": 0.3017767667770386, "ref_logps/chosen": -27.740989685058594, "ref_logps/rejected": -26.50198745727539, "rewards/accuracies": 0.875, "rewards/chosen": -2.1994590759277344, "rewards/margins": 2.38702392578125, "rewards/rejected": -4.586483001708984, "step": 2791 }, { "epoch": 2.63, "grad_norm": 25.03069877765949, "learning_rate": 1.9274509296299313e-08, "logps/chosen": -68.91224670410156, "logps/rejected": -96.59869384765625, "loss": 0.2157, "losses/dpo": 0.11170407384634018, "losses/sft": 1.4747097492218018, "losses/total": 0.11170407384634018, "ref_logps/chosen": -38.23283386230469, "ref_logps/rejected": -37.81830978393555, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0679421424865723, "rewards/margins": 2.810096263885498, "rewards/rejected": -5.87803840637207, "step": 2792 }, { "epoch": 2.63, "grad_norm": 6.5837960847670765, "learning_rate": 1.917657267022224e-08, "logps/chosen": -49.02075958251953, "logps/rejected": -100.90135192871094, "loss": 0.0499, "losses/dpo": 0.013848483562469482, "losses/sft": 1.9582533836364746, "losses/total": 0.013848483562469482, "ref_logps/chosen": -22.933813095092773, "ref_logps/rejected": -34.84961700439453, "rewards/accuracies": 1.0, "rewards/chosen": -2.608694553375244, "rewards/margins": 3.9964792728424072, "rewards/rejected": -6.6051740646362305, "step": 2793 }, { "epoch": 2.64, "grad_norm": 8.865486529696618, "learning_rate": 1.9078875569148223e-08, "logps/chosen": -49.60749816894531, "logps/rejected": -87.40669250488281, "loss": 0.0846, "losses/dpo": 0.03583729267120361, "losses/sft": 0.5461621880531311, "losses/total": 0.03583729267120361, "ref_logps/chosen": -33.51853942871094, "ref_logps/rejected": -35.743621826171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.608896017074585, "rewards/margins": 3.557410478591919, "rewards/rejected": -5.166306495666504, "step": 2794 }, { "epoch": 2.64, "grad_norm": 12.275237920756474, "learning_rate": 1.8981418094457273e-08, "logps/chosen": -61.42280960083008, "logps/rejected": -90.388671875, "loss": 0.1145, "losses/dpo": 0.07341725379228592, "losses/sft": 2.5974156856536865, "losses/total": 0.07341725379228592, "ref_logps/chosen": -33.775333404541016, "ref_logps/rejected": -35.52915954589844, "rewards/accuracies": 1.0, "rewards/chosen": -2.7647480964660645, "rewards/margins": 2.721202850341797, "rewards/rejected": -5.485950469970703, "step": 2795 }, { "epoch": 2.64, "grad_norm": 17.19520973036019, "learning_rate": 1.8884200347280944e-08, "logps/chosen": -58.516441345214844, "logps/rejected": -78.54539489746094, "loss": 0.1793, "losses/dpo": 0.05865200608968735, "losses/sft": 2.401871919631958, "losses/total": 0.05865200608968735, "ref_logps/chosen": -31.00728416442871, "ref_logps/rejected": -29.87194061279297, "rewards/accuracies": 1.0, "rewards/chosen": -2.750915765762329, "rewards/margins": 2.116429090499878, "rewards/rejected": -4.867344856262207, "step": 2796 }, { "epoch": 2.64, "grad_norm": 9.095745584090947, "learning_rate": 1.8787222428501755e-08, "logps/chosen": -52.90055847167969, "logps/rejected": -105.30073547363281, "loss": 0.0782, "losses/dpo": 0.038164734840393066, "losses/sft": 0.8702307939529419, "losses/total": 0.038164734840393066, "ref_logps/chosen": -28.539430618286133, "ref_logps/rejected": -45.489864349365234, "rewards/accuracies": 1.0, "rewards/chosen": -2.436112880706787, "rewards/margins": 3.5449745655059814, "rewards/rejected": -5.9810872077941895, "step": 2797 }, { "epoch": 2.64, "grad_norm": 10.94848353595915, "learning_rate": 1.8690484438753596e-08, "logps/chosen": -55.06923294067383, "logps/rejected": -109.53988647460938, "loss": 0.0889, "losses/dpo": 0.0020082583650946617, "losses/sft": 1.2426941394805908, "losses/total": 0.0020082583650946617, "ref_logps/chosen": -25.831157684326172, "ref_logps/rejected": -41.60638427734375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9238076210021973, "rewards/margins": 3.869542121887207, "rewards/rejected": -6.793349266052246, "step": 2798 }, { "epoch": 2.64, "grad_norm": 8.258299698984725, "learning_rate": 1.85939864784212e-08, "logps/chosen": -53.41211700439453, "logps/rejected": -96.54610443115234, "loss": 0.0827, "losses/dpo": 0.013262515887618065, "losses/sft": 2.8368637561798096, "losses/total": 0.013262515887618065, "ref_logps/chosen": -29.957237243652344, "ref_logps/rejected": -36.29106903076172, "rewards/accuracies": 1.0, "rewards/chosen": -2.3454883098602295, "rewards/margins": 3.680016040802002, "rewards/rejected": -6.025504112243652, "step": 2799 }, { "epoch": 2.64, "grad_norm": 15.914942832858458, "learning_rate": 1.849772864764035e-08, "logps/chosen": -41.410057067871094, "logps/rejected": -83.02757263183594, "loss": 0.1729, "losses/dpo": 0.0014260528841987252, "losses/sft": 1.616995930671692, "losses/total": 0.0014260528841987252, "ref_logps/chosen": -20.717235565185547, "ref_logps/rejected": -29.854806900024414, "rewards/accuracies": 0.875, "rewards/chosen": -2.069282054901123, "rewards/margins": 3.2479944229125977, "rewards/rejected": -5.317276477813721, "step": 2800 }, { "epoch": 2.64, "grad_norm": 17.859322935435408, "learning_rate": 1.840171104629762e-08, "logps/chosen": -42.48439025878906, "logps/rejected": -67.95292663574219, "loss": 0.2383, "losses/dpo": 0.0411231704056263, "losses/sft": 1.461352825164795, "losses/total": 0.0411231704056263, "ref_logps/chosen": -20.531326293945312, "ref_logps/rejected": -23.354581832885742, "rewards/accuracies": 0.875, "rewards/chosen": -2.1953063011169434, "rewards/margins": 2.264528751373291, "rewards/rejected": -4.459835052490234, "step": 2801 }, { "epoch": 2.64, "grad_norm": 14.647495131746684, "learning_rate": 1.8305933774030252e-08, "logps/chosen": -51.805419921875, "logps/rejected": -77.70191192626953, "loss": 0.1469, "losses/dpo": 0.3088758587837219, "losses/sft": 1.784153938293457, "losses/total": 0.3088758587837219, "ref_logps/chosen": -30.62432861328125, "ref_logps/rejected": -28.343734741210938, "rewards/accuracies": 1.0, "rewards/chosen": -2.1181089878082275, "rewards/margins": 2.817708730697632, "rewards/rejected": -4.935817718505859, "step": 2802 }, { "epoch": 2.64, "grad_norm": 2.766268391907861, "learning_rate": 1.8210396930226164e-08, "logps/chosen": -58.942527770996094, "logps/rejected": -110.30023193359375, "loss": 0.0194, "losses/dpo": 0.009241566993296146, "losses/sft": 1.1629202365875244, "losses/total": 0.009241566993296146, "ref_logps/chosen": -34.665645599365234, "ref_logps/rejected": -39.39238357543945, "rewards/accuracies": 1.0, "rewards/chosen": -2.4276885986328125, "rewards/margins": 4.6630964279174805, "rewards/rejected": -7.090785026550293, "step": 2803 }, { "epoch": 2.65, "grad_norm": 18.882770252136567, "learning_rate": 1.811510061402366e-08, "logps/chosen": -46.472442626953125, "logps/rejected": -71.3446044921875, "loss": 0.1635, "losses/dpo": 0.07880403846502304, "losses/sft": 1.5317310094833374, "losses/total": 0.07880403846502304, "ref_logps/chosen": -26.476078033447266, "ref_logps/rejected": -26.322540283203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9996364116668701, "rewards/margins": 2.5025696754455566, "rewards/rejected": -4.502206325531006, "step": 2804 }, { "epoch": 2.65, "grad_norm": 9.180100146897596, "learning_rate": 1.8020044924311667e-08, "logps/chosen": -52.34739685058594, "logps/rejected": -93.03675842285156, "loss": 0.0854, "losses/dpo": 0.039273008704185486, "losses/sft": 1.6164549589157104, "losses/total": 0.039273008704185486, "ref_logps/chosen": -28.314468383789062, "ref_logps/rejected": -35.2613639831543, "rewards/accuracies": 1.0, "rewards/chosen": -2.4032931327819824, "rewards/margins": 3.3742458820343018, "rewards/rejected": -5.777539253234863, "step": 2805 }, { "epoch": 2.65, "grad_norm": 15.54844807927381, "learning_rate": 1.7925229959729143e-08, "logps/chosen": -45.566162109375, "logps/rejected": -77.46324157714844, "loss": 0.2015, "losses/dpo": 0.003936342895030975, "losses/sft": 1.2413064241409302, "losses/total": 0.003936342895030975, "ref_logps/chosen": -20.306610107421875, "ref_logps/rejected": -25.013856887817383, "rewards/accuracies": 0.875, "rewards/chosen": -2.5259552001953125, "rewards/margins": 2.7189834117889404, "rewards/rejected": -5.244938850402832, "step": 2806 }, { "epoch": 2.65, "grad_norm": 14.60077851282485, "learning_rate": 1.7830655818665498e-08, "logps/chosen": -51.118438720703125, "logps/rejected": -91.32125854492188, "loss": 0.1903, "losses/dpo": 0.0008280183537863195, "losses/sft": 1.0369209051132202, "losses/total": 0.0008280183537863195, "ref_logps/chosen": -28.783084869384766, "ref_logps/rejected": -35.524593353271484, "rewards/accuracies": 0.875, "rewards/chosen": -2.2335352897644043, "rewards/margins": 3.3461318016052246, "rewards/rejected": -5.579667091369629, "step": 2807 }, { "epoch": 2.65, "grad_norm": 5.902302756062262, "learning_rate": 1.773632259926003e-08, "logps/chosen": -53.92626953125, "logps/rejected": -120.39421081542969, "loss": 0.038, "losses/dpo": 0.008504980243742466, "losses/sft": 0.9597277641296387, "losses/total": 0.008504980243742466, "ref_logps/chosen": -31.68640899658203, "ref_logps/rejected": -49.736148834228516, "rewards/accuracies": 1.0, "rewards/chosen": -2.2239861488342285, "rewards/margins": 4.841819763183594, "rewards/rejected": -7.065805912017822, "step": 2808 }, { "epoch": 2.65, "grad_norm": 9.259939938931932, "learning_rate": 1.7642230399402187e-08, "logps/chosen": -51.47581100463867, "logps/rejected": -99.86223602294922, "loss": 0.0804, "losses/dpo": 0.27831658720970154, "losses/sft": 1.4141430854797363, "losses/total": 0.27831658720970154, "ref_logps/chosen": -28.633926391601562, "ref_logps/rejected": -38.88172149658203, "rewards/accuracies": 1.0, "rewards/chosen": -2.2841885089874268, "rewards/margins": 3.8138628005981445, "rewards/rejected": -6.098052024841309, "step": 2809 }, { "epoch": 2.65, "grad_norm": 14.653133561016043, "learning_rate": 1.7548379316731283e-08, "logps/chosen": -61.024940490722656, "logps/rejected": -102.30985260009766, "loss": 0.1439, "losses/dpo": 0.09312650561332703, "losses/sft": 1.6006309986114502, "losses/total": 0.09312650561332703, "ref_logps/chosen": -33.73482131958008, "ref_logps/rejected": -42.90214538574219, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7290122509002686, "rewards/margins": 3.211759090423584, "rewards/rejected": -5.940771102905273, "step": 2810 }, { "epoch": 2.65, "grad_norm": 17.356105453326823, "learning_rate": 1.7454769448636303e-08, "logps/chosen": -47.698341369628906, "logps/rejected": -74.54415893554688, "loss": 0.1741, "losses/dpo": 0.0682608112692833, "losses/sft": 1.345615267753601, "losses/total": 0.0682608112692833, "ref_logps/chosen": -25.941452026367188, "ref_logps/rejected": -24.989511489868164, "rewards/accuracies": 1.0, "rewards/chosen": -2.175689220428467, "rewards/margins": 2.779775619506836, "rewards/rejected": -4.955464839935303, "step": 2811 }, { "epoch": 2.65, "grad_norm": 16.179799508202056, "learning_rate": 1.736140089225613e-08, "logps/chosen": -26.934890747070312, "logps/rejected": -79.48282623291016, "loss": 0.2028, "losses/dpo": 0.013190564699470997, "losses/sft": 1.28653883934021, "losses/total": 0.013190564699470997, "ref_logps/chosen": -13.415700912475586, "ref_logps/rejected": -32.41227340698242, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3519190549850464, "rewards/margins": 3.355135679244995, "rewards/rejected": -4.70705509185791, "step": 2812 }, { "epoch": 2.65, "grad_norm": 13.77300324125103, "learning_rate": 1.7268273744479006e-08, "logps/chosen": -47.329612731933594, "logps/rejected": -89.64637756347656, "loss": 0.1258, "losses/dpo": 0.8528265357017517, "losses/sft": 1.8794991970062256, "losses/total": 0.8528265357017517, "ref_logps/chosen": -26.589588165283203, "ref_logps/rejected": -30.40192413330078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.074002504348755, "rewards/margins": 3.850443124771118, "rewards/rejected": -5.924445629119873, "step": 2813 }, { "epoch": 2.65, "grad_norm": 9.752858381679525, "learning_rate": 1.7175388101942885e-08, "logps/chosen": -48.241966247558594, "logps/rejected": -95.4462890625, "loss": 0.1139, "losses/dpo": 0.008726786822080612, "losses/sft": 1.3472671508789062, "losses/total": 0.008726786822080612, "ref_logps/chosen": -24.7725830078125, "ref_logps/rejected": -38.76769256591797, "rewards/accuracies": 1.0, "rewards/chosen": -2.3469386100769043, "rewards/margins": 3.320920467376709, "rewards/rejected": -5.667859077453613, "step": 2814 }, { "epoch": 2.66, "grad_norm": 9.590982973502335, "learning_rate": 1.7082744061034963e-08, "logps/chosen": -61.36180877685547, "logps/rejected": -91.30992126464844, "loss": 0.0839, "losses/dpo": 0.014681704342365265, "losses/sft": 1.9502456188201904, "losses/total": 0.014681704342365265, "ref_logps/chosen": -35.74493408203125, "ref_logps/rejected": -34.633140563964844, "rewards/accuracies": 1.0, "rewards/chosen": -2.561687707901001, "rewards/margins": 3.105990171432495, "rewards/rejected": -5.667677879333496, "step": 2815 }, { "epoch": 2.66, "grad_norm": 10.364722934794802, "learning_rate": 1.6990341717891738e-08, "logps/chosen": -63.07079315185547, "logps/rejected": -104.24777221679688, "loss": 0.0845, "losses/dpo": 0.0007236601668410003, "losses/sft": 1.6045522689819336, "losses/total": 0.0007236601668410003, "ref_logps/chosen": -35.30938720703125, "ref_logps/rejected": -43.48402404785156, "rewards/accuracies": 1.0, "rewards/chosen": -2.7761411666870117, "rewards/margins": 3.300233840942383, "rewards/rejected": -6.0763750076293945, "step": 2816 }, { "epoch": 2.66, "grad_norm": 13.489322584898098, "learning_rate": 1.6898181168399055e-08, "logps/chosen": -66.53965759277344, "logps/rejected": -99.47126770019531, "loss": 0.1208, "losses/dpo": 0.04874548688530922, "losses/sft": 3.0751006603240967, "losses/total": 0.04874548688530922, "ref_logps/chosen": -35.16459274291992, "ref_logps/rejected": -38.216026306152344, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1375067234039307, "rewards/margins": 2.9880177974700928, "rewards/rejected": -6.125524520874023, "step": 2817 }, { "epoch": 2.66, "grad_norm": 7.504245019728615, "learning_rate": 1.6806262508191587e-08, "logps/chosen": -46.265838623046875, "logps/rejected": -87.17449951171875, "loss": 0.0904, "losses/dpo": 0.024305099621415138, "losses/sft": 1.8427212238311768, "losses/total": 0.024305099621415138, "ref_logps/chosen": -22.698129653930664, "ref_logps/rejected": -31.11077308654785, "rewards/accuracies": 1.0, "rewards/chosen": -2.356771230697632, "rewards/margins": 3.2496018409729004, "rewards/rejected": -5.606373310089111, "step": 2818 }, { "epoch": 2.66, "grad_norm": 9.632721636289803, "learning_rate": 1.6714585832653254e-08, "logps/chosen": -48.46437072753906, "logps/rejected": -89.90699768066406, "loss": 0.0938, "losses/dpo": 0.13515493273735046, "losses/sft": 1.398819088935852, "losses/total": 0.13515493273735046, "ref_logps/chosen": -25.489238739013672, "ref_logps/rejected": -30.069202423095703, "rewards/accuracies": 1.0, "rewards/chosen": -2.297513484954834, "rewards/margins": 3.6862661838531494, "rewards/rejected": -5.983779430389404, "step": 2819 }, { "epoch": 2.66, "grad_norm": 8.453531622403126, "learning_rate": 1.662315123691668e-08, "logps/chosen": -53.70713806152344, "logps/rejected": -101.06900787353516, "loss": 0.0924, "losses/dpo": 0.0716320052742958, "losses/sft": 2.294779062271118, "losses/total": 0.0716320052742958, "ref_logps/chosen": -28.88657569885254, "ref_logps/rejected": -41.520965576171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.482056140899658, "rewards/margins": 3.4727485179901123, "rewards/rejected": -5.954804420471191, "step": 2820 }, { "epoch": 2.66, "grad_norm": 20.12241050045552, "learning_rate": 1.6531958815863417e-08, "logps/chosen": -41.16617202758789, "logps/rejected": -73.81826782226562, "loss": 0.2529, "losses/dpo": 1.6139299869537354, "losses/sft": 3.103987693786621, "losses/total": 1.6139299869537354, "ref_logps/chosen": -26.090408325195312, "ref_logps/rejected": -32.77730178833008, "rewards/accuracies": 0.875, "rewards/chosen": -1.5075764656066895, "rewards/margins": 2.596520185470581, "rewards/rejected": -4.104096412658691, "step": 2821 }, { "epoch": 2.66, "grad_norm": 9.408507200289522, "learning_rate": 1.6441008664123595e-08, "logps/chosen": -55.764434814453125, "logps/rejected": -92.21015167236328, "loss": 0.0726, "losses/dpo": 0.0010207192972302437, "losses/sft": 1.4202988147735596, "losses/total": 0.0010207192972302437, "ref_logps/chosen": -30.147777557373047, "ref_logps/rejected": -31.179035186767578, "rewards/accuracies": 1.0, "rewards/chosen": -2.5616655349731445, "rewards/margins": 3.5414462089538574, "rewards/rejected": -6.103111743927002, "step": 2822 }, { "epoch": 2.66, "grad_norm": 13.272500309952548, "learning_rate": 1.6350300876076056e-08, "logps/chosen": -50.03733825683594, "logps/rejected": -92.8902587890625, "loss": 0.1597, "losses/dpo": 0.0028709254693239927, "losses/sft": 1.8538126945495605, "losses/total": 0.0028709254693239927, "ref_logps/chosen": -23.639450073242188, "ref_logps/rejected": -33.99414825439453, "rewards/accuracies": 1.0, "rewards/chosen": -2.639788866043091, "rewards/margins": 3.2498221397399902, "rewards/rejected": -5.88961124420166, "step": 2823 }, { "epoch": 2.66, "grad_norm": 12.462671503146598, "learning_rate": 1.6259835545848034e-08, "logps/chosen": -56.62602996826172, "logps/rejected": -96.43949890136719, "loss": 0.13, "losses/dpo": 0.005246830638498068, "losses/sft": 1.6841330528259277, "losses/total": 0.005246830638498068, "ref_logps/chosen": -31.555946350097656, "ref_logps/rejected": -37.83412170410156, "rewards/accuracies": 1.0, "rewards/chosen": -2.5070080757141113, "rewards/margins": 3.3535304069519043, "rewards/rejected": -5.860538482666016, "step": 2824 }, { "epoch": 2.67, "grad_norm": 6.283619954008635, "learning_rate": 1.6169612767315266e-08, "logps/chosen": -43.59754180908203, "logps/rejected": -87.22843933105469, "loss": 0.0447, "losses/dpo": 0.04967693239450455, "losses/sft": 2.7539899349212646, "losses/total": 0.04967693239450455, "ref_logps/chosen": -25.274906158447266, "ref_logps/rejected": -29.527584075927734, "rewards/accuracies": 1.0, "rewards/chosen": -1.832263708114624, "rewards/margins": 3.937821388244629, "rewards/rejected": -5.770085334777832, "step": 2825 }, { "epoch": 2.67, "grad_norm": 8.153659335873753, "learning_rate": 1.6079632634101753e-08, "logps/chosen": -55.52337646484375, "logps/rejected": -106.68479919433594, "loss": 0.0854, "losses/dpo": 0.09941048920154572, "losses/sft": 2.331650495529175, "losses/total": 0.09941048920154572, "ref_logps/chosen": -27.051109313964844, "ref_logps/rejected": -41.40109634399414, "rewards/accuracies": 1.0, "rewards/chosen": -2.847226142883301, "rewards/margins": 3.6811442375183105, "rewards/rejected": -6.528370380401611, "step": 2826 }, { "epoch": 2.67, "grad_norm": 11.831454681084063, "learning_rate": 1.5989895239579665e-08, "logps/chosen": -51.01895523071289, "logps/rejected": -116.04660034179688, "loss": 0.0779, "losses/dpo": 0.1077575758099556, "losses/sft": 2.27634334564209, "losses/total": 0.1077575758099556, "ref_logps/chosen": -23.92596435546875, "ref_logps/rejected": -43.02252960205078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.709299087524414, "rewards/margins": 4.593108177185059, "rewards/rejected": -7.302407264709473, "step": 2827 }, { "epoch": 2.67, "grad_norm": 10.616512928142756, "learning_rate": 1.5900400676869347e-08, "logps/chosen": -54.854820251464844, "logps/rejected": -89.3753662109375, "loss": 0.1014, "losses/dpo": 0.35202786326408386, "losses/sft": 1.6465380191802979, "losses/total": 0.35202786326408386, "ref_logps/chosen": -30.85869598388672, "ref_logps/rejected": -33.978675842285156, "rewards/accuracies": 1.0, "rewards/chosen": -2.3996124267578125, "rewards/margins": 3.1400563716888428, "rewards/rejected": -5.539669036865234, "step": 2828 }, { "epoch": 2.67, "grad_norm": 13.27604250635405, "learning_rate": 1.5811149038839038e-08, "logps/chosen": -55.8221435546875, "logps/rejected": -91.17778015136719, "loss": 0.1947, "losses/dpo": 0.12854258716106415, "losses/sft": 3.0231337547302246, "losses/total": 0.12854258716106415, "ref_logps/chosen": -30.731853485107422, "ref_logps/rejected": -36.0888671875, "rewards/accuracies": 0.875, "rewards/chosen": -2.509028911590576, "rewards/margins": 2.9998626708984375, "rewards/rejected": -5.5088911056518555, "step": 2829 }, { "epoch": 2.67, "grad_norm": 22.70708942613047, "learning_rate": 1.572214041810513e-08, "logps/chosen": -54.56840515136719, "logps/rejected": -79.3572998046875, "loss": 0.2197, "losses/dpo": 0.33177652955055237, "losses/sft": 1.4276703596115112, "losses/total": 0.33177652955055237, "ref_logps/chosen": -30.567705154418945, "ref_logps/rejected": -30.814956665039062, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4000699520111084, "rewards/margins": 2.4541640281677246, "rewards/rejected": -4.854233741760254, "step": 2830 }, { "epoch": 2.67, "grad_norm": 21.151471559510373, "learning_rate": 1.563337490703154e-08, "logps/chosen": -46.9577522277832, "logps/rejected": -73.68101501464844, "loss": 0.2577, "losses/dpo": 0.021885527297854424, "losses/sft": 1.156499981880188, "losses/total": 0.021885527297854424, "ref_logps/chosen": -23.73055648803711, "ref_logps/rejected": -24.481159210205078, "rewards/accuracies": 0.875, "rewards/chosen": -2.3227195739746094, "rewards/margins": 2.597266435623169, "rewards/rejected": -4.919985771179199, "step": 2831 }, { "epoch": 2.67, "grad_norm": 12.104896635925428, "learning_rate": 1.5544852597730167e-08, "logps/chosen": -48.095314025878906, "logps/rejected": -85.69065856933594, "loss": 0.1413, "losses/dpo": 0.2104538381099701, "losses/sft": 0.12857858836650848, "losses/total": 0.2104538381099701, "ref_logps/chosen": -24.947067260742188, "ref_logps/rejected": -31.70560073852539, "rewards/accuracies": 1.0, "rewards/chosen": -2.3148248195648193, "rewards/margins": 3.083681106567383, "rewards/rejected": -5.398506164550781, "step": 2832 }, { "epoch": 2.67, "grad_norm": 9.718407369063774, "learning_rate": 1.5456573582060372e-08, "logps/chosen": -51.2730712890625, "logps/rejected": -86.31947326660156, "loss": 0.0773, "losses/dpo": 0.19249127805233002, "losses/sft": 0.16010436415672302, "losses/total": 0.19249127805233002, "ref_logps/chosen": -26.95941162109375, "ref_logps/rejected": -31.209861755371094, "rewards/accuracies": 1.0, "rewards/chosen": -2.431365966796875, "rewards/margins": 3.0795950889587402, "rewards/rejected": -5.510961055755615, "step": 2833 }, { "epoch": 2.67, "grad_norm": 12.03216166866427, "learning_rate": 1.5368537951629123e-08, "logps/chosen": -53.357662200927734, "logps/rejected": -95.70645904541016, "loss": 0.11, "losses/dpo": 0.05409887433052063, "losses/sft": 0.4967217743396759, "losses/total": 0.05409887433052063, "ref_logps/chosen": -27.496498107910156, "ref_logps/rejected": -34.61152648925781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.586116313934326, "rewards/margins": 3.523376703262329, "rewards/rejected": -6.109493255615234, "step": 2834 }, { "epoch": 2.67, "grad_norm": 9.097918072913965, "learning_rate": 1.5280745797790843e-08, "logps/chosen": -42.50740051269531, "logps/rejected": -93.31790161132812, "loss": 0.081, "losses/dpo": 0.003283426631242037, "losses/sft": 3.235013246536255, "losses/total": 0.003283426631242037, "ref_logps/chosen": -24.826770782470703, "ref_logps/rejected": -33.871543884277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7680628299713135, "rewards/margins": 4.176572799682617, "rewards/rejected": -5.944635391235352, "step": 2835 }, { "epoch": 2.68, "grad_norm": 7.855530520110218, "learning_rate": 1.5193197211647218e-08, "logps/chosen": -50.95653533935547, "logps/rejected": -99.20956420898438, "loss": 0.0707, "losses/dpo": 0.005761385895311832, "losses/sft": 1.8886492252349854, "losses/total": 0.005761385895311832, "ref_logps/chosen": -28.070512771606445, "ref_logps/rejected": -39.599849700927734, "rewards/accuracies": 1.0, "rewards/chosen": -2.288602113723755, "rewards/margins": 3.6723694801330566, "rewards/rejected": -5.960971832275391, "step": 2836 }, { "epoch": 2.68, "grad_norm": 7.675577582126389, "learning_rate": 1.5105892284047322e-08, "logps/chosen": -49.420318603515625, "logps/rejected": -96.48568725585938, "loss": 0.0788, "losses/dpo": 0.005915814079344273, "losses/sft": 2.795821189880371, "losses/total": 0.005915814079344273, "ref_logps/chosen": -27.972320556640625, "ref_logps/rejected": -38.0633659362793, "rewards/accuracies": 1.0, "rewards/chosen": -2.1447997093200684, "rewards/margins": 3.697432518005371, "rewards/rejected": -5.842232704162598, "step": 2837 }, { "epoch": 2.68, "grad_norm": 11.122668828952447, "learning_rate": 1.5018831105587216e-08, "logps/chosen": -61.794830322265625, "logps/rejected": -119.93282318115234, "loss": 0.1105, "losses/dpo": 0.005420593079179525, "losses/sft": 2.6616153717041016, "losses/total": 0.005420593079179525, "ref_logps/chosen": -35.59931182861328, "ref_logps/rejected": -49.88977813720703, "rewards/accuracies": 1.0, "rewards/chosen": -2.619551658630371, "rewards/margins": 4.384753227233887, "rewards/rejected": -7.004304885864258, "step": 2838 }, { "epoch": 2.68, "grad_norm": 15.791642521778758, "learning_rate": 1.493201376661016e-08, "logps/chosen": -63.725223541259766, "logps/rejected": -103.16880798339844, "loss": 0.0991, "losses/dpo": 0.033112429082393646, "losses/sft": 1.186898112297058, "losses/total": 0.033112429082393646, "ref_logps/chosen": -36.538143157958984, "ref_logps/rejected": -40.094810485839844, "rewards/accuracies": 1.0, "rewards/chosen": -2.7187085151672363, "rewards/margins": 3.588690757751465, "rewards/rejected": -6.307399272918701, "step": 2839 }, { "epoch": 2.68, "grad_norm": 8.194494530480082, "learning_rate": 1.4845440357206279e-08, "logps/chosen": -44.15911102294922, "logps/rejected": -85.79733276367188, "loss": 0.0853, "losses/dpo": 0.02067658118903637, "losses/sft": 1.688472867012024, "losses/total": 0.02067658118903637, "ref_logps/chosen": -23.265106201171875, "ref_logps/rejected": -32.755374908447266, "rewards/accuracies": 1.0, "rewards/chosen": -2.089400291442871, "rewards/margins": 3.2147955894470215, "rewards/rejected": -5.304195880889893, "step": 2840 }, { "epoch": 2.68, "grad_norm": 10.953404884321003, "learning_rate": 1.4759110967212718e-08, "logps/chosen": -52.511375427246094, "logps/rejected": -87.5341796875, "loss": 0.1157, "losses/dpo": 0.0003920833405572921, "losses/sft": 0.3422554135322571, "losses/total": 0.0003920833405572921, "ref_logps/chosen": -30.63216209411621, "ref_logps/rejected": -32.049461364746094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1879212856292725, "rewards/margins": 3.3605499267578125, "rewards/rejected": -5.548471450805664, "step": 2841 }, { "epoch": 2.68, "grad_norm": 28.40111703078128, "learning_rate": 1.4673025686213204e-08, "logps/chosen": -63.26511764526367, "logps/rejected": -82.4879150390625, "loss": 0.3305, "losses/dpo": 0.42105165123939514, "losses/sft": 1.978668212890625, "losses/total": 0.42105165123939514, "ref_logps/chosen": -34.495460510253906, "ref_logps/rejected": -33.183082580566406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8769657611846924, "rewards/margins": 2.053518295288086, "rewards/rejected": -4.930483818054199, "step": 2842 }, { "epoch": 2.68, "grad_norm": 11.218115632112259, "learning_rate": 1.4587184603538304e-08, "logps/chosen": -67.41361236572266, "logps/rejected": -101.72064971923828, "loss": 0.1239, "losses/dpo": 0.06529197841882706, "losses/sft": 3.0805039405822754, "losses/total": 0.06529197841882706, "ref_logps/chosen": -38.012596130371094, "ref_logps/rejected": -43.404296875, "rewards/accuracies": 1.0, "rewards/chosen": -2.9401016235351562, "rewards/margins": 2.891533851623535, "rewards/rejected": -5.83163595199585, "step": 2843 }, { "epoch": 2.68, "grad_norm": 12.089528632899027, "learning_rate": 1.4501587808265131e-08, "logps/chosen": -54.98820495605469, "logps/rejected": -97.18350219726562, "loss": 0.1092, "losses/dpo": 1.2536797523498535, "losses/sft": 2.4683852195739746, "losses/total": 1.2536797523498535, "ref_logps/chosen": -31.95380973815918, "ref_logps/rejected": -36.60350036621094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3034398555755615, "rewards/margins": 3.7545604705810547, "rewards/rejected": -6.058000564575195, "step": 2844 }, { "epoch": 2.68, "grad_norm": 8.881945638405377, "learning_rate": 1.4416235389217379e-08, "logps/chosen": -47.33195877075195, "logps/rejected": -89.97486877441406, "loss": 0.1325, "losses/dpo": 0.029565947130322456, "losses/sft": 1.434861421585083, "losses/total": 0.029565947130322456, "ref_logps/chosen": -26.36478614807129, "ref_logps/rejected": -36.048213958740234, "rewards/accuracies": 0.9375, "rewards/chosen": -2.096717357635498, "rewards/margins": 3.2959482669830322, "rewards/rejected": -5.392665386199951, "step": 2845 }, { "epoch": 2.68, "grad_norm": 15.368594484472753, "learning_rate": 1.4331127434964951e-08, "logps/chosen": -51.26707077026367, "logps/rejected": -76.73634338378906, "loss": 0.153, "losses/dpo": 0.5841065049171448, "losses/sft": 2.8146655559539795, "losses/total": 0.5841065049171448, "ref_logps/chosen": -25.7127742767334, "ref_logps/rejected": -25.677526473999023, "rewards/accuracies": 1.0, "rewards/chosen": -2.555429458618164, "rewards/margins": 2.5504517555236816, "rewards/rejected": -5.105881214141846, "step": 2846 }, { "epoch": 2.69, "grad_norm": 21.581923912093192, "learning_rate": 1.4246264033824284e-08, "logps/chosen": -61.81499481201172, "logps/rejected": -84.83018493652344, "loss": 0.2397, "losses/dpo": 0.2505514919757843, "losses/sft": 2.7463629245758057, "losses/total": 0.2505514919757843, "ref_logps/chosen": -33.430233001708984, "ref_logps/rejected": -30.664228439331055, "rewards/accuracies": 0.875, "rewards/chosen": -2.8384764194488525, "rewards/margins": 2.578120231628418, "rewards/rejected": -5.416596412658691, "step": 2847 }, { "epoch": 2.69, "grad_norm": 17.778608681097413, "learning_rate": 1.4161645273857963e-08, "logps/chosen": -54.53549575805664, "logps/rejected": -94.1983642578125, "loss": 0.1857, "losses/dpo": 0.14470572769641876, "losses/sft": 1.6628787517547607, "losses/total": 0.14470572769641876, "ref_logps/chosen": -27.57988166809082, "ref_logps/rejected": -41.42209243774414, "rewards/accuracies": 0.9375, "rewards/chosen": -2.695561170578003, "rewards/margins": 2.5820655822753906, "rewards/rejected": -5.277626991271973, "step": 2848 }, { "epoch": 2.69, "grad_norm": 7.435760653352009, "learning_rate": 1.4077271242874639e-08, "logps/chosen": -46.57875442504883, "logps/rejected": -90.34742736816406, "loss": 0.0806, "losses/dpo": 0.030564261600375175, "losses/sft": 2.0329408645629883, "losses/total": 0.030564261600375175, "ref_logps/chosen": -26.526731491088867, "ref_logps/rejected": -36.621063232421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.005202531814575, "rewards/margins": 3.367434024810791, "rewards/rejected": -5.372636795043945, "step": 2849 }, { "epoch": 2.69, "grad_norm": 9.758076582303332, "learning_rate": 1.3993142028429133e-08, "logps/chosen": -47.202369689941406, "logps/rejected": -75.2608871459961, "loss": 0.1257, "losses/dpo": 0.007947532460093498, "losses/sft": 1.041975975036621, "losses/total": 0.007947532460093498, "ref_logps/chosen": -24.031814575195312, "ref_logps/rejected": -23.742019653320312, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3170557022094727, "rewards/margins": 2.8348307609558105, "rewards/rejected": -5.151885986328125, "step": 2850 }, { "epoch": 2.69, "grad_norm": 13.444347283444094, "learning_rate": 1.3909257717822114e-08, "logps/chosen": -46.90399932861328, "logps/rejected": -88.60507202148438, "loss": 0.1446, "losses/dpo": 0.011996475979685783, "losses/sft": 0.33353766798973083, "losses/total": 0.011996475979685783, "ref_logps/chosen": -24.708955764770508, "ref_logps/rejected": -33.70659637451172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2195043563842773, "rewards/margins": 3.2703428268432617, "rewards/rejected": -5.489847183227539, "step": 2851 }, { "epoch": 2.69, "grad_norm": 14.364693700382446, "learning_rate": 1.382561839810023e-08, "logps/chosen": -53.76738357543945, "logps/rejected": -89.23169708251953, "loss": 0.1387, "losses/dpo": 0.07196628302335739, "losses/sft": 1.5133062601089478, "losses/total": 0.07196628302335739, "ref_logps/chosen": -30.036775588989258, "ref_logps/rejected": -34.19063949584961, "rewards/accuracies": 1.0, "rewards/chosen": -2.373060703277588, "rewards/margins": 3.131044864654541, "rewards/rejected": -5.504105567932129, "step": 2852 }, { "epoch": 2.69, "grad_norm": 14.891986440108543, "learning_rate": 1.3742224156055749e-08, "logps/chosen": -51.13638687133789, "logps/rejected": -83.52445983886719, "loss": 0.181, "losses/dpo": 0.698430597782135, "losses/sft": 0.6015021204948425, "losses/total": 0.698430597782135, "ref_logps/chosen": -29.86198616027832, "ref_logps/rejected": -30.299407958984375, "rewards/accuracies": 0.875, "rewards/chosen": -2.1274399757385254, "rewards/margins": 3.195065498352051, "rewards/rejected": -5.322505950927734, "step": 2853 }, { "epoch": 2.69, "grad_norm": 9.15025616142102, "learning_rate": 1.3659075078226751e-08, "logps/chosen": -60.25452423095703, "logps/rejected": -103.57296752929688, "loss": 0.1002, "losses/dpo": 0.03795241937041283, "losses/sft": 0.7473779916763306, "losses/total": 0.03795241937041283, "ref_logps/chosen": -30.138338088989258, "ref_logps/rejected": -40.35694122314453, "rewards/accuracies": 1.0, "rewards/chosen": -3.0116186141967773, "rewards/margins": 3.309983730316162, "rewards/rejected": -6.321602821350098, "step": 2854 }, { "epoch": 2.69, "grad_norm": 12.362528490594975, "learning_rate": 1.3576171250896883e-08, "logps/chosen": -40.78608703613281, "logps/rejected": -72.38230895996094, "loss": 0.1329, "losses/dpo": 0.02457747608423233, "losses/sft": 1.0352545976638794, "losses/total": 0.02457747608423233, "ref_logps/chosen": -22.63672637939453, "ref_logps/rejected": -27.754356384277344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8149361610412598, "rewards/margins": 2.6478590965270996, "rewards/rejected": -4.462795257568359, "step": 2855 }, { "epoch": 2.69, "grad_norm": 15.514631744081921, "learning_rate": 1.3493512760095221e-08, "logps/chosen": -54.852508544921875, "logps/rejected": -92.40373229980469, "loss": 0.1344, "losses/dpo": 0.04313939809799194, "losses/sft": 2.992354154586792, "losses/total": 0.04313939809799194, "ref_logps/chosen": -25.774749755859375, "ref_logps/rejected": -32.06303787231445, "rewards/accuracies": 0.9375, "rewards/chosen": -2.90777587890625, "rewards/margins": 3.1262941360473633, "rewards/rejected": -6.034070014953613, "step": 2856 }, { "epoch": 2.7, "grad_norm": 8.812472756314714, "learning_rate": 1.34110996915964e-08, "logps/chosen": -40.747764587402344, "logps/rejected": -88.8678207397461, "loss": 0.0852, "losses/dpo": 0.047950901091098785, "losses/sft": 2.114847183227539, "losses/total": 0.047950901091098785, "ref_logps/chosen": -21.709733963012695, "ref_logps/rejected": -30.09357452392578, "rewards/accuracies": 1.0, "rewards/chosen": -1.9038031101226807, "rewards/margins": 3.9736218452453613, "rewards/rejected": -5.877424716949463, "step": 2857 }, { "epoch": 2.7, "grad_norm": 19.080697528603903, "learning_rate": 1.3328932130920206e-08, "logps/chosen": -66.84607696533203, "logps/rejected": -93.009033203125, "loss": 0.1907, "losses/dpo": 0.007801214698702097, "losses/sft": 1.0152010917663574, "losses/total": 0.007801214698702097, "ref_logps/chosen": -38.21744155883789, "ref_logps/rejected": -36.15122985839844, "rewards/accuracies": 1.0, "rewards/chosen": -2.862863302230835, "rewards/margins": 2.822916269302368, "rewards/rejected": -5.685779571533203, "step": 2858 }, { "epoch": 2.7, "grad_norm": 5.955020634075913, "learning_rate": 1.3247010163331824e-08, "logps/chosen": -66.10136413574219, "logps/rejected": -104.71983337402344, "loss": 0.0454, "losses/dpo": 0.12738864123821259, "losses/sft": 1.09098219871521, "losses/total": 0.12738864123821259, "ref_logps/chosen": -37.5471076965332, "ref_logps/rejected": -40.31610870361328, "rewards/accuracies": 1.0, "rewards/chosen": -2.8554258346557617, "rewards/margins": 3.584946632385254, "rewards/rejected": -6.440372943878174, "step": 2859 }, { "epoch": 2.7, "grad_norm": 13.998475630258207, "learning_rate": 1.3165333873841445e-08, "logps/chosen": -51.90514373779297, "logps/rejected": -92.48672485351562, "loss": 0.0907, "losses/dpo": 0.01224607601761818, "losses/sft": 2.3578076362609863, "losses/total": 0.01224607601761818, "ref_logps/chosen": -28.78662109375, "ref_logps/rejected": -33.56285095214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.311851978302002, "rewards/margins": 3.580535888671875, "rewards/rejected": -5.892387866973877, "step": 2860 }, { "epoch": 2.7, "grad_norm": 6.222380450628904, "learning_rate": 1.3083903347204467e-08, "logps/chosen": -34.49250793457031, "logps/rejected": -84.96420288085938, "loss": 0.1083, "losses/dpo": 0.030994519591331482, "losses/sft": 1.479899287223816, "losses/total": 0.030994519591331482, "ref_logps/chosen": -17.559375762939453, "ref_logps/rejected": -28.8740177154541, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6933131217956543, "rewards/margins": 3.9157049655914307, "rewards/rejected": -5.609018325805664, "step": 2861 }, { "epoch": 2.7, "grad_norm": 30.749445123032917, "learning_rate": 1.3002718667921076e-08, "logps/chosen": -61.41925048828125, "logps/rejected": -87.84774017333984, "loss": 0.2581, "losses/dpo": 0.01558623369783163, "losses/sft": 0.8223040103912354, "losses/total": 0.01558623369783163, "ref_logps/chosen": -34.45378875732422, "ref_logps/rejected": -36.98064422607422, "rewards/accuracies": 0.875, "rewards/chosen": -2.6965463161468506, "rewards/margins": 2.3901638984680176, "rewards/rejected": -5.086710453033447, "step": 2862 }, { "epoch": 2.7, "grad_norm": 17.487611123337505, "learning_rate": 1.2921779920236548e-08, "logps/chosen": -47.56700134277344, "logps/rejected": -88.84326934814453, "loss": 0.2086, "losses/dpo": 0.05239977315068245, "losses/sft": 1.0720182657241821, "losses/total": 0.05239977315068245, "ref_logps/chosen": -23.804035186767578, "ref_logps/rejected": -35.020179748535156, "rewards/accuracies": 0.875, "rewards/chosen": -2.3762965202331543, "rewards/margins": 3.006012201309204, "rewards/rejected": -5.3823089599609375, "step": 2863 }, { "epoch": 2.7, "grad_norm": 19.595642800811184, "learning_rate": 1.2841087188140836e-08, "logps/chosen": -50.37160110473633, "logps/rejected": -74.31666564941406, "loss": 0.2327, "losses/dpo": 0.034029971808195114, "losses/sft": 2.1152029037475586, "losses/total": 0.034029971808195114, "ref_logps/chosen": -29.129241943359375, "ref_logps/rejected": -28.853580474853516, "rewards/accuracies": 0.875, "rewards/chosen": -2.1242361068725586, "rewards/margins": 2.422072649002075, "rewards/rejected": -4.546308517456055, "step": 2864 }, { "epoch": 2.7, "grad_norm": 14.089429247514733, "learning_rate": 1.2760640555368574e-08, "logps/chosen": -44.63612365722656, "logps/rejected": -81.33868408203125, "loss": 0.1265, "losses/dpo": 0.15311971306800842, "losses/sft": 2.0811686515808105, "losses/total": 0.15311971306800842, "ref_logps/chosen": -25.019840240478516, "ref_logps/rejected": -31.388944625854492, "rewards/accuracies": 1.0, "rewards/chosen": -1.9616284370422363, "rewards/margins": 3.033345937728882, "rewards/rejected": -4.994974136352539, "step": 2865 }, { "epoch": 2.7, "grad_norm": 10.725904645086285, "learning_rate": 1.268044010539912e-08, "logps/chosen": -54.64808654785156, "logps/rejected": -89.21377563476562, "loss": 0.1012, "losses/dpo": 0.015550479292869568, "losses/sft": 2.7954580783843994, "losses/total": 0.015550479292869568, "ref_logps/chosen": -30.29691505432129, "ref_logps/rejected": -34.75553894042969, "rewards/accuracies": 1.0, "rewards/chosen": -2.435116767883301, "rewards/margins": 3.010707378387451, "rewards/rejected": -5.445824146270752, "step": 2866 }, { "epoch": 2.7, "grad_norm": 6.310267669688943, "learning_rate": 1.2600485921456294e-08, "logps/chosen": -54.28863525390625, "logps/rejected": -103.05197143554688, "loss": 0.0471, "losses/dpo": 0.08783111721277237, "losses/sft": 1.1045044660568237, "losses/total": 0.08783111721277237, "ref_logps/chosen": -29.714553833007812, "ref_logps/rejected": -40.20824432373047, "rewards/accuracies": 1.0, "rewards/chosen": -2.4574086666107178, "rewards/margins": 3.826964855194092, "rewards/rejected": -6.284373760223389, "step": 2867 }, { "epoch": 2.71, "grad_norm": 8.165282144991531, "learning_rate": 1.2520778086508448e-08, "logps/chosen": -60.2156867980957, "logps/rejected": -99.82496643066406, "loss": 0.071, "losses/dpo": 0.032458674162626266, "losses/sft": 1.8305315971374512, "losses/total": 0.032458674162626266, "ref_logps/chosen": -32.650794982910156, "ref_logps/rejected": -39.19825744628906, "rewards/accuracies": 1.0, "rewards/chosen": -2.756488800048828, "rewards/margins": 3.306182384490967, "rewards/rejected": -6.062670707702637, "step": 2868 }, { "epoch": 2.71, "grad_norm": 25.138996324769412, "learning_rate": 1.2441316683268143e-08, "logps/chosen": -58.6508903503418, "logps/rejected": -80.96015930175781, "loss": 0.2462, "losses/dpo": 0.09810163080692291, "losses/sft": 0.5373082756996155, "losses/total": 0.09810163080692291, "ref_logps/chosen": -27.802692413330078, "ref_logps/rejected": -28.00699234008789, "rewards/accuracies": 0.875, "rewards/chosen": -3.084820032119751, "rewards/margins": 2.210496425628662, "rewards/rejected": -5.295316696166992, "step": 2869 }, { "epoch": 2.71, "grad_norm": 12.942696906303816, "learning_rate": 1.2362101794192393e-08, "logps/chosen": -56.80162811279297, "logps/rejected": -96.78016662597656, "loss": 0.0974, "losses/dpo": 0.0860036313533783, "losses/sft": 2.716254949569702, "losses/total": 0.0860036313533783, "ref_logps/chosen": -30.722856521606445, "ref_logps/rejected": -36.60301971435547, "rewards/accuracies": 1.0, "rewards/chosen": -2.607877254486084, "rewards/margins": 3.4098377227783203, "rewards/rejected": -6.017714977264404, "step": 2870 }, { "epoch": 2.71, "grad_norm": 15.794393570797459, "learning_rate": 1.2283133501482274e-08, "logps/chosen": -59.4975471496582, "logps/rejected": -80.92585754394531, "loss": 0.2024, "losses/dpo": 0.18022863566875458, "losses/sft": 2.082608699798584, "losses/total": 0.18022863566875458, "ref_logps/chosen": -34.176944732666016, "ref_logps/rejected": -28.817739486694336, "rewards/accuracies": 0.875, "rewards/chosen": -2.532060384750366, "rewards/margins": 2.678750991821289, "rewards/rejected": -5.210811614990234, "step": 2871 }, { "epoch": 2.71, "grad_norm": 14.231429141119225, "learning_rate": 1.2204411887083072e-08, "logps/chosen": -52.005741119384766, "logps/rejected": -99.00135803222656, "loss": 0.1336, "losses/dpo": 0.0015699425712227821, "losses/sft": 1.3706027269363403, "losses/total": 0.0015699425712227821, "ref_logps/chosen": -30.160619735717773, "ref_logps/rejected": -39.745758056640625, "rewards/accuracies": 1.0, "rewards/chosen": -2.184512138366699, "rewards/margins": 3.7410478591918945, "rewards/rejected": -5.925559997558594, "step": 2872 }, { "epoch": 2.71, "grad_norm": 11.303289295455427, "learning_rate": 1.2125937032684052e-08, "logps/chosen": -48.072105407714844, "logps/rejected": -96.21719360351562, "loss": 0.1006, "losses/dpo": 0.02882535010576248, "losses/sft": 1.9826335906982422, "losses/total": 0.02882535010576248, "ref_logps/chosen": -26.34575653076172, "ref_logps/rejected": -38.613555908203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.172635555267334, "rewards/margins": 3.587728500366211, "rewards/rejected": -5.760364532470703, "step": 2873 }, { "epoch": 2.71, "grad_norm": 13.605255779616176, "learning_rate": 1.2047709019718328e-08, "logps/chosen": -68.11811065673828, "logps/rejected": -90.87627410888672, "loss": 0.115, "losses/dpo": 0.06891480088233948, "losses/sft": 2.9875571727752686, "losses/total": 0.06891480088233948, "ref_logps/chosen": -36.011348724365234, "ref_logps/rejected": -30.60083770751953, "rewards/accuracies": 1.0, "rewards/chosen": -3.210676670074463, "rewards/margins": 2.8168673515319824, "rewards/rejected": -6.027543544769287, "step": 2874 }, { "epoch": 2.71, "grad_norm": 7.351921120910635, "learning_rate": 1.1969727929363072e-08, "logps/chosen": -56.233673095703125, "logps/rejected": -103.75616455078125, "loss": 0.0504, "losses/dpo": 0.008681925013661385, "losses/sft": 1.8979880809783936, "losses/total": 0.008681925013661385, "ref_logps/chosen": -36.21833038330078, "ref_logps/rejected": -39.05774688720703, "rewards/accuracies": 1.0, "rewards/chosen": -2.0015344619750977, "rewards/margins": 4.4683074951171875, "rewards/rejected": -6.469841957092285, "step": 2875 }, { "epoch": 2.71, "grad_norm": 9.293849615984639, "learning_rate": 1.1891993842538972e-08, "logps/chosen": -57.87008285522461, "logps/rejected": -106.35627746582031, "loss": 0.0687, "losses/dpo": 0.23052653670310974, "losses/sft": 1.777092695236206, "losses/total": 0.23052653670310974, "ref_logps/chosen": -30.646217346191406, "ref_logps/rejected": -45.10546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.722386360168457, "rewards/margins": 3.4026951789855957, "rewards/rejected": -6.125081539154053, "step": 2876 }, { "epoch": 2.71, "grad_norm": 12.75858781491196, "learning_rate": 1.181450683991067e-08, "logps/chosen": -44.374385833740234, "logps/rejected": -74.1337890625, "loss": 0.1244, "losses/dpo": 0.289078027009964, "losses/sft": 1.5943197011947632, "losses/total": 0.289078027009964, "ref_logps/chosen": -24.225189208984375, "ref_logps/rejected": -26.332054138183594, "rewards/accuracies": 1.0, "rewards/chosen": -2.0149197578430176, "rewards/margins": 2.7652535438537598, "rewards/rejected": -4.780173301696777, "step": 2877 }, { "epoch": 2.72, "grad_norm": 10.847010957451538, "learning_rate": 1.1737267001886152e-08, "logps/chosen": -54.573875427246094, "logps/rejected": -92.37704467773438, "loss": 0.0919, "losses/dpo": 0.007006396073848009, "losses/sft": 1.3759276866912842, "losses/total": 0.007006396073848009, "ref_logps/chosen": -33.609046936035156, "ref_logps/rejected": -37.64482116699219, "rewards/accuracies": 1.0, "rewards/chosen": -2.096482276916504, "rewards/margins": 3.376739501953125, "rewards/rejected": -5.473221778869629, "step": 2878 }, { "epoch": 2.72, "grad_norm": 14.702179088462145, "learning_rate": 1.1660274408617137e-08, "logps/chosen": -51.296234130859375, "logps/rejected": -101.96958923339844, "loss": 0.1405, "losses/dpo": 0.05252404138445854, "losses/sft": 2.301892042160034, "losses/total": 0.05252404138445854, "ref_logps/chosen": -25.611156463623047, "ref_logps/rejected": -42.70488739013672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5685079097747803, "rewards/margins": 3.3579633235931396, "rewards/rejected": -5.92647123336792, "step": 2879 }, { "epoch": 2.72, "grad_norm": 12.036695944109981, "learning_rate": 1.1583529139998577e-08, "logps/chosen": -34.942501068115234, "logps/rejected": -74.81184387207031, "loss": 0.1472, "losses/dpo": 0.0009010893991217017, "losses/sft": 0.43277060985565186, "losses/total": 0.0009010893991217017, "ref_logps/chosen": -18.460657119750977, "ref_logps/rejected": -29.29147720336914, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6481845378875732, "rewards/margins": 2.9038524627685547, "rewards/rejected": -4.552036762237549, "step": 2880 }, { "epoch": 2.72, "grad_norm": 9.358996627719153, "learning_rate": 1.150703127566896e-08, "logps/chosen": -41.55085754394531, "logps/rejected": -90.53712463378906, "loss": 0.0961, "losses/dpo": 0.02346923016011715, "losses/sft": 1.8408198356628418, "losses/total": 0.02346923016011715, "ref_logps/chosen": -22.015321731567383, "ref_logps/rejected": -37.78960418701172, "rewards/accuracies": 1.0, "rewards/chosen": -1.9535534381866455, "rewards/margins": 3.321197986602783, "rewards/rejected": -5.27475118637085, "step": 2881 }, { "epoch": 2.72, "grad_norm": 15.580731875002158, "learning_rate": 1.1430780895009985e-08, "logps/chosen": -52.761268615722656, "logps/rejected": -84.56764221191406, "loss": 0.1477, "losses/dpo": 0.001280146068893373, "losses/sft": 2.231870174407959, "losses/total": 0.001280146068893373, "ref_logps/chosen": -29.626169204711914, "ref_logps/rejected": -28.62251853942871, "rewards/accuracies": 1.0, "rewards/chosen": -2.313509941101074, "rewards/margins": 3.2810025215148926, "rewards/rejected": -5.594512462615967, "step": 2882 }, { "epoch": 2.72, "grad_norm": 10.625567587125225, "learning_rate": 1.1354778077146437e-08, "logps/chosen": -64.63533782958984, "logps/rejected": -103.08334350585938, "loss": 0.0729, "losses/dpo": 0.022934021428227425, "losses/sft": 1.1269910335540771, "losses/total": 0.022934021428227425, "ref_logps/chosen": -36.854530334472656, "ref_logps/rejected": -37.418731689453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.778080940246582, "rewards/margins": 3.7883799076080322, "rewards/rejected": -6.566461086273193, "step": 2883 }, { "epoch": 2.72, "grad_norm": 14.43124342495325, "learning_rate": 1.1279022900946373e-08, "logps/chosen": -64.0099868774414, "logps/rejected": -102.17241668701172, "loss": 0.123, "losses/dpo": 0.05720913037657738, "losses/sft": 2.0619730949401855, "losses/total": 0.05720913037657738, "ref_logps/chosen": -34.70280075073242, "ref_logps/rejected": -38.470916748046875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9307186603546143, "rewards/margins": 3.4394309520721436, "rewards/rejected": -6.370149612426758, "step": 2884 }, { "epoch": 2.72, "grad_norm": 18.879724930784565, "learning_rate": 1.1203515445020711e-08, "logps/chosen": -50.861995697021484, "logps/rejected": -78.02462768554688, "loss": 0.1774, "losses/dpo": 0.16806751489639282, "losses/sft": 1.793616533279419, "losses/total": 0.16806751489639282, "ref_logps/chosen": -31.456146240234375, "ref_logps/rejected": -27.50904655456543, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9405851364135742, "rewards/margins": 3.1109731197357178, "rewards/rejected": -5.051558494567871, "step": 2885 }, { "epoch": 2.72, "grad_norm": 6.866399015028786, "learning_rate": 1.1128255787723418e-08, "logps/chosen": -46.47187042236328, "logps/rejected": -99.26995849609375, "loss": 0.0647, "losses/dpo": 0.021562570706009865, "losses/sft": 2.5056400299072266, "losses/total": 0.021562570706009865, "ref_logps/chosen": -27.57830047607422, "ref_logps/rejected": -41.34366989135742, "rewards/accuracies": 1.0, "rewards/chosen": -1.8893569707870483, "rewards/margins": 3.9032726287841797, "rewards/rejected": -5.792630195617676, "step": 2886 }, { "epoch": 2.72, "grad_norm": 15.377181077829976, "learning_rate": 1.1053244007151297e-08, "logps/chosen": -56.475677490234375, "logps/rejected": -91.62783813476562, "loss": 0.1626, "losses/dpo": 0.06300060451030731, "losses/sft": 2.1857070922851562, "losses/total": 0.06300060451030731, "ref_logps/chosen": -34.05097961425781, "ref_logps/rejected": -40.75542449951172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2424697875976562, "rewards/margins": 2.844771385192871, "rewards/rejected": -5.087241172790527, "step": 2887 }, { "epoch": 2.72, "grad_norm": 18.688723844955746, "learning_rate": 1.0978480181143862e-08, "logps/chosen": -57.92671203613281, "logps/rejected": -88.22919464111328, "loss": 0.1804, "losses/dpo": 0.009113596752285957, "losses/sft": 0.7643386125564575, "losses/total": 0.009113596752285957, "ref_logps/chosen": -30.808799743652344, "ref_logps/rejected": -38.83740234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.711791515350342, "rewards/margins": 2.2273876667022705, "rewards/rejected": -4.939178943634033, "step": 2888 }, { "epoch": 2.73, "grad_norm": 10.356390766710392, "learning_rate": 1.0903964387283499e-08, "logps/chosen": -58.413352966308594, "logps/rejected": -93.03396606445312, "loss": 0.1116, "losses/dpo": 0.5979110598564148, "losses/sft": 1.506881833076477, "losses/total": 0.5979110598564148, "ref_logps/chosen": -30.16204261779785, "ref_logps/rejected": -35.09925842285156, "rewards/accuracies": 1.0, "rewards/chosen": -2.8251309394836426, "rewards/margins": 2.968338966369629, "rewards/rejected": -5.79347038269043, "step": 2889 }, { "epoch": 2.73, "grad_norm": 14.302399841288251, "learning_rate": 1.082969670289499e-08, "logps/chosen": -45.588111877441406, "logps/rejected": -79.18229675292969, "loss": 0.1718, "losses/dpo": 0.041024476289749146, "losses/sft": 1.3998796939849854, "losses/total": 0.041024476289749146, "ref_logps/chosen": -27.502544403076172, "ref_logps/rejected": -31.283660888671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8085570335388184, "rewards/margins": 2.981306791305542, "rewards/rejected": -4.789863586425781, "step": 2890 }, { "epoch": 2.73, "grad_norm": 10.526499063378475, "learning_rate": 1.0755677205045832e-08, "logps/chosen": -47.65642166137695, "logps/rejected": -97.4150390625, "loss": 0.1185, "losses/dpo": 0.004025958478450775, "losses/sft": 2.1952404975891113, "losses/total": 0.004025958478450775, "ref_logps/chosen": -24.41338348388672, "ref_logps/rejected": -38.4092903137207, "rewards/accuracies": 1.0, "rewards/chosen": -2.3243041038513184, "rewards/margins": 3.5762712955474854, "rewards/rejected": -5.900575637817383, "step": 2891 }, { "epoch": 2.73, "grad_norm": 13.040625090684927, "learning_rate": 1.068190597054583e-08, "logps/chosen": -59.755558013916016, "logps/rejected": -101.86265563964844, "loss": 0.1159, "losses/dpo": 0.051963839679956436, "losses/sft": 1.8657768964767456, "losses/total": 0.051963839679956436, "ref_logps/chosen": -29.40241241455078, "ref_logps/rejected": -37.19486999511719, "rewards/accuracies": 1.0, "rewards/chosen": -3.0353143215179443, "rewards/margins": 3.4314639568328857, "rewards/rejected": -6.46677827835083, "step": 2892 }, { "epoch": 2.73, "grad_norm": 8.026726040633282, "learning_rate": 1.0608383075947357e-08, "logps/chosen": -45.57089614868164, "logps/rejected": -104.32669830322266, "loss": 0.0648, "losses/dpo": 0.0013630345929414034, "losses/sft": 1.3255599737167358, "losses/total": 0.0013630345929414034, "ref_logps/chosen": -23.55187225341797, "ref_logps/rejected": -42.23603820800781, "rewards/accuracies": 1.0, "rewards/chosen": -2.2019026279449463, "rewards/margins": 4.007163047790527, "rewards/rejected": -6.209065914154053, "step": 2893 }, { "epoch": 2.73, "grad_norm": 18.96885812486849, "learning_rate": 1.0535108597544833e-08, "logps/chosen": -46.446842193603516, "logps/rejected": -75.80978393554688, "loss": 0.2557, "losses/dpo": 0.05272752046585083, "losses/sft": 1.8571085929870605, "losses/total": 0.05272752046585083, "ref_logps/chosen": -24.370628356933594, "ref_logps/rejected": -28.405044555664062, "rewards/accuracies": 0.875, "rewards/chosen": -2.2076213359832764, "rewards/margins": 2.532853126525879, "rewards/rejected": -4.740474700927734, "step": 2894 }, { "epoch": 2.73, "grad_norm": 11.565539438213245, "learning_rate": 1.0462082611375156e-08, "logps/chosen": -56.06581115722656, "logps/rejected": -84.13351440429688, "loss": 0.0985, "losses/dpo": 0.02222086675465107, "losses/sft": 1.6693438291549683, "losses/total": 0.02222086675465107, "ref_logps/chosen": -32.645477294921875, "ref_logps/rejected": -29.538236618041992, "rewards/accuracies": 1.0, "rewards/chosen": -2.342033624649048, "rewards/margins": 3.1174936294555664, "rewards/rejected": -5.459527015686035, "step": 2895 }, { "epoch": 2.73, "grad_norm": 21.04389686517066, "learning_rate": 1.038930519321718e-08, "logps/chosen": -56.55517578125, "logps/rejected": -79.29573059082031, "loss": 0.2363, "losses/dpo": 0.41420307755470276, "losses/sft": 0.6713619232177734, "losses/total": 0.41420307755470276, "ref_logps/chosen": -31.573429107666016, "ref_logps/rejected": -28.848764419555664, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4981746673583984, "rewards/margins": 2.5465221405029297, "rewards/rejected": -5.044696807861328, "step": 2896 }, { "epoch": 2.73, "grad_norm": 19.520999340685403, "learning_rate": 1.0316776418591882e-08, "logps/chosen": -46.5657844543457, "logps/rejected": -78.52531433105469, "loss": 0.2175, "losses/dpo": 0.3076520562171936, "losses/sft": 1.8507276773452759, "losses/total": 0.3076520562171936, "ref_logps/chosen": -24.22399139404297, "ref_logps/rejected": -30.81273651123047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2341794967651367, "rewards/margins": 2.5370779037475586, "rewards/rejected": -4.771257400512695, "step": 2897 }, { "epoch": 2.73, "grad_norm": 5.716362940381063, "learning_rate": 1.0244496362762284e-08, "logps/chosen": -55.62094497680664, "logps/rejected": -102.42710876464844, "loss": 0.0497, "losses/dpo": 0.02983049303293228, "losses/sft": 2.110854387283325, "losses/total": 0.02983049303293228, "ref_logps/chosen": -27.871471405029297, "ref_logps/rejected": -36.261634826660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.774947166442871, "rewards/margins": 3.841599941253662, "rewards/rejected": -6.616546630859375, "step": 2898 }, { "epoch": 2.73, "grad_norm": 8.102034832531121, "learning_rate": 1.0172465100733163e-08, "logps/chosen": -53.43672180175781, "logps/rejected": -84.566162109375, "loss": 0.0831, "losses/dpo": 0.004674369934946299, "losses/sft": 1.527561068534851, "losses/total": 0.004674369934946299, "ref_logps/chosen": -29.775468826293945, "ref_logps/rejected": -31.456317901611328, "rewards/accuracies": 1.0, "rewards/chosen": -2.3661253452301025, "rewards/margins": 2.944859027862549, "rewards/rejected": -5.3109846115112305, "step": 2899 }, { "epoch": 2.74, "grad_norm": 9.05045475378091, "learning_rate": 1.0100682707251312e-08, "logps/chosen": -66.43649291992188, "logps/rejected": -109.62818908691406, "loss": 0.0749, "losses/dpo": 0.03526006266474724, "losses/sft": 2.289003372192383, "losses/total": 0.03526006266474724, "ref_logps/chosen": -37.617706298828125, "ref_logps/rejected": -46.80097198486328, "rewards/accuracies": 1.0, "rewards/chosen": -2.881878137588501, "rewards/margins": 3.400844097137451, "rewards/rejected": -6.282722473144531, "step": 2900 }, { "epoch": 2.74, "grad_norm": 9.25899365868247, "learning_rate": 1.0029149256805091e-08, "logps/chosen": -49.99055480957031, "logps/rejected": -93.07919311523438, "loss": 0.0713, "losses/dpo": 0.03136264532804489, "losses/sft": 1.0720698833465576, "losses/total": 0.03136264532804489, "ref_logps/chosen": -30.42355728149414, "ref_logps/rejected": -39.249916076660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9566996097564697, "rewards/margins": 3.4262285232543945, "rewards/rejected": -5.382927894592285, "step": 2901 }, { "epoch": 2.74, "grad_norm": 6.151584416381057, "learning_rate": 9.957864823624679e-09, "logps/chosen": -43.4630126953125, "logps/rejected": -100.0662841796875, "loss": 0.0476, "losses/dpo": 0.004968002904206514, "losses/sft": 1.9551312923431396, "losses/total": 0.004968002904206514, "ref_logps/chosen": -21.557968139648438, "ref_logps/rejected": -38.26072692871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.190504312515259, "rewards/margins": 3.99005126953125, "rewards/rejected": -6.180555820465088, "step": 2902 }, { "epoch": 2.74, "grad_norm": 6.531391721615535, "learning_rate": 9.886829481681736e-09, "logps/chosen": -53.357093811035156, "logps/rejected": -100.23283386230469, "loss": 0.0691, "losses/dpo": 0.0007046578684821725, "losses/sft": 2.4460089206695557, "losses/total": 0.0007046578684821725, "ref_logps/chosen": -28.327463150024414, "ref_logps/rejected": -39.98008728027344, "rewards/accuracies": 1.0, "rewards/chosen": -2.502963066101074, "rewards/margins": 3.522311210632324, "rewards/rejected": -6.025274276733398, "step": 2903 }, { "epoch": 2.74, "grad_norm": 13.74971878922192, "learning_rate": 9.816043304689519e-09, "logps/chosen": -48.20646667480469, "logps/rejected": -82.63427734375, "loss": 0.1443, "losses/dpo": 0.18098990619182587, "losses/sft": 1.734058141708374, "losses/total": 0.18098990619182587, "ref_logps/chosen": -28.15145492553711, "ref_logps/rejected": -32.64268493652344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0055010318756104, "rewards/margins": 2.9936587810516357, "rewards/rejected": -4.999159812927246, "step": 2904 }, { "epoch": 2.74, "grad_norm": 14.662973679232055, "learning_rate": 9.745506366102717e-09, "logps/chosen": -42.497371673583984, "logps/rejected": -73.82939910888672, "loss": 0.1452, "losses/dpo": 0.05336759239435196, "losses/sft": 2.044562578201294, "losses/total": 0.05336759239435196, "ref_logps/chosen": -23.924880981445312, "ref_logps/rejected": -27.72870635986328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.85724937915802, "rewards/margins": 2.7528200149536133, "rewards/rejected": -4.610069751739502, "step": 2905 }, { "epoch": 2.74, "grad_norm": 8.468064164400674, "learning_rate": 9.675218739117307e-09, "logps/chosen": -49.61390686035156, "logps/rejected": -79.93238830566406, "loss": 0.1099, "losses/dpo": 0.1672113686800003, "losses/sft": 1.2803272008895874, "losses/total": 0.1672113686800003, "ref_logps/chosen": -29.191364288330078, "ref_logps/rejected": -32.0498046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0422539710998535, "rewards/margins": 2.746004581451416, "rewards/rejected": -4.7882585525512695, "step": 2906 }, { "epoch": 2.74, "grad_norm": 10.764424759378976, "learning_rate": 9.605180496670722e-09, "logps/chosen": -57.347930908203125, "logps/rejected": -98.46735382080078, "loss": 0.079, "losses/dpo": 0.016987601295113564, "losses/sft": 1.7887076139450073, "losses/total": 0.016987601295113564, "ref_logps/chosen": -30.917909622192383, "ref_logps/rejected": -40.855430603027344, "rewards/accuracies": 1.0, "rewards/chosen": -2.643002510070801, "rewards/margins": 3.118190050125122, "rewards/rejected": -5.761192321777344, "step": 2907 }, { "epoch": 2.74, "grad_norm": 9.987682950810576, "learning_rate": 9.53539171144141e-09, "logps/chosen": -60.2633056640625, "logps/rejected": -99.16165161132812, "loss": 0.0887, "losses/dpo": 0.019694006070494652, "losses/sft": 1.0204119682312012, "losses/total": 0.019694006070494652, "ref_logps/chosen": -33.33181381225586, "ref_logps/rejected": -36.764617919921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.6931490898132324, "rewards/margins": 3.5465545654296875, "rewards/rejected": -6.239704132080078, "step": 2908 }, { "epoch": 2.74, "grad_norm": 11.784255215062247, "learning_rate": 9.465852455849105e-09, "logps/chosen": -51.40300369262695, "logps/rejected": -76.19452667236328, "loss": 0.1213, "losses/dpo": 0.02463644929230213, "losses/sft": 1.8898158073425293, "losses/total": 0.02463644929230213, "ref_logps/chosen": -28.281309127807617, "ref_logps/rejected": -22.81805419921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.3121695518493652, "rewards/margins": 3.025477170944214, "rewards/rejected": -5.337646961212158, "step": 2909 }, { "epoch": 2.75, "grad_norm": 10.088499045759342, "learning_rate": 9.396562802054503e-09, "logps/chosen": -47.12004089355469, "logps/rejected": -86.42637634277344, "loss": 0.1308, "losses/dpo": 0.04728517681360245, "losses/sft": 2.5326039791107178, "losses/total": 0.04728517681360245, "ref_logps/chosen": -27.079078674316406, "ref_logps/rejected": -34.53654479980469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.004096031188965, "rewards/margins": 3.184887409210205, "rewards/rejected": -5.188983917236328, "step": 2910 }, { "epoch": 2.75, "grad_norm": 7.374460471520232, "learning_rate": 9.327522821959393e-09, "logps/chosen": -46.771392822265625, "logps/rejected": -94.4814453125, "loss": 0.0748, "losses/dpo": 0.11859419196844101, "losses/sft": 1.2350984811782837, "losses/total": 0.11859419196844101, "ref_logps/chosen": -23.46113395690918, "ref_logps/rejected": -34.23462677001953, "rewards/accuracies": 1.0, "rewards/chosen": -2.3310256004333496, "rewards/margins": 3.6936566829681396, "rewards/rejected": -6.02468204498291, "step": 2911 }, { "epoch": 2.75, "grad_norm": 12.647819492641073, "learning_rate": 9.258732587206358e-09, "logps/chosen": -52.67634582519531, "logps/rejected": -101.70606994628906, "loss": 0.1489, "losses/dpo": 0.536206841468811, "losses/sft": 1.053963541984558, "losses/total": 0.536206841468811, "ref_logps/chosen": -29.530094146728516, "ref_logps/rejected": -43.199100494384766, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3146252632141113, "rewards/margins": 3.536071538925171, "rewards/rejected": -5.850696563720703, "step": 2912 }, { "epoch": 2.75, "grad_norm": 18.494185353465067, "learning_rate": 9.190192169178962e-09, "logps/chosen": -51.95834732055664, "logps/rejected": -79.19059753417969, "loss": 0.1537, "losses/dpo": 0.12562914192676544, "losses/sft": 2.174992561340332, "losses/total": 0.12562914192676544, "ref_logps/chosen": -28.732681274414062, "ref_logps/rejected": -26.68280792236328, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3225667476654053, "rewards/margins": 2.928212881088257, "rewards/rejected": -5.250779151916504, "step": 2913 }, { "epoch": 2.75, "grad_norm": 16.483341682736647, "learning_rate": 9.121901639001394e-09, "logps/chosen": -44.221343994140625, "logps/rejected": -82.14362335205078, "loss": 0.1623, "losses/dpo": 0.004985205363482237, "losses/sft": 0.49232903122901917, "losses/total": 0.004985205363482237, "ref_logps/chosen": -22.943344116210938, "ref_logps/rejected": -29.85675048828125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1277999877929688, "rewards/margins": 3.1008870601654053, "rewards/rejected": -5.228686809539795, "step": 2914 }, { "epoch": 2.75, "grad_norm": 10.117379128587041, "learning_rate": 9.05386106753861e-09, "logps/chosen": -54.76373291015625, "logps/rejected": -106.77265167236328, "loss": 0.074, "losses/dpo": 0.0076336790807545185, "losses/sft": 2.3188905715942383, "losses/total": 0.0076336790807545185, "ref_logps/chosen": -29.114028930664062, "ref_logps/rejected": -38.21709060668945, "rewards/accuracies": 1.0, "rewards/chosen": -2.5649704933166504, "rewards/margins": 4.290585517883301, "rewards/rejected": -6.855556488037109, "step": 2915 }, { "epoch": 2.75, "grad_norm": 11.338446572178336, "learning_rate": 8.98607052539624e-09, "logps/chosen": -45.45170593261719, "logps/rejected": -86.27145385742188, "loss": 0.1197, "losses/dpo": 0.2767471969127655, "losses/sft": 0.8248845338821411, "losses/total": 0.2767471969127655, "ref_logps/chosen": -27.22047233581543, "ref_logps/rejected": -35.05369186401367, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8231236934661865, "rewards/margins": 3.298652172088623, "rewards/rejected": -5.1217756271362305, "step": 2916 }, { "epoch": 2.75, "grad_norm": 14.051877861659323, "learning_rate": 8.91853008292029e-09, "logps/chosen": -55.37767791748047, "logps/rejected": -108.55623626708984, "loss": 0.1491, "losses/dpo": 0.046734169125556946, "losses/sft": 1.1644030809402466, "losses/total": 0.046734169125556946, "ref_logps/chosen": -28.79140853881836, "ref_logps/rejected": -43.35768508911133, "rewards/accuracies": 0.875, "rewards/chosen": -2.6586270332336426, "rewards/margins": 3.8612279891967773, "rewards/rejected": -6.519855499267578, "step": 2917 }, { "epoch": 2.75, "grad_norm": 11.359271096671224, "learning_rate": 8.85123981019742e-09, "logps/chosen": -68.83587646484375, "logps/rejected": -113.67874908447266, "loss": 0.0825, "losses/dpo": 0.032433345913887024, "losses/sft": 1.576927661895752, "losses/total": 0.032433345913887024, "ref_logps/chosen": -40.442176818847656, "ref_logps/rejected": -45.08629608154297, "rewards/accuracies": 1.0, "rewards/chosen": -2.839369773864746, "rewards/margins": 4.0198750495910645, "rewards/rejected": -6.859245300292969, "step": 2918 }, { "epoch": 2.75, "grad_norm": 8.844630792862429, "learning_rate": 8.784199777054551e-09, "logps/chosen": -55.13768768310547, "logps/rejected": -108.72185516357422, "loss": 0.071, "losses/dpo": 0.00020503182895481586, "losses/sft": 0.9272692799568176, "losses/total": 0.00020503182895481586, "ref_logps/chosen": -28.47830581665039, "ref_logps/rejected": -41.99987030029297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.665938377380371, "rewards/margins": 4.006259918212891, "rewards/rejected": -6.672198295593262, "step": 2919 }, { "epoch": 2.75, "grad_norm": 12.384521441449193, "learning_rate": 8.717410053059038e-09, "logps/chosen": -54.97055435180664, "logps/rejected": -83.525634765625, "loss": 0.1176, "losses/dpo": 0.39982008934020996, "losses/sft": 1.853311538696289, "losses/total": 0.39982008934020996, "ref_logps/chosen": -33.88381576538086, "ref_logps/rejected": -31.5878963470459, "rewards/accuracies": 1.0, "rewards/chosen": -2.108673572540283, "rewards/margins": 3.085099697113037, "rewards/rejected": -5.19377326965332, "step": 2920 }, { "epoch": 2.76, "grad_norm": 10.49558362542325, "learning_rate": 8.65087070751841e-09, "logps/chosen": -50.317352294921875, "logps/rejected": -85.3694076538086, "loss": 0.1492, "losses/dpo": 0.09884389489889145, "losses/sft": 0.7846667766571045, "losses/total": 0.09884389489889145, "ref_logps/chosen": -26.377464294433594, "ref_logps/rejected": -33.25958251953125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.393988847732544, "rewards/margins": 2.8169937133789062, "rewards/rejected": -5.210982799530029, "step": 2921 }, { "epoch": 2.76, "grad_norm": 8.63838537557285, "learning_rate": 8.584581809480407e-09, "logps/chosen": -50.27284240722656, "logps/rejected": -93.25723266601562, "loss": 0.0772, "losses/dpo": 0.0005339249037206173, "losses/sft": 2.5442371368408203, "losses/total": 0.0005339249037206173, "ref_logps/chosen": -30.25333023071289, "ref_logps/rejected": -35.29228591918945, "rewards/accuracies": 1.0, "rewards/chosen": -2.001950979232788, "rewards/margins": 3.794543504714966, "rewards/rejected": -5.796494483947754, "step": 2922 }, { "epoch": 2.76, "grad_norm": 8.245717586442453, "learning_rate": 8.518543427732949e-09, "logps/chosen": -52.54399490356445, "logps/rejected": -87.67829895019531, "loss": 0.0804, "losses/dpo": 0.15826034545898438, "losses/sft": 2.838545083999634, "losses/total": 0.15826034545898438, "ref_logps/chosen": -27.07329750061035, "ref_logps/rejected": -31.1475887298584, "rewards/accuracies": 1.0, "rewards/chosen": -2.547069549560547, "rewards/margins": 3.106001377105713, "rewards/rejected": -5.65307092666626, "step": 2923 }, { "epoch": 2.76, "grad_norm": 9.650119892990242, "learning_rate": 8.452755630803832e-09, "logps/chosen": -43.82130432128906, "logps/rejected": -98.08641815185547, "loss": 0.0785, "losses/dpo": 0.42738476395606995, "losses/sft": 2.9894180297851562, "losses/total": 0.42738476395606995, "ref_logps/chosen": -23.84458351135254, "ref_logps/rejected": -36.932373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9976720809936523, "rewards/margins": 4.117733001708984, "rewards/rejected": -6.115405082702637, "step": 2924 }, { "epoch": 2.76, "grad_norm": 9.895816716341375, "learning_rate": 8.387218486961029e-09, "logps/chosen": -36.570377349853516, "logps/rejected": -77.92410278320312, "loss": 0.1006, "losses/dpo": 0.10898467898368835, "losses/sft": 0.9105882048606873, "losses/total": 0.10898467898368835, "ref_logps/chosen": -21.68875503540039, "ref_logps/rejected": -29.63178825378418, "rewards/accuracies": 1.0, "rewards/chosen": -1.4881622791290283, "rewards/margins": 3.341069221496582, "rewards/rejected": -4.8292317390441895, "step": 2925 }, { "epoch": 2.76, "grad_norm": 11.188888312871171, "learning_rate": 8.321932064212194e-09, "logps/chosen": -49.03590393066406, "logps/rejected": -89.14581298828125, "loss": 0.0909, "losses/dpo": 0.08444342762231827, "losses/sft": 1.7460744380950928, "losses/total": 0.08444342762231827, "ref_logps/chosen": -26.685256958007812, "ref_logps/rejected": -33.559234619140625, "rewards/accuracies": 1.0, "rewards/chosen": -2.235064744949341, "rewards/margins": 3.3235933780670166, "rewards/rejected": -5.558657646179199, "step": 2926 }, { "epoch": 2.76, "grad_norm": 8.610406824883515, "learning_rate": 8.256896430304972e-09, "logps/chosen": -49.128692626953125, "logps/rejected": -96.68344116210938, "loss": 0.081, "losses/dpo": 0.3240838944911957, "losses/sft": 2.38773250579834, "losses/total": 0.3240838944911957, "ref_logps/chosen": -25.393741607666016, "ref_logps/rejected": -37.79845428466797, "rewards/accuracies": 1.0, "rewards/chosen": -2.37349534034729, "rewards/margins": 3.515003204345703, "rewards/rejected": -5.888498783111572, "step": 2927 }, { "epoch": 2.76, "grad_norm": 13.205499706766682, "learning_rate": 8.192111652726708e-09, "logps/chosen": -45.46384811401367, "logps/rejected": -92.11479949951172, "loss": 0.1216, "losses/dpo": 0.005958180874586105, "losses/sft": 1.6966544389724731, "losses/total": 0.005958180874586105, "ref_logps/chosen": -26.33543586730957, "ref_logps/rejected": -36.971641540527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.9128413200378418, "rewards/margins": 3.6014745235443115, "rewards/rejected": -5.514315605163574, "step": 2928 }, { "epoch": 2.76, "grad_norm": 13.017111187581493, "learning_rate": 8.127577798704433e-09, "logps/chosen": -53.77728271484375, "logps/rejected": -87.63262176513672, "loss": 0.1152, "losses/dpo": 0.03203286603093147, "losses/sft": 1.1570167541503906, "losses/total": 0.03203286603093147, "ref_logps/chosen": -31.55791473388672, "ref_logps/rejected": -33.295135498046875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2219364643096924, "rewards/margins": 3.2118124961853027, "rewards/rejected": -5.433749198913574, "step": 2929 }, { "epoch": 2.76, "grad_norm": 11.498580701828377, "learning_rate": 8.063294935204773e-09, "logps/chosen": -51.91254425048828, "logps/rejected": -80.6965560913086, "loss": 0.1486, "losses/dpo": 0.03962376341223717, "losses/sft": 2.675844192504883, "losses/total": 0.03962376341223717, "ref_logps/chosen": -29.587610244750977, "ref_logps/rejected": -30.353965759277344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2324936389923096, "rewards/margins": 2.8017654418945312, "rewards/rejected": -5.03425931930542, "step": 2930 }, { "epoch": 2.77, "grad_norm": 12.940403247416029, "learning_rate": 7.999263128933925e-09, "logps/chosen": -45.743106842041016, "logps/rejected": -98.63470458984375, "loss": 0.098, "losses/dpo": 0.003336163004860282, "losses/sft": 1.7333507537841797, "losses/total": 0.003336163004860282, "ref_logps/chosen": -23.042388916015625, "ref_logps/rejected": -37.95845031738281, "rewards/accuracies": 1.0, "rewards/chosen": -2.2700719833374023, "rewards/margins": 3.7975540161132812, "rewards/rejected": -6.067625522613525, "step": 2931 }, { "epoch": 2.77, "grad_norm": 18.757446693375698, "learning_rate": 7.935482446337627e-09, "logps/chosen": -52.89741897583008, "logps/rejected": -78.44029235839844, "loss": 0.2202, "losses/dpo": 0.05179992690682411, "losses/sft": 1.79423987865448, "losses/total": 0.05179992690682411, "ref_logps/chosen": -29.798839569091797, "ref_logps/rejected": -31.26614761352539, "rewards/accuracies": 0.875, "rewards/chosen": -2.3098583221435547, "rewards/margins": 2.4075560569763184, "rewards/rejected": -4.717413902282715, "step": 2932 }, { "epoch": 2.77, "grad_norm": 16.50727907798509, "learning_rate": 7.871952953600907e-09, "logps/chosen": -48.652835845947266, "logps/rejected": -87.84122467041016, "loss": 0.2114, "losses/dpo": 0.0022075918968766928, "losses/sft": 3.4156246185302734, "losses/total": 0.0022075918968766928, "ref_logps/chosen": -23.1940860748291, "ref_logps/rejected": -32.59026336669922, "rewards/accuracies": 0.9375, "rewards/chosen": -2.545875072479248, "rewards/margins": 2.9792206287384033, "rewards/rejected": -5.525095462799072, "step": 2933 }, { "epoch": 2.77, "grad_norm": 10.306715660144564, "learning_rate": 7.8086747166482e-09, "logps/chosen": -45.96955108642578, "logps/rejected": -89.16915130615234, "loss": 0.1233, "losses/dpo": 0.004719047341495752, "losses/sft": 1.715898871421814, "losses/total": 0.004719047341495752, "ref_logps/chosen": -20.421875, "ref_logps/rejected": -31.977737426757812, "rewards/accuracies": 1.0, "rewards/chosen": -2.5547680854797363, "rewards/margins": 3.1643729209899902, "rewards/rejected": -5.719141006469727, "step": 2934 }, { "epoch": 2.77, "grad_norm": 20.878660786901747, "learning_rate": 7.7456478011432e-09, "logps/chosen": -53.13145446777344, "logps/rejected": -94.80911254882812, "loss": 0.194, "losses/dpo": 0.15560372173786163, "losses/sft": 0.943413257598877, "losses/total": 0.15560372173786163, "ref_logps/chosen": -30.27311897277832, "ref_logps/rejected": -37.00516128540039, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2858335971832275, "rewards/margins": 3.4945621490478516, "rewards/rejected": -5.7803955078125, "step": 2935 }, { "epoch": 2.77, "grad_norm": 17.301037469496055, "learning_rate": 7.682872272488844e-09, "logps/chosen": -50.152923583984375, "logps/rejected": -78.23987579345703, "loss": 0.1455, "losses/dpo": 0.27391958236694336, "losses/sft": 1.6378743648529053, "losses/total": 0.27391958236694336, "ref_logps/chosen": -27.110280990600586, "ref_logps/rejected": -28.276782989501953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3042640686035156, "rewards/margins": 2.692045211791992, "rewards/rejected": -4.996309280395508, "step": 2936 }, { "epoch": 2.77, "grad_norm": 10.618989653779748, "learning_rate": 7.620348195827104e-09, "logps/chosen": -50.41096115112305, "logps/rejected": -90.24430847167969, "loss": 0.1035, "losses/dpo": 0.19145119190216064, "losses/sft": 2.126065731048584, "losses/total": 0.19145119190216064, "ref_logps/chosen": -29.29900360107422, "ref_logps/rejected": -37.254432678222656, "rewards/accuracies": 1.0, "rewards/chosen": -2.1111958026885986, "rewards/margins": 3.1877920627593994, "rewards/rejected": -5.298987865447998, "step": 2937 }, { "epoch": 2.77, "grad_norm": 11.521130555483715, "learning_rate": 7.558075636039162e-09, "logps/chosen": -40.07830810546875, "logps/rejected": -84.06982421875, "loss": 0.1291, "losses/dpo": 3.335663859616034e-05, "losses/sft": 0.46514081954956055, "losses/total": 3.335663859616034e-05, "ref_logps/chosen": -22.365203857421875, "ref_logps/rejected": -30.158483505249023, "rewards/accuracies": 1.0, "rewards/chosen": -1.771310567855835, "rewards/margins": 3.6198229789733887, "rewards/rejected": -5.3911333084106445, "step": 2938 }, { "epoch": 2.77, "grad_norm": 13.536493178559308, "learning_rate": 7.496054657745082e-09, "logps/chosen": -53.777740478515625, "logps/rejected": -83.24807739257812, "loss": 0.1193, "losses/dpo": 0.13761812448501587, "losses/sft": 3.138676643371582, "losses/total": 0.13761812448501587, "ref_logps/chosen": -29.511110305786133, "ref_logps/rejected": -29.301496505737305, "rewards/accuracies": 1.0, "rewards/chosen": -2.426663398742676, "rewards/margins": 2.9679946899414062, "rewards/rejected": -5.394658088684082, "step": 2939 }, { "epoch": 2.77, "grad_norm": 20.968493779719303, "learning_rate": 7.434285325303907e-09, "logps/chosen": -63.70805358886719, "logps/rejected": -110.30714416503906, "loss": 0.2451, "losses/dpo": 0.2624583840370178, "losses/sft": 1.0816072225570679, "losses/total": 0.2624583840370178, "ref_logps/chosen": -34.27699279785156, "ref_logps/rejected": -40.99261474609375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.943105936050415, "rewards/margins": 3.9883475303649902, "rewards/rejected": -6.931453227996826, "step": 2940 }, { "epoch": 2.77, "grad_norm": 8.739343710353573, "learning_rate": 7.372767702813559e-09, "logps/chosen": -53.601600646972656, "logps/rejected": -84.13319396972656, "loss": 0.0715, "losses/dpo": 0.0099215442314744, "losses/sft": 2.8428733348846436, "losses/total": 0.0099215442314744, "ref_logps/chosen": -32.123722076416016, "ref_logps/rejected": -28.15727996826172, "rewards/accuracies": 1.0, "rewards/chosen": -2.147787570953369, "rewards/margins": 3.4498040676116943, "rewards/rejected": -5.597591400146484, "step": 2941 }, { "epoch": 2.78, "grad_norm": 9.898477364938161, "learning_rate": 7.3115018541107795e-09, "logps/chosen": -41.27419662475586, "logps/rejected": -86.06768035888672, "loss": 0.0803, "losses/dpo": 0.03209796920418739, "losses/sft": 3.2429311275482178, "losses/total": 0.03209796920418739, "ref_logps/chosen": -20.322418212890625, "ref_logps/rejected": -32.217464447021484, "rewards/accuracies": 1.0, "rewards/chosen": -2.0951778888702393, "rewards/margins": 3.2898435592651367, "rewards/rejected": -5.385021209716797, "step": 2942 }, { "epoch": 2.78, "grad_norm": 25.606654409911272, "learning_rate": 7.250487842770964e-09, "logps/chosen": -45.378631591796875, "logps/rejected": -76.30484008789062, "loss": 0.2972, "losses/dpo": 0.07765273749828339, "losses/sft": 1.8442484140396118, "losses/total": 0.07765273749828339, "ref_logps/chosen": -24.935901641845703, "ref_logps/rejected": -30.867610931396484, "rewards/accuracies": 0.875, "rewards/chosen": -2.0442733764648438, "rewards/margins": 2.4994497299194336, "rewards/rejected": -4.543723106384277, "step": 2943 }, { "epoch": 2.78, "grad_norm": 9.408583371322916, "learning_rate": 7.189725732108243e-09, "logps/chosen": -48.59496307373047, "logps/rejected": -103.92691040039062, "loss": 0.0912, "losses/dpo": 0.2998805046081543, "losses/sft": 3.4137558937072754, "losses/total": 0.2998805046081543, "ref_logps/chosen": -23.56805419921875, "ref_logps/rejected": -41.10039520263672, "rewards/accuracies": 1.0, "rewards/chosen": -2.5026912689208984, "rewards/margins": 3.7799601554870605, "rewards/rejected": -6.282651424407959, "step": 2944 }, { "epoch": 2.78, "grad_norm": 14.785986790457281, "learning_rate": 7.129215585175375e-09, "logps/chosen": -49.37919616699219, "logps/rejected": -72.74447631835938, "loss": 0.2066, "losses/dpo": 0.22139526903629303, "losses/sft": 2.124589443206787, "losses/total": 0.22139526903629303, "ref_logps/chosen": -30.930387496948242, "ref_logps/rejected": -28.84986114501953, "rewards/accuracies": 0.875, "rewards/chosen": -1.8448805809020996, "rewards/margins": 2.544581890106201, "rewards/rejected": -4.389462471008301, "step": 2945 }, { "epoch": 2.78, "grad_norm": 12.332042092678392, "learning_rate": 7.068957464763548e-09, "logps/chosen": -41.59635543823242, "logps/rejected": -79.25413513183594, "loss": 0.1601, "losses/dpo": 0.002378988079726696, "losses/sft": 1.0753297805786133, "losses/total": 0.002378988079726696, "ref_logps/chosen": -21.448123931884766, "ref_logps/rejected": -27.437347412109375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0148234367370605, "rewards/margins": 3.1668553352355957, "rewards/rejected": -5.181678771972656, "step": 2946 }, { "epoch": 2.78, "grad_norm": 10.039066345641608, "learning_rate": 7.008951433402577e-09, "logps/chosen": -51.44623565673828, "logps/rejected": -97.59432983398438, "loss": 0.0801, "losses/dpo": 0.007868560031056404, "losses/sft": 2.0483598709106445, "losses/total": 0.007868560031056404, "ref_logps/chosen": -30.637258529663086, "ref_logps/rejected": -35.64677810668945, "rewards/accuracies": 1.0, "rewards/chosen": -2.080897569656372, "rewards/margins": 4.113857746124268, "rewards/rejected": -6.194755554199219, "step": 2947 }, { "epoch": 2.78, "grad_norm": 18.710625124667065, "learning_rate": 6.949197553360514e-09, "logps/chosen": -47.119022369384766, "logps/rejected": -75.71910095214844, "loss": 0.2388, "losses/dpo": 0.5768417119979858, "losses/sft": 2.4791259765625, "losses/total": 0.5768417119979858, "ref_logps/chosen": -25.54513931274414, "ref_logps/rejected": -31.350841522216797, "rewards/accuracies": 1.0, "rewards/chosen": -2.157388210296631, "rewards/margins": 2.27943754196167, "rewards/rejected": -4.436825752258301, "step": 2948 }, { "epoch": 2.78, "grad_norm": 20.289402245336436, "learning_rate": 6.889695886643926e-09, "logps/chosen": -53.07735061645508, "logps/rejected": -76.45563507080078, "loss": 0.2448, "losses/dpo": 0.08742141723632812, "losses/sft": 1.287448525428772, "losses/total": 0.08742141723632812, "ref_logps/chosen": -28.95462417602539, "ref_logps/rejected": -25.500211715698242, "rewards/accuracies": 0.875, "rewards/chosen": -2.4122729301452637, "rewards/margins": 2.68326997756958, "rewards/rejected": -5.095542907714844, "step": 2949 }, { "epoch": 2.78, "grad_norm": 10.322347525778396, "learning_rate": 6.830446494997477e-09, "logps/chosen": -64.174560546875, "logps/rejected": -93.49050903320312, "loss": 0.1017, "losses/dpo": 0.016456449404358864, "losses/sft": 2.350837230682373, "losses/total": 0.016456449404358864, "ref_logps/chosen": -35.245567321777344, "ref_logps/rejected": -36.49077606201172, "rewards/accuracies": 1.0, "rewards/chosen": -2.892899990081787, "rewards/margins": 2.8070733547210693, "rewards/rejected": -5.6999735832214355, "step": 2950 }, { "epoch": 2.78, "grad_norm": 9.484049468029342, "learning_rate": 6.7714494399042086e-09, "logps/chosen": -57.43333053588867, "logps/rejected": -91.9375, "loss": 0.0919, "losses/dpo": 0.029873015359044075, "losses/sft": 1.3883802890777588, "losses/total": 0.029873015359044075, "ref_logps/chosen": -32.29127502441406, "ref_logps/rejected": -31.39759635925293, "rewards/accuracies": 1.0, "rewards/chosen": -2.51420521736145, "rewards/margins": 3.5397846698760986, "rewards/rejected": -6.053989887237549, "step": 2951 }, { "epoch": 2.78, "grad_norm": 8.840756747705546, "learning_rate": 6.712704782585205e-09, "logps/chosen": -39.63132095336914, "logps/rejected": -73.45793914794922, "loss": 0.1113, "losses/dpo": 0.06681264936923981, "losses/sft": 1.8372191190719604, "losses/total": 0.06681264936923981, "ref_logps/chosen": -19.502132415771484, "ref_logps/rejected": -25.337038040161133, "rewards/accuracies": 1.0, "rewards/chosen": -2.0129189491271973, "rewards/margins": 2.799170970916748, "rewards/rejected": -4.812089920043945, "step": 2952 }, { "epoch": 2.79, "grad_norm": 24.88906867116877, "learning_rate": 6.654212583999702e-09, "logps/chosen": -53.873695373535156, "logps/rejected": -70.44425201416016, "loss": 0.3288, "losses/dpo": 0.17412590980529785, "losses/sft": 2.7505223751068115, "losses/total": 0.17412590980529785, "ref_logps/chosen": -29.607452392578125, "ref_logps/rejected": -24.25558090209961, "rewards/accuracies": 0.875, "rewards/chosen": -2.4266245365142822, "rewards/margins": 2.1922426223754883, "rewards/rejected": -4.61886739730835, "step": 2953 }, { "epoch": 2.79, "grad_norm": 11.504947398216983, "learning_rate": 6.5959729048449544e-09, "logps/chosen": -42.60639953613281, "logps/rejected": -67.34687805175781, "loss": 0.1544, "losses/dpo": 0.07891879975795746, "losses/sft": 1.4601597785949707, "losses/total": 0.07891879975795746, "ref_logps/chosen": -22.67746353149414, "ref_logps/rejected": -21.355972290039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9928936958312988, "rewards/margins": 2.606196880340576, "rewards/rejected": -4.599090576171875, "step": 2954 }, { "epoch": 2.79, "grad_norm": 17.918902422238745, "learning_rate": 6.5379858055561176e-09, "logps/chosen": -53.74408721923828, "logps/rejected": -74.53982543945312, "loss": 0.1884, "losses/dpo": 0.025888504460453987, "losses/sft": 1.6763933897018433, "losses/total": 0.025888504460453987, "ref_logps/chosen": -31.936176300048828, "ref_logps/rejected": -25.551719665527344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1807913780212402, "rewards/margins": 2.718019485473633, "rewards/rejected": -4.898810386657715, "step": 2955 }, { "epoch": 2.79, "grad_norm": 10.524942376812557, "learning_rate": 6.480251346306309e-09, "logps/chosen": -42.444881439208984, "logps/rejected": -96.26366424560547, "loss": 0.0955, "losses/dpo": 0.0016559921205043793, "losses/sft": 2.222790002822876, "losses/total": 0.0016559921205043793, "ref_logps/chosen": -21.247631072998047, "ref_logps/rejected": -37.096431732177734, "rewards/accuracies": 1.0, "rewards/chosen": -2.119725227355957, "rewards/margins": 3.7969985008239746, "rewards/rejected": -5.916723728179932, "step": 2956 }, { "epoch": 2.79, "grad_norm": 10.359469105281512, "learning_rate": 6.422769587006438e-09, "logps/chosen": -51.28647232055664, "logps/rejected": -93.08845520019531, "loss": 0.0968, "losses/dpo": 0.1821354776620865, "losses/sft": 1.1926517486572266, "losses/total": 0.1821354776620865, "ref_logps/chosen": -24.970149993896484, "ref_logps/rejected": -36.120235443115234, "rewards/accuracies": 1.0, "rewards/chosen": -2.6316323280334473, "rewards/margins": 3.065189838409424, "rewards/rejected": -5.696822166442871, "step": 2957 }, { "epoch": 2.79, "grad_norm": 18.85418456548163, "learning_rate": 6.365540587305262e-09, "logps/chosen": -52.108394622802734, "logps/rejected": -84.08563232421875, "loss": 0.2504, "losses/dpo": 0.07428350299596786, "losses/sft": 1.9270075559616089, "losses/total": 0.07428350299596786, "ref_logps/chosen": -30.283832550048828, "ref_logps/rejected": -35.99604797363281, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1824560165405273, "rewards/margins": 2.626502513885498, "rewards/rejected": -4.808958530426025, "step": 2958 }, { "epoch": 2.79, "grad_norm": 12.953981162731925, "learning_rate": 6.308564406589139e-09, "logps/chosen": -50.20587158203125, "logps/rejected": -85.36734008789062, "loss": 0.1306, "losses/dpo": 0.27373871207237244, "losses/sft": 1.8725160360336304, "losses/total": 0.27373871207237244, "ref_logps/chosen": -28.657703399658203, "ref_logps/rejected": -33.2384147644043, "rewards/accuracies": 1.0, "rewards/chosen": -2.1548168659210205, "rewards/margins": 3.058075428009033, "rewards/rejected": -5.212892532348633, "step": 2959 }, { "epoch": 2.79, "grad_norm": 14.062854650858652, "learning_rate": 6.251841103982164e-09, "logps/chosen": -50.01112747192383, "logps/rejected": -82.20979309082031, "loss": 0.1165, "losses/dpo": 0.05099281296133995, "losses/sft": 2.2444543838500977, "losses/total": 0.05099281296133995, "ref_logps/chosen": -26.813276290893555, "ref_logps/rejected": -29.160768508911133, "rewards/accuracies": 1.0, "rewards/chosen": -2.3197851181030273, "rewards/margins": 2.9851179122924805, "rewards/rejected": -5.304903030395508, "step": 2960 }, { "epoch": 2.79, "grad_norm": 13.406243421924868, "learning_rate": 6.1953707383460005e-09, "logps/chosen": -57.319644927978516, "logps/rejected": -102.87249755859375, "loss": 0.1063, "losses/dpo": 0.00039244594518095255, "losses/sft": 2.2773115634918213, "losses/total": 0.00039244594518095255, "ref_logps/chosen": -27.448822021484375, "ref_logps/rejected": -39.18036651611328, "rewards/accuracies": 1.0, "rewards/chosen": -2.987082004547119, "rewards/margins": 3.382131576538086, "rewards/rejected": -6.369213104248047, "step": 2961 }, { "epoch": 2.79, "grad_norm": 10.968928670277743, "learning_rate": 6.139153368279804e-09, "logps/chosen": -65.32650756835938, "logps/rejected": -96.43431854248047, "loss": 0.1058, "losses/dpo": 0.07029494643211365, "losses/sft": 2.6483373641967773, "losses/total": 0.07029494643211365, "ref_logps/chosen": -36.085792541503906, "ref_logps/rejected": -38.092533111572266, "rewards/accuracies": 1.0, "rewards/chosen": -2.924072504043579, "rewards/margins": 2.9101061820983887, "rewards/rejected": -5.834178447723389, "step": 2962 }, { "epoch": 2.8, "grad_norm": 12.541314171558987, "learning_rate": 6.083189052120269e-09, "logps/chosen": -38.95207977294922, "logps/rejected": -61.44675064086914, "loss": 0.2004, "losses/dpo": 0.05940040200948715, "losses/sft": 2.065178871154785, "losses/total": 0.05940040200948715, "ref_logps/chosen": -23.72837257385254, "ref_logps/rejected": -22.034297943115234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5223710536956787, "rewards/margins": 2.4188740253448486, "rewards/rejected": -3.9412453174591064, "step": 2963 }, { "epoch": 2.8, "grad_norm": 12.722056737691744, "learning_rate": 6.027477847941415e-09, "logps/chosen": -67.2991714477539, "logps/rejected": -87.27599334716797, "loss": 0.1375, "losses/dpo": 0.048647601157426834, "losses/sft": 2.08318829536438, "losses/total": 0.048647601157426834, "ref_logps/chosen": -37.94415283203125, "ref_logps/rejected": -30.328311920166016, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9355015754699707, "rewards/margins": 2.7592666149139404, "rewards/rejected": -5.694767951965332, "step": 2964 }, { "epoch": 2.8, "grad_norm": 23.712378008509166, "learning_rate": 5.972019813554663e-09, "logps/chosen": -45.922576904296875, "logps/rejected": -85.64988708496094, "loss": 0.287, "losses/dpo": 2.062307834625244, "losses/sft": 1.584159255027771, "losses/total": 2.062307834625244, "ref_logps/chosen": -21.348876953125, "ref_logps/rejected": -29.758071899414062, "rewards/accuracies": 0.875, "rewards/chosen": -2.4573702812194824, "rewards/margins": 3.1318118572235107, "rewards/rejected": -5.589181900024414, "step": 2965 }, { "epoch": 2.8, "grad_norm": 9.650411755967248, "learning_rate": 5.916815006508702e-09, "logps/chosen": -53.99803161621094, "logps/rejected": -88.87811279296875, "loss": 0.1044, "losses/dpo": 0.12049560248851776, "losses/sft": 1.8263710737228394, "losses/total": 0.12049560248851776, "ref_logps/chosen": -30.525775909423828, "ref_logps/rejected": -31.830318450927734, "rewards/accuracies": 0.9375, "rewards/chosen": -2.347226142883301, "rewards/margins": 3.357553005218506, "rewards/rejected": -5.704778671264648, "step": 2966 }, { "epoch": 2.8, "grad_norm": 14.52478051605104, "learning_rate": 5.861863484089457e-09, "logps/chosen": -44.347259521484375, "logps/rejected": -84.28527069091797, "loss": 0.1634, "losses/dpo": 0.019577961415052414, "losses/sft": 1.7986332178115845, "losses/total": 0.019577961415052414, "ref_logps/chosen": -21.579151153564453, "ref_logps/rejected": -31.253873825073242, "rewards/accuracies": 0.9375, "rewards/chosen": -2.276810646057129, "rewards/margins": 3.0263290405273438, "rewards/rejected": -5.303139686584473, "step": 2967 }, { "epoch": 2.8, "grad_norm": 15.30525260047202, "learning_rate": 5.807165303319956e-09, "logps/chosen": -52.727901458740234, "logps/rejected": -86.798828125, "loss": 0.1501, "losses/dpo": 0.04701437056064606, "losses/sft": 1.9954566955566406, "losses/total": 0.04701437056064606, "ref_logps/chosen": -30.27233123779297, "ref_logps/rejected": -32.88179016113281, "rewards/accuracies": 1.0, "rewards/chosen": -2.2455570697784424, "rewards/margins": 3.1461472511291504, "rewards/rejected": -5.391704559326172, "step": 2968 }, { "epoch": 2.8, "grad_norm": 9.51804476330101, "learning_rate": 5.752720520960458e-09, "logps/chosen": -52.2918586730957, "logps/rejected": -90.00909423828125, "loss": 0.0832, "losses/dpo": 0.22555318474769592, "losses/sft": 2.1538822650909424, "losses/total": 0.22555318474769592, "ref_logps/chosen": -27.146860122680664, "ref_logps/rejected": -32.29396057128906, "rewards/accuracies": 1.0, "rewards/chosen": -2.5144996643066406, "rewards/margins": 3.2570137977600098, "rewards/rejected": -5.77151346206665, "step": 2969 }, { "epoch": 2.8, "grad_norm": 14.750797490466855, "learning_rate": 5.698529193508189e-09, "logps/chosen": -62.122440338134766, "logps/rejected": -97.83304595947266, "loss": 0.1148, "losses/dpo": 0.014143439941108227, "losses/sft": 1.7064329385757446, "losses/total": 0.014143439941108227, "ref_logps/chosen": -33.098419189453125, "ref_logps/rejected": -35.48191833496094, "rewards/accuracies": 1.0, "rewards/chosen": -2.90240216255188, "rewards/margins": 3.3327109813690186, "rewards/rejected": -6.23511266708374, "step": 2970 }, { "epoch": 2.8, "grad_norm": 16.55420490335175, "learning_rate": 5.644591377197361e-09, "logps/chosen": -43.359806060791016, "logps/rejected": -83.37974548339844, "loss": 0.1602, "losses/dpo": 0.025264237076044083, "losses/sft": 2.164590358734131, "losses/total": 0.025264237076044083, "ref_logps/chosen": -21.859161376953125, "ref_logps/rejected": -32.61882400512695, "rewards/accuracies": 0.9375, "rewards/chosen": -2.150064706802368, "rewards/margins": 2.926027774810791, "rewards/rejected": -5.076091766357422, "step": 2971 }, { "epoch": 2.8, "grad_norm": 8.34987949356676, "learning_rate": 5.5909071279991724e-09, "logps/chosen": -44.97702407836914, "logps/rejected": -88.94004821777344, "loss": 0.0624, "losses/dpo": 0.03441740199923515, "losses/sft": 1.8476622104644775, "losses/total": 0.03441740199923515, "ref_logps/chosen": -25.526920318603516, "ref_logps/rejected": -32.15320587158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.9450106620788574, "rewards/margins": 3.733673095703125, "rewards/rejected": -5.678684234619141, "step": 2972 }, { "epoch": 2.8, "grad_norm": 16.488462404151438, "learning_rate": 5.537476501621591e-09, "logps/chosen": -58.17576217651367, "logps/rejected": -95.56924438476562, "loss": 0.2157, "losses/dpo": 0.02470828965306282, "losses/sft": 2.274742364883423, "losses/total": 0.02470828965306282, "ref_logps/chosen": -31.368074417114258, "ref_logps/rejected": -39.280670166015625, "rewards/accuracies": 1.0, "rewards/chosen": -2.6807689666748047, "rewards/margins": 2.9480881690979004, "rewards/rejected": -5.628857612609863, "step": 2973 }, { "epoch": 2.81, "grad_norm": 14.35982579391655, "learning_rate": 5.484299553509569e-09, "logps/chosen": -63.33268737792969, "logps/rejected": -111.6799545288086, "loss": 0.1706, "losses/dpo": 0.0006897123530507088, "losses/sft": 0.871490478515625, "losses/total": 0.0006897123530507088, "ref_logps/chosen": -35.235198974609375, "ref_logps/rejected": -45.239070892333984, "rewards/accuracies": 0.9375, "rewards/chosen": -2.809748411178589, "rewards/margins": 3.8343396186828613, "rewards/rejected": -6.644087791442871, "step": 2974 }, { "epoch": 2.81, "grad_norm": 13.475256839128203, "learning_rate": 5.431376338844607e-09, "logps/chosen": -45.91913604736328, "logps/rejected": -86.17117309570312, "loss": 0.1341, "losses/dpo": 0.16061654686927795, "losses/sft": 0.8173127770423889, "losses/total": 0.16061654686927795, "ref_logps/chosen": -26.160938262939453, "ref_logps/rejected": -34.279998779296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9758200645446777, "rewards/margins": 3.2132973670959473, "rewards/rejected": -5.189117431640625, "step": 2975 }, { "epoch": 2.81, "grad_norm": 14.971891456335648, "learning_rate": 5.378706912545105e-09, "logps/chosen": -61.99713134765625, "logps/rejected": -92.17497253417969, "loss": 0.1035, "losses/dpo": 0.008451242931187153, "losses/sft": 1.8083500862121582, "losses/total": 0.008451242931187153, "ref_logps/chosen": -31.86258888244629, "ref_logps/rejected": -31.909452438354492, "rewards/accuracies": 1.0, "rewards/chosen": -3.013453960418701, "rewards/margins": 3.0130977630615234, "rewards/rejected": -6.026551723480225, "step": 2976 }, { "epoch": 2.81, "grad_norm": 10.31166807600871, "learning_rate": 5.326291329265958e-09, "logps/chosen": -66.02063751220703, "logps/rejected": -98.71532440185547, "loss": 0.133, "losses/dpo": 0.017624638974666595, "losses/sft": 2.785581111907959, "losses/total": 0.017624638974666595, "ref_logps/chosen": -38.251914978027344, "ref_logps/rejected": -40.97486877441406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7768726348876953, "rewards/margins": 2.9971730709075928, "rewards/rejected": -5.774045944213867, "step": 2977 }, { "epoch": 2.81, "grad_norm": 9.564360946741333, "learning_rate": 5.2741296433987395e-09, "logps/chosen": -32.32965087890625, "logps/rejected": -80.4990234375, "loss": 0.106, "losses/dpo": 0.011890966445207596, "losses/sft": 0.02496829256415367, "losses/total": 0.011890966445207596, "ref_logps/chosen": -18.183685302734375, "ref_logps/rejected": -27.892160415649414, "rewards/accuracies": 1.0, "rewards/chosen": -1.4145967960357666, "rewards/margins": 3.8460898399353027, "rewards/rejected": -5.260686874389648, "step": 2978 }, { "epoch": 2.81, "grad_norm": 14.017812393798755, "learning_rate": 5.2222219090715665e-09, "logps/chosen": -50.46388626098633, "logps/rejected": -90.81988525390625, "loss": 0.1137, "losses/dpo": 0.49322694540023804, "losses/sft": 0.8779363036155701, "losses/total": 0.49322694540023804, "ref_logps/chosen": -27.2247371673584, "ref_logps/rejected": -34.65003204345703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3239147663116455, "rewards/margins": 3.293069839477539, "rewards/rejected": -5.616984844207764, "step": 2979 }, { "epoch": 2.81, "grad_norm": 9.88260349356317, "learning_rate": 5.1705681801489095e-09, "logps/chosen": -43.22724914550781, "logps/rejected": -75.583740234375, "loss": 0.0883, "losses/dpo": 0.01919245719909668, "losses/sft": 1.211371898651123, "losses/total": 0.01919245719909668, "ref_logps/chosen": -28.03325080871582, "ref_logps/rejected": -30.081026077270508, "rewards/accuracies": 1.0, "rewards/chosen": -1.5194001197814941, "rewards/margins": 3.030871629714966, "rewards/rejected": -4.550271511077881, "step": 2980 }, { "epoch": 2.81, "grad_norm": 12.68342580278823, "learning_rate": 5.11916851023178e-09, "logps/chosen": -67.14790344238281, "logps/rejected": -98.89556884765625, "loss": 0.122, "losses/dpo": 0.03194049373269081, "losses/sft": 0.8391532897949219, "losses/total": 0.03194049373269081, "ref_logps/chosen": -36.48619079589844, "ref_logps/rejected": -39.15556335449219, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0661706924438477, "rewards/margins": 2.9078307151794434, "rewards/rejected": -5.974001884460449, "step": 2981 }, { "epoch": 2.81, "grad_norm": 8.79027092212944, "learning_rate": 5.068022952657514e-09, "logps/chosen": -44.65581512451172, "logps/rejected": -81.80258178710938, "loss": 0.1133, "losses/dpo": 0.016429636627435684, "losses/sft": 1.4586756229400635, "losses/total": 0.016429636627435684, "ref_logps/chosen": -25.513526916503906, "ref_logps/rejected": -30.31407928466797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.914229154586792, "rewards/margins": 3.234622001647949, "rewards/rejected": -5.148850440979004, "step": 2982 }, { "epoch": 2.81, "grad_norm": 12.405667156814276, "learning_rate": 5.017131560499765e-09, "logps/chosen": -49.07786560058594, "logps/rejected": -78.44114685058594, "loss": 0.1249, "losses/dpo": 0.08944921940565109, "losses/sft": 1.794183373451233, "losses/total": 0.08944921940565109, "ref_logps/chosen": -29.167823791503906, "ref_logps/rejected": -28.535606384277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.9910039901733398, "rewards/margins": 2.9995503425598145, "rewards/rejected": -4.990554332733154, "step": 2983 }, { "epoch": 2.82, "grad_norm": 18.037539542461758, "learning_rate": 4.966494386568376e-09, "logps/chosen": -52.666690826416016, "logps/rejected": -70.0419921875, "loss": 0.2362, "losses/dpo": 0.23625542223453522, "losses/sft": 3.438595771789551, "losses/total": 0.23625542223453522, "ref_logps/chosen": -32.32910919189453, "ref_logps/rejected": -25.081295013427734, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0337581634521484, "rewards/margins": 2.4623115062713623, "rewards/rejected": -4.49606990814209, "step": 2984 }, { "epoch": 2.82, "grad_norm": 12.021045661262914, "learning_rate": 4.916111483409507e-09, "logps/chosen": -60.25165557861328, "logps/rejected": -102.42935943603516, "loss": 0.0829, "losses/dpo": 0.044849175959825516, "losses/sft": 2.0814294815063477, "losses/total": 0.044849175959825516, "ref_logps/chosen": -35.11140441894531, "ref_logps/rejected": -44.8964729309082, "rewards/accuracies": 1.0, "rewards/chosen": -2.5140252113342285, "rewards/margins": 3.2392637729644775, "rewards/rejected": -5.753288745880127, "step": 2985 }, { "epoch": 2.82, "grad_norm": 16.33551764064282, "learning_rate": 4.8659829033053375e-09, "logps/chosen": -47.955848693847656, "logps/rejected": -69.04281616210938, "loss": 0.2033, "losses/dpo": 0.3321808874607086, "losses/sft": 3.0817103385925293, "losses/total": 0.3321808874607086, "ref_logps/chosen": -23.33855628967285, "ref_logps/rejected": -21.826351165771484, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4617295265197754, "rewards/margins": 2.2599167823791504, "rewards/rejected": -4.721646308898926, "step": 2986 }, { "epoch": 2.82, "grad_norm": 5.743563510565193, "learning_rate": 4.81610869827423e-09, "logps/chosen": -50.81134033203125, "logps/rejected": -90.83492279052734, "loss": 0.0634, "losses/dpo": 0.03496156632900238, "losses/sft": 1.7906919717788696, "losses/total": 0.03496156632900238, "ref_logps/chosen": -27.795589447021484, "ref_logps/rejected": -30.256088256835938, "rewards/accuracies": 1.0, "rewards/chosen": -2.301575183868408, "rewards/margins": 3.756308078765869, "rewards/rejected": -6.057883262634277, "step": 2987 }, { "epoch": 2.82, "grad_norm": 20.37812916640525, "learning_rate": 4.7664889200705605e-09, "logps/chosen": -63.60095977783203, "logps/rejected": -89.52256774902344, "loss": 0.2139, "losses/dpo": 0.11206984519958496, "losses/sft": 2.0226359367370605, "losses/total": 0.11206984519958496, "ref_logps/chosen": -33.56333923339844, "ref_logps/rejected": -33.07855987548828, "rewards/accuracies": 0.875, "rewards/chosen": -3.0037617683410645, "rewards/margins": 2.640638828277588, "rewards/rejected": -5.644400596618652, "step": 2988 }, { "epoch": 2.82, "grad_norm": 11.188543155629183, "learning_rate": 4.717123620184587e-09, "logps/chosen": -48.833030700683594, "logps/rejected": -83.27880859375, "loss": 0.134, "losses/dpo": 0.3049444854259491, "losses/sft": 2.22892689704895, "losses/total": 0.3049444854259491, "ref_logps/chosen": -29.290393829345703, "ref_logps/rejected": -33.56953430175781, "rewards/accuracies": 1.0, "rewards/chosen": -1.9542635679244995, "rewards/margins": 3.016664505004883, "rewards/rejected": -4.970927715301514, "step": 2989 }, { "epoch": 2.82, "grad_norm": 10.64812376847862, "learning_rate": 4.668012849842695e-09, "logps/chosen": -55.45071792602539, "logps/rejected": -101.55300903320312, "loss": 0.072, "losses/dpo": 0.002444530837237835, "losses/sft": 1.3159278631210327, "losses/total": 0.002444530837237835, "ref_logps/chosen": -31.857738494873047, "ref_logps/rejected": -37.698524475097656, "rewards/accuracies": 1.0, "rewards/chosen": -2.35929799079895, "rewards/margins": 4.026150703430176, "rewards/rejected": -6.385448455810547, "step": 2990 }, { "epoch": 2.82, "grad_norm": 5.6482899465553125, "learning_rate": 4.619156660006951e-09, "logps/chosen": -65.34595489501953, "logps/rejected": -114.13906860351562, "loss": 0.0399, "losses/dpo": 0.0009135915315710008, "losses/sft": 1.6799944639205933, "losses/total": 0.0009135915315710008, "ref_logps/chosen": -38.61480712890625, "ref_logps/rejected": -41.04083251953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.673114538192749, "rewards/margins": 4.636710166931152, "rewards/rejected": -7.309823989868164, "step": 2991 }, { "epoch": 2.82, "grad_norm": 10.735181108779893, "learning_rate": 4.570555101375356e-09, "logps/chosen": -67.56083679199219, "logps/rejected": -98.686279296875, "loss": 0.0981, "losses/dpo": 0.0012269392609596252, "losses/sft": 0.8264875411987305, "losses/total": 0.0012269392609596252, "ref_logps/chosen": -40.91302490234375, "ref_logps/rejected": -38.32350158691406, "rewards/accuracies": 1.0, "rewards/chosen": -2.664781332015991, "rewards/margins": 3.3714969158172607, "rewards/rejected": -6.036278247833252, "step": 2992 }, { "epoch": 2.82, "grad_norm": 10.769414443334659, "learning_rate": 4.522208224381624e-09, "logps/chosen": -56.40092468261719, "logps/rejected": -110.20799255371094, "loss": 0.0826, "losses/dpo": 0.00327697885222733, "losses/sft": 1.6342766284942627, "losses/total": 0.00327697885222733, "ref_logps/chosen": -29.841062545776367, "ref_logps/rejected": -41.86363220214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.6559863090515137, "rewards/margins": 4.178450584411621, "rewards/rejected": -6.834436893463135, "step": 2993 }, { "epoch": 2.82, "grad_norm": 6.764432960636464, "learning_rate": 4.474116079195234e-09, "logps/chosen": -55.80851745605469, "logps/rejected": -100.58486938476562, "loss": 0.0656, "losses/dpo": 0.026256464421749115, "losses/sft": 1.1427849531173706, "losses/total": 0.026256464421749115, "ref_logps/chosen": -29.408370971679688, "ref_logps/rejected": -38.310401916503906, "rewards/accuracies": 1.0, "rewards/chosen": -2.640015125274658, "rewards/margins": 3.5874319076538086, "rewards/rejected": -6.227446556091309, "step": 2994 }, { "epoch": 2.83, "grad_norm": 35.46742382160116, "learning_rate": 4.426278715721321e-09, "logps/chosen": -56.887855529785156, "logps/rejected": -80.81572723388672, "loss": 0.3028, "losses/dpo": 0.058131348341703415, "losses/sft": 0.9216777682304382, "losses/total": 0.058131348341703415, "ref_logps/chosen": -27.40162467956543, "ref_logps/rejected": -29.3204402923584, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9486231803894043, "rewards/margins": 2.2009055614471436, "rewards/rejected": -5.149528980255127, "step": 2995 }, { "epoch": 2.83, "grad_norm": 9.772150759306077, "learning_rate": 4.378696183600567e-09, "logps/chosen": -52.536651611328125, "logps/rejected": -84.94515991210938, "loss": 0.094, "losses/dpo": 0.185294970870018, "losses/sft": 2.8950119018554688, "losses/total": 0.185294970870018, "ref_logps/chosen": -29.780338287353516, "ref_logps/rejected": -33.43125915527344, "rewards/accuracies": 1.0, "rewards/chosen": -2.2756311893463135, "rewards/margins": 2.8757591247558594, "rewards/rejected": -5.151390075683594, "step": 2996 }, { "epoch": 2.83, "grad_norm": 24.459211780951517, "learning_rate": 4.331368532209362e-09, "logps/chosen": -47.89764404296875, "logps/rejected": -98.49110412597656, "loss": 0.1799, "losses/dpo": 0.04916500300168991, "losses/sft": 1.554333209991455, "losses/total": 0.04916500300168991, "ref_logps/chosen": -22.167177200317383, "ref_logps/rejected": -35.68844223022461, "rewards/accuracies": 0.9375, "rewards/chosen": -2.573046922683716, "rewards/margins": 3.707219123840332, "rewards/rejected": -6.280265808105469, "step": 2997 }, { "epoch": 2.83, "grad_norm": 18.452954612305405, "learning_rate": 4.284295810659394e-09, "logps/chosen": -61.5440673828125, "logps/rejected": -97.50553894042969, "loss": 0.1242, "losses/dpo": 0.10628178715705872, "losses/sft": 2.017195463180542, "losses/total": 0.10628178715705872, "ref_logps/chosen": -34.039119720458984, "ref_logps/rejected": -35.57096481323242, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7504944801330566, "rewards/margins": 3.442963123321533, "rewards/rejected": -6.19345760345459, "step": 2998 }, { "epoch": 2.83, "grad_norm": 14.983341479313655, "learning_rate": 4.237478067798062e-09, "logps/chosen": -47.334224700927734, "logps/rejected": -89.55912780761719, "loss": 0.2134, "losses/dpo": 0.030352646484971046, "losses/sft": 2.0532915592193604, "losses/total": 0.030352646484971046, "ref_logps/chosen": -26.856975555419922, "ref_logps/rejected": -34.442771911621094, "rewards/accuracies": 0.875, "rewards/chosen": -2.047725200653076, "rewards/margins": 3.4639105796813965, "rewards/rejected": -5.511635780334473, "step": 2999 }, { "epoch": 2.83, "grad_norm": 12.394850811838186, "learning_rate": 4.190915352207919e-09, "logps/chosen": -49.14397048950195, "logps/rejected": -76.74310302734375, "loss": 0.1589, "losses/dpo": 0.1026623547077179, "losses/sft": 1.5188080072402954, "losses/total": 0.1026623547077179, "ref_logps/chosen": -29.819772720336914, "ref_logps/rejected": -30.286670684814453, "rewards/accuracies": 1.0, "rewards/chosen": -1.932419776916504, "rewards/margins": 2.713224411010742, "rewards/rejected": -4.645644187927246, "step": 3000 }, { "epoch": 2.83, "grad_norm": 9.856110828973916, "learning_rate": 4.144607712207093e-09, "logps/chosen": -67.52684783935547, "logps/rejected": -101.78369140625, "loss": 0.0845, "losses/dpo": 0.018618330359458923, "losses/sft": 1.830625057220459, "losses/total": 0.018618330359458923, "ref_logps/chosen": -39.71387481689453, "ref_logps/rejected": -39.05012512207031, "rewards/accuracies": 1.0, "rewards/chosen": -2.781297206878662, "rewards/margins": 3.4920601844787598, "rewards/rejected": -6.273357391357422, "step": 3001 }, { "epoch": 2.83, "grad_norm": 11.369166150198977, "learning_rate": 4.098555195848841e-09, "logps/chosen": -49.20844268798828, "logps/rejected": -92.06135559082031, "loss": 0.1178, "losses/dpo": 0.02345520816743374, "losses/sft": 2.219454050064087, "losses/total": 0.02345520816743374, "ref_logps/chosen": -27.137115478515625, "ref_logps/rejected": -32.974365234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2071330547332764, "rewards/margins": 3.701566457748413, "rewards/rejected": -5.9086995124816895, "step": 3002 }, { "epoch": 2.83, "grad_norm": 11.295872280363715, "learning_rate": 4.052757850921851e-09, "logps/chosen": -37.75860595703125, "logps/rejected": -77.1427001953125, "loss": 0.1169, "losses/dpo": 0.026923267170786858, "losses/sft": 0.7394262552261353, "losses/total": 0.026923267170786858, "ref_logps/chosen": -20.375247955322266, "ref_logps/rejected": -27.675188064575195, "rewards/accuracies": 1.0, "rewards/chosen": -1.7383359670639038, "rewards/margins": 3.2084155082702637, "rewards/rejected": -4.946751594543457, "step": 3003 }, { "epoch": 2.83, "grad_norm": 8.576091485912851, "learning_rate": 4.007215724949914e-09, "logps/chosen": -65.2431869506836, "logps/rejected": -121.46746826171875, "loss": 0.0481, "losses/dpo": 0.1530638039112091, "losses/sft": 1.5167455673217773, "losses/total": 0.1530638039112091, "ref_logps/chosen": -35.50630187988281, "ref_logps/rejected": -45.99356460571289, "rewards/accuracies": 1.0, "rewards/chosen": -2.9736886024475098, "rewards/margins": 4.573702812194824, "rewards/rejected": -7.547390937805176, "step": 3004 }, { "epoch": 2.83, "grad_norm": 8.899214279563743, "learning_rate": 3.961928865191949e-09, "logps/chosen": -43.18684005737305, "logps/rejected": -88.17889404296875, "loss": 0.0632, "losses/dpo": 0.14419636130332947, "losses/sft": 1.264927625656128, "losses/total": 0.14419636130332947, "ref_logps/chosen": -23.774738311767578, "ref_logps/rejected": -30.505313873291016, "rewards/accuracies": 1.0, "rewards/chosen": -1.941210150718689, "rewards/margins": 3.82614803314209, "rewards/rejected": -5.76735782623291, "step": 3005 }, { "epoch": 2.84, "grad_norm": 11.364291602200456, "learning_rate": 3.916897318642115e-09, "logps/chosen": -54.38560485839844, "logps/rejected": -86.4685287475586, "loss": 0.0825, "losses/dpo": 0.018700864166021347, "losses/sft": 2.2473556995391846, "losses/total": 0.018700864166021347, "ref_logps/chosen": -36.01399230957031, "ref_logps/rejected": -35.400238037109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8371613025665283, "rewards/margins": 3.269667387008667, "rewards/rejected": -5.106828689575195, "step": 3006 }, { "epoch": 2.84, "grad_norm": 7.533442090849537, "learning_rate": 3.872121132029504e-09, "logps/chosen": -52.33091735839844, "logps/rejected": -97.07683563232422, "loss": 0.0826, "losses/dpo": 0.2021254003047943, "losses/sft": 2.2115330696105957, "losses/total": 0.2021254003047943, "ref_logps/chosen": -26.88910484313965, "ref_logps/rejected": -35.4443359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.5441818237304688, "rewards/margins": 3.6190686225891113, "rewards/rejected": -6.163249969482422, "step": 3007 }, { "epoch": 2.84, "grad_norm": 25.142579832513416, "learning_rate": 3.8276003518183105e-09, "logps/chosen": -53.452911376953125, "logps/rejected": -79.42109680175781, "loss": 0.2503, "losses/dpo": 0.2564602196216583, "losses/sft": 1.9910284280776978, "losses/total": 0.2564602196216583, "ref_logps/chosen": -29.144004821777344, "ref_logps/rejected": -28.90114974975586, "rewards/accuracies": 0.875, "rewards/chosen": -2.4308905601501465, "rewards/margins": 2.6211037635803223, "rewards/rejected": -5.051994323730469, "step": 3008 }, { "epoch": 2.84, "grad_norm": 14.568698419685319, "learning_rate": 3.783335024207607e-09, "logps/chosen": -46.002227783203125, "logps/rejected": -107.35443878173828, "loss": 0.0768, "losses/dpo": 0.0012917546555399895, "losses/sft": 1.0320597887039185, "losses/total": 0.0012917546555399895, "ref_logps/chosen": -22.055877685546875, "ref_logps/rejected": -41.626548767089844, "rewards/accuracies": 1.0, "rewards/chosen": -2.3946352005004883, "rewards/margins": 4.178153991699219, "rewards/rejected": -6.572789192199707, "step": 3009 }, { "epoch": 2.84, "grad_norm": 13.022595048976513, "learning_rate": 3.739325195131482e-09, "logps/chosen": -47.43189239501953, "logps/rejected": -82.84133911132812, "loss": 0.1201, "losses/dpo": 0.024473685771226883, "losses/sft": 1.2006020545959473, "losses/total": 0.024473685771226883, "ref_logps/chosen": -26.73711395263672, "ref_logps/rejected": -29.40743637084961, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0694780349731445, "rewards/margins": 3.273912191390991, "rewards/rejected": -5.343390464782715, "step": 3010 }, { "epoch": 2.84, "grad_norm": 10.226148922291788, "learning_rate": 3.695570910258794e-09, "logps/chosen": -64.974853515625, "logps/rejected": -93.16159057617188, "loss": 0.0911, "losses/dpo": 0.0038495881017297506, "losses/sft": 2.227541923522949, "losses/total": 0.0038495881017297506, "ref_logps/chosen": -39.495487213134766, "ref_logps/rejected": -33.91644287109375, "rewards/accuracies": 1.0, "rewards/chosen": -2.54793643951416, "rewards/margins": 3.3765780925750732, "rewards/rejected": -5.9245147705078125, "step": 3011 }, { "epoch": 2.84, "grad_norm": 13.870487451444973, "learning_rate": 3.652072214993335e-09, "logps/chosen": -33.53938293457031, "logps/rejected": -58.30128479003906, "loss": 0.188, "losses/dpo": 0.30861029028892517, "losses/sft": 1.1313132047653198, "losses/total": 0.30861029028892517, "ref_logps/chosen": -18.017501831054688, "ref_logps/rejected": -20.636676788330078, "rewards/accuracies": 1.0, "rewards/chosen": -1.5521878004074097, "rewards/margins": 2.2142727375030518, "rewards/rejected": -3.766460418701172, "step": 3012 }, { "epoch": 2.84, "grad_norm": 10.167874740949692, "learning_rate": 3.608829154473608e-09, "logps/chosen": -46.9948844909668, "logps/rejected": -92.0848388671875, "loss": 0.0867, "losses/dpo": 0.026909325271844864, "losses/sft": 2.711970806121826, "losses/total": 0.026909325271844864, "ref_logps/chosen": -23.06595802307129, "ref_logps/rejected": -33.84601593017578, "rewards/accuracies": 1.0, "rewards/chosen": -2.392892599105835, "rewards/margins": 3.430990219116211, "rewards/rejected": -5.823883056640625, "step": 3013 }, { "epoch": 2.84, "grad_norm": 7.239060429405684, "learning_rate": 3.565841773572803e-09, "logps/chosen": -64.33511352539062, "logps/rejected": -105.58467102050781, "loss": 0.0564, "losses/dpo": 0.10429718345403671, "losses/sft": 0.5524404644966125, "losses/total": 0.10429718345403671, "ref_logps/chosen": -35.71358108520508, "ref_logps/rejected": -41.62045669555664, "rewards/accuracies": 1.0, "rewards/chosen": -2.8621535301208496, "rewards/margins": 3.5342676639556885, "rewards/rejected": -6.396420955657959, "step": 3014 }, { "epoch": 2.84, "grad_norm": 14.414240871110938, "learning_rate": 3.5231101168988754e-09, "logps/chosen": -44.4501953125, "logps/rejected": -87.53152465820312, "loss": 0.158, "losses/dpo": 0.04221241921186447, "losses/sft": 1.81611967086792, "losses/total": 0.04221241921186447, "ref_logps/chosen": -25.511287689208984, "ref_logps/rejected": -35.99733352661133, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8938912153244019, "rewards/margins": 3.2595279216766357, "rewards/rejected": -5.153419494628906, "step": 3015 }, { "epoch": 2.85, "grad_norm": 6.590079022628426, "learning_rate": 3.4806342287943824e-09, "logps/chosen": -43.803470611572266, "logps/rejected": -97.51197814941406, "loss": 0.0546, "losses/dpo": 0.005897137336432934, "losses/sft": 1.1737291812896729, "losses/total": 0.005897137336432934, "ref_logps/chosen": -24.434757232666016, "ref_logps/rejected": -35.04584503173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.9368712902069092, "rewards/margins": 4.309741973876953, "rewards/rejected": -6.246613502502441, "step": 3016 }, { "epoch": 2.85, "grad_norm": 13.993659655737178, "learning_rate": 3.4384141533364276e-09, "logps/chosen": -52.43189239501953, "logps/rejected": -90.02592468261719, "loss": 0.1671, "losses/dpo": 0.09978903830051422, "losses/sft": 1.3705974817276, "losses/total": 0.09978903830051422, "ref_logps/chosen": -27.548166275024414, "ref_logps/rejected": -36.55899429321289, "rewards/accuracies": 0.9375, "rewards/chosen": -2.488372564315796, "rewards/margins": 2.858320951461792, "rewards/rejected": -5.34669303894043, "step": 3017 }, { "epoch": 2.85, "grad_norm": 14.105740307273537, "learning_rate": 3.3964499343367426e-09, "logps/chosen": -61.83318328857422, "logps/rejected": -86.1053695678711, "loss": 0.1331, "losses/dpo": 0.269369512796402, "losses/sft": 1.1796821355819702, "losses/total": 0.269369512796402, "ref_logps/chosen": -33.269775390625, "ref_logps/rejected": -32.60493087768555, "rewards/accuracies": 1.0, "rewards/chosen": -2.856340169906616, "rewards/margins": 2.493703842163086, "rewards/rejected": -5.350043773651123, "step": 3018 }, { "epoch": 2.85, "grad_norm": 10.134215219985194, "learning_rate": 3.3547416153414943e-09, "logps/chosen": -42.07174301147461, "logps/rejected": -83.23526000976562, "loss": 0.1078, "losses/dpo": 0.08801452815532684, "losses/sft": 1.2579821348190308, "losses/total": 0.08801452815532684, "ref_logps/chosen": -26.143978118896484, "ref_logps/rejected": -32.595008850097656, "rewards/accuracies": 1.0, "rewards/chosen": -1.5927765369415283, "rewards/margins": 3.4712486267089844, "rewards/rejected": -5.064024925231934, "step": 3019 }, { "epoch": 2.85, "grad_norm": 11.991157286501094, "learning_rate": 3.313289239631284e-09, "logps/chosen": -57.16379165649414, "logps/rejected": -93.25111389160156, "loss": 0.0849, "losses/dpo": 0.0009646282414905727, "losses/sft": 2.7413032054901123, "losses/total": 0.0009646282414905727, "ref_logps/chosen": -29.412151336669922, "ref_logps/rejected": -30.666553497314453, "rewards/accuracies": 1.0, "rewards/chosen": -2.7751643657684326, "rewards/margins": 3.4832916259765625, "rewards/rejected": -6.258455753326416, "step": 3020 }, { "epoch": 2.85, "grad_norm": 7.471439223849296, "learning_rate": 3.272092850221203e-09, "logps/chosen": -50.24284362792969, "logps/rejected": -96.28688049316406, "loss": 0.0476, "losses/dpo": 0.008101118728518486, "losses/sft": 1.42106294631958, "losses/total": 0.008101118728518486, "ref_logps/chosen": -28.257341384887695, "ref_logps/rejected": -34.79439926147461, "rewards/accuracies": 1.0, "rewards/chosen": -2.198550224304199, "rewards/margins": 3.950698137283325, "rewards/rejected": -6.149248123168945, "step": 3021 }, { "epoch": 2.85, "grad_norm": 16.536963679181383, "learning_rate": 3.2311524898606946e-09, "logps/chosen": -48.798370361328125, "logps/rejected": -85.6186752319336, "loss": 0.1658, "losses/dpo": 0.06907983124256134, "losses/sft": 1.0860321521759033, "losses/total": 0.06907983124256134, "ref_logps/chosen": -22.187768936157227, "ref_logps/rejected": -31.203514099121094, "rewards/accuracies": 1.0, "rewards/chosen": -2.661060094833374, "rewards/margins": 2.780456066131592, "rewards/rejected": -5.441515922546387, "step": 3022 }, { "epoch": 2.85, "grad_norm": 4.268852170040526, "learning_rate": 3.1904682010333863e-09, "logps/chosen": -56.29779815673828, "logps/rejected": -110.99871826171875, "loss": 0.0354, "losses/dpo": 0.03702012449502945, "losses/sft": 2.6262319087982178, "losses/total": 0.03702012449502945, "ref_logps/chosen": -26.513004302978516, "ref_logps/rejected": -37.64812469482422, "rewards/accuracies": 1.0, "rewards/chosen": -2.9784793853759766, "rewards/margins": 4.35658073425293, "rewards/rejected": -7.335060119628906, "step": 3023 }, { "epoch": 2.85, "grad_norm": 8.551300264753811, "learning_rate": 3.1500400259573954e-09, "logps/chosen": -65.79023742675781, "logps/rejected": -116.01136016845703, "loss": 0.0477, "losses/dpo": 0.010401234962046146, "losses/sft": 1.4689441919326782, "losses/total": 0.010401234962046146, "ref_logps/chosen": -36.483062744140625, "ref_logps/rejected": -38.198402404785156, "rewards/accuracies": 1.0, "rewards/chosen": -2.930717945098877, "rewards/margins": 4.850578308105469, "rewards/rejected": -7.781296253204346, "step": 3024 }, { "epoch": 2.85, "grad_norm": 10.940960808558904, "learning_rate": 3.1098680065848593e-09, "logps/chosen": -40.830909729003906, "logps/rejected": -71.30525207519531, "loss": 0.1638, "losses/dpo": 0.05089828744530678, "losses/sft": 2.6958231925964355, "losses/total": 0.05089828744530678, "ref_logps/chosen": -21.53544044494629, "ref_logps/rejected": -27.42683219909668, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9295473098754883, "rewards/margins": 2.45829439163208, "rewards/rejected": -4.387841701507568, "step": 3025 }, { "epoch": 2.85, "grad_norm": 12.599923479956574, "learning_rate": 3.0699521846022645e-09, "logps/chosen": -59.162113189697266, "logps/rejected": -82.27177429199219, "loss": 0.1509, "losses/dpo": 0.0030325872357934713, "losses/sft": 3.6161673069000244, "losses/total": 0.0030325872357934713, "ref_logps/chosen": -34.298675537109375, "ref_logps/rejected": -29.672155380249023, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4863438606262207, "rewards/margins": 2.773618221282959, "rewards/rejected": -5.25996208190918, "step": 3026 }, { "epoch": 2.86, "grad_norm": 10.301806299123841, "learning_rate": 3.0302926014301455e-09, "logps/chosen": -39.43010711669922, "logps/rejected": -84.05865478515625, "loss": 0.1778, "losses/dpo": 0.7328090667724609, "losses/sft": 0.6672323346138, "losses/total": 0.7328090667724609, "ref_logps/chosen": -20.60599136352539, "ref_logps/rejected": -35.00446701049805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8824117183685303, "rewards/margins": 3.023007392883301, "rewards/rejected": -4.905418872833252, "step": 3027 }, { "epoch": 2.86, "grad_norm": 7.677456973052364, "learning_rate": 2.9908892982231925e-09, "logps/chosen": -48.949073791503906, "logps/rejected": -92.08863830566406, "loss": 0.1047, "losses/dpo": 0.02520478516817093, "losses/sft": 1.736657738685608, "losses/total": 0.02520478516817093, "ref_logps/chosen": -24.892702102661133, "ref_logps/rejected": -35.41941452026367, "rewards/accuracies": 0.9375, "rewards/chosen": -2.40563702583313, "rewards/margins": 3.2612853050231934, "rewards/rejected": -5.666922569274902, "step": 3028 }, { "epoch": 2.86, "grad_norm": 16.3916667218022, "learning_rate": 2.9517423158701428e-09, "logps/chosen": -48.41082000732422, "logps/rejected": -81.58915710449219, "loss": 0.1421, "losses/dpo": 0.15461939573287964, "losses/sft": 0.4562636613845825, "losses/total": 0.15461939573287964, "ref_logps/chosen": -26.37908172607422, "ref_logps/rejected": -28.99889373779297, "rewards/accuracies": 1.0, "rewards/chosen": -2.203174114227295, "rewards/margins": 3.055852174758911, "rewards/rejected": -5.259026050567627, "step": 3029 }, { "epoch": 2.86, "grad_norm": 13.44987624735905, "learning_rate": 2.9128516949936954e-09, "logps/chosen": -57.21836853027344, "logps/rejected": -87.43299865722656, "loss": 0.1655, "losses/dpo": 0.018283488228917122, "losses/sft": 2.5238099098205566, "losses/total": 0.018283488228917122, "ref_logps/chosen": -32.11560821533203, "ref_logps/rejected": -36.5997314453125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5102758407592773, "rewards/margins": 2.5730504989624023, "rewards/rejected": -5.08332633972168, "step": 3030 }, { "epoch": 2.86, "grad_norm": 18.257629594523365, "learning_rate": 2.8742174759506245e-09, "logps/chosen": -48.459228515625, "logps/rejected": -80.99302673339844, "loss": 0.2278, "losses/dpo": 0.022764643654227257, "losses/sft": 1.4359664916992188, "losses/total": 0.022764643654227257, "ref_logps/chosen": -27.06201171875, "ref_logps/rejected": -31.76248550415039, "rewards/accuracies": 0.875, "rewards/chosen": -2.139721393585205, "rewards/margins": 2.7833328247070312, "rewards/rejected": -4.923053741455078, "step": 3031 }, { "epoch": 2.86, "grad_norm": 19.20571539017457, "learning_rate": 2.835839698831527e-09, "logps/chosen": -40.78425979614258, "logps/rejected": -79.81642150878906, "loss": 0.2142, "losses/dpo": 0.010726843029260635, "losses/sft": 0.7691444158554077, "losses/total": 0.010726843029260635, "ref_logps/chosen": -23.067745208740234, "ref_logps/rejected": -31.1234130859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.7716515064239502, "rewards/margins": 3.097649574279785, "rewards/rejected": -4.8693013191223145, "step": 3032 }, { "epoch": 2.86, "grad_norm": 25.881805439373505, "learning_rate": 2.797718403460991e-09, "logps/chosen": -59.12655258178711, "logps/rejected": -103.67294311523438, "loss": 0.1911, "losses/dpo": 0.007835803553462029, "losses/sft": 1.3782459497451782, "losses/total": 0.007835803553462029, "ref_logps/chosen": -31.009811401367188, "ref_logps/rejected": -41.93743133544922, "rewards/accuracies": 0.875, "rewards/chosen": -2.811674118041992, "rewards/margins": 3.361876964569092, "rewards/rejected": -6.173551559448242, "step": 3033 }, { "epoch": 2.86, "grad_norm": 10.179302119262235, "learning_rate": 2.759853629397374e-09, "logps/chosen": -44.825103759765625, "logps/rejected": -70.74267578125, "loss": 0.1047, "losses/dpo": 0.2082468718290329, "losses/sft": 1.8107064962387085, "losses/total": 0.2082468718290329, "ref_logps/chosen": -25.046310424804688, "ref_logps/rejected": -25.088489532470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.977879524230957, "rewards/margins": 2.587538957595825, "rewards/rejected": -4.565418243408203, "step": 3034 }, { "epoch": 2.86, "grad_norm": 13.940136394214809, "learning_rate": 2.722245415932911e-09, "logps/chosen": -62.21529769897461, "logps/rejected": -107.7183837890625, "loss": 0.1395, "losses/dpo": 0.050812434405088425, "losses/sft": 2.353217124938965, "losses/total": 0.050812434405088425, "ref_logps/chosen": -34.468780517578125, "ref_logps/rejected": -46.20569610595703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7746520042419434, "rewards/margins": 3.3766167163848877, "rewards/rejected": -6.151268482208252, "step": 3035 }, { "epoch": 2.86, "grad_norm": 15.158238346386401, "learning_rate": 2.6848938020934963e-09, "logps/chosen": -37.0377311706543, "logps/rejected": -76.75149536132812, "loss": 0.156, "losses/dpo": 0.005352106876671314, "losses/sft": 0.740353524684906, "losses/total": 0.005352106876671314, "ref_logps/chosen": -21.63066864013672, "ref_logps/rejected": -27.814865112304688, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5407061576843262, "rewards/margins": 3.352956771850586, "rewards/rejected": -4.893662929534912, "step": 3036 }, { "epoch": 2.87, "grad_norm": 7.366893820064069, "learning_rate": 2.647798826638903e-09, "logps/chosen": -49.44518280029297, "logps/rejected": -91.38569641113281, "loss": 0.0764, "losses/dpo": 0.18081925809383392, "losses/sft": 1.0340782403945923, "losses/total": 0.18081925809383392, "ref_logps/chosen": -28.58285903930664, "ref_logps/rejected": -33.833045959472656, "rewards/accuracies": 1.0, "rewards/chosen": -2.0862321853637695, "rewards/margins": 3.669031858444214, "rewards/rejected": -5.7552642822265625, "step": 3037 }, { "epoch": 2.87, "grad_norm": 8.326231856188986, "learning_rate": 2.610960528062478e-09, "logps/chosen": -43.49235534667969, "logps/rejected": -85.54916381835938, "loss": 0.098, "losses/dpo": 0.010531334206461906, "losses/sft": 1.2653228044509888, "losses/total": 0.010531334206461906, "ref_logps/chosen": -21.372297286987305, "ref_logps/rejected": -31.754377365112305, "rewards/accuracies": 1.0, "rewards/chosen": -2.212005615234375, "rewards/margins": 3.167473077774048, "rewards/rejected": -5.379478454589844, "step": 3038 }, { "epoch": 2.87, "grad_norm": 10.213787649789834, "learning_rate": 2.5743789445912823e-09, "logps/chosen": -51.192108154296875, "logps/rejected": -89.8642807006836, "loss": 0.1109, "losses/dpo": 0.1368105262517929, "losses/sft": 2.2732250690460205, "losses/total": 0.1368105262517929, "ref_logps/chosen": -28.321617126464844, "ref_logps/rejected": -33.839847564697266, "rewards/accuracies": 1.0, "rewards/chosen": -2.2870490550994873, "rewards/margins": 3.315394878387451, "rewards/rejected": -5.602443695068359, "step": 3039 }, { "epoch": 2.87, "grad_norm": 14.711799159804794, "learning_rate": 2.538054114185895e-09, "logps/chosen": -47.94084167480469, "logps/rejected": -94.07597351074219, "loss": 0.1385, "losses/dpo": 0.015771644189953804, "losses/sft": 2.245201587677002, "losses/total": 0.015771644189953804, "ref_logps/chosen": -19.917266845703125, "ref_logps/rejected": -33.42072296142578, "rewards/accuracies": 1.0, "rewards/chosen": -2.8023571968078613, "rewards/margins": 3.2631676197052, "rewards/rejected": -6.065525054931641, "step": 3040 }, { "epoch": 2.87, "grad_norm": 13.152211857884462, "learning_rate": 2.501986074540552e-09, "logps/chosen": -49.51601028442383, "logps/rejected": -88.3946533203125, "loss": 0.1318, "losses/dpo": 0.0013743848539888859, "losses/sft": 1.6969006061553955, "losses/total": 0.0013743848539888859, "ref_logps/chosen": -30.670459747314453, "ref_logps/rejected": -36.23668670654297, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8845551013946533, "rewards/margins": 3.3312416076660156, "rewards/rejected": -5.21579647064209, "step": 3041 }, { "epoch": 2.87, "grad_norm": 14.76872441093649, "learning_rate": 2.4661748630829817e-09, "logps/chosen": -54.33005905151367, "logps/rejected": -86.2127685546875, "loss": 0.1521, "losses/dpo": 0.01294008269906044, "losses/sft": 0.4962697923183441, "losses/total": 0.01294008269906044, "ref_logps/chosen": -34.19427490234375, "ref_logps/rejected": -29.877788543701172, "rewards/accuracies": 0.9375, "rewards/chosen": -2.013578414916992, "rewards/margins": 3.619920015335083, "rewards/rejected": -5.633498191833496, "step": 3042 }, { "epoch": 2.87, "grad_norm": 17.092536513541976, "learning_rate": 2.430620516974402e-09, "logps/chosen": -57.105098724365234, "logps/rejected": -97.03224182128906, "loss": 0.1331, "losses/dpo": 6.426440086215734e-05, "losses/sft": 1.0636271238327026, "losses/total": 6.426440086215734e-05, "ref_logps/chosen": -31.729827880859375, "ref_logps/rejected": -36.45747756958008, "rewards/accuracies": 0.9375, "rewards/chosen": -2.537527084350586, "rewards/margins": 3.5199496746063232, "rewards/rejected": -6.05747652053833, "step": 3043 }, { "epoch": 2.87, "grad_norm": 11.776334513492628, "learning_rate": 2.3953230731094954e-09, "logps/chosen": -54.555755615234375, "logps/rejected": -91.06858825683594, "loss": 0.1133, "losses/dpo": 0.005854746792465448, "losses/sft": 2.096259117126465, "losses/total": 0.005854746792465448, "ref_logps/chosen": -32.40919876098633, "ref_logps/rejected": -34.06385040283203, "rewards/accuracies": 1.0, "rewards/chosen": -2.2146553993225098, "rewards/margins": 3.48581862449646, "rewards/rejected": -5.700473785400391, "step": 3044 }, { "epoch": 2.87, "grad_norm": 12.553092008408042, "learning_rate": 2.360282568116323e-09, "logps/chosen": -51.85920715332031, "logps/rejected": -88.66854095458984, "loss": 0.1375, "losses/dpo": 0.35001835227012634, "losses/sft": 0.4562891125679016, "losses/total": 0.35001835227012634, "ref_logps/chosen": -28.213130950927734, "ref_logps/rejected": -36.758270263671875, "rewards/accuracies": 1.0, "rewards/chosen": -2.364607334136963, "rewards/margins": 2.8264193534851074, "rewards/rejected": -5.19102668762207, "step": 3045 }, { "epoch": 2.87, "grad_norm": 25.923140004589172, "learning_rate": 2.3254990383563823e-09, "logps/chosen": -52.41242980957031, "logps/rejected": -68.67882537841797, "loss": 0.3529, "losses/dpo": 0.25528889894485474, "losses/sft": 1.9282546043395996, "losses/total": 0.25528889894485474, "ref_logps/chosen": -32.066566467285156, "ref_logps/rejected": -27.486663818359375, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03458571434021, "rewards/margins": 2.084630012512207, "rewards/rejected": -4.119215488433838, "step": 3046 }, { "epoch": 2.87, "grad_norm": 10.121012603339246, "learning_rate": 2.2909725199244388e-09, "logps/chosen": -50.854637145996094, "logps/rejected": -108.18753051757812, "loss": 0.0879, "losses/dpo": 3.9363715131912613e-07, "losses/sft": 0.5962079167366028, "losses/total": 3.9363715131912613e-07, "ref_logps/chosen": -25.559005737304688, "ref_logps/rejected": -44.96372985839844, "rewards/accuracies": 1.0, "rewards/chosen": -2.5295629501342773, "rewards/margins": 3.7928168773651123, "rewards/rejected": -6.3223795890808105, "step": 3047 }, { "epoch": 2.88, "grad_norm": 7.602341904912744, "learning_rate": 2.2567030486486115e-09, "logps/chosen": -54.90392303466797, "logps/rejected": -96.31109619140625, "loss": 0.0521, "losses/dpo": 0.07658592611551285, "losses/sft": 0.529507577419281, "losses/total": 0.07658592611551285, "ref_logps/chosen": -30.769428253173828, "ref_logps/rejected": -34.34912109375, "rewards/accuracies": 1.0, "rewards/chosen": -2.413449764251709, "rewards/margins": 3.782747745513916, "rewards/rejected": -6.196197509765625, "step": 3048 }, { "epoch": 2.88, "grad_norm": 15.948661422785916, "learning_rate": 2.2226906600902594e-09, "logps/chosen": -52.68550109863281, "logps/rejected": -77.92951965332031, "loss": 0.1208, "losses/dpo": 0.0009735487401485443, "losses/sft": 2.0137975215911865, "losses/total": 0.0009735487401485443, "ref_logps/chosen": -28.406017303466797, "ref_logps/rejected": -25.234210968017578, "rewards/accuracies": 1.0, "rewards/chosen": -2.427948474884033, "rewards/margins": 2.841582775115967, "rewards/rejected": -5.26953125, "step": 3049 }, { "epoch": 2.88, "grad_norm": 12.871181541861542, "learning_rate": 2.1889353895439555e-09, "logps/chosen": -54.69073486328125, "logps/rejected": -94.90828704833984, "loss": 0.0817, "losses/dpo": 0.0035718725994229317, "losses/sft": 1.7842650413513184, "losses/total": 0.0035718725994229317, "ref_logps/chosen": -30.484580993652344, "ref_logps/rejected": -34.92890930175781, "rewards/accuracies": 1.0, "rewards/chosen": -2.4206156730651855, "rewards/margins": 3.577322483062744, "rewards/rejected": -5.99793815612793, "step": 3050 }, { "epoch": 2.88, "grad_norm": 7.6938485948488, "learning_rate": 2.155437272037486e-09, "logps/chosen": -44.32317352294922, "logps/rejected": -99.69464111328125, "loss": 0.0584, "losses/dpo": 0.008318991400301456, "losses/sft": 1.057355523109436, "losses/total": 0.008318991400301456, "ref_logps/chosen": -22.097339630126953, "ref_logps/rejected": -40.56995391845703, "rewards/accuracies": 1.0, "rewards/chosen": -2.222583770751953, "rewards/margins": 3.689885139465332, "rewards/rejected": -5.912468910217285, "step": 3051 }, { "epoch": 2.88, "grad_norm": 9.762868109774065, "learning_rate": 2.122196342331767e-09, "logps/chosen": -58.74087905883789, "logps/rejected": -97.58432006835938, "loss": 0.0824, "losses/dpo": 9.883384336717427e-05, "losses/sft": 2.2530226707458496, "losses/total": 9.883384336717427e-05, "ref_logps/chosen": -35.84435272216797, "ref_logps/rejected": -40.247230529785156, "rewards/accuracies": 1.0, "rewards/chosen": -2.289652109146118, "rewards/margins": 3.444056987762451, "rewards/rejected": -5.733709335327148, "step": 3052 }, { "epoch": 2.88, "grad_norm": 10.873933814436253, "learning_rate": 2.0892126349208726e-09, "logps/chosen": -50.924781799316406, "logps/rejected": -91.60363006591797, "loss": 0.0877, "losses/dpo": 0.02071487158536911, "losses/sft": 1.4094833135604858, "losses/total": 0.02071487158536911, "ref_logps/chosen": -32.00244140625, "ref_logps/rejected": -36.970794677734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8922338485717773, "rewards/margins": 3.571049928665161, "rewards/rejected": -5.463283538818359, "step": 3053 }, { "epoch": 2.88, "grad_norm": 15.283746350896289, "learning_rate": 2.056486184031897e-09, "logps/chosen": -46.58061218261719, "logps/rejected": -79.53279113769531, "loss": 0.151, "losses/dpo": 0.21165287494659424, "losses/sft": 1.3004807233810425, "losses/total": 0.21165287494659424, "ref_logps/chosen": -22.069215774536133, "ref_logps/rejected": -28.135055541992188, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4511399269104004, "rewards/margins": 2.688633918762207, "rewards/rejected": -5.139773368835449, "step": 3054 }, { "epoch": 2.88, "grad_norm": 15.851437965954878, "learning_rate": 2.0240170236250352e-09, "logps/chosen": -58.15641403198242, "logps/rejected": -82.94425201416016, "loss": 0.1618, "losses/dpo": 0.001529618981294334, "losses/sft": 2.0901801586151123, "losses/total": 0.001529618981294334, "ref_logps/chosen": -33.204559326171875, "ref_logps/rejected": -30.02216148376465, "rewards/accuracies": 1.0, "rewards/chosen": -2.4951860904693604, "rewards/margins": 2.797023057937622, "rewards/rejected": -5.292208671569824, "step": 3055 }, { "epoch": 2.88, "grad_norm": 13.58878942666134, "learning_rate": 1.991805187393447e-09, "logps/chosen": -47.705162048339844, "logps/rejected": -83.42250061035156, "loss": 0.1384, "losses/dpo": 0.6886730194091797, "losses/sft": 1.4493515491485596, "losses/total": 0.6886730194091797, "ref_logps/chosen": -27.307811737060547, "ref_logps/rejected": -31.846038818359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0397348403930664, "rewards/margins": 3.1179113388061523, "rewards/rejected": -5.157646179199219, "step": 3056 }, { "epoch": 2.88, "grad_norm": 10.940004388306242, "learning_rate": 1.959850708763311e-09, "logps/chosen": -60.836021423339844, "logps/rejected": -93.96884155273438, "loss": 0.0952, "losses/dpo": 0.1509896218776703, "losses/sft": 1.4367409944534302, "losses/total": 0.1509896218776703, "ref_logps/chosen": -37.44738006591797, "ref_logps/rejected": -38.22657775878906, "rewards/accuracies": 1.0, "rewards/chosen": -2.338864326477051, "rewards/margins": 3.2353618144989014, "rewards/rejected": -5.574226379394531, "step": 3057 }, { "epoch": 2.88, "grad_norm": 15.903483568923427, "learning_rate": 1.928153620893741e-09, "logps/chosen": -53.06943893432617, "logps/rejected": -92.86547088623047, "loss": 0.2213, "losses/dpo": 2.6203091144561768, "losses/sft": 2.3552136421203613, "losses/total": 2.6203091144561768, "ref_logps/chosen": -26.097759246826172, "ref_logps/rejected": -33.661590576171875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6971676349639893, "rewards/margins": 3.2232203483581543, "rewards/rejected": -5.920388221740723, "step": 3058 }, { "epoch": 2.89, "grad_norm": 19.226195446899386, "learning_rate": 1.8967139566766766e-09, "logps/chosen": -40.60166931152344, "logps/rejected": -86.35872650146484, "loss": 0.1985, "losses/dpo": 0.05345580354332924, "losses/sft": 0.4119003415107727, "losses/total": 0.05345580354332924, "ref_logps/chosen": -19.434799194335938, "ref_logps/rejected": -33.43555450439453, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1166868209838867, "rewards/margins": 3.175630569458008, "rewards/rejected": -5.2923173904418945, "step": 3059 }, { "epoch": 2.89, "grad_norm": 15.232550028265804, "learning_rate": 1.865531748737076e-09, "logps/chosen": -53.32837677001953, "logps/rejected": -94.78460693359375, "loss": 0.1632, "losses/dpo": 0.5143664479255676, "losses/sft": 0.8330796360969543, "losses/total": 0.5143664479255676, "ref_logps/chosen": -27.591102600097656, "ref_logps/rejected": -40.234153747558594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5737271308898926, "rewards/margins": 2.8813180923461914, "rewards/rejected": -5.455044746398926, "step": 3060 }, { "epoch": 2.89, "grad_norm": 11.59765004778897, "learning_rate": 1.8346070294325832e-09, "logps/chosen": -53.95292663574219, "logps/rejected": -86.05854797363281, "loss": 0.1151, "losses/dpo": 0.08142589777708054, "losses/sft": 2.2176434993743896, "losses/total": 0.08142589777708054, "ref_logps/chosen": -31.593725204467773, "ref_logps/rejected": -32.289363861083984, "rewards/accuracies": 0.9375, "rewards/chosen": -2.235919952392578, "rewards/margins": 3.140998363494873, "rewards/rejected": -5.376918315887451, "step": 3061 }, { "epoch": 2.89, "grad_norm": 9.5126426320151, "learning_rate": 1.8039398308537513e-09, "logps/chosen": -63.001564025878906, "logps/rejected": -108.23310852050781, "loss": 0.0787, "losses/dpo": 0.0035768591333180666, "losses/sft": 1.2599103450775146, "losses/total": 0.0035768591333180666, "ref_logps/chosen": -34.87248611450195, "ref_logps/rejected": -45.67564392089844, "rewards/accuracies": 1.0, "rewards/chosen": -2.8129074573516846, "rewards/margins": 3.4428391456604004, "rewards/rejected": -6.255746364593506, "step": 3062 }, { "epoch": 2.89, "grad_norm": 9.105578099213952, "learning_rate": 1.773530184823846e-09, "logps/chosen": -57.081947326660156, "logps/rejected": -101.158203125, "loss": 0.0888, "losses/dpo": 0.0002875407226383686, "losses/sft": 0.9084807634353638, "losses/total": 0.0002875407226383686, "ref_logps/chosen": -33.40345001220703, "ref_logps/rejected": -40.90632629394531, "rewards/accuracies": 1.0, "rewards/chosen": -2.367849826812744, "rewards/margins": 3.6573381423950195, "rewards/rejected": -6.025187969207764, "step": 3063 }, { "epoch": 2.89, "grad_norm": 8.508819616811708, "learning_rate": 1.743378122898931e-09, "logps/chosen": -55.757144927978516, "logps/rejected": -99.83523559570312, "loss": 0.0562, "losses/dpo": 0.002782501745969057, "losses/sft": 2.387275218963623, "losses/total": 0.002782501745969057, "ref_logps/chosen": -28.188983917236328, "ref_logps/rejected": -36.04817581176758, "rewards/accuracies": 1.0, "rewards/chosen": -2.7568163871765137, "rewards/margins": 3.62188982963562, "rewards/rejected": -6.378705978393555, "step": 3064 }, { "epoch": 2.89, "grad_norm": 6.9354724682681725, "learning_rate": 1.7134836763677273e-09, "logps/chosen": -40.49652862548828, "logps/rejected": -93.9666976928711, "loss": 0.0598, "losses/dpo": 0.0022440196480602026, "losses/sft": 0.7687010765075684, "losses/total": 0.0022440196480602026, "ref_logps/chosen": -23.261646270751953, "ref_logps/rejected": -34.882930755615234, "rewards/accuracies": 1.0, "rewards/chosen": -1.7234885692596436, "rewards/margins": 4.184887886047363, "rewards/rejected": -5.908376693725586, "step": 3065 }, { "epoch": 2.89, "grad_norm": 11.562501404263092, "learning_rate": 1.6838468762516156e-09, "logps/chosen": -56.86506652832031, "logps/rejected": -92.08357238769531, "loss": 0.1202, "losses/dpo": 0.0006711797323077917, "losses/sft": 0.4717066287994385, "losses/total": 0.0006711797323077917, "ref_logps/chosen": -29.940738677978516, "ref_logps/rejected": -33.923118591308594, "rewards/accuracies": 1.0, "rewards/chosen": -2.6924326419830322, "rewards/margins": 3.123612642288208, "rewards/rejected": -5.81604528427124, "step": 3066 }, { "epoch": 2.89, "grad_norm": 16.932458873290578, "learning_rate": 1.6544677533046892e-09, "logps/chosen": -50.35433578491211, "logps/rejected": -103.34260559082031, "loss": 0.0961, "losses/dpo": 1.4873542568238918e-05, "losses/sft": 1.013974905014038, "losses/total": 1.4873542568238918e-05, "ref_logps/chosen": -30.423927307128906, "ref_logps/rejected": -41.726783752441406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.993040919303894, "rewards/margins": 4.168540954589844, "rewards/rejected": -6.161581993103027, "step": 3067 }, { "epoch": 2.89, "grad_norm": 9.657353074085169, "learning_rate": 1.6253463380135613e-09, "logps/chosen": -41.41581344604492, "logps/rejected": -89.57865905761719, "loss": 0.096, "losses/dpo": 0.008785446174442768, "losses/sft": 0.8761202096939087, "losses/total": 0.008785446174442768, "ref_logps/chosen": -22.866804122924805, "ref_logps/rejected": -38.86484909057617, "rewards/accuracies": 1.0, "rewards/chosen": -1.8549009561538696, "rewards/margins": 3.216480016708374, "rewards/rejected": -5.071381092071533, "step": 3068 }, { "epoch": 2.9, "grad_norm": 12.940650855509986, "learning_rate": 1.5964826605975035e-09, "logps/chosen": -46.363365173339844, "logps/rejected": -77.8629150390625, "loss": 0.1276, "losses/dpo": 0.18864621222019196, "losses/sft": 0.7934114336967468, "losses/total": 0.18864621222019196, "ref_logps/chosen": -25.43160629272461, "ref_logps/rejected": -28.790767669677734, "rewards/accuracies": 1.0, "rewards/chosen": -2.0931758880615234, "rewards/margins": 2.8140382766723633, "rewards/rejected": -4.907214164733887, "step": 3069 }, { "epoch": 2.9, "grad_norm": 13.329201148250837, "learning_rate": 1.5678767510082512e-09, "logps/chosen": -55.03471755981445, "logps/rejected": -100.99493408203125, "loss": 0.1133, "losses/dpo": 0.011857420206069946, "losses/sft": 2.0141379833221436, "losses/total": 0.011857420206069946, "ref_logps/chosen": -29.51630401611328, "ref_logps/rejected": -37.610015869140625, "rewards/accuracies": 1.0, "rewards/chosen": -2.5518412590026855, "rewards/margins": 3.786651134490967, "rewards/rejected": -6.338492393493652, "step": 3070 }, { "epoch": 2.9, "grad_norm": 9.041729767244354, "learning_rate": 1.539528638930143e-09, "logps/chosen": -54.756019592285156, "logps/rejected": -97.21527099609375, "loss": 0.0759, "losses/dpo": 0.0004297502455301583, "losses/sft": 1.779070258140564, "losses/total": 0.0004297502455301583, "ref_logps/chosen": -31.549518585205078, "ref_logps/rejected": -36.15790939331055, "rewards/accuracies": 1.0, "rewards/chosen": -2.320650100708008, "rewards/margins": 3.7850866317749023, "rewards/rejected": -6.10573673248291, "step": 3071 }, { "epoch": 2.9, "grad_norm": 15.920279073954436, "learning_rate": 1.511438353779898e-09, "logps/chosen": -52.70277404785156, "logps/rejected": -86.32106018066406, "loss": 0.1269, "losses/dpo": 0.1367662250995636, "losses/sft": 1.0793484449386597, "losses/total": 0.1367662250995636, "ref_logps/chosen": -29.668445587158203, "ref_logps/rejected": -31.402767181396484, "rewards/accuracies": 1.0, "rewards/chosen": -2.3034329414367676, "rewards/margins": 3.188396453857422, "rewards/rejected": -5.491828918457031, "step": 3072 }, { "epoch": 2.9, "grad_norm": 14.741691927774719, "learning_rate": 1.4836059247068378e-09, "logps/chosen": -52.901920318603516, "logps/rejected": -88.28768920898438, "loss": 0.1676, "losses/dpo": 0.6463176608085632, "losses/sft": 1.754916787147522, "losses/total": 0.6463176608085632, "ref_logps/chosen": -26.958251953125, "ref_logps/rejected": -33.3049430847168, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5943667888641357, "rewards/margins": 2.9039082527160645, "rewards/rejected": -5.498274803161621, "step": 3073 }, { "epoch": 2.9, "grad_norm": 20.129887509470382, "learning_rate": 1.4560313805925262e-09, "logps/chosen": -48.770477294921875, "logps/rejected": -78.1597900390625, "loss": 0.2432, "losses/dpo": 0.07087081670761108, "losses/sft": 1.2835760116577148, "losses/total": 0.07087081670761108, "ref_logps/chosen": -25.31640625, "ref_logps/rejected": -24.996557235717773, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3454065322875977, "rewards/margins": 2.970917224884033, "rewards/rejected": -5.316324234008789, "step": 3074 }, { "epoch": 2.9, "grad_norm": 16.393530058474166, "learning_rate": 1.4287147500510743e-09, "logps/chosen": -48.5106201171875, "logps/rejected": -83.18060302734375, "loss": 0.1881, "losses/dpo": 0.04708848148584366, "losses/sft": 1.715989112854004, "losses/total": 0.04708848148584366, "ref_logps/chosen": -27.617841720581055, "ref_logps/rejected": -30.044374465942383, "rewards/accuracies": 0.875, "rewards/chosen": -2.0892772674560547, "rewards/margins": 3.224346160888672, "rewards/rejected": -5.313623428344727, "step": 3075 }, { "epoch": 2.9, "grad_norm": 12.196214083280857, "learning_rate": 1.401656061428863e-09, "logps/chosen": -61.5443000793457, "logps/rejected": -79.95785522460938, "loss": 0.0949, "losses/dpo": 0.04446649178862572, "losses/sft": 1.5054166316986084, "losses/total": 0.04446649178862572, "ref_logps/chosen": -36.579261779785156, "ref_logps/rejected": -26.90842628479004, "rewards/accuracies": 1.0, "rewards/chosen": -2.496504306793213, "rewards/margins": 2.808438777923584, "rewards/rejected": -5.304943084716797, "step": 3076 }, { "epoch": 2.9, "grad_norm": 7.709550332309665, "learning_rate": 1.374855342804654e-09, "logps/chosen": -58.8314323425293, "logps/rejected": -113.72611999511719, "loss": 0.0693, "losses/dpo": 0.05178442969918251, "losses/sft": 1.7908174991607666, "losses/total": 0.05178442969918251, "ref_logps/chosen": -29.11687660217285, "ref_logps/rejected": -45.01181411743164, "rewards/accuracies": 1.0, "rewards/chosen": -2.9714558124542236, "rewards/margins": 3.899974822998047, "rewards/rejected": -6.87143087387085, "step": 3077 }, { "epoch": 2.9, "grad_norm": 12.616241218365934, "learning_rate": 1.3483126219895058e-09, "logps/chosen": -47.130516052246094, "logps/rejected": -74.64703369140625, "loss": 0.1351, "losses/dpo": 0.18794751167297363, "losses/sft": 1.5949382781982422, "losses/total": 0.18794751167297363, "ref_logps/chosen": -26.201984405517578, "ref_logps/rejected": -26.843399047851562, "rewards/accuracies": 0.9375, "rewards/chosen": -2.09285306930542, "rewards/margins": 2.6875100135803223, "rewards/rejected": -4.780363082885742, "step": 3078 }, { "epoch": 2.9, "grad_norm": 7.782254380836153, "learning_rate": 1.3220279265267198e-09, "logps/chosen": -63.329288482666016, "logps/rejected": -102.37330627441406, "loss": 0.0673, "losses/dpo": 0.013243462890386581, "losses/sft": 1.1585981845855713, "losses/total": 0.013243462890386581, "ref_logps/chosen": -41.82634735107422, "ref_logps/rejected": -38.780460357666016, "rewards/accuracies": 1.0, "rewards/chosen": -2.150294065475464, "rewards/margins": 4.208990573883057, "rewards/rejected": -6.359284400939941, "step": 3079 }, { "epoch": 2.91, "grad_norm": 8.44734565544999, "learning_rate": 1.2960012836918944e-09, "logps/chosen": -47.55170440673828, "logps/rejected": -92.84144592285156, "loss": 0.0744, "losses/dpo": 0.045876167714595795, "losses/sft": 1.4188343286514282, "losses/total": 0.045876167714595795, "ref_logps/chosen": -26.061492919921875, "ref_logps/rejected": -36.37382888793945, "rewards/accuracies": 1.0, "rewards/chosen": -2.1490211486816406, "rewards/margins": 3.4977400302886963, "rewards/rejected": -5.646761417388916, "step": 3080 }, { "epoch": 2.91, "grad_norm": 9.083720323844696, "learning_rate": 1.2702327204928143e-09, "logps/chosen": -43.42580032348633, "logps/rejected": -93.48927307128906, "loss": 0.0782, "losses/dpo": 0.0580456517636776, "losses/sft": 3.4090089797973633, "losses/total": 0.0580456517636776, "ref_logps/chosen": -19.095516204833984, "ref_logps/rejected": -31.88750457763672, "rewards/accuracies": 1.0, "rewards/chosen": -2.433028221130371, "rewards/margins": 3.7271482944488525, "rewards/rejected": -6.160176753997803, "step": 3081 }, { "epoch": 2.91, "grad_norm": 23.890304032602966, "learning_rate": 1.244722263669451e-09, "logps/chosen": -61.36216735839844, "logps/rejected": -81.4760513305664, "loss": 0.2467, "losses/dpo": 0.08262497186660767, "losses/sft": 2.617098331451416, "losses/total": 0.08262497186660767, "ref_logps/chosen": -32.05269241333008, "ref_logps/rejected": -31.823116302490234, "rewards/accuracies": 0.875, "rewards/chosen": -2.9309473037719727, "rewards/margins": 2.034346580505371, "rewards/rejected": -4.965293884277344, "step": 3082 }, { "epoch": 2.91, "grad_norm": 11.445668681122442, "learning_rate": 1.2194699396939623e-09, "logps/chosen": -52.858516693115234, "logps/rejected": -87.3686294555664, "loss": 0.0924, "losses/dpo": 0.060975395143032074, "losses/sft": 0.9088894128799438, "losses/total": 0.060975395143032074, "ref_logps/chosen": -29.953495025634766, "ref_logps/rejected": -31.576669692993164, "rewards/accuracies": 1.0, "rewards/chosen": -2.2905020713806152, "rewards/margins": 3.288694143295288, "rewards/rejected": -5.579196453094482, "step": 3083 }, { "epoch": 2.91, "grad_norm": 11.54223280603011, "learning_rate": 1.1944757747706647e-09, "logps/chosen": -42.15313720703125, "logps/rejected": -73.68999481201172, "loss": 0.1716, "losses/dpo": 0.0033069064375013113, "losses/sft": 0.6454547643661499, "losses/total": 0.0033069064375013113, "ref_logps/chosen": -21.02020263671875, "ref_logps/rejected": -26.649803161621094, "rewards/accuracies": 1.0, "rewards/chosen": -2.1132936477661133, "rewards/margins": 2.5907254219055176, "rewards/rejected": -4.704019069671631, "step": 3084 }, { "epoch": 2.91, "grad_norm": 11.900356984241204, "learning_rate": 1.1697397948359223e-09, "logps/chosen": -50.51007080078125, "logps/rejected": -91.52340698242188, "loss": 0.1665, "losses/dpo": 1.0175074338912964, "losses/sft": 1.1600542068481445, "losses/total": 1.0175074338912964, "ref_logps/chosen": -26.15643310546875, "ref_logps/rejected": -36.31172180175781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.43536376953125, "rewards/margins": 3.0858054161071777, "rewards/rejected": -5.5211687088012695, "step": 3085 }, { "epoch": 2.91, "grad_norm": 9.283807622358793, "learning_rate": 1.1452620255582024e-09, "logps/chosen": -52.027809143066406, "logps/rejected": -99.48974609375, "loss": 0.0877, "losses/dpo": 0.007790949661284685, "losses/sft": 1.6523523330688477, "losses/total": 0.007790949661284685, "ref_logps/chosen": -26.5601806640625, "ref_logps/rejected": -40.72835922241211, "rewards/accuracies": 1.0, "rewards/chosen": -2.5467634201049805, "rewards/margins": 3.329375982284546, "rewards/rejected": -5.8761396408081055, "step": 3086 }, { "epoch": 2.91, "grad_norm": 15.797306345716153, "learning_rate": 1.1210424923380756e-09, "logps/chosen": -50.489524841308594, "logps/rejected": -90.57789611816406, "loss": 0.1226, "losses/dpo": 0.38755232095718384, "losses/sft": 1.2859848737716675, "losses/total": 0.38755232095718384, "ref_logps/chosen": -29.275211334228516, "ref_logps/rejected": -38.269508361816406, "rewards/accuracies": 1.0, "rewards/chosen": -2.121431350708008, "rewards/margins": 3.109407424926758, "rewards/rejected": -5.230838775634766, "step": 3087 }, { "epoch": 2.91, "grad_norm": 21.159903061852177, "learning_rate": 1.0970812203080493e-09, "logps/chosen": -59.67591857910156, "logps/rejected": -99.23196411132812, "loss": 0.169, "losses/dpo": 0.1569477915763855, "losses/sft": 1.528821587562561, "losses/total": 0.1569477915763855, "ref_logps/chosen": -33.972869873046875, "ref_logps/rejected": -42.11613845825195, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5703046321868896, "rewards/margins": 3.1412785053253174, "rewards/rejected": -5.711583137512207, "step": 3088 }, { "epoch": 2.91, "grad_norm": 9.514378798622069, "learning_rate": 1.0733782343327336e-09, "logps/chosen": -44.01835250854492, "logps/rejected": -85.7056884765625, "loss": 0.1019, "losses/dpo": 0.18788690865039825, "losses/sft": 1.5851428508758545, "losses/total": 0.18788690865039825, "ref_logps/chosen": -20.25539779663086, "ref_logps/rejected": -30.988370895385742, "rewards/accuracies": 1.0, "rewards/chosen": -2.376295566558838, "rewards/margins": 3.0954365730285645, "rewards/rejected": -5.471732139587402, "step": 3089 }, { "epoch": 2.92, "grad_norm": 11.508303399346993, "learning_rate": 1.0499335590086479e-09, "logps/chosen": -43.23910140991211, "logps/rejected": -78.1702880859375, "loss": 0.1367, "losses/dpo": 0.0002855450729839504, "losses/sft": 1.4316691160202026, "losses/total": 0.0002855450729839504, "ref_logps/chosen": -19.83020782470703, "ref_logps/rejected": -26.461889266967773, "rewards/accuracies": 1.0, "rewards/chosen": -2.3408894538879395, "rewards/margins": 2.8299498558044434, "rewards/rejected": -5.170839309692383, "step": 3090 }, { "epoch": 2.92, "grad_norm": 7.720155886044047, "learning_rate": 1.0267472186642756e-09, "logps/chosen": -41.42892074584961, "logps/rejected": -85.20281982421875, "loss": 0.0899, "losses/dpo": 0.001205127569846809, "losses/sft": 1.6721642017364502, "losses/total": 0.001205127569846809, "ref_logps/chosen": -24.691862106323242, "ref_logps/rejected": -33.509517669677734, "rewards/accuracies": 1.0, "rewards/chosen": -1.673705816268921, "rewards/margins": 3.4956247806549072, "rewards/rejected": -5.169330596923828, "step": 3091 }, { "epoch": 2.92, "grad_norm": 9.102042407246259, "learning_rate": 1.0038192373600652e-09, "logps/chosen": -52.600502014160156, "logps/rejected": -94.72779083251953, "loss": 0.0901, "losses/dpo": 0.03529021143913269, "losses/sft": 1.7008278369903564, "losses/total": 0.03529021143913269, "ref_logps/chosen": -30.991004943847656, "ref_logps/rejected": -35.472511291503906, "rewards/accuracies": 1.0, "rewards/chosen": -2.16094970703125, "rewards/margins": 3.764577627182007, "rewards/rejected": -5.925527572631836, "step": 3092 }, { "epoch": 2.92, "grad_norm": 11.256151762519467, "learning_rate": 9.811496388883178e-10, "logps/chosen": -41.28453826904297, "logps/rejected": -95.53227233886719, "loss": 0.1422, "losses/dpo": 0.6931471824645996, "losses/sft": 1.1441277265548706, "losses/total": 0.6931471824645996, "ref_logps/chosen": -17.957735061645508, "ref_logps/rejected": -35.38795852661133, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3326807022094727, "rewards/margins": 3.6817502975463867, "rewards/rejected": -6.014430999755859, "step": 3093 }, { "epoch": 2.92, "grad_norm": 7.93900082620356, "learning_rate": 9.587384467732163e-10, "logps/chosen": -51.64521026611328, "logps/rejected": -96.7646484375, "loss": 0.1024, "losses/dpo": 0.015651367604732513, "losses/sft": 2.036304235458374, "losses/total": 0.015651367604732513, "ref_logps/chosen": -27.858489990234375, "ref_logps/rejected": -37.4537467956543, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3786721229553223, "rewards/margins": 3.552417516708374, "rewards/rejected": -5.931089401245117, "step": 3094 }, { "epoch": 2.92, "grad_norm": 11.549232700714839, "learning_rate": 9.365856842708242e-10, "logps/chosen": -50.21731948852539, "logps/rejected": -86.1357192993164, "loss": 0.1476, "losses/dpo": 0.0008805005345493555, "losses/sft": 1.7524375915527344, "losses/total": 0.0008805005345493555, "ref_logps/chosen": -26.623701095581055, "ref_logps/rejected": -30.392250061035156, "rewards/accuracies": 1.0, "rewards/chosen": -2.3593621253967285, "rewards/margins": 3.214984893798828, "rewards/rejected": -5.574347019195557, "step": 3095 }, { "epoch": 2.92, "grad_norm": 14.281145633201971, "learning_rate": 9.14691374369031e-10, "logps/chosen": -52.61360168457031, "logps/rejected": -76.14762878417969, "loss": 0.2173, "losses/dpo": 0.6931472420692444, "losses/sft": 0.5219011902809143, "losses/total": 0.6931472420692444, "ref_logps/chosen": -32.151947021484375, "ref_logps/rejected": -28.29889488220215, "rewards/accuracies": 0.875, "rewards/chosen": -2.0461654663085938, "rewards/margins": 2.738708019256592, "rewards/rejected": -4.7848734855651855, "step": 3096 }, { "epoch": 2.92, "grad_norm": 6.196720172313073, "learning_rate": 8.930555397874962e-10, "logps/chosen": -63.76982879638672, "logps/rejected": -113.31640625, "loss": 0.0559, "losses/dpo": 0.0026103383861482143, "losses/sft": 2.146095037460327, "losses/total": 0.0026103383861482143, "ref_logps/chosen": -35.851436614990234, "ref_logps/rejected": -45.96903991699219, "rewards/accuracies": 1.0, "rewards/chosen": -2.7918388843536377, "rewards/margins": 3.9428980350494385, "rewards/rejected": -6.734736442565918, "step": 3097 }, { "epoch": 2.92, "grad_norm": 10.038710776268815, "learning_rate": 8.716782029777048e-10, "logps/chosen": -42.45858383178711, "logps/rejected": -95.52516174316406, "loss": 0.0981, "losses/dpo": 0.12221632152795792, "losses/sft": 0.06692328304052353, "losses/total": 0.12221632152795792, "ref_logps/chosen": -24.90179443359375, "ref_logps/rejected": -39.25814437866211, "rewards/accuracies": 1.0, "rewards/chosen": -1.7556787729263306, "rewards/margins": 3.871023178100586, "rewards/rejected": -5.626702308654785, "step": 3098 }, { "epoch": 2.92, "grad_norm": 13.225125219484136, "learning_rate": 8.505593861228289e-10, "logps/chosen": -49.34916687011719, "logps/rejected": -88.17595672607422, "loss": 0.1309, "losses/dpo": 0.30918657779693604, "losses/sft": 2.397432565689087, "losses/total": 0.30918657779693604, "ref_logps/chosen": -26.792137145996094, "ref_logps/rejected": -33.875022888183594, "rewards/accuracies": 1.0, "rewards/chosen": -2.2557029724121094, "rewards/margins": 3.1743905544281006, "rewards/rejected": -5.430093765258789, "step": 3099 }, { "epoch": 2.92, "grad_norm": 7.52814091589389, "learning_rate": 8.296991111378659e-10, "logps/chosen": -46.19431686401367, "logps/rejected": -96.86152648925781, "loss": 0.0573, "losses/dpo": 0.010937673039734364, "losses/sft": 0.5375294089317322, "losses/total": 0.010937673039734364, "ref_logps/chosen": -26.71916961669922, "ref_logps/rejected": -39.19416046142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.9475150108337402, "rewards/margins": 3.8192222118377686, "rewards/rejected": -5.76673698425293, "step": 3100 }, { "epoch": 2.93, "grad_norm": 9.515003819768014, "learning_rate": 8.090973996694728e-10, "logps/chosen": -44.442630767822266, "logps/rejected": -84.91495513916016, "loss": 0.0947, "losses/dpo": 0.046680301427841187, "losses/sft": 1.9518659114837646, "losses/total": 0.046680301427841187, "ref_logps/chosen": -23.587017059326172, "ref_logps/rejected": -32.37657165527344, "rewards/accuracies": 1.0, "rewards/chosen": -2.0855612754821777, "rewards/margins": 3.1682770252227783, "rewards/rejected": -5.253838539123535, "step": 3101 }, { "epoch": 2.93, "grad_norm": 24.348887006631465, "learning_rate": 7.887542730959929e-10, "logps/chosen": -53.6943244934082, "logps/rejected": -91.88190460205078, "loss": 0.219, "losses/dpo": 0.9874597787857056, "losses/sft": 2.9442760944366455, "losses/total": 0.9874597787857056, "ref_logps/chosen": -29.10352897644043, "ref_logps/rejected": -33.72401428222656, "rewards/accuracies": 0.875, "rewards/chosen": -2.4590795040130615, "rewards/margins": 3.3567094802856445, "rewards/rejected": -5.815789222717285, "step": 3102 }, { "epoch": 2.93, "grad_norm": 12.409434880862557, "learning_rate": 7.686697525274843e-10, "logps/chosen": -44.759559631347656, "logps/rejected": -75.91706085205078, "loss": 0.1272, "losses/dpo": 0.06347016245126724, "losses/sft": 1.6113433837890625, "losses/total": 0.06347016245126724, "ref_logps/chosen": -23.79364585876465, "ref_logps/rejected": -25.970252990722656, "rewards/accuracies": 1.0, "rewards/chosen": -2.0965914726257324, "rewards/margins": 2.89808988571167, "rewards/rejected": -4.994681358337402, "step": 3103 }, { "epoch": 2.93, "grad_norm": 14.02880983763606, "learning_rate": 7.488438588055535e-10, "logps/chosen": -40.926300048828125, "logps/rejected": -84.37028503417969, "loss": 0.1766, "losses/dpo": 0.002048453548923135, "losses/sft": 1.5997190475463867, "losses/total": 0.002048453548923135, "ref_logps/chosen": -20.918563842773438, "ref_logps/rejected": -31.73322868347168, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0007736682891846, "rewards/margins": 3.262932300567627, "rewards/rejected": -5.263706207275391, "step": 3104 }, { "epoch": 2.93, "grad_norm": 9.552898595438345, "learning_rate": 7.29276612503521e-10, "logps/chosen": -36.426483154296875, "logps/rejected": -83.49224853515625, "loss": 0.0862, "losses/dpo": 0.06382700800895691, "losses/sft": 1.6532275676727295, "losses/total": 0.06382700800895691, "ref_logps/chosen": -17.715145111083984, "ref_logps/rejected": -28.985126495361328, "rewards/accuracies": 1.0, "rewards/chosen": -1.8711339235305786, "rewards/margins": 3.579577922821045, "rewards/rejected": -5.450712203979492, "step": 3105 }, { "epoch": 2.93, "grad_norm": 10.432243468250078, "learning_rate": 7.099680339262837e-10, "logps/chosen": -46.74140930175781, "logps/rejected": -80.75563049316406, "loss": 0.1052, "losses/dpo": 0.0776323452591896, "losses/sft": 1.6501401662826538, "losses/total": 0.0776323452591896, "ref_logps/chosen": -24.364837646484375, "ref_logps/rejected": -28.580408096313477, "rewards/accuracies": 1.0, "rewards/chosen": -2.237657070159912, "rewards/margins": 2.979865550994873, "rewards/rejected": -5.217522621154785, "step": 3106 }, { "epoch": 2.93, "grad_norm": 12.878035557320835, "learning_rate": 6.909181431103139e-10, "logps/chosen": -46.45759201049805, "logps/rejected": -81.6104736328125, "loss": 0.1501, "losses/dpo": 0.02801256813108921, "losses/sft": 2.381244659423828, "losses/total": 0.02801256813108921, "ref_logps/chosen": -23.477346420288086, "ref_logps/rejected": -30.61754035949707, "rewards/accuracies": 1.0, "rewards/chosen": -2.2980244159698486, "rewards/margins": 2.801269769668579, "rewards/rejected": -5.099294185638428, "step": 3107 }, { "epoch": 2.93, "grad_norm": 4.823946862630092, "learning_rate": 6.721269598236323e-10, "logps/chosen": -45.77821350097656, "logps/rejected": -98.46870422363281, "loss": 0.0426, "losses/dpo": 0.03533608838915825, "losses/sft": 1.5695457458496094, "losses/total": 0.03533608838915825, "ref_logps/chosen": -23.16857147216797, "ref_logps/rejected": -35.56703186035156, "rewards/accuracies": 1.0, "rewards/chosen": -2.2609646320343018, "rewards/margins": 4.029202938079834, "rewards/rejected": -6.290167808532715, "step": 3108 }, { "epoch": 2.93, "grad_norm": 25.823841651939688, "learning_rate": 6.535945035658075e-10, "logps/chosen": -45.5894660949707, "logps/rejected": -75.44552612304688, "loss": 0.263, "losses/dpo": 0.3470650017261505, "losses/sft": 2.3768677711486816, "losses/total": 0.3470650017261505, "ref_logps/chosen": -19.53087043762207, "ref_logps/rejected": -26.537857055664062, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6058595180511475, "rewards/margins": 2.284907579421997, "rewards/rejected": -4.8907670974731445, "step": 3109 }, { "epoch": 2.93, "grad_norm": 15.585116522803844, "learning_rate": 6.353207935679561e-10, "logps/chosen": -50.516021728515625, "logps/rejected": -89.5556411743164, "loss": 0.1303, "losses/dpo": 0.021938582882285118, "losses/sft": 1.060173511505127, "losses/total": 0.021938582882285118, "ref_logps/chosen": -23.8960018157959, "ref_logps/rejected": -33.68707275390625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6620023250579834, "rewards/margins": 2.924854040145874, "rewards/rejected": -5.586856842041016, "step": 3110 }, { "epoch": 2.93, "grad_norm": 14.568475173117282, "learning_rate": 6.173058487926597e-10, "logps/chosen": -41.063480377197266, "logps/rejected": -77.99103546142578, "loss": 0.1663, "losses/dpo": 0.11596086621284485, "losses/sft": 0.36518678069114685, "losses/total": 0.11596086621284485, "ref_logps/chosen": -24.21432113647461, "ref_logps/rejected": -29.30951690673828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6849159002304077, "rewards/margins": 3.1832354068756104, "rewards/rejected": -4.868151664733887, "step": 3111 }, { "epoch": 2.94, "grad_norm": 14.383211703870625, "learning_rate": 5.995496879339923e-10, "logps/chosen": -56.369110107421875, "logps/rejected": -88.01588439941406, "loss": 0.1394, "losses/dpo": 0.06479404121637344, "losses/sft": 0.9153762459754944, "losses/total": 0.06479404121637344, "ref_logps/chosen": -34.416542053222656, "ref_logps/rejected": -34.72166442871094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1952571868896484, "rewards/margins": 3.1341657638549805, "rewards/rejected": -5.329422473907471, "step": 3112 }, { "epoch": 2.94, "grad_norm": 11.625098367380417, "learning_rate": 5.820523294174651e-10, "logps/chosen": -56.387939453125, "logps/rejected": -101.36428833007812, "loss": 0.1416, "losses/dpo": 0.18842832744121552, "losses/sft": 1.4944359064102173, "losses/total": 0.18842832744121552, "ref_logps/chosen": -26.76778793334961, "ref_logps/rejected": -39.45599365234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.96201491355896, "rewards/margins": 3.2288143634796143, "rewards/rejected": -6.190829277038574, "step": 3113 }, { "epoch": 2.94, "grad_norm": 17.404120895687026, "learning_rate": 5.648137914000816e-10, "logps/chosen": -64.16130065917969, "logps/rejected": -95.42835235595703, "loss": 0.1391, "losses/dpo": 0.2273445427417755, "losses/sft": 1.9892845153808594, "losses/total": 0.2273445427417755, "ref_logps/chosen": -36.54061508178711, "ref_logps/rejected": -37.978851318359375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.762068748474121, "rewards/margins": 2.982882022857666, "rewards/rejected": -5.744950294494629, "step": 3114 }, { "epoch": 2.94, "grad_norm": 15.016420882047303, "learning_rate": 5.478340917701996e-10, "logps/chosen": -47.485103607177734, "logps/rejected": -94.25184631347656, "loss": 0.1072, "losses/dpo": 0.0022635492496192455, "losses/sft": 2.9719464778900146, "losses/total": 0.0022635492496192455, "ref_logps/chosen": -24.43424415588379, "ref_logps/rejected": -35.81635284423828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.305086135864258, "rewards/margins": 3.538463592529297, "rewards/rejected": -5.8435492515563965, "step": 3115 }, { "epoch": 2.94, "grad_norm": 15.367641230598737, "learning_rate": 5.311132481476688e-10, "logps/chosen": -48.80437469482422, "logps/rejected": -100.32748413085938, "loss": 0.1006, "losses/dpo": 0.008312759920954704, "losses/sft": 1.3267948627471924, "losses/total": 0.008312759920954704, "ref_logps/chosen": -27.507963180541992, "ref_logps/rejected": -40.635459899902344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.129641056060791, "rewards/margins": 3.839561939239502, "rewards/rejected": -5.969202995300293, "step": 3116 }, { "epoch": 2.94, "grad_norm": 11.113843460823919, "learning_rate": 5.146512778836376e-10, "logps/chosen": -38.63996505737305, "logps/rejected": -74.83114624023438, "loss": 0.1491, "losses/dpo": 0.07727693766355515, "losses/sft": 1.0723071098327637, "losses/total": 0.07727693766355515, "ref_logps/chosen": -23.41539764404297, "ref_logps/rejected": -30.6960391998291, "rewards/accuracies": 1.0, "rewards/chosen": -1.5224568843841553, "rewards/margins": 2.8910536766052246, "rewards/rejected": -4.413510322570801, "step": 3117 }, { "epoch": 2.94, "grad_norm": 11.957967723657262, "learning_rate": 4.984481980606636e-10, "logps/chosen": -56.609954833984375, "logps/rejected": -83.7254867553711, "loss": 0.1394, "losses/dpo": 0.3684772849082947, "losses/sft": 0.9566918015480042, "losses/total": 0.3684772849082947, "ref_logps/chosen": -30.005836486816406, "ref_logps/rejected": -29.667465209960938, "rewards/accuracies": 1.0, "rewards/chosen": -2.660411834716797, "rewards/margins": 2.7453904151916504, "rewards/rejected": -5.4058027267456055, "step": 3118 }, { "epoch": 2.94, "grad_norm": 15.13085849304753, "learning_rate": 4.825040254926582e-10, "logps/chosen": -45.9720573425293, "logps/rejected": -83.86991882324219, "loss": 0.1555, "losses/dpo": 0.2376764565706253, "losses/sft": 0.2506118714809418, "losses/total": 0.2376764565706253, "ref_logps/chosen": -26.19831085205078, "ref_logps/rejected": -31.395566940307617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.97737455368042, "rewards/margins": 3.2700600624084473, "rewards/rejected": -5.247434616088867, "step": 3119 }, { "epoch": 2.94, "grad_norm": 8.056401197103472, "learning_rate": 4.668187767248866e-10, "logps/chosen": -37.54590606689453, "logps/rejected": -83.03231811523438, "loss": 0.0933, "losses/dpo": 0.005013502202928066, "losses/sft": 0.44703713059425354, "losses/total": 0.005013502202928066, "ref_logps/chosen": -21.234844207763672, "ref_logps/rejected": -31.88240623474121, "rewards/accuracies": 1.0, "rewards/chosen": -1.6311062574386597, "rewards/margins": 3.4838852882385254, "rewards/rejected": -5.114991664886475, "step": 3120 }, { "epoch": 2.94, "grad_norm": 12.513240013189993, "learning_rate": 4.5139246803382877e-10, "logps/chosen": -38.742652893066406, "logps/rejected": -80.49407958984375, "loss": 0.1493, "losses/dpo": 0.019788924604654312, "losses/sft": 1.9016536474227905, "losses/total": 0.019788924604654312, "ref_logps/chosen": -21.892993927001953, "ref_logps/rejected": -31.97351837158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.6849660873413086, "rewards/margins": 3.1670899391174316, "rewards/rejected": -4.852056503295898, "step": 3121 }, { "epoch": 2.95, "grad_norm": 11.430806052869444, "learning_rate": 4.3622511542740216e-10, "logps/chosen": -48.667625427246094, "logps/rejected": -108.26673889160156, "loss": 0.0955, "losses/dpo": 0.017555134370923042, "losses/sft": 1.8051636219024658, "losses/total": 0.017555134370923042, "ref_logps/chosen": -26.76537322998047, "ref_logps/rejected": -45.69499969482422, "rewards/accuracies": 1.0, "rewards/chosen": -2.19022536277771, "rewards/margins": 4.066948890686035, "rewards/rejected": -6.257174491882324, "step": 3122 }, { "epoch": 2.95, "grad_norm": 18.36887342923156, "learning_rate": 4.213167346446833e-10, "logps/chosen": -50.57246398925781, "logps/rejected": -77.40685272216797, "loss": 0.2388, "losses/dpo": 0.23648393154144287, "losses/sft": 1.774614691734314, "losses/total": 0.23648393154144287, "ref_logps/chosen": -25.882122039794922, "ref_logps/rejected": -24.3785343170166, "rewards/accuracies": 0.875, "rewards/chosen": -2.469034194946289, "rewards/margins": 2.8337979316711426, "rewards/rejected": -5.302832126617432, "step": 3123 }, { "epoch": 2.95, "grad_norm": 6.903193511749161, "learning_rate": 4.0666734115613053e-10, "logps/chosen": -46.92401123046875, "logps/rejected": -92.44146728515625, "loss": 0.1003, "losses/dpo": 0.0007400172762572765, "losses/sft": 1.1120785474777222, "losses/total": 0.0007400172762572765, "ref_logps/chosen": -22.818668365478516, "ref_logps/rejected": -32.68048858642578, "rewards/accuracies": 1.0, "rewards/chosen": -2.410534143447876, "rewards/margins": 3.5655641555786133, "rewards/rejected": -5.97609806060791, "step": 3124 }, { "epoch": 2.95, "grad_norm": 11.30811910634549, "learning_rate": 3.9227695016333387e-10, "logps/chosen": -46.09933090209961, "logps/rejected": -74.62057495117188, "loss": 0.1108, "losses/dpo": 0.00033219935721717775, "losses/sft": 0.7548304796218872, "losses/total": 0.00033219935721717775, "ref_logps/chosen": -28.865474700927734, "ref_logps/rejected": -27.818281173706055, "rewards/accuracies": 1.0, "rewards/chosen": -1.7233855724334717, "rewards/margins": 2.9568440914154053, "rewards/rejected": -4.680229663848877, "step": 3125 }, { "epoch": 2.95, "grad_norm": 9.838492112967213, "learning_rate": 3.7814557659918147e-10, "logps/chosen": -42.734989166259766, "logps/rejected": -85.60396575927734, "loss": 0.0863, "losses/dpo": 0.024101022630929947, "losses/sft": 2.837454080581665, "losses/total": 0.024101022630929947, "ref_logps/chosen": -25.07318878173828, "ref_logps/rejected": -32.433807373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7661800384521484, "rewards/margins": 3.550835609436035, "rewards/rejected": -5.317015647888184, "step": 3126 }, { "epoch": 2.95, "grad_norm": 13.189178398975026, "learning_rate": 3.642732351278044e-10, "logps/chosen": -54.4596061706543, "logps/rejected": -91.73567962646484, "loss": 0.138, "losses/dpo": 0.20726712048053741, "losses/sft": 2.460529327392578, "losses/total": 0.20726712048053741, "ref_logps/chosen": -27.827816009521484, "ref_logps/rejected": -34.388031005859375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6631789207458496, "rewards/margins": 3.071585178375244, "rewards/rejected": -5.734764099121094, "step": 3127 }, { "epoch": 2.95, "grad_norm": 7.822508751019039, "learning_rate": 3.5065994014446544e-10, "logps/chosen": -56.628211975097656, "logps/rejected": -119.82569885253906, "loss": 0.0637, "losses/dpo": 0.04370784014463425, "losses/sft": 0.6949189901351929, "losses/total": 0.04370784014463425, "ref_logps/chosen": -32.78208541870117, "ref_logps/rejected": -48.3472900390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.3846120834350586, "rewards/margins": 4.763228416442871, "rewards/rejected": -7.14784049987793, "step": 3128 }, { "epoch": 2.95, "grad_norm": 8.917033822256856, "learning_rate": 3.373057057756701e-10, "logps/chosen": -59.475990295410156, "logps/rejected": -98.73926544189453, "loss": 0.0889, "losses/dpo": 0.018601685762405396, "losses/sft": 1.6132152080535889, "losses/total": 0.018601685762405396, "ref_logps/chosen": -28.200740814208984, "ref_logps/rejected": -36.85673904418945, "rewards/accuracies": 1.0, "rewards/chosen": -3.1275253295898438, "rewards/margins": 3.060727119445801, "rewards/rejected": -6.1882524490356445, "step": 3129 }, { "epoch": 2.95, "grad_norm": 16.71971195900923, "learning_rate": 3.2421054587908336e-10, "logps/chosen": -40.102874755859375, "logps/rejected": -73.79554748535156, "loss": 0.1914, "losses/dpo": 0.15487881004810333, "losses/sft": 0.8543293476104736, "losses/total": 0.15487881004810333, "ref_logps/chosen": -23.534957885742188, "ref_logps/rejected": -31.662212371826172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6567919254302979, "rewards/margins": 2.5565414428710938, "rewards/rejected": -4.2133331298828125, "step": 3130 }, { "epoch": 2.95, "grad_norm": 11.081032087387959, "learning_rate": 3.113744740435298e-10, "logps/chosen": -43.492000579833984, "logps/rejected": -88.13764953613281, "loss": 0.098, "losses/dpo": 0.31390219926834106, "losses/sft": 0.551375150680542, "losses/total": 0.31390219926834106, "ref_logps/chosen": -21.83401870727539, "ref_logps/rejected": -34.12187576293945, "rewards/accuracies": 1.0, "rewards/chosen": -2.1657981872558594, "rewards/margins": 3.235779285430908, "rewards/rejected": -5.401577472686768, "step": 3131 }, { "epoch": 2.95, "grad_norm": 8.586141622811086, "learning_rate": 2.9879750358896563e-10, "logps/chosen": -67.01643371582031, "logps/rejected": -101.99954223632812, "loss": 0.0717, "losses/dpo": 0.043894004076719284, "losses/sft": 1.5613597631454468, "losses/total": 0.043894004076719284, "ref_logps/chosen": -36.86115264892578, "ref_logps/rejected": -38.94523620605469, "rewards/accuracies": 1.0, "rewards/chosen": -3.0155282020568848, "rewards/margins": 3.289902448654175, "rewards/rejected": -6.3054304122924805, "step": 3132 }, { "epoch": 2.96, "grad_norm": 7.241595175094147, "learning_rate": 2.864796475664788e-10, "logps/chosen": -49.08814239501953, "logps/rejected": -87.54273986816406, "loss": 0.0611, "losses/dpo": 0.28190043568611145, "losses/sft": 0.8557358980178833, "losses/total": 0.28190043568611145, "ref_logps/chosen": -29.445663452148438, "ref_logps/rejected": -35.29151916503906, "rewards/accuracies": 1.0, "rewards/chosen": -1.9642479419708252, "rewards/margins": 3.2608742713928223, "rewards/rejected": -5.225122451782227, "step": 3133 }, { "epoch": 2.96, "grad_norm": 11.836302701584566, "learning_rate": 2.7442091875831684e-10, "logps/chosen": -49.426029205322266, "logps/rejected": -82.61817932128906, "loss": 0.1271, "losses/dpo": 0.004734512884169817, "losses/sft": 1.5016310214996338, "losses/total": 0.004734512884169817, "ref_logps/chosen": -26.498998641967773, "ref_logps/rejected": -31.233510971069336, "rewards/accuracies": 1.0, "rewards/chosen": -2.2927029132843018, "rewards/margins": 2.845763921737671, "rewards/rejected": -5.138466835021973, "step": 3134 }, { "epoch": 2.96, "grad_norm": 14.281622494721601, "learning_rate": 2.626213296777757e-10, "logps/chosen": -49.61192321777344, "logps/rejected": -82.36347198486328, "loss": 0.146, "losses/dpo": 0.14734342694282532, "losses/sft": 1.098231554031372, "losses/total": 0.14734342694282532, "ref_logps/chosen": -24.154470443725586, "ref_logps/rejected": -29.308446884155273, "rewards/accuracies": 1.0, "rewards/chosen": -2.545745372772217, "rewards/margins": 2.7597579956054688, "rewards/rejected": -5.3055033683776855, "step": 3135 }, { "epoch": 2.96, "grad_norm": 9.425667167967317, "learning_rate": 2.5108089256925536e-10, "logps/chosen": -40.05361557006836, "logps/rejected": -74.85436248779297, "loss": 0.1308, "losses/dpo": 0.1109946221113205, "losses/sft": 2.641425371170044, "losses/total": 0.1109946221113205, "ref_logps/chosen": -21.08489227294922, "ref_logps/rejected": -26.077072143554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.8968724012374878, "rewards/margins": 2.9808568954467773, "rewards/rejected": -4.8777289390563965, "step": 3136 }, { "epoch": 2.96, "grad_norm": 7.319219358377187, "learning_rate": 2.397996194082319e-10, "logps/chosen": -63.33928680419922, "logps/rejected": -105.04176330566406, "loss": 0.0646, "losses/dpo": 0.013110775500535965, "losses/sft": 1.1530338525772095, "losses/total": 0.013110775500535965, "ref_logps/chosen": -37.7128791809082, "ref_logps/rejected": -38.329742431640625, "rewards/accuracies": 1.0, "rewards/chosen": -2.562641143798828, "rewards/margins": 4.108560562133789, "rewards/rejected": -6.671201705932617, "step": 3137 }, { "epoch": 2.96, "grad_norm": 15.474657170776725, "learning_rate": 2.2877752190131328e-10, "logps/chosen": -37.01011657714844, "logps/rejected": -61.58271408081055, "loss": 0.1861, "losses/dpo": 0.0900132954120636, "losses/sft": 2.1748530864715576, "losses/total": 0.0900132954120636, "ref_logps/chosen": -20.873653411865234, "ref_logps/rejected": -23.514183044433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.6136462688446045, "rewards/margins": 2.193207025527954, "rewards/rejected": -3.8068532943725586, "step": 3138 }, { "epoch": 2.96, "grad_norm": 7.3686626108606115, "learning_rate": 2.1801461148607259e-10, "logps/chosen": -53.79985046386719, "logps/rejected": -97.44644165039062, "loss": 0.0553, "losses/dpo": 0.002509345533326268, "losses/sft": 2.7029690742492676, "losses/total": 0.002509345533326268, "ref_logps/chosen": -30.119169235229492, "ref_logps/rejected": -33.93689727783203, "rewards/accuracies": 1.0, "rewards/chosen": -2.3680684566497803, "rewards/margins": 3.9828855991363525, "rewards/rejected": -6.350954055786133, "step": 3139 }, { "epoch": 2.96, "grad_norm": 8.212015232913613, "learning_rate": 2.0751089933115917e-10, "logps/chosen": -41.94115447998047, "logps/rejected": -90.17085266113281, "loss": 0.0899, "losses/dpo": 0.07482998818159103, "losses/sft": 0.74053555727005, "losses/total": 0.07482998818159103, "ref_logps/chosen": -22.984718322753906, "ref_logps/rejected": -36.17500686645508, "rewards/accuracies": 1.0, "rewards/chosen": -1.8956435918807983, "rewards/margins": 3.503941297531128, "rewards/rejected": -5.399584770202637, "step": 3140 }, { "epoch": 2.96, "grad_norm": 8.486120872310423, "learning_rate": 1.972663963362431e-10, "logps/chosen": -66.74613952636719, "logps/rejected": -100.75192260742188, "loss": 0.0936, "losses/dpo": 0.15820522606372833, "losses/sft": 2.8478918075561523, "losses/total": 0.15820522606372833, "ref_logps/chosen": -41.40361785888672, "ref_logps/rejected": -42.29313659667969, "rewards/accuracies": 1.0, "rewards/chosen": -2.5342516899108887, "rewards/margins": 3.311626434326172, "rewards/rejected": -5.845879077911377, "step": 3141 }, { "epoch": 2.96, "grad_norm": 14.14007918555633, "learning_rate": 1.8728111313204286e-10, "logps/chosen": -42.54011154174805, "logps/rejected": -72.26756286621094, "loss": 0.1323, "losses/dpo": 0.11329139024019241, "losses/sft": 1.1105269193649292, "losses/total": 0.11329139024019241, "ref_logps/chosen": -24.184650421142578, "ref_logps/rejected": -25.622499465942383, "rewards/accuracies": 1.0, "rewards/chosen": -1.8355460166931152, "rewards/margins": 2.828960418701172, "rewards/rejected": -4.664506435394287, "step": 3142 }, { "epoch": 2.97, "grad_norm": 7.824127772433812, "learning_rate": 1.7755506008029775e-10, "logps/chosen": -53.68363571166992, "logps/rejected": -100.23954772949219, "loss": 0.0583, "losses/dpo": 0.0187838077545166, "losses/sft": 0.8954954743385315, "losses/total": 0.0187838077545166, "ref_logps/chosen": -32.00776672363281, "ref_logps/rejected": -41.16106414794922, "rewards/accuracies": 1.0, "rewards/chosen": -2.1675870418548584, "rewards/margins": 3.7402613162994385, "rewards/rejected": -5.907848834991455, "step": 3143 }, { "epoch": 2.97, "grad_norm": 7.34346254644282, "learning_rate": 1.6808824727365667e-10, "logps/chosen": -46.78154754638672, "logps/rejected": -95.27367401123047, "loss": 0.0607, "losses/dpo": 0.006987278815358877, "losses/sft": 1.4784088134765625, "losses/total": 0.006987278815358877, "ref_logps/chosen": -25.721389770507812, "ref_logps/rejected": -35.63569641113281, "rewards/accuracies": 1.0, "rewards/chosen": -2.106015682220459, "rewards/margins": 3.8577823638916016, "rewards/rejected": -5.9637980461120605, "step": 3144 }, { "epoch": 2.97, "grad_norm": 5.783823286346337, "learning_rate": 1.5888068453581705e-10, "logps/chosen": -57.6567497253418, "logps/rejected": -101.75475311279297, "loss": 0.0493, "losses/dpo": 0.09981254488229752, "losses/sft": 2.685333728790283, "losses/total": 0.09981254488229752, "ref_logps/chosen": -31.552982330322266, "ref_logps/rejected": -37.41337203979492, "rewards/accuracies": 1.0, "rewards/chosen": -2.610377311706543, "rewards/margins": 3.823760747909546, "rewards/rejected": -6.434138298034668, "step": 3145 }, { "epoch": 2.97, "grad_norm": 10.479935748621266, "learning_rate": 1.49932381421497e-10, "logps/chosen": -70.39659118652344, "logps/rejected": -87.91748046875, "loss": 0.0929, "losses/dpo": 0.0665484294295311, "losses/sft": 1.1997476816177368, "losses/total": 0.0665484294295311, "ref_logps/chosen": -47.08997344970703, "ref_logps/rejected": -33.00654220581055, "rewards/accuracies": 1.0, "rewards/chosen": -2.3306617736816406, "rewards/margins": 3.1604318618774414, "rewards/rejected": -5.491093635559082, "step": 3146 }, { "epoch": 2.97, "grad_norm": 4.799526908040814, "learning_rate": 1.4124334721626885e-10, "logps/chosen": -42.28208923339844, "logps/rejected": -79.29411315917969, "loss": 0.0436, "losses/dpo": 0.13588009774684906, "losses/sft": 1.5150580406188965, "losses/total": 0.13588009774684906, "ref_logps/chosen": -26.628902435302734, "ref_logps/rejected": -29.512226104736328, "rewards/accuracies": 1.0, "rewards/chosen": -1.5653187036514282, "rewards/margins": 3.4128706455230713, "rewards/rejected": -4.978188991546631, "step": 3147 }, { "epoch": 2.97, "grad_norm": 11.014915352608257, "learning_rate": 1.3281359093678114e-10, "logps/chosen": -53.10375213623047, "logps/rejected": -79.98255157470703, "loss": 0.1211, "losses/dpo": 0.13978001475334167, "losses/sft": 1.0048158168792725, "losses/total": 0.13978001475334167, "ref_logps/chosen": -31.075481414794922, "ref_logps/rejected": -30.68716812133789, "rewards/accuracies": 1.0, "rewards/chosen": -2.202827215194702, "rewards/margins": 2.7267112731933594, "rewards/rejected": -4.929538726806641, "step": 3148 }, { "epoch": 2.97, "grad_norm": 16.475216672559476, "learning_rate": 1.246431213305643e-10, "logps/chosen": -59.69072341918945, "logps/rejected": -94.06878662109375, "loss": 0.1969, "losses/dpo": 0.1373084932565689, "losses/sft": 2.3379769325256348, "losses/total": 0.1373084932565689, "ref_logps/chosen": -29.589157104492188, "ref_logps/rejected": -36.80241394042969, "rewards/accuracies": 0.875, "rewards/chosen": -3.0101566314697266, "rewards/margins": 2.7164807319641113, "rewards/rejected": -5.726637363433838, "step": 3149 }, { "epoch": 2.97, "grad_norm": 10.900139874058098, "learning_rate": 1.1673194687605858e-10, "logps/chosen": -53.52466583251953, "logps/rejected": -88.89054107666016, "loss": 0.1124, "losses/dpo": 0.06163720414042473, "losses/sft": 1.631263256072998, "losses/total": 0.06163720414042473, "ref_logps/chosen": -29.098270416259766, "ref_logps/rejected": -34.241737365722656, "rewards/accuracies": 1.0, "rewards/chosen": -2.4426403045654297, "rewards/margins": 3.0222396850585938, "rewards/rejected": -5.464879989624023, "step": 3150 }, { "epoch": 2.97, "grad_norm": 18.64697692303331, "learning_rate": 1.0908007578272482e-10, "logps/chosen": -54.43238067626953, "logps/rejected": -103.1353530883789, "loss": 0.1335, "losses/dpo": 0.9932347536087036, "losses/sft": 2.6936416625976562, "losses/total": 0.9932347536087036, "ref_logps/chosen": -31.96935272216797, "ref_logps/rejected": -43.44456100463867, "rewards/accuracies": 0.9375, "rewards/chosen": -2.246303081512451, "rewards/margins": 3.722776412963867, "rewards/rejected": -5.969079971313477, "step": 3151 }, { "epoch": 2.97, "grad_norm": 8.279327364909372, "learning_rate": 1.0168751599085035e-10, "logps/chosen": -47.87211227416992, "logps/rejected": -91.85086059570312, "loss": 0.0742, "losses/dpo": 0.053359609097242355, "losses/sft": 0.6053342819213867, "losses/total": 0.053359609097242355, "ref_logps/chosen": -28.577919006347656, "ref_logps/rejected": -36.101688385009766, "rewards/accuracies": 1.0, "rewards/chosen": -1.9294195175170898, "rewards/margins": 3.645498275756836, "rewards/rejected": -5.574917316436768, "step": 3152 }, { "epoch": 2.97, "grad_norm": 12.065042731016035, "learning_rate": 9.455427517168769e-11, "logps/chosen": -43.10607147216797, "logps/rejected": -90.0599136352539, "loss": 0.1245, "losses/dpo": 0.024039510637521744, "losses/sft": 1.6388399600982666, "losses/total": 0.024039510637521744, "ref_logps/chosen": -19.903385162353516, "ref_logps/rejected": -33.60740661621094, "rewards/accuracies": 1.0, "rewards/chosen": -2.320268392562866, "rewards/margins": 3.3249824047088623, "rewards/rejected": -5.6452507972717285, "step": 3153 }, { "epoch": 2.98, "grad_norm": 14.600784971671885, "learning_rate": 8.76803607274268e-11, "logps/chosen": -51.02659606933594, "logps/rejected": -89.9635009765625, "loss": 0.1196, "losses/dpo": 0.32255688309669495, "losses/sft": 1.389981985092163, "losses/total": 0.32255688309669495, "ref_logps/chosen": -26.825576782226562, "ref_logps/rejected": -30.275625228881836, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4201016426086426, "rewards/margins": 3.5486860275268555, "rewards/rejected": -5.968788146972656, "step": 3154 }, { "epoch": 2.98, "grad_norm": 12.193935861207672, "learning_rate": 8.106577979108409e-11, "logps/chosen": -51.59403610229492, "logps/rejected": -78.88963317871094, "loss": 0.1115, "losses/dpo": 0.05023208633065224, "losses/sft": 0.41254881024360657, "losses/total": 0.05023208633065224, "ref_logps/chosen": -29.664649963378906, "ref_logps/rejected": -27.28243064880371, "rewards/accuracies": 1.0, "rewards/chosen": -2.192938804626465, "rewards/margins": 2.9677815437316895, "rewards/rejected": -5.160720348358154, "step": 3155 }, { "epoch": 2.98, "grad_norm": 15.049916551842445, "learning_rate": 7.471053922658566e-11, "logps/chosen": -56.6778678894043, "logps/rejected": -111.060546875, "loss": 0.1529, "losses/dpo": 0.8334082365036011, "losses/sft": 0.9624126553535461, "losses/total": 0.8334082365036011, "ref_logps/chosen": -30.74213218688965, "ref_logps/rejected": -48.506168365478516, "rewards/accuracies": 0.875, "rewards/chosen": -2.593573570251465, "rewards/margins": 3.661865234375, "rewards/rejected": -6.255438327789307, "step": 3156 }, { "epoch": 2.98, "grad_norm": 8.427555238677652, "learning_rate": 6.861464562876729e-11, "logps/chosen": -46.62464141845703, "logps/rejected": -97.7607421875, "loss": 0.0765, "losses/dpo": 0.1950957179069519, "losses/sft": 0.38599056005477905, "losses/total": 0.1950957179069519, "ref_logps/chosen": -25.482938766479492, "ref_logps/rejected": -39.852073669433594, "rewards/accuracies": 1.0, "rewards/chosen": -2.114170551300049, "rewards/margins": 3.67669677734375, "rewards/rejected": -5.790866851806641, "step": 3157 }, { "epoch": 2.98, "grad_norm": 15.333727640926933, "learning_rate": 6.277810532331895e-11, "logps/chosen": -72.78181457519531, "logps/rejected": -91.38802337646484, "loss": 0.1715, "losses/dpo": 0.01049103308469057, "losses/sft": 1.7789393663406372, "losses/total": 0.01049103308469057, "ref_logps/chosen": -43.30706024169922, "ref_logps/rejected": -35.07415771484375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9474754333496094, "rewards/margins": 2.6839115619659424, "rewards/rejected": -5.631386756896973, "step": 3158 }, { "epoch": 2.98, "grad_norm": 19.93479778558518, "learning_rate": 5.720092436681256e-11, "logps/chosen": -50.777706146240234, "logps/rejected": -71.40351867675781, "loss": 0.249, "losses/dpo": 0.0075319185853004456, "losses/sft": 0.857665479183197, "losses/total": 0.0075319185853004456, "ref_logps/chosen": -27.674421310424805, "ref_logps/rejected": -24.643978118896484, "rewards/accuracies": 0.875, "rewards/chosen": -2.310328483581543, "rewards/margins": 2.3656258583068848, "rewards/rejected": -4.675954341888428, "step": 3159 }, { "epoch": 2.98, "grad_norm": 15.110006955172093, "learning_rate": 5.188310854670197e-11, "logps/chosen": -52.21847152709961, "logps/rejected": -91.76683807373047, "loss": 0.1575, "losses/dpo": 0.0017660652520135045, "losses/sft": 2.182222604751587, "losses/total": 0.0017660652520135045, "ref_logps/chosen": -30.692256927490234, "ref_logps/rejected": -37.1625862121582, "rewards/accuracies": 0.9375, "rewards/chosen": -2.152621269226074, "rewards/margins": 3.3078041076660156, "rewards/rejected": -5.460424900054932, "step": 3160 }, { "epoch": 2.98, "grad_norm": 15.838770790492998, "learning_rate": 4.682466338121194e-11, "logps/chosen": -52.18991470336914, "logps/rejected": -98.00503540039062, "loss": 0.1688, "losses/dpo": 0.03336622193455696, "losses/sft": 2.4367048740386963, "losses/total": 0.03336622193455696, "ref_logps/chosen": -30.15101432800293, "ref_logps/rejected": -40.354408264160156, "rewards/accuracies": 0.875, "rewards/chosen": -2.203889846801758, "rewards/margins": 3.5611729621887207, "rewards/rejected": -5.765063285827637, "step": 3161 }, { "epoch": 2.98, "grad_norm": 7.8374512632346045, "learning_rate": 4.202559411953244e-11, "logps/chosen": -53.67498016357422, "logps/rejected": -94.2636489868164, "loss": 0.0675, "losses/dpo": 0.12138114124536514, "losses/sft": 1.2621102333068848, "losses/total": 0.12138114124536514, "ref_logps/chosen": -33.18479919433594, "ref_logps/rejected": -36.48705291748047, "rewards/accuracies": 1.0, "rewards/chosen": -2.049018144607544, "rewards/margins": 3.7286415100097656, "rewards/rejected": -5.777659893035889, "step": 3162 }, { "epoch": 2.98, "grad_norm": 18.937471011807343, "learning_rate": 3.748590574162436e-11, "logps/chosen": -49.43524169921875, "logps/rejected": -79.18400573730469, "loss": 0.2389, "losses/dpo": 1.5158665180206299, "losses/sft": 1.999355435371399, "losses/total": 1.5158665180206299, "ref_logps/chosen": -25.511831283569336, "ref_logps/rejected": -27.9150447845459, "rewards/accuracies": 0.9375, "rewards/chosen": -2.392341136932373, "rewards/margins": 2.734555244445801, "rewards/rejected": -5.126896381378174, "step": 3163 }, { "epoch": 2.98, "grad_norm": 16.95474480149686, "learning_rate": 3.320560295833053e-11, "logps/chosen": -62.164180755615234, "logps/rejected": -99.73114776611328, "loss": 0.197, "losses/dpo": 0.15556009113788605, "losses/sft": 2.5440621376037598, "losses/total": 0.15556009113788605, "ref_logps/chosen": -34.27838134765625, "ref_logps/rejected": -37.69365692138672, "rewards/accuracies": 0.875, "rewards/chosen": -2.7885804176330566, "rewards/margins": 3.4151690006256104, "rewards/rejected": -6.203749179840088, "step": 3164 }, { "epoch": 2.99, "grad_norm": 13.967775525158462, "learning_rate": 2.918469021129244e-11, "logps/chosen": -48.72654342651367, "logps/rejected": -83.84039306640625, "loss": 0.1529, "losses/dpo": 0.2057482749223709, "losses/sft": 2.0623793601989746, "losses/total": 0.2057482749223709, "ref_logps/chosen": -29.642913818359375, "ref_logps/rejected": -31.814481735229492, "rewards/accuracies": 1.0, "rewards/chosen": -1.9083629846572876, "rewards/margins": 3.2942283153533936, "rewards/rejected": -5.2025909423828125, "step": 3165 }, { "epoch": 2.99, "grad_norm": 23.7503373905011, "learning_rate": 2.542317167303354e-11, "logps/chosen": -44.97050857543945, "logps/rejected": -63.15156936645508, "loss": 0.3274, "losses/dpo": 0.14451876282691956, "losses/sft": 2.105726718902588, "losses/total": 0.14451876282691956, "ref_logps/chosen": -25.316715240478516, "ref_logps/rejected": -23.464324951171875, "rewards/accuracies": 0.875, "rewards/chosen": -1.9653794765472412, "rewards/margins": 2.003345489501953, "rewards/rejected": -3.9687247276306152, "step": 3166 }, { "epoch": 2.99, "grad_norm": 23.20443795042307, "learning_rate": 2.1921051246820422e-11, "logps/chosen": -47.402191162109375, "logps/rejected": -72.6341552734375, "loss": 0.284, "losses/dpo": 0.21798302233219147, "losses/sft": 1.5327709913253784, "losses/total": 0.21798302233219147, "ref_logps/chosen": -26.277761459350586, "ref_logps/rejected": -27.271148681640625, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1124427318573, "rewards/margins": 2.4238574504852295, "rewards/rejected": -4.536300182342529, "step": 3167 }, { "epoch": 2.99, "grad_norm": 20.75228036313803, "learning_rate": 1.8678332566857135e-11, "logps/chosen": -47.354434967041016, "logps/rejected": -87.38253021240234, "loss": 0.214, "losses/dpo": 0.008510376326739788, "losses/sft": 2.3990514278411865, "losses/total": 0.008510376326739788, "ref_logps/chosen": -25.63875961303711, "ref_logps/rejected": -36.642494201660156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.171567440032959, "rewards/margins": 2.9024362564086914, "rewards/rejected": -5.07400369644165, "step": 3168 }, { "epoch": 2.99, "grad_norm": 11.28495311682124, "learning_rate": 1.5695018998063136e-11, "logps/chosen": -45.81100845336914, "logps/rejected": -89.8382568359375, "loss": 0.1195, "losses/dpo": 0.011251207441091537, "losses/sft": 1.2584232091903687, "losses/total": 0.011251207441091537, "ref_logps/chosen": -24.177602767944336, "ref_logps/rejected": -35.44628143310547, "rewards/accuracies": 1.0, "rewards/chosen": -2.1633405685424805, "rewards/margins": 3.275857448577881, "rewards/rejected": -5.439197540283203, "step": 3169 }, { "epoch": 2.99, "grad_norm": 13.317797115650682, "learning_rate": 1.2971113636239816e-11, "logps/chosen": -60.836326599121094, "logps/rejected": -91.1651611328125, "loss": 0.1316, "losses/dpo": 0.1641066074371338, "losses/sft": 1.6308459043502808, "losses/total": 0.1641066074371338, "ref_logps/chosen": -32.462188720703125, "ref_logps/rejected": -31.13603973388672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.837413787841797, "rewards/margins": 3.1654984951019287, "rewards/rejected": -6.002912521362305, "step": 3170 }, { "epoch": 2.99, "grad_norm": 7.7015617168381905, "learning_rate": 1.0506619307987241e-11, "logps/chosen": -50.70697021484375, "logps/rejected": -86.97694396972656, "loss": 0.0682, "losses/dpo": 0.04812197759747505, "losses/sft": 1.6509325504302979, "losses/total": 0.04812197759747505, "ref_logps/chosen": -29.303659439086914, "ref_logps/rejected": -32.567752838134766, "rewards/accuracies": 1.0, "rewards/chosen": -2.1403307914733887, "rewards/margins": 3.300588846206665, "rewards/rejected": -5.440919399261475, "step": 3171 }, { "epoch": 2.99, "grad_norm": 15.072374614655352, "learning_rate": 8.301538570676392e-12, "logps/chosen": -43.09785079956055, "logps/rejected": -95.66876983642578, "loss": 0.1168, "losses/dpo": 0.0023556549567729235, "losses/sft": 2.033229112625122, "losses/total": 0.0023556549567729235, "ref_logps/chosen": -19.945167541503906, "ref_logps/rejected": -33.89905548095703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3152685165405273, "rewards/margins": 3.8617029190063477, "rewards/rejected": -6.176971435546875, "step": 3172 }, { "epoch": 2.99, "grad_norm": 7.958812437321352, "learning_rate": 6.355873712504678e-12, "logps/chosen": -46.93206787109375, "logps/rejected": -78.31072235107422, "loss": 0.0851, "losses/dpo": 0.2621162533760071, "losses/sft": 1.559199333190918, "losses/total": 0.2621162533760071, "ref_logps/chosen": -28.931743621826172, "ref_logps/rejected": -27.329151153564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.800032377243042, "rewards/margins": 3.2981247901916504, "rewards/rejected": -5.098156929016113, "step": 3173 }, { "epoch": 2.99, "grad_norm": 16.158117532508193, "learning_rate": 4.669626752551448e-12, "logps/chosen": -53.429039001464844, "logps/rejected": -80.16108703613281, "loss": 0.1898, "losses/dpo": 0.1697525531053543, "losses/sft": 1.1368672847747803, "losses/total": 0.1697525531053543, "ref_logps/chosen": -26.649822235107422, "ref_logps/rejected": -28.162742614746094, "rewards/accuracies": 0.875, "rewards/chosen": -2.6779212951660156, "rewards/margins": 2.521912097930908, "rewards/rejected": -5.199833869934082, "step": 3174 }, { "epoch": 3.0, "grad_norm": 16.21332078603673, "learning_rate": 3.242799440555943e-12, "logps/chosen": -54.65606689453125, "logps/rejected": -82.68156433105469, "loss": 0.1313, "losses/dpo": 0.6201815009117126, "losses/sft": 1.7058987617492676, "losses/total": 0.6201815009117126, "ref_logps/chosen": -29.649023056030273, "ref_logps/rejected": -29.347694396972656, "rewards/accuracies": 1.0, "rewards/chosen": -2.500704526901245, "rewards/margins": 2.832683563232422, "rewards/rejected": -5.333388328552246, "step": 3175 }, { "epoch": 3.0, "grad_norm": 16.40945793523134, "learning_rate": 2.0753932571948573e-12, "logps/chosen": -53.31110382080078, "logps/rejected": -91.76263427734375, "loss": 0.1799, "losses/dpo": 0.11933598667383194, "losses/sft": 1.97722589969635, "losses/total": 0.11933598667383194, "ref_logps/chosen": -23.894775390625, "ref_logps/rejected": -33.52750778198242, "rewards/accuracies": 1.0, "rewards/chosen": -2.9416327476501465, "rewards/margins": 2.881880760192871, "rewards/rejected": -5.823513031005859, "step": 3176 }, { "epoch": 3.0, "grad_norm": 18.143190029119616, "learning_rate": 1.1674094138325319e-12, "logps/chosen": -48.61560821533203, "logps/rejected": -95.16450500488281, "loss": 0.2002, "losses/dpo": 0.10549107939004898, "losses/sft": 0.9001356959342957, "losses/total": 0.10549107939004898, "ref_logps/chosen": -23.85279083251953, "ref_logps/rejected": -33.61205291748047, "rewards/accuracies": 0.875, "rewards/chosen": -2.4762814044952393, "rewards/margins": 3.678964138031006, "rewards/rejected": -6.155245304107666, "step": 3177 }, { "epoch": 3.0, "grad_norm": 11.36593749864929, "learning_rate": 5.188488527152479e-13, "logps/chosen": -53.51325225830078, "logps/rejected": -88.14250183105469, "loss": 0.1245, "losses/dpo": 0.39563506841659546, "losses/sft": 1.112668514251709, "losses/total": 0.39563506841659546, "ref_logps/chosen": -28.669391632080078, "ref_logps/rejected": -30.61825180053711, "rewards/accuracies": 1.0, "rewards/chosen": -2.484386444091797, "rewards/margins": 3.2680392265319824, "rewards/rejected": -5.752425193786621, "step": 3178 }, { "epoch": 3.0, "grad_norm": 10.139666322160975, "learning_rate": 1.2971224683244742e-13, "logps/chosen": -61.493385314941406, "logps/rejected": -95.36381530761719, "loss": 0.0973, "losses/dpo": 0.01611722633242607, "losses/sft": 2.460395574569702, "losses/total": 0.01611722633242607, "ref_logps/chosen": -38.65428161621094, "ref_logps/rejected": -34.034393310546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.2839105129241943, "rewards/margins": 3.849031925201416, "rewards/rejected": -6.132942199707031, "step": 3179 }, { "epoch": 3.0, "grad_norm": 8.216123484738292, "learning_rate": 0.0, "logps/chosen": -44.144859313964844, "logps/rejected": -86.30203247070312, "loss": 0.0684, "losses/dpo": 0.05449850112199783, "losses/sft": 2.0605883598327637, "losses/total": 0.05449850112199783, "ref_logps/chosen": -29.65682601928711, "ref_logps/rejected": -33.78363800048828, "rewards/accuracies": 1.0, "rewards/chosen": -1.4488036632537842, "rewards/margins": 3.8030354976654053, "rewards/rejected": -5.2518391609191895, "step": 3180 }, { "epoch": 3.0, "step": 3180, "total_flos": 0.0, "train_loss": 0.3416744293984752, "train_runtime": 27533.1223, "train_samples_per_second": 1.848, "train_steps_per_second": 0.115 } ], "logging_steps": 1.0, "max_steps": 3180, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }