{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988679245283019, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 8.418300067215691, "learning_rate": 1.25e-08, "logps/chosen": -39.02219009399414, "logps/rejected": -45.12399673461914, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.552122950553894, "losses/total": 0.6931471824645996, "ref_logps/chosen": -39.02219009399414, "ref_logps/rejected": -45.12399673461914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 7.6721075942113535, "learning_rate": 2.5e-08, "logps/chosen": -37.21428680419922, "logps/rejected": -44.4819221496582, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.6663763523101807, "losses/total": 0.6931471824645996, "ref_logps/chosen": -37.21428680419922, "ref_logps/rejected": -44.4819221496582, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.02, "grad_norm": 7.948340327346078, "learning_rate": 3.75e-08, "logps/chosen": -41.46142578125, "logps/rejected": -52.18663024902344, "loss": 0.6926, "losses/dpo": 0.6867616176605225, "losses/sft": 1.7890703678131104, "losses/total": 0.6867616176605225, "ref_logps/chosen": -41.46522903442383, "ref_logps/rejected": -52.1768798828125, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0003804098814725876, "rewards/margins": 0.0013556077610701323, "rewards/rejected": -0.0009751979378052056, "step": 3 }, { "epoch": 0.03, "grad_norm": 8.038993728778431, "learning_rate": 5e-08, "logps/chosen": -39.45478057861328, "logps/rejected": -45.85334014892578, "loss": 0.6936, "losses/dpo": 0.6930198073387146, "losses/sft": 1.6549196243286133, "losses/total": 0.6930198073387146, "ref_logps/chosen": -39.42698287963867, "ref_logps/rejected": -45.83390426635742, "rewards/accuracies": 0.484375, "rewards/chosen": -0.002779680071398616, "rewards/margins": -0.0008358716731891036, "rewards/rejected": -0.001943808514624834, "step": 4 }, { "epoch": 0.04, "grad_norm": 7.749366133023054, "learning_rate": 6.25e-08, "logps/chosen": -44.29286193847656, "logps/rejected": -51.08875274658203, "loss": 0.695, "losses/dpo": 0.6971508264541626, "losses/sft": 1.3629276752471924, "losses/total": 0.6971508264541626, "ref_logps/chosen": -44.301361083984375, "ref_logps/rejected": -51.132286071777344, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.0008498989045619965, "rewards/margins": -0.003503247397020459, "rewards/rejected": 0.004353146068751812, "step": 5 }, { "epoch": 0.05, "grad_norm": 7.545034174633802, "learning_rate": 7.5e-08, "logps/chosen": -37.192138671875, "logps/rejected": -44.56536865234375, "loss": 0.693, "losses/dpo": 0.6901252269744873, "losses/sft": 1.235260248184204, "losses/total": 0.6901252269744873, "ref_logps/chosen": -37.197486877441406, "ref_logps/rejected": -44.56662368774414, "rewards/accuracies": 0.46875, "rewards/chosen": 0.000535178929567337, "rewards/margins": 0.000409391475841403, "rewards/rejected": 0.00012578748282976449, "step": 6 }, { "epoch": 0.05, "grad_norm": 7.645026557219469, "learning_rate": 8.75e-08, "logps/chosen": -40.067909240722656, "logps/rejected": -46.251487731933594, "loss": 0.6941, "losses/dpo": 0.695063054561615, "losses/sft": 1.8211989402770996, "losses/total": 0.695063054561615, "ref_logps/chosen": -40.05988311767578, "ref_logps/rejected": -46.26015090942383, "rewards/accuracies": 0.453125, "rewards/chosen": -0.0008021063404157758, "rewards/margins": -0.001668928423896432, "rewards/rejected": 0.000866822199895978, "step": 7 }, { "epoch": 0.06, "grad_norm": 8.091132576285203, "learning_rate": 1e-07, "logps/chosen": -44.73344802856445, "logps/rejected": -46.818824768066406, "loss": 0.6949, "losses/dpo": 0.6943204402923584, "losses/sft": 1.470657229423523, "losses/total": 0.6943204402923584, "ref_logps/chosen": -44.7131233215332, "ref_logps/rejected": -46.83186340332031, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0020323917269706726, "rewards/margins": -0.0033366940915584564, "rewards/rejected": 0.0013043024810031056, "step": 8 }, { "epoch": 0.07, "grad_norm": 7.987567478058529, "learning_rate": 1.125e-07, "logps/chosen": -40.069969177246094, "logps/rejected": -50.12396240234375, "loss": 0.6944, "losses/dpo": 0.6880265474319458, "losses/sft": 1.1660749912261963, "losses/total": 0.6880265474319458, "ref_logps/chosen": -40.05424118041992, "ref_logps/rejected": -50.13201904296875, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.0015729822916910052, "rewards/margins": -0.002378995530307293, "rewards/rejected": 0.000806013063993305, "step": 9 }, { "epoch": 0.08, "grad_norm": 7.458190702935405, "learning_rate": 1.25e-07, "logps/chosen": -36.18950271606445, "logps/rejected": -45.11402130126953, "loss": 0.694, "losses/dpo": 0.6928867697715759, "losses/sft": 1.605255365371704, "losses/total": 0.6928867697715759, "ref_logps/chosen": -36.18606185913086, "ref_logps/rejected": -45.12499237060547, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.00034411592059768736, "rewards/margins": -0.0014411872252821922, "rewards/rejected": 0.0010970717994496226, "step": 10 }, { "epoch": 0.08, "grad_norm": 7.498431737676318, "learning_rate": 1.375e-07, "logps/chosen": -38.895912170410156, "logps/rejected": -44.2772216796875, "loss": 0.6922, "losses/dpo": 0.6932664513587952, "losses/sft": 1.4097728729248047, "losses/total": 0.6932664513587952, "ref_logps/chosen": -38.896759033203125, "ref_logps/rejected": -44.25825881958008, "rewards/accuracies": 0.53125, "rewards/chosen": 8.490856271237135e-05, "rewards/margins": 0.0019812812097370625, "rewards/rejected": -0.001896372647024691, "step": 11 }, { "epoch": 0.09, "grad_norm": 7.699329532824058, "learning_rate": 1.5e-07, "logps/chosen": -41.140281677246094, "logps/rejected": -45.357364654541016, "loss": 0.6914, "losses/dpo": 0.6933699250221252, "losses/sft": 1.6783134937286377, "losses/total": 0.6933699250221252, "ref_logps/chosen": -41.16625213623047, "ref_logps/rejected": -45.34600830078125, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002597447484731674, "rewards/margins": 0.0037334603257477283, "rewards/rejected": -0.0011360126081854105, "step": 12 }, { "epoch": 0.1, "grad_norm": 7.69326977444187, "learning_rate": 1.625e-07, "logps/chosen": -40.31540298461914, "logps/rejected": -50.180397033691406, "loss": 0.6925, "losses/dpo": 0.6910371780395508, "losses/sft": 1.366438865661621, "losses/total": 0.6910371780395508, "ref_logps/chosen": -40.30924987792969, "ref_logps/rejected": -50.16073989868164, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0006153000867925584, "rewards/margins": 0.0013502361252903938, "rewards/rejected": -0.0019655367359519005, "step": 13 }, { "epoch": 0.11, "grad_norm": 8.114372207024019, "learning_rate": 1.75e-07, "logps/chosen": -37.29108428955078, "logps/rejected": -44.525848388671875, "loss": 0.6911, "losses/dpo": 0.6899442076683044, "losses/sft": 1.4768216609954834, "losses/total": 0.6899442076683044, "ref_logps/chosen": -37.311187744140625, "ref_logps/rejected": -44.50410079956055, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.0020105522125959396, "rewards/margins": 0.004185608588159084, "rewards/rejected": -0.0021750556770712137, "step": 14 }, { "epoch": 0.11, "grad_norm": 7.835230950103234, "learning_rate": 1.875e-07, "logps/chosen": -38.33734893798828, "logps/rejected": -43.93443298339844, "loss": 0.6932, "losses/dpo": 0.6937546133995056, "losses/sft": 1.4263617992401123, "losses/total": 0.6937546133995056, "ref_logps/chosen": -38.31421661376953, "ref_logps/rejected": -43.90996170043945, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.0023131906054913998, "rewards/margins": 0.00013422727352008224, "rewards/rejected": -0.002447417937219143, "step": 15 }, { "epoch": 0.12, "grad_norm": 7.58949645380321, "learning_rate": 2e-07, "logps/chosen": -40.26841735839844, "logps/rejected": -43.40159225463867, "loss": 0.6934, "losses/dpo": 0.7032474279403687, "losses/sft": 1.5701673030853271, "losses/total": 0.7032474279403687, "ref_logps/chosen": -40.24923324584961, "ref_logps/rejected": -43.38625717163086, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.0019177356734871864, "rewards/margins": -0.0003846373874694109, "rewards/rejected": -0.0015330985188484192, "step": 16 }, { "epoch": 0.13, "grad_norm": 7.829467009021115, "learning_rate": 2.1249999999999998e-07, "logps/chosen": -41.63703536987305, "logps/rejected": -46.70919418334961, "loss": 0.6952, "losses/dpo": 0.6850873231887817, "losses/sft": 1.4479947090148926, "losses/total": 0.6850873231887817, "ref_logps/chosen": -41.58295822143555, "ref_logps/rejected": -46.69389343261719, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005406979937106371, "rewards/margins": -0.003876863745972514, "rewards/rejected": -0.0015301161911338568, "step": 17 }, { "epoch": 0.14, "grad_norm": 7.485609028305383, "learning_rate": 2.25e-07, "logps/chosen": -40.4469108581543, "logps/rejected": -44.94635009765625, "loss": 0.6927, "losses/dpo": 0.6907854080200195, "losses/sft": 1.1833800077438354, "losses/total": 0.6907854080200195, "ref_logps/chosen": -40.407257080078125, "ref_logps/rejected": -44.894813537597656, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.003965577110648155, "rewards/margins": 0.0011882353574037552, "rewards/rejected": -0.005153812933713198, "step": 18 }, { "epoch": 0.14, "grad_norm": 7.404661140565325, "learning_rate": 2.3749999999999998e-07, "logps/chosen": -35.739524841308594, "logps/rejected": -46.330265045166016, "loss": 0.6923, "losses/dpo": 0.6935074329376221, "losses/sft": 1.8608835935592651, "losses/total": 0.6935074329376221, "ref_logps/chosen": -35.708274841308594, "ref_logps/rejected": -46.27949523925781, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.003125070594251156, "rewards/margins": 0.001951692276634276, "rewards/rejected": -0.00507676275447011, "step": 19 }, { "epoch": 0.15, "grad_norm": 7.514252016049334, "learning_rate": 2.5e-07, "logps/chosen": -40.884029388427734, "logps/rejected": -47.1005859375, "loss": 0.6939, "losses/dpo": 0.6919451355934143, "losses/sft": 1.3290549516677856, "losses/total": 0.6919451355934143, "ref_logps/chosen": -40.82601547241211, "ref_logps/rejected": -47.054588317871094, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0058018057607114315, "rewards/margins": -0.001202343963086605, "rewards/rejected": -0.004599461797624826, "step": 20 }, { "epoch": 0.16, "grad_norm": 11.774765376903847, "learning_rate": 2.625e-07, "logps/chosen": -40.38345718383789, "logps/rejected": -51.66474914550781, "loss": 0.6922, "losses/dpo": 0.6949824690818787, "losses/sft": 1.548210620880127, "losses/total": 0.6949824690818787, "ref_logps/chosen": -40.31751251220703, "ref_logps/rejected": -51.57673645019531, "rewards/accuracies": 0.515625, "rewards/chosen": -0.006594239268451929, "rewards/margins": 0.002207120880484581, "rewards/rejected": -0.008801360614597797, "step": 21 }, { "epoch": 0.17, "grad_norm": 7.542951860866026, "learning_rate": 2.75e-07, "logps/chosen": -38.18585205078125, "logps/rejected": -47.13993835449219, "loss": 0.6924, "losses/dpo": 0.6899946928024292, "losses/sft": 1.1579391956329346, "losses/total": 0.6899946928024292, "ref_logps/chosen": -38.091922760009766, "ref_logps/rejected": -47.029541015625, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.009392979554831982, "rewards/margins": 0.0016468917019665241, "rewards/rejected": -0.011039872653782368, "step": 22 }, { "epoch": 0.17, "grad_norm": 7.876385596657873, "learning_rate": 2.8749999999999995e-07, "logps/chosen": -38.30439376831055, "logps/rejected": -49.61843490600586, "loss": 0.6895, "losses/dpo": 0.6851339936256409, "losses/sft": 1.5843318700790405, "losses/total": 0.6851339936256409, "ref_logps/chosen": -38.242801666259766, "ref_logps/rejected": -49.4824104309082, "rewards/accuracies": 0.578125, "rewards/chosen": -0.006159077398478985, "rewards/margins": 0.007443387992680073, "rewards/rejected": -0.013602466322481632, "step": 23 }, { "epoch": 0.18, "grad_norm": 7.668314504955077, "learning_rate": 3e-07, "logps/chosen": -40.48552322387695, "logps/rejected": -46.48503494262695, "loss": 0.6895, "losses/dpo": 0.6920894980430603, "losses/sft": 1.4189947843551636, "losses/total": 0.6920894980430603, "ref_logps/chosen": -40.395687103271484, "ref_logps/rejected": -46.31945037841797, "rewards/accuracies": 0.609375, "rewards/chosen": -0.008983338251709938, "rewards/margins": 0.007574939634650946, "rewards/rejected": -0.01655827835202217, "step": 24 }, { "epoch": 0.19, "grad_norm": 7.344435357199023, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -39.81501388549805, "logps/rejected": -45.954071044921875, "loss": 0.6914, "losses/dpo": 0.6927582621574402, "losses/sft": 1.6129871606826782, "losses/total": 0.6927582621574402, "ref_logps/chosen": -39.700050354003906, "ref_logps/rejected": -45.80275344848633, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.011496355757117271, "rewards/margins": 0.0036351331509649754, "rewards/rejected": -0.01513148844242096, "step": 25 }, { "epoch": 0.2, "grad_norm": 7.977738556938539, "learning_rate": 3.25e-07, "logps/chosen": -39.41866683959961, "logps/rejected": -47.94341278076172, "loss": 0.6919, "losses/dpo": 0.6823984384536743, "losses/sft": 1.1218098402023315, "losses/total": 0.6823984384536743, "ref_logps/chosen": -39.244632720947266, "ref_logps/rejected": -47.742191314697266, "rewards/accuracies": 0.5625, "rewards/chosen": -0.017403149977326393, "rewards/margins": 0.002718748524785042, "rewards/rejected": -0.020121898502111435, "step": 26 }, { "epoch": 0.2, "grad_norm": 7.137045291395353, "learning_rate": 3.375e-07, "logps/chosen": -36.09528350830078, "logps/rejected": -43.786441802978516, "loss": 0.6934, "losses/dpo": 0.7064226269721985, "losses/sft": 1.185333251953125, "losses/total": 0.7064226269721985, "ref_logps/chosen": -35.91387939453125, "ref_logps/rejected": -43.607757568359375, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.01814102753996849, "rewards/margins": -0.0002719040203373879, "rewards/rejected": -0.017869124189019203, "step": 27 }, { "epoch": 0.21, "grad_norm": 7.379899822037054, "learning_rate": 3.5e-07, "logps/chosen": -44.72355270385742, "logps/rejected": -47.34676742553711, "loss": 0.6909, "losses/dpo": 0.6887847185134888, "losses/sft": 1.6178021430969238, "losses/total": 0.6887847185134888, "ref_logps/chosen": -44.496578216552734, "ref_logps/rejected": -47.07260513305664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02269744500517845, "rewards/margins": 0.004718274809420109, "rewards/rejected": -0.027415720745921135, "step": 28 }, { "epoch": 0.22, "grad_norm": 8.0485668325091, "learning_rate": 3.6249999999999997e-07, "logps/chosen": -41.700931549072266, "logps/rejected": -50.03131103515625, "loss": 0.6893, "losses/dpo": 0.6823500394821167, "losses/sft": 1.4876271486282349, "losses/total": 0.6823500394821167, "ref_logps/chosen": -41.42716979980469, "ref_logps/rejected": -49.678436279296875, "rewards/accuracies": 0.546875, "rewards/chosen": -0.027376368641853333, "rewards/margins": 0.007911860011518002, "rewards/rejected": -0.03528822585940361, "step": 29 }, { "epoch": 0.23, "grad_norm": 7.828639871809106, "learning_rate": 3.75e-07, "logps/chosen": -41.79233932495117, "logps/rejected": -48.521629333496094, "loss": 0.6884, "losses/dpo": 0.6861717700958252, "losses/sft": 1.3226033449172974, "losses/total": 0.6861717700958252, "ref_logps/chosen": -41.542991638183594, "ref_logps/rejected": -48.17402648925781, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.024935012683272362, "rewards/margins": 0.009825671091675758, "rewards/rejected": -0.03476068750023842, "step": 30 }, { "epoch": 0.23, "grad_norm": 8.365578824729102, "learning_rate": 3.875e-07, "logps/chosen": -41.142433166503906, "logps/rejected": -48.93161392211914, "loss": 0.6954, "losses/dpo": 0.6871266961097717, "losses/sft": 1.0435420274734497, "losses/total": 0.6871266961097717, "ref_logps/chosen": -40.73477554321289, "ref_logps/rejected": -48.56393814086914, "rewards/accuracies": 0.484375, "rewards/chosen": -0.04076562076807022, "rewards/margins": -0.00399819714948535, "rewards/rejected": -0.03676741570234299, "step": 31 }, { "epoch": 0.24, "grad_norm": 7.2403699625390585, "learning_rate": 4e-07, "logps/chosen": -37.81801223754883, "logps/rejected": -46.747371673583984, "loss": 0.6899, "losses/dpo": 0.696927011013031, "losses/sft": 1.7572060823440552, "losses/total": 0.696927011013031, "ref_logps/chosen": -37.4744758605957, "ref_logps/rejected": -46.330135345458984, "rewards/accuracies": 0.578125, "rewards/chosen": -0.03435356542468071, "rewards/margins": 0.007370149716734886, "rewards/rejected": -0.04172371327877045, "step": 32 }, { "epoch": 0.25, "grad_norm": 7.310694198303384, "learning_rate": 4.1249999999999997e-07, "logps/chosen": -35.633243560791016, "logps/rejected": -41.00613021850586, "loss": 0.6852, "losses/dpo": 0.6834661960601807, "losses/sft": 1.3767448663711548, "losses/total": 0.6834661960601807, "ref_logps/chosen": -35.318153381347656, "ref_logps/rejected": -40.5264892578125, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.03150848299264908, "rewards/margins": 0.016455503180623055, "rewards/rejected": -0.04796398803591728, "step": 33 }, { "epoch": 0.26, "grad_norm": 7.510894622614614, "learning_rate": 4.2499999999999995e-07, "logps/chosen": -42.22370529174805, "logps/rejected": -48.45228576660156, "loss": 0.688, "losses/dpo": 0.6784626841545105, "losses/sft": 1.7890020608901978, "losses/total": 0.6784626841545105, "ref_logps/chosen": -41.76036834716797, "ref_logps/rejected": -47.876914978027344, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.046334076672792435, "rewards/margins": 0.011203275993466377, "rewards/rejected": -0.05753735080361366, "step": 34 }, { "epoch": 0.26, "grad_norm": 7.524260365013066, "learning_rate": 4.375e-07, "logps/chosen": -41.117164611816406, "logps/rejected": -47.30539321899414, "loss": 0.6874, "losses/dpo": 0.6890352368354797, "losses/sft": 1.9127196073532104, "losses/total": 0.6890352368354797, "ref_logps/chosen": -40.611358642578125, "ref_logps/rejected": -46.675148010253906, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.0505804568529129, "rewards/margins": 0.01244389358907938, "rewards/rejected": -0.0630243569612503, "step": 35 }, { "epoch": 0.27, "grad_norm": 7.835502878067996, "learning_rate": 4.5e-07, "logps/chosen": -42.31553649902344, "logps/rejected": -48.77557373046875, "loss": 0.6846, "losses/dpo": 0.6801737546920776, "losses/sft": 1.0419285297393799, "losses/total": 0.6801737546920776, "ref_logps/chosen": -41.76920700073242, "ref_logps/rejected": -48.04461669921875, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.054632995277643204, "rewards/margins": 0.0184622872620821, "rewards/rejected": -0.07309528440237045, "step": 36 }, { "epoch": 0.28, "grad_norm": 10.089436620525504, "learning_rate": 4.625e-07, "logps/chosen": -40.23670196533203, "logps/rejected": -44.72167205810547, "loss": 0.6856, "losses/dpo": 0.6820752620697021, "losses/sft": 1.6919959783554077, "losses/total": 0.6820752620697021, "ref_logps/chosen": -39.70112991333008, "ref_logps/rejected": -44.0257682800293, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05355698987841606, "rewards/margins": 0.016033286228775978, "rewards/rejected": -0.06959027796983719, "step": 37 }, { "epoch": 0.29, "grad_norm": 7.4705752998877974, "learning_rate": 4.7499999999999995e-07, "logps/chosen": -40.608951568603516, "logps/rejected": -46.77935791015625, "loss": 0.6846, "losses/dpo": 0.7083909511566162, "losses/sft": 1.3596407175064087, "losses/total": 0.7083909511566162, "ref_logps/chosen": -40.02342987060547, "ref_logps/rejected": -46.008670806884766, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.05855226144194603, "rewards/margins": 0.018516112118959427, "rewards/rejected": -0.07706836611032486, "step": 38 }, { "epoch": 0.29, "grad_norm": 7.4834962101051445, "learning_rate": 4.875e-07, "logps/chosen": -38.924591064453125, "logps/rejected": -44.15880584716797, "loss": 0.6806, "losses/dpo": 0.6712931394577026, "losses/sft": 1.4741321802139282, "losses/total": 0.6712931394577026, "ref_logps/chosen": -38.32358932495117, "ref_logps/rejected": -43.294708251953125, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.060100097209215164, "rewards/margins": 0.02630985900759697, "rewards/rejected": -0.08640995621681213, "step": 39 }, { "epoch": 0.3, "grad_norm": 7.593289226892247, "learning_rate": 5e-07, "logps/chosen": -36.14179611206055, "logps/rejected": -43.69697952270508, "loss": 0.6893, "losses/dpo": 0.6905455589294434, "losses/sft": 1.8340303897857666, "losses/total": 0.6905455589294434, "ref_logps/chosen": -35.46891784667969, "ref_logps/rejected": -42.935638427734375, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06728792935609818, "rewards/margins": 0.008846651762723923, "rewards/rejected": -0.0761345773935318, "step": 40 }, { "epoch": 0.31, "grad_norm": 7.184019422431451, "learning_rate": 4.985955056179775e-07, "logps/chosen": -36.04156494140625, "logps/rejected": -44.501773834228516, "loss": 0.6765, "losses/dpo": 0.6632527112960815, "losses/sft": 1.562534213066101, "losses/total": 0.6632527112960815, "ref_logps/chosen": -35.38131332397461, "ref_logps/rejected": -43.47880935668945, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06602565199136734, "rewards/margins": 0.036271147429943085, "rewards/rejected": -0.10229679197072983, "step": 41 }, { "epoch": 0.32, "grad_norm": 7.817004450242549, "learning_rate": 4.97191011235955e-07, "logps/chosen": -40.78254699707031, "logps/rejected": -48.181861877441406, "loss": 0.6803, "losses/dpo": 0.7099467515945435, "losses/sft": 1.8783167600631714, "losses/total": 0.7099467515945435, "ref_logps/chosen": -40.004154205322266, "ref_logps/rejected": -47.115760803222656, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.07783940434455872, "rewards/margins": 0.028770849108695984, "rewards/rejected": -0.1066102534532547, "step": 42 }, { "epoch": 0.32, "grad_norm": 7.433654746121009, "learning_rate": 4.957865168539325e-07, "logps/chosen": -40.71480941772461, "logps/rejected": -47.88724136352539, "loss": 0.6803, "losses/dpo": 0.7072566151618958, "losses/sft": 1.6432607173919678, "losses/total": 0.7072566151618958, "ref_logps/chosen": -39.739933013916016, "ref_logps/rejected": -46.61668014526367, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.09748795628547668, "rewards/margins": 0.029567349702119827, "rewards/rejected": -0.1270553022623062, "step": 43 }, { "epoch": 0.33, "grad_norm": 7.64577773243097, "learning_rate": 4.943820224719101e-07, "logps/chosen": -36.590328216552734, "logps/rejected": -45.61329650878906, "loss": 0.6824, "losses/dpo": 0.6843121647834778, "losses/sft": 2.019310235977173, "losses/total": 0.6843121647834778, "ref_logps/chosen": -35.54472732543945, "ref_logps/rejected": -44.32211685180664, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.10456002503633499, "rewards/margins": 0.024558255448937416, "rewards/rejected": -0.12911829352378845, "step": 44 }, { "epoch": 0.34, "grad_norm": 7.451674659941506, "learning_rate": 4.929775280898877e-07, "logps/chosen": -39.82986068725586, "logps/rejected": -44.68933868408203, "loss": 0.6767, "losses/dpo": 0.6984357833862305, "losses/sft": 1.321048617362976, "losses/total": 0.6984357833862305, "ref_logps/chosen": -38.77911376953125, "ref_logps/rejected": -43.251163482666016, "rewards/accuracies": 0.578125, "rewards/chosen": -0.10507487505674362, "rewards/margins": 0.038742441684007645, "rewards/rejected": -0.14381732046604156, "step": 45 }, { "epoch": 0.35, "grad_norm": 7.481580488082584, "learning_rate": 4.915730337078651e-07, "logps/chosen": -40.547637939453125, "logps/rejected": -48.37934112548828, "loss": 0.6784, "losses/dpo": 0.682788610458374, "losses/sft": 1.305440902709961, "losses/total": 0.682788610458374, "ref_logps/chosen": -39.39177703857422, "ref_logps/rejected": -46.888671875, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.11558566987514496, "rewards/margins": 0.03348149359226227, "rewards/rejected": -0.14906716346740723, "step": 46 }, { "epoch": 0.35, "grad_norm": 7.503690704863454, "learning_rate": 4.901685393258427e-07, "logps/chosen": -43.58844757080078, "logps/rejected": -46.27735137939453, "loss": 0.6796, "losses/dpo": 0.6725805997848511, "losses/sft": 1.7845996618270874, "losses/total": 0.6725805997848511, "ref_logps/chosen": -42.20465850830078, "ref_logps/rejected": -44.57255554199219, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.13837924599647522, "rewards/margins": 0.03210053965449333, "rewards/rejected": -0.17047978937625885, "step": 47 }, { "epoch": 0.36, "grad_norm": 7.6005277154002275, "learning_rate": 4.887640449438202e-07, "logps/chosen": -40.19657897949219, "logps/rejected": -46.2965087890625, "loss": 0.6806, "losses/dpo": 0.6915363669395447, "losses/sft": 1.4134502410888672, "losses/total": 0.6915363669395447, "ref_logps/chosen": -38.93235778808594, "ref_logps/rejected": -44.731178283691406, "rewards/accuracies": 0.546875, "rewards/chosen": -0.12642225623130798, "rewards/margins": 0.030110429972410202, "rewards/rejected": -0.15653270483016968, "step": 48 }, { "epoch": 0.37, "grad_norm": 7.342034157176263, "learning_rate": 4.873595505617978e-07, "logps/chosen": -35.246055603027344, "logps/rejected": -45.013092041015625, "loss": 0.671, "losses/dpo": 0.6273987293243408, "losses/sft": 1.2200208902359009, "losses/total": 0.6273987293243408, "ref_logps/chosen": -33.989261627197266, "ref_logps/rejected": -43.26504898071289, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.12567944824695587, "rewards/margins": 0.04912441223859787, "rewards/rejected": -0.17480388283729553, "step": 49 }, { "epoch": 0.38, "grad_norm": 7.768856876362336, "learning_rate": 4.859550561797752e-07, "logps/chosen": -41.647483825683594, "logps/rejected": -49.59557342529297, "loss": 0.6668, "losses/dpo": 0.6410457491874695, "losses/sft": 2.0004844665527344, "losses/total": 0.6410457491874695, "ref_logps/chosen": -40.150108337402344, "ref_logps/rejected": -47.4974365234375, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.1497381180524826, "rewards/margins": 0.06007564440369606, "rewards/rejected": -0.20981375873088837, "step": 50 }, { "epoch": 0.38, "grad_norm": 7.482808682633612, "learning_rate": 4.845505617977528e-07, "logps/chosen": -42.28547668457031, "logps/rejected": -46.57417297363281, "loss": 0.6686, "losses/dpo": 0.6821735501289368, "losses/sft": 1.643945336341858, "losses/total": 0.6821735501289368, "ref_logps/chosen": -40.781097412109375, "ref_logps/rejected": -44.47871017456055, "rewards/accuracies": 0.625, "rewards/chosen": -0.15043821930885315, "rewards/margins": 0.05910744518041611, "rewards/rejected": -0.20954564213752747, "step": 51 }, { "epoch": 0.39, "grad_norm": 7.444552223013138, "learning_rate": 4.831460674157303e-07, "logps/chosen": -38.653770446777344, "logps/rejected": -47.96025848388672, "loss": 0.6696, "losses/dpo": 0.653758704662323, "losses/sft": 1.9075889587402344, "losses/total": 0.653758704662323, "ref_logps/chosen": -37.04439163208008, "ref_logps/rejected": -45.76567840576172, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16093730926513672, "rewards/margins": 0.05852021649479866, "rewards/rejected": -0.2194575071334839, "step": 52 }, { "epoch": 0.4, "grad_norm": 7.051622049892774, "learning_rate": 4.817415730337078e-07, "logps/chosen": -36.511940002441406, "logps/rejected": -42.634193420410156, "loss": 0.672, "losses/dpo": 0.6566299200057983, "losses/sft": 1.6063774824142456, "losses/total": 0.6566299200057983, "ref_logps/chosen": -34.99435806274414, "ref_logps/rejected": -40.612342834472656, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.15175840258598328, "rewards/margins": 0.05042674392461777, "rewards/rejected": -0.20218515396118164, "step": 53 }, { "epoch": 0.41, "grad_norm": 7.752968590362967, "learning_rate": 4.803370786516854e-07, "logps/chosen": -43.374481201171875, "logps/rejected": -46.1808967590332, "loss": 0.664, "losses/dpo": 0.5915548801422119, "losses/sft": 1.5764846801757812, "losses/total": 0.5915548801422119, "ref_logps/chosen": -41.619239807128906, "ref_logps/rejected": -43.757720947265625, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.17552456259727478, "rewards/margins": 0.06679282337427139, "rewards/rejected": -0.24231737852096558, "step": 54 }, { "epoch": 0.42, "grad_norm": 7.375135165111918, "learning_rate": 4.789325842696629e-07, "logps/chosen": -40.775726318359375, "logps/rejected": -45.556365966796875, "loss": 0.6783, "losses/dpo": 0.6768916845321655, "losses/sft": 1.3732706308364868, "losses/total": 0.6768916845321655, "ref_logps/chosen": -39.01034927368164, "ref_logps/rejected": -43.436553955078125, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.17653760313987732, "rewards/margins": 0.03544352203607559, "rewards/rejected": -0.21198111772537231, "step": 55 }, { "epoch": 0.42, "grad_norm": 7.238231049231885, "learning_rate": 4.775280898876405e-07, "logps/chosen": -39.11316680908203, "logps/rejected": -45.04530334472656, "loss": 0.6642, "losses/dpo": 0.6528148651123047, "losses/sft": 1.159528136253357, "losses/total": 0.6528148651123047, "ref_logps/chosen": -37.2640266418457, "ref_logps/rejected": -42.495338439941406, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.18491369485855103, "rewards/margins": 0.07008323073387146, "rewards/rejected": -0.2549969553947449, "step": 56 }, { "epoch": 0.43, "grad_norm": 7.652181625277677, "learning_rate": 4.7612359550561797e-07, "logps/chosen": -43.244380950927734, "logps/rejected": -49.415409088134766, "loss": 0.6679, "losses/dpo": 0.6176864504814148, "losses/sft": 1.796196460723877, "losses/total": 0.6176864504814148, "ref_logps/chosen": -41.167381286621094, "ref_logps/rejected": -46.70917892456055, "rewards/accuracies": 0.609375, "rewards/chosen": -0.20770025253295898, "rewards/margins": 0.06292243301868439, "rewards/rejected": -0.2706226706504822, "step": 57 }, { "epoch": 0.44, "grad_norm": 8.012193852457372, "learning_rate": 4.747191011235955e-07, "logps/chosen": -38.95054626464844, "logps/rejected": -45.45573043823242, "loss": 0.6545, "losses/dpo": 0.721019983291626, "losses/sft": 1.6278411149978638, "losses/total": 0.721019983291626, "ref_logps/chosen": -37.17967987060547, "ref_logps/rejected": -42.77497100830078, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17708644270896912, "rewards/margins": 0.09098967909812927, "rewards/rejected": -0.2680761218070984, "step": 58 }, { "epoch": 0.45, "grad_norm": 7.745152630727936, "learning_rate": 4.7331460674157303e-07, "logps/chosen": -41.84577560424805, "logps/rejected": -54.23434066772461, "loss": 0.654, "losses/dpo": 0.601816713809967, "losses/sft": 1.5886242389678955, "losses/total": 0.601816713809967, "ref_logps/chosen": -39.931434631347656, "ref_logps/rejected": -51.34343719482422, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19143418967723846, "rewards/margins": 0.09765592962503433, "rewards/rejected": -0.2890901267528534, "step": 59 }, { "epoch": 0.45, "grad_norm": 8.104292528879256, "learning_rate": 4.7191011235955054e-07, "logps/chosen": -40.5402717590332, "logps/rejected": -48.11115264892578, "loss": 0.6612, "losses/dpo": 0.6307883858680725, "losses/sft": 1.6475903987884521, "losses/total": 0.6307883858680725, "ref_logps/chosen": -38.59111022949219, "ref_logps/rejected": -45.39031219482422, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19491644203662872, "rewards/margins": 0.07716768234968185, "rewards/rejected": -0.27208411693573, "step": 60 }, { "epoch": 0.46, "grad_norm": 8.24076971848322, "learning_rate": 4.705056179775281e-07, "logps/chosen": -41.55039978027344, "logps/rejected": -51.43959426879883, "loss": 0.6677, "losses/dpo": 0.7012457251548767, "losses/sft": 2.175475597381592, "losses/total": 0.7012457251548767, "ref_logps/chosen": -39.29528045654297, "ref_logps/rejected": -48.50824737548828, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22551202774047852, "rewards/margins": 0.06762254983186722, "rewards/rejected": -0.2931345999240875, "step": 61 }, { "epoch": 0.47, "grad_norm": 8.35481489409821, "learning_rate": 4.691011235955056e-07, "logps/chosen": -43.0926399230957, "logps/rejected": -47.44728088378906, "loss": 0.6462, "losses/dpo": 0.6053961515426636, "losses/sft": 1.459052324295044, "losses/total": 0.6053961515426636, "ref_logps/chosen": -40.96197509765625, "ref_logps/rejected": -44.2166633605957, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.2130661904811859, "rewards/margins": 0.1099955290555954, "rewards/rejected": -0.3230617344379425, "step": 62 }, { "epoch": 0.48, "grad_norm": 7.797774561384425, "learning_rate": 4.6769662921348315e-07, "logps/chosen": -38.055320739746094, "logps/rejected": -47.66813659667969, "loss": 0.652, "losses/dpo": 0.632691502571106, "losses/sft": 1.4375559091567993, "losses/total": 0.632691502571106, "ref_logps/chosen": -36.05341720581055, "ref_logps/rejected": -44.64914321899414, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20018979907035828, "rewards/margins": 0.10170910507440567, "rewards/rejected": -0.30189892649650574, "step": 63 }, { "epoch": 0.48, "grad_norm": 7.28638958987082, "learning_rate": 4.662921348314606e-07, "logps/chosen": -39.81006622314453, "logps/rejected": -46.78810501098633, "loss": 0.662, "losses/dpo": 0.6003807783126831, "losses/sft": 1.3374682664871216, "losses/total": 0.6003807783126831, "ref_logps/chosen": -37.332698822021484, "ref_logps/rejected": -43.507659912109375, "rewards/accuracies": 0.625, "rewards/chosen": -0.247736856341362, "rewards/margins": 0.0803074836730957, "rewards/rejected": -0.3280443251132965, "step": 64 }, { "epoch": 0.49, "grad_norm": 7.798097069372596, "learning_rate": 4.6488764044943816e-07, "logps/chosen": -45.76461410522461, "logps/rejected": -50.199825286865234, "loss": 0.6814, "losses/dpo": 0.6105685234069824, "losses/sft": 1.8878819942474365, "losses/total": 0.6105685234069824, "ref_logps/chosen": -42.697021484375, "ref_logps/rejected": -46.67133331298828, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.30675944685935974, "rewards/margins": 0.046089351177215576, "rewards/rejected": -0.35284876823425293, "step": 65 }, { "epoch": 0.5, "grad_norm": 7.825603469401775, "learning_rate": 4.634831460674157e-07, "logps/chosen": -43.35639953613281, "logps/rejected": -53.741355895996094, "loss": 0.6436, "losses/dpo": 0.6293699741363525, "losses/sft": 1.4026882648468018, "losses/total": 0.6293699741363525, "ref_logps/chosen": -40.74187469482422, "ref_logps/rejected": -49.880897521972656, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.2614526152610779, "rewards/margins": 0.12459328025579453, "rewards/rejected": -0.3860458731651306, "step": 66 }, { "epoch": 0.51, "grad_norm": 7.635556989764748, "learning_rate": 4.620786516853932e-07, "logps/chosen": -40.68864440917969, "logps/rejected": -46.91835021972656, "loss": 0.6539, "losses/dpo": 0.7432792782783508, "losses/sft": 1.5986056327819824, "losses/total": 0.7432792782783508, "ref_logps/chosen": -38.21084213256836, "ref_logps/rejected": -43.418067932128906, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.24778038263320923, "rewards/margins": 0.10224790126085281, "rewards/rejected": -0.35002827644348145, "step": 67 }, { "epoch": 0.51, "grad_norm": 7.6861933707333305, "learning_rate": 4.606741573033708e-07, "logps/chosen": -42.49998092651367, "logps/rejected": -49.80910873413086, "loss": 0.6241, "losses/dpo": 0.5949782133102417, "losses/sft": 1.2951277494430542, "losses/total": 0.5949782133102417, "ref_logps/chosen": -40.09019088745117, "ref_logps/rejected": -45.69980239868164, "rewards/accuracies": 0.703125, "rewards/chosen": -0.24097900092601776, "rewards/margins": 0.16995173692703247, "rewards/rejected": -0.4109307527542114, "step": 68 }, { "epoch": 0.52, "grad_norm": 7.5671933847566555, "learning_rate": 4.592696629213483e-07, "logps/chosen": -42.11725616455078, "logps/rejected": -52.82745361328125, "loss": 0.6438, "losses/dpo": 0.6235805749893188, "losses/sft": 1.4768122434616089, "losses/total": 0.6235805749893188, "ref_logps/chosen": -39.234642028808594, "ref_logps/rejected": -48.657501220703125, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.2882614731788635, "rewards/margins": 0.12873350083827972, "rewards/rejected": -0.41699495911598206, "step": 69 }, { "epoch": 0.53, "grad_norm": 7.380869348497186, "learning_rate": 4.5786516853932584e-07, "logps/chosen": -40.33063507080078, "logps/rejected": -47.34225845336914, "loss": 0.6519, "losses/dpo": 0.6829323768615723, "losses/sft": 1.6434234380722046, "losses/total": 0.6829323768615723, "ref_logps/chosen": -37.294776916503906, "ref_logps/rejected": -43.10985565185547, "rewards/accuracies": 0.640625, "rewards/chosen": -0.30358612537384033, "rewards/margins": 0.11965445429086685, "rewards/rejected": -0.4232405722141266, "step": 70 }, { "epoch": 0.54, "grad_norm": 7.505680485493994, "learning_rate": 4.5646067415730334e-07, "logps/chosen": -40.74094009399414, "logps/rejected": -49.552616119384766, "loss": 0.6423, "losses/dpo": 0.6549758315086365, "losses/sft": 1.5461335182189941, "losses/total": 0.6549758315086365, "ref_logps/chosen": -37.58620071411133, "ref_logps/rejected": -45.091209411621094, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.3154744505882263, "rewards/margins": 0.1306663304567337, "rewards/rejected": -0.4461407959461212, "step": 71 }, { "epoch": 0.54, "grad_norm": 7.805459051678804, "learning_rate": 4.550561797752809e-07, "logps/chosen": -42.94220733642578, "logps/rejected": -54.635963439941406, "loss": 0.6384, "losses/dpo": 0.589752733707428, "losses/sft": 1.55972421169281, "losses/total": 0.589752733707428, "ref_logps/chosen": -39.39100646972656, "ref_logps/rejected": -49.64848709106445, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.35512006282806396, "rewards/margins": 0.1436270773410797, "rewards/rejected": -0.49874716997146606, "step": 72 }, { "epoch": 0.55, "grad_norm": 8.048352395094511, "learning_rate": 4.536516853932584e-07, "logps/chosen": -41.640235900878906, "logps/rejected": -53.20794677734375, "loss": 0.621, "losses/dpo": 0.6062641143798828, "losses/sft": 1.0970079898834229, "losses/total": 0.6062641143798828, "ref_logps/chosen": -38.45201873779297, "ref_logps/rejected": -48.13432312011719, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3188212513923645, "rewards/margins": 0.18854106962680817, "rewards/rejected": -0.5073623061180115, "step": 73 }, { "epoch": 0.56, "grad_norm": 12.072882817554818, "learning_rate": 4.522471910112359e-07, "logps/chosen": -43.404823303222656, "logps/rejected": -50.17894744873047, "loss": 0.6527, "losses/dpo": 0.5960186719894409, "losses/sft": 1.444412112236023, "losses/total": 0.5960186719894409, "ref_logps/chosen": -39.74496841430664, "ref_logps/rejected": -45.32371139526367, "rewards/accuracies": 0.625, "rewards/chosen": -0.3659852147102356, "rewards/margins": 0.11953801661729813, "rewards/rejected": -0.4855232238769531, "step": 74 }, { "epoch": 0.57, "grad_norm": 7.650328731492573, "learning_rate": 4.5084269662921347e-07, "logps/chosen": -43.5896110534668, "logps/rejected": -51.086971282958984, "loss": 0.6409, "losses/dpo": 0.6057982444763184, "losses/sft": 1.4658453464508057, "losses/total": 0.6057982444763184, "ref_logps/chosen": -39.98381423950195, "ref_logps/rejected": -45.86697769165039, "rewards/accuracies": 0.578125, "rewards/chosen": -0.3605796992778778, "rewards/margins": 0.16141945123672485, "rewards/rejected": -0.521999180316925, "step": 75 }, { "epoch": 0.57, "grad_norm": 7.945574535694735, "learning_rate": 4.4943820224719097e-07, "logps/chosen": -44.07162094116211, "logps/rejected": -50.66339874267578, "loss": 0.6556, "losses/dpo": 0.6170323491096497, "losses/sft": 1.8739807605743408, "losses/total": 0.6170323491096497, "ref_logps/chosen": -40.06593322753906, "ref_logps/rejected": -45.41100311279297, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.4005686044692993, "rewards/margins": 0.12467078864574432, "rewards/rejected": -0.5252394080162048, "step": 76 }, { "epoch": 0.58, "grad_norm": 9.606133876539282, "learning_rate": 4.4803370786516853e-07, "logps/chosen": -42.20341110229492, "logps/rejected": -50.49468231201172, "loss": 0.6503, "losses/dpo": 0.586646556854248, "losses/sft": 1.5989439487457275, "losses/total": 0.586646556854248, "ref_logps/chosen": -38.3419075012207, "ref_logps/rejected": -45.38350296020508, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.3861507773399353, "rewards/margins": 0.12496703863143921, "rewards/rejected": -0.5111178159713745, "step": 77 }, { "epoch": 0.59, "grad_norm": 8.32429224169226, "learning_rate": 4.4662921348314603e-07, "logps/chosen": -42.94104766845703, "logps/rejected": -54.27581787109375, "loss": 0.6335, "losses/dpo": 0.629318356513977, "losses/sft": 1.5925976037979126, "losses/total": 0.629318356513977, "ref_logps/chosen": -38.90857696533203, "ref_logps/rejected": -48.47710037231445, "rewards/accuracies": 0.671875, "rewards/chosen": -0.40324676036834717, "rewards/margins": 0.1766246110200882, "rewards/rejected": -0.5798712968826294, "step": 78 }, { "epoch": 0.6, "grad_norm": 7.898667752271484, "learning_rate": 4.452247191011236e-07, "logps/chosen": -42.00642395019531, "logps/rejected": -51.58437728881836, "loss": 0.6326, "losses/dpo": 0.6813949346542358, "losses/sft": 1.5958709716796875, "losses/total": 0.6813949346542358, "ref_logps/chosen": -38.2513427734375, "ref_logps/rejected": -46.194374084472656, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3755083978176117, "rewards/margins": 0.16349197924137115, "rewards/rejected": -0.5390004515647888, "step": 79 }, { "epoch": 0.6, "grad_norm": 8.4984309671094, "learning_rate": 4.438202247191011e-07, "logps/chosen": -47.29401397705078, "logps/rejected": -54.29883575439453, "loss": 0.642, "losses/dpo": 0.6930491924285889, "losses/sft": 1.8281772136688232, "losses/total": 0.6930491924285889, "ref_logps/chosen": -42.54518127441406, "ref_logps/rejected": -47.80082321166992, "rewards/accuracies": 0.578125, "rewards/chosen": -0.47488299012184143, "rewards/margins": 0.17491832375526428, "rewards/rejected": -0.6498013138771057, "step": 80 }, { "epoch": 0.61, "grad_norm": 8.385277776330648, "learning_rate": 4.4241573033707865e-07, "logps/chosen": -48.636085510253906, "logps/rejected": -54.089210510253906, "loss": 0.6105, "losses/dpo": 0.5880630612373352, "losses/sft": 1.647892951965332, "losses/total": 0.5880630612373352, "ref_logps/chosen": -43.907169342041016, "ref_logps/rejected": -47.025569915771484, "rewards/accuracies": 0.671875, "rewards/chosen": -0.472891628742218, "rewards/margins": 0.2334723323583603, "rewards/rejected": -0.7063639760017395, "step": 81 }, { "epoch": 0.62, "grad_norm": 10.98551780733234, "learning_rate": 4.410112359550562e-07, "logps/chosen": -44.03840637207031, "logps/rejected": -52.65192413330078, "loss": 0.622, "losses/dpo": 0.5322688817977905, "losses/sft": 1.9210578203201294, "losses/total": 0.5322688817977905, "ref_logps/chosen": -39.61579132080078, "ref_logps/rejected": -46.18141174316406, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4422611594200134, "rewards/margins": 0.2047904133796692, "rewards/rejected": -0.6470515727996826, "step": 82 }, { "epoch": 0.63, "grad_norm": 7.7041080998180425, "learning_rate": 4.3960674157303366e-07, "logps/chosen": -44.86250305175781, "logps/rejected": -52.133052825927734, "loss": 0.6265, "losses/dpo": 0.5972993969917297, "losses/sft": 1.4703764915466309, "losses/total": 0.5972993969917297, "ref_logps/chosen": -40.2207145690918, "ref_logps/rejected": -45.56148910522461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46417874097824097, "rewards/margins": 0.192977637052536, "rewards/rejected": -0.6571563482284546, "step": 83 }, { "epoch": 0.63, "grad_norm": 7.893501393776045, "learning_rate": 4.382022471910112e-07, "logps/chosen": -45.97453689575195, "logps/rejected": -55.243316650390625, "loss": 0.6289, "losses/dpo": 0.5659444332122803, "losses/sft": 1.6541783809661865, "losses/total": 0.5659444332122803, "ref_logps/chosen": -40.886837005615234, "ref_logps/rejected": -48.23194885253906, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5087698698043823, "rewards/margins": 0.19236721098423004, "rewards/rejected": -0.7011370062828064, "step": 84 }, { "epoch": 0.64, "grad_norm": 9.509139423826628, "learning_rate": 4.367977528089887e-07, "logps/chosen": -44.12831115722656, "logps/rejected": -54.7608642578125, "loss": 0.6193, "losses/dpo": 0.5432471036911011, "losses/sft": 1.8381226062774658, "losses/total": 0.5432471036911011, "ref_logps/chosen": -39.457069396972656, "ref_logps/rejected": -47.7266731262207, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.46712398529052734, "rewards/margins": 0.23629523813724518, "rewards/rejected": -0.7034192681312561, "step": 85 }, { "epoch": 0.65, "grad_norm": 7.887107642093141, "learning_rate": 4.353932584269663e-07, "logps/chosen": -45.145904541015625, "logps/rejected": -55.40123748779297, "loss": 0.6092, "losses/dpo": 0.5780594944953918, "losses/sft": 1.9163440465927124, "losses/total": 0.5780594944953918, "ref_logps/chosen": -40.329139709472656, "ref_logps/rejected": -48.183074951171875, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4816761910915375, "rewards/margins": 0.24014019966125488, "rewards/rejected": -0.7218164205551147, "step": 86 }, { "epoch": 0.66, "grad_norm": 8.39204893477319, "learning_rate": 4.339887640449438e-07, "logps/chosen": -46.145851135253906, "logps/rejected": -53.961036682128906, "loss": 0.6403, "losses/dpo": 0.60181725025177, "losses/sft": 1.625700831413269, "losses/total": 0.60181725025177, "ref_logps/chosen": -40.71726608276367, "ref_logps/rejected": -46.57926940917969, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.5428579449653625, "rewards/margins": 0.19531863927841187, "rewards/rejected": -0.7381765842437744, "step": 87 }, { "epoch": 0.66, "grad_norm": 8.63628479914341, "learning_rate": 4.3258426966292134e-07, "logps/chosen": -48.23160171508789, "logps/rejected": -53.496604919433594, "loss": 0.6382, "losses/dpo": 0.5790094137191772, "losses/sft": 1.4581667184829712, "losses/total": 0.5790094137191772, "ref_logps/chosen": -42.78254699707031, "ref_logps/rejected": -46.28538131713867, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5449056029319763, "rewards/margins": 0.1762169450521469, "rewards/rejected": -0.7211225628852844, "step": 88 }, { "epoch": 0.67, "grad_norm": 7.682264878764254, "learning_rate": 4.311797752808989e-07, "logps/chosen": -44.9177360534668, "logps/rejected": -57.826866149902344, "loss": 0.5789, "losses/dpo": 0.5269919037818909, "losses/sft": 1.9525985717773438, "losses/total": 0.5269919037818909, "ref_logps/chosen": -39.326385498046875, "ref_logps/rejected": -48.92866134643555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5591354370117188, "rewards/margins": 0.33068495988845825, "rewards/rejected": -0.889820396900177, "step": 89 }, { "epoch": 0.68, "grad_norm": 9.993714538070773, "learning_rate": 4.297752808988764e-07, "logps/chosen": -47.74348068237305, "logps/rejected": -51.28035354614258, "loss": 0.6687, "losses/dpo": 0.6379462480545044, "losses/sft": 1.61128830909729, "losses/total": 0.6379462480545044, "ref_logps/chosen": -41.72521209716797, "ref_logps/rejected": -44.09041976928711, "rewards/accuracies": 0.640625, "rewards/chosen": -0.6018266677856445, "rewards/margins": 0.11716663837432861, "rewards/rejected": -0.7189933061599731, "step": 90 }, { "epoch": 0.69, "grad_norm": 9.79439579593073, "learning_rate": 4.2837078651685396e-07, "logps/chosen": -46.20167541503906, "logps/rejected": -54.01323318481445, "loss": 0.6199, "losses/dpo": 0.4756242632865906, "losses/sft": 1.6830320358276367, "losses/total": 0.4756242632865906, "ref_logps/chosen": -40.16909408569336, "ref_logps/rejected": -45.69634246826172, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6032581329345703, "rewards/margins": 0.22843076288700104, "rewards/rejected": -0.8316888809204102, "step": 91 }, { "epoch": 0.69, "grad_norm": 8.115443534609298, "learning_rate": 4.269662921348314e-07, "logps/chosen": -49.6710319519043, "logps/rejected": -60.84608840942383, "loss": 0.5948, "losses/dpo": 0.6356014013290405, "losses/sft": 1.7809040546417236, "losses/total": 0.6356014013290405, "ref_logps/chosen": -43.4056282043457, "ref_logps/rejected": -51.69749069213867, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6265405416488647, "rewards/margins": 0.2883196473121643, "rewards/rejected": -0.914860188961029, "step": 92 }, { "epoch": 0.7, "grad_norm": 8.161770613063224, "learning_rate": 4.2556179775280896e-07, "logps/chosen": -44.44715881347656, "logps/rejected": -52.76496124267578, "loss": 0.6499, "losses/dpo": 0.5320106744766235, "losses/sft": 1.5193849802017212, "losses/total": 0.5320106744766235, "ref_logps/chosen": -38.260562896728516, "ref_logps/rejected": -44.902198791503906, "rewards/accuracies": 0.625, "rewards/chosen": -0.618659496307373, "rewards/margins": 0.1676165610551834, "rewards/rejected": -0.78627610206604, "step": 93 }, { "epoch": 0.71, "grad_norm": 11.63146009410558, "learning_rate": 4.2415730337078647e-07, "logps/chosen": -48.569419860839844, "logps/rejected": -55.14238357543945, "loss": 0.6309, "losses/dpo": 0.5968553423881531, "losses/sft": 1.5720221996307373, "losses/total": 0.5968553423881531, "ref_logps/chosen": -42.18726348876953, "ref_logps/rejected": -46.23759460449219, "rewards/accuracies": 0.625, "rewards/chosen": -0.6382158994674683, "rewards/margins": 0.2522626221179962, "rewards/rejected": -0.8904784917831421, "step": 94 }, { "epoch": 0.72, "grad_norm": 7.826574210546138, "learning_rate": 4.22752808988764e-07, "logps/chosen": -48.970035552978516, "logps/rejected": -56.224632263183594, "loss": 0.6209, "losses/dpo": 0.566143810749054, "losses/sft": 1.7626792192459106, "losses/total": 0.566143810749054, "ref_logps/chosen": -42.288108825683594, "ref_logps/rejected": -47.209964752197266, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6681923270225525, "rewards/margins": 0.23327398300170898, "rewards/rejected": -0.9014662504196167, "step": 95 }, { "epoch": 0.72, "grad_norm": 7.984390488888068, "learning_rate": 4.2134831460674153e-07, "logps/chosen": -44.690216064453125, "logps/rejected": -57.38431930541992, "loss": 0.5916, "losses/dpo": 0.5278609395027161, "losses/sft": 1.7001551389694214, "losses/total": 0.5278609395027161, "ref_logps/chosen": -39.018001556396484, "ref_logps/rejected": -48.73064422607422, "rewards/accuracies": 0.734375, "rewards/chosen": -0.567221999168396, "rewards/margins": 0.2981455326080322, "rewards/rejected": -0.8653674125671387, "step": 96 }, { "epoch": 0.73, "grad_norm": 8.140376307221391, "learning_rate": 4.199438202247191e-07, "logps/chosen": -45.94129180908203, "logps/rejected": -57.70640563964844, "loss": 0.5843, "losses/dpo": 0.6323425769805908, "losses/sft": 1.9729546308517456, "losses/total": 0.6323425769805908, "ref_logps/chosen": -39.79209899902344, "ref_logps/rejected": -47.78965377807617, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6149196624755859, "rewards/margins": 0.37675485014915466, "rewards/rejected": -0.991674542427063, "step": 97 }, { "epoch": 0.74, "grad_norm": 8.940775697669213, "learning_rate": 4.1853932584269664e-07, "logps/chosen": -49.44526672363281, "logps/rejected": -54.83664321899414, "loss": 0.6672, "losses/dpo": 0.6795445084571838, "losses/sft": 1.7624062299728394, "losses/total": 0.6795445084571838, "ref_logps/chosen": -42.365447998046875, "ref_logps/rejected": -46.211761474609375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7079817652702332, "rewards/margins": 0.15450690686702728, "rewards/rejected": -0.8624885678291321, "step": 98 }, { "epoch": 0.75, "grad_norm": 7.425234946687916, "learning_rate": 4.1713483146067415e-07, "logps/chosen": -43.5301513671875, "logps/rejected": -52.020164489746094, "loss": 0.5957, "losses/dpo": 0.6246699690818787, "losses/sft": 1.7049494981765747, "losses/total": 0.6246699690818787, "ref_logps/chosen": -37.423370361328125, "ref_logps/rejected": -42.76348876953125, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6106777191162109, "rewards/margins": 0.31499022245407104, "rewards/rejected": -0.9256680607795715, "step": 99 }, { "epoch": 0.75, "grad_norm": 9.179756029554161, "learning_rate": 4.157303370786517e-07, "logps/chosen": -49.9049072265625, "logps/rejected": -51.58677673339844, "loss": 0.6817, "losses/dpo": 0.4553123712539673, "losses/sft": 1.8781102895736694, "losses/total": 0.4553123712539673, "ref_logps/chosen": -42.09438705444336, "ref_logps/rejected": -42.1667594909668, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7810521721839905, "rewards/margins": 0.16094914078712463, "rewards/rejected": -0.9420013427734375, "step": 100 }, { "epoch": 0.76, "grad_norm": 8.48875168411848, "learning_rate": 4.1432584269662915e-07, "logps/chosen": -43.814697265625, "logps/rejected": -56.10200881958008, "loss": 0.5895, "losses/dpo": 0.6046161651611328, "losses/sft": 1.7643048763275146, "losses/total": 0.6046161651611328, "ref_logps/chosen": -37.67271423339844, "ref_logps/rejected": -46.54473114013672, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6141979694366455, "rewards/margins": 0.3415302038192749, "rewards/rejected": -0.9557281732559204, "step": 101 }, { "epoch": 0.77, "grad_norm": 8.913022100408304, "learning_rate": 4.129213483146067e-07, "logps/chosen": -43.33885955810547, "logps/rejected": -49.23841094970703, "loss": 0.6399, "losses/dpo": 0.5780912637710571, "losses/sft": 2.1572117805480957, "losses/total": 0.5780912637710571, "ref_logps/chosen": -37.323486328125, "ref_logps/rejected": -41.119110107421875, "rewards/accuracies": 0.609375, "rewards/chosen": -0.6015373468399048, "rewards/margins": 0.21039217710494995, "rewards/rejected": -0.8119295239448547, "step": 102 }, { "epoch": 0.78, "grad_norm": 8.206141020123177, "learning_rate": 4.115168539325842e-07, "logps/chosen": -48.0225830078125, "logps/rejected": -54.95545196533203, "loss": 0.6109, "losses/dpo": 0.5648887753486633, "losses/sft": 1.740958571434021, "losses/total": 0.5648887753486633, "ref_logps/chosen": -41.10425567626953, "ref_logps/rejected": -45.241817474365234, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6918322443962097, "rewards/margins": 0.27953118085861206, "rewards/rejected": -0.9713634848594666, "step": 103 }, { "epoch": 0.78, "grad_norm": 7.999449117068295, "learning_rate": 4.1011235955056177e-07, "logps/chosen": -48.33061981201172, "logps/rejected": -56.14019775390625, "loss": 0.6036, "losses/dpo": 0.5798739194869995, "losses/sft": 1.6369647979736328, "losses/total": 0.5798739194869995, "ref_logps/chosen": -40.94407653808594, "ref_logps/rejected": -45.9286003112793, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7386540174484253, "rewards/margins": 0.28250569105148315, "rewards/rejected": -1.0211596488952637, "step": 104 }, { "epoch": 0.79, "grad_norm": 7.961208032843563, "learning_rate": 4.0870786516853933e-07, "logps/chosen": -43.01828384399414, "logps/rejected": -55.2956657409668, "loss": 0.5814, "losses/dpo": 0.5777114629745483, "losses/sft": 2.0679547786712646, "losses/total": 0.5777114629745483, "ref_logps/chosen": -36.25102233886719, "ref_logps/rejected": -45.063175201416016, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6767261624336243, "rewards/margins": 0.3465230464935303, "rewards/rejected": -1.0232491493225098, "step": 105 }, { "epoch": 0.8, "grad_norm": 8.162855072558866, "learning_rate": 4.0730337078651683e-07, "logps/chosen": -38.68614959716797, "logps/rejected": -51.58649444580078, "loss": 0.6392, "losses/dpo": 0.6045973896980286, "losses/sft": 1.7759897708892822, "losses/total": 0.6045973896980286, "ref_logps/chosen": -32.37638854980469, "ref_logps/rejected": -42.94057083129883, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.6309766173362732, "rewards/margins": 0.2336157262325287, "rewards/rejected": -0.8645923733711243, "step": 106 }, { "epoch": 0.81, "grad_norm": 10.312576606575677, "learning_rate": 4.058988764044944e-07, "logps/chosen": -50.928245544433594, "logps/rejected": -59.36943054199219, "loss": 0.654, "losses/dpo": 0.735203206539154, "losses/sft": 2.190847158432007, "losses/total": 0.735203206539154, "ref_logps/chosen": -42.11771774291992, "ref_logps/rejected": -48.740692138671875, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.8810529112815857, "rewards/margins": 0.18182089924812317, "rewards/rejected": -1.0628738403320312, "step": 107 }, { "epoch": 0.82, "grad_norm": 8.648201545330428, "learning_rate": 4.044943820224719e-07, "logps/chosen": -47.14788818359375, "logps/rejected": -54.653385162353516, "loss": 0.63, "losses/dpo": 0.6501352787017822, "losses/sft": 2.0834176540374756, "losses/total": 0.6501352787017822, "ref_logps/chosen": -39.38991928100586, "ref_logps/rejected": -44.46839904785156, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.7757968902587891, "rewards/margins": 0.24270157516002655, "rewards/rejected": -1.0184985399246216, "step": 108 }, { "epoch": 0.82, "grad_norm": 9.8578017889453, "learning_rate": 4.0308988764044945e-07, "logps/chosen": -48.94048309326172, "logps/rejected": -56.453369140625, "loss": 0.6293, "losses/dpo": 0.7049952149391174, "losses/sft": 2.280228614807129, "losses/total": 0.7049952149391174, "ref_logps/chosen": -41.544471740722656, "ref_logps/rejected": -46.083526611328125, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.7396014332771301, "rewards/margins": 0.29738324880599976, "rewards/rejected": -1.0369845628738403, "step": 109 }, { "epoch": 0.83, "grad_norm": 8.394061989951249, "learning_rate": 4.0168539325842696e-07, "logps/chosen": -45.53207015991211, "logps/rejected": -54.01716613769531, "loss": 0.6652, "losses/dpo": 0.5625556111335754, "losses/sft": 2.00128436088562, "losses/total": 0.5625556111335754, "ref_logps/chosen": -37.465904235839844, "ref_logps/rejected": -44.16294860839844, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8066164255142212, "rewards/margins": 0.17880576848983765, "rewards/rejected": -0.9854221940040588, "step": 110 }, { "epoch": 0.84, "grad_norm": 8.76540952525436, "learning_rate": 4.0028089887640446e-07, "logps/chosen": -46.494407653808594, "logps/rejected": -56.70625686645508, "loss": 0.6355, "losses/dpo": 0.816941499710083, "losses/sft": 2.149186372756958, "losses/total": 0.816941499710083, "ref_logps/chosen": -38.6019401550293, "ref_logps/rejected": -46.42439270019531, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.7892469167709351, "rewards/margins": 0.23893946409225464, "rewards/rejected": -1.028186321258545, "step": 111 }, { "epoch": 0.85, "grad_norm": 8.706675493262694, "learning_rate": 3.9887640449438196e-07, "logps/chosen": -48.493709564208984, "logps/rejected": -58.21279525756836, "loss": 0.6039, "losses/dpo": 0.49378710985183716, "losses/sft": 1.3456647396087646, "losses/total": 0.49378710985183716, "ref_logps/chosen": -41.08473205566406, "ref_logps/rejected": -47.72603225708008, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.7408972978591919, "rewards/margins": 0.30777889490127563, "rewards/rejected": -1.0486761331558228, "step": 112 }, { "epoch": 0.85, "grad_norm": 8.390537463814029, "learning_rate": 3.974719101123595e-07, "logps/chosen": -45.94505310058594, "logps/rejected": -59.44303894042969, "loss": 0.5863, "losses/dpo": 0.7109102606773376, "losses/sft": 1.734868049621582, "losses/total": 0.7109102606773376, "ref_logps/chosen": -39.46669387817383, "ref_logps/rejected": -49.24740219116211, "rewards/accuracies": 0.75, "rewards/chosen": -0.6478357911109924, "rewards/margins": 0.37172842025756836, "rewards/rejected": -1.0195642709732056, "step": 113 }, { "epoch": 0.86, "grad_norm": 7.999605487961454, "learning_rate": 3.960674157303371e-07, "logps/chosen": -43.57410430908203, "logps/rejected": -53.34558868408203, "loss": 0.6172, "losses/dpo": 0.5053269863128662, "losses/sft": 1.450685739517212, "losses/total": 0.5053269863128662, "ref_logps/chosen": -36.833072662353516, "ref_logps/rejected": -43.94281768798828, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6741028428077698, "rewards/margins": 0.26617470383644104, "rewards/rejected": -0.9402774572372437, "step": 114 }, { "epoch": 0.87, "grad_norm": 9.415075126299298, "learning_rate": 3.946629213483146e-07, "logps/chosen": -46.565555572509766, "logps/rejected": -57.010372161865234, "loss": 0.5955, "losses/dpo": 0.5628423690795898, "losses/sft": 1.7239865064620972, "losses/total": 0.5628423690795898, "ref_logps/chosen": -39.13492202758789, "ref_logps/rejected": -46.0317497253418, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.743063747882843, "rewards/margins": 0.3547991216182709, "rewards/rejected": -1.097862958908081, "step": 115 }, { "epoch": 0.88, "grad_norm": 9.600908058178481, "learning_rate": 3.9325842696629214e-07, "logps/chosen": -50.78204345703125, "logps/rejected": -56.04452896118164, "loss": 0.6852, "losses/dpo": 0.635813295841217, "losses/sft": 1.73310124874115, "losses/total": 0.635813295841217, "ref_logps/chosen": -42.01323318481445, "ref_logps/rejected": -45.830718994140625, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.8768815994262695, "rewards/margins": 0.14449933171272278, "rewards/rejected": -1.02138090133667, "step": 116 }, { "epoch": 0.88, "grad_norm": 8.373342286084757, "learning_rate": 3.9185393258426964e-07, "logps/chosen": -47.82256317138672, "logps/rejected": -54.956302642822266, "loss": 0.6136, "losses/dpo": 0.6849408149719238, "losses/sft": 2.210094451904297, "losses/total": 0.6849408149719238, "ref_logps/chosen": -40.209930419921875, "ref_logps/rejected": -44.30906295776367, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.7612636089324951, "rewards/margins": 0.3034607172012329, "rewards/rejected": -1.0647242069244385, "step": 117 }, { "epoch": 0.89, "grad_norm": 9.637403138990082, "learning_rate": 3.904494382022472e-07, "logps/chosen": -48.02635955810547, "logps/rejected": -57.71215057373047, "loss": 0.5835, "losses/dpo": 0.6933724880218506, "losses/sft": 1.4713902473449707, "losses/total": 0.6933724880218506, "ref_logps/chosen": -40.86788558959961, "ref_logps/rejected": -47.20375442504883, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.7158471345901489, "rewards/margins": 0.33499258756637573, "rewards/rejected": -1.0508397817611694, "step": 118 }, { "epoch": 0.9, "grad_norm": 8.531064508491891, "learning_rate": 3.890449438202247e-07, "logps/chosen": -46.2841796875, "logps/rejected": -54.74563980102539, "loss": 0.6334, "losses/dpo": 0.702929675579071, "losses/sft": 1.360397219657898, "losses/total": 0.702929675579071, "ref_logps/chosen": -39.4974365234375, "ref_logps/rejected": -45.235111236572266, "rewards/accuracies": 0.640625, "rewards/chosen": -0.6786739826202393, "rewards/margins": 0.2723783850669861, "rewards/rejected": -0.9510524272918701, "step": 119 }, { "epoch": 0.91, "grad_norm": 41.566258002848386, "learning_rate": 3.876404494382022e-07, "logps/chosen": -44.022640228271484, "logps/rejected": -55.926021575927734, "loss": 0.596, "losses/dpo": 0.5612522959709167, "losses/sft": 1.8725919723510742, "losses/total": 0.5612522959709167, "ref_logps/chosen": -37.003326416015625, "ref_logps/rejected": -45.59555435180664, "rewards/accuracies": 0.671875, "rewards/chosen": -0.701931357383728, "rewards/margins": 0.3311149477958679, "rewards/rejected": -1.0330464839935303, "step": 120 }, { "epoch": 0.91, "grad_norm": 8.107636440875833, "learning_rate": 3.8623595505617977e-07, "logps/chosen": -46.37071990966797, "logps/rejected": -57.88187026977539, "loss": 0.5981, "losses/dpo": 0.5683261156082153, "losses/sft": 1.2367005348205566, "losses/total": 0.5683261156082153, "ref_logps/chosen": -40.31574630737305, "ref_logps/rejected": -48.33371353149414, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6054975986480713, "rewards/margins": 0.3493175208568573, "rewards/rejected": -0.9548150300979614, "step": 121 }, { "epoch": 0.92, "grad_norm": 7.976666948784577, "learning_rate": 3.8483146067415727e-07, "logps/chosen": -49.71910095214844, "logps/rejected": -61.38713073730469, "loss": 0.5628, "losses/dpo": 0.6633545756340027, "losses/sft": 2.0874011516571045, "losses/total": 0.6633545756340027, "ref_logps/chosen": -42.128177642822266, "ref_logps/rejected": -49.5493278503418, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7590923309326172, "rewards/margins": 0.42468804121017456, "rewards/rejected": -1.1837804317474365, "step": 122 }, { "epoch": 0.93, "grad_norm": 8.273120167281697, "learning_rate": 3.834269662921348e-07, "logps/chosen": -44.52275848388672, "logps/rejected": -56.69093322753906, "loss": 0.5973, "losses/dpo": 0.7617586851119995, "losses/sft": 1.4152177572250366, "losses/total": 0.7617586851119995, "ref_logps/chosen": -38.226234436035156, "ref_logps/rejected": -47.10552215576172, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6296522617340088, "rewards/margins": 0.32888925075531006, "rewards/rejected": -0.9585415124893188, "step": 123 }, { "epoch": 0.94, "grad_norm": 8.426019615665346, "learning_rate": 3.8202247191011233e-07, "logps/chosen": -45.17867660522461, "logps/rejected": -52.477989196777344, "loss": 0.5956, "losses/dpo": 0.8184994459152222, "losses/sft": 2.1767871379852295, "losses/total": 0.8184994459152222, "ref_logps/chosen": -39.22052001953125, "ref_logps/rejected": -42.9237060546875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5958150625228882, "rewards/margins": 0.35961273312568665, "rewards/rejected": -0.9554278254508972, "step": 124 }, { "epoch": 0.94, "grad_norm": 9.445037206868065, "learning_rate": 3.806179775280899e-07, "logps/chosen": -49.57510757446289, "logps/rejected": -57.41535568237305, "loss": 0.626, "losses/dpo": 0.5964499711990356, "losses/sft": 1.574311375617981, "losses/total": 0.5964499711990356, "ref_logps/chosen": -42.10932159423828, "ref_logps/rejected": -47.498348236083984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.746578574180603, "rewards/margins": 0.24512259662151337, "rewards/rejected": -0.9917011260986328, "step": 125 }, { "epoch": 0.95, "grad_norm": 8.142660343260747, "learning_rate": 3.792134831460674e-07, "logps/chosen": -49.39699935913086, "logps/rejected": -60.31464385986328, "loss": 0.5557, "losses/dpo": 0.4981670677661896, "losses/sft": 1.7636176347732544, "losses/total": 0.4981670677661896, "ref_logps/chosen": -42.645172119140625, "ref_logps/rejected": -49.29505157470703, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.675183117389679, "rewards/margins": 0.4267764091491699, "rewards/rejected": -1.1019595861434937, "step": 126 }, { "epoch": 0.96, "grad_norm": 10.371790207828976, "learning_rate": 3.7780898876404495e-07, "logps/chosen": -49.649539947509766, "logps/rejected": -55.797752380371094, "loss": 0.6236, "losses/dpo": 0.6386290788650513, "losses/sft": 1.9548349380493164, "losses/total": 0.6386290788650513, "ref_logps/chosen": -43.093387603759766, "ref_logps/rejected": -46.55876541137695, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6556151509284973, "rewards/margins": 0.2682836949825287, "rewards/rejected": -0.9238989353179932, "step": 127 }, { "epoch": 0.97, "grad_norm": 8.551344033897433, "learning_rate": 3.7640449438202245e-07, "logps/chosen": -47.41435623168945, "logps/rejected": -53.9222526550293, "loss": 0.6097, "losses/dpo": 0.5466079711914062, "losses/sft": 1.6678849458694458, "losses/total": 0.5466079711914062, "ref_logps/chosen": -40.969482421875, "ref_logps/rejected": -44.308204650878906, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6444875001907349, "rewards/margins": 0.3169165849685669, "rewards/rejected": -0.9614041447639465, "step": 128 }, { "epoch": 0.97, "grad_norm": 8.48986980564796, "learning_rate": 3.75e-07, "logps/chosen": -46.43178176879883, "logps/rejected": -54.83501434326172, "loss": 0.6116, "losses/dpo": 0.7334883213043213, "losses/sft": 2.06964111328125, "losses/total": 0.7334883213043213, "ref_logps/chosen": -39.70704650878906, "ref_logps/rejected": -45.3802490234375, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6724739074707031, "rewards/margins": 0.2730027139186859, "rewards/rejected": -0.9454765915870667, "step": 129 }, { "epoch": 0.98, "grad_norm": 7.931029049263691, "learning_rate": 3.735955056179775e-07, "logps/chosen": -41.7554817199707, "logps/rejected": -51.33645248413086, "loss": 0.6118, "losses/dpo": 0.6140174865722656, "losses/sft": 1.9485254287719727, "losses/total": 0.6140174865722656, "ref_logps/chosen": -36.31658172607422, "ref_logps/rejected": -43.00047302246094, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5438905954360962, "rewards/margins": 0.28970640897750854, "rewards/rejected": -0.8335970640182495, "step": 130 }, { "epoch": 0.99, "grad_norm": 7.961225244135859, "learning_rate": 3.72191011235955e-07, "logps/chosen": -45.26293182373047, "logps/rejected": -53.40849304199219, "loss": 0.5975, "losses/dpo": 0.6238459348678589, "losses/sft": 1.9288058280944824, "losses/total": 0.6238459348678589, "ref_logps/chosen": -39.287940979003906, "ref_logps/rejected": -43.95112991333008, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.597498893737793, "rewards/margins": 0.34823763370513916, "rewards/rejected": -0.9457363486289978, "step": 131 }, { "epoch": 1.0, "grad_norm": 8.463248361122965, "learning_rate": 3.707865168539326e-07, "logps/chosen": -46.658084869384766, "logps/rejected": -55.08595275878906, "loss": 0.6124, "losses/dpo": 0.5780600309371948, "losses/sft": 2.4361934661865234, "losses/total": 0.5780600309371948, "ref_logps/chosen": -40.17093276977539, "ref_logps/rejected": -45.625953674316406, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6487153172492981, "rewards/margins": 0.29728472232818604, "rewards/rejected": -0.9460000395774841, "step": 132 }, { "epoch": 1.0, "grad_norm": 7.966461556458486, "learning_rate": 3.693820224719101e-07, "logps/chosen": -43.571929931640625, "logps/rejected": -56.445987701416016, "loss": 0.5996, "losses/dpo": 0.6271636486053467, "losses/sft": 1.7036837339401245, "losses/total": 0.6271636486053467, "ref_logps/chosen": -37.748313903808594, "ref_logps/rejected": -46.95930862426758, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5823614001274109, "rewards/margins": 0.366305947303772, "rewards/rejected": -0.9486674070358276, "step": 133 }, { "epoch": 1.01, "grad_norm": 8.685706334967172, "learning_rate": 3.6797752808988764e-07, "logps/chosen": -45.6722412109375, "logps/rejected": -58.07893371582031, "loss": 0.5569, "losses/dpo": 0.44357961416244507, "losses/sft": 1.9906073808670044, "losses/total": 0.44357961416244507, "ref_logps/chosen": -39.666465759277344, "ref_logps/rejected": -47.67109298706055, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6005774736404419, "rewards/margins": 0.44020622968673706, "rewards/rejected": -1.0407837629318237, "step": 134 }, { "epoch": 1.02, "grad_norm": 7.5465440969699475, "learning_rate": 3.6657303370786514e-07, "logps/chosen": -44.68495178222656, "logps/rejected": -59.658729553222656, "loss": 0.5418, "losses/dpo": 0.6047704815864563, "losses/sft": 1.8062564134597778, "losses/total": 0.6047704815864563, "ref_logps/chosen": -38.53544616699219, "ref_logps/rejected": -48.757301330566406, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6149506568908691, "rewards/margins": 0.47519204020500183, "rewards/rejected": -1.0901426076889038, "step": 135 }, { "epoch": 1.03, "grad_norm": 8.040538818716184, "learning_rate": 3.651685393258427e-07, "logps/chosen": -49.753868103027344, "logps/rejected": -55.77690124511719, "loss": 0.5676, "losses/dpo": 0.6332641243934631, "losses/sft": 1.9821722507476807, "losses/total": 0.6332641243934631, "ref_logps/chosen": -43.2289924621582, "ref_logps/rejected": -44.969783782958984, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6524879932403564, "rewards/margins": 0.4282234311103821, "rewards/rejected": -1.0807113647460938, "step": 136 }, { "epoch": 1.03, "grad_norm": 8.176454529563474, "learning_rate": 3.637640449438202e-07, "logps/chosen": -44.740989685058594, "logps/rejected": -56.03924560546875, "loss": 0.5426, "losses/dpo": 0.5114879608154297, "losses/sft": 1.7858185768127441, "losses/total": 0.5114879608154297, "ref_logps/chosen": -39.14986038208008, "ref_logps/rejected": -45.894187927246094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5591133236885071, "rewards/margins": 0.45539283752441406, "rewards/rejected": -1.014506220817566, "step": 137 }, { "epoch": 1.04, "grad_norm": 7.378583429433305, "learning_rate": 3.6235955056179776e-07, "logps/chosen": -43.69382095336914, "logps/rejected": -58.01210021972656, "loss": 0.5576, "losses/dpo": 0.5225633978843689, "losses/sft": 1.5681252479553223, "losses/total": 0.5225633978843689, "ref_logps/chosen": -37.7261848449707, "ref_logps/rejected": -47.4239616394043, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5967639684677124, "rewards/margins": 0.46204984188079834, "rewards/rejected": -1.0588138103485107, "step": 138 }, { "epoch": 1.05, "grad_norm": 6.804949706358365, "learning_rate": 3.6095505617977526e-07, "logps/chosen": -41.86065673828125, "logps/rejected": -56.17615509033203, "loss": 0.541, "losses/dpo": 0.5155816674232483, "losses/sft": 1.66976797580719, "losses/total": 0.5155816674232483, "ref_logps/chosen": -36.93909454345703, "ref_logps/rejected": -46.33450698852539, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.49215659499168396, "rewards/margins": 0.492008239030838, "rewards/rejected": -0.9841648936271667, "step": 139 }, { "epoch": 1.06, "grad_norm": 9.08939326139503, "learning_rate": 3.5955056179775277e-07, "logps/chosen": -50.215492248535156, "logps/rejected": -61.04364013671875, "loss": 0.4877, "losses/dpo": 0.4505589008331299, "losses/sft": 2.045948028564453, "losses/total": 0.4505589008331299, "ref_logps/chosen": -44.16120147705078, "ref_logps/rejected": -48.857383728027344, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.6054291725158691, "rewards/margins": 0.6131964921951294, "rewards/rejected": -1.218625545501709, "step": 140 }, { "epoch": 1.06, "grad_norm": 7.204665546592892, "learning_rate": 3.581460674157303e-07, "logps/chosen": -44.662261962890625, "logps/rejected": -56.869659423828125, "loss": 0.5186, "losses/dpo": 0.4171184301376343, "losses/sft": 1.8761239051818848, "losses/total": 0.4171184301376343, "ref_logps/chosen": -38.65668487548828, "ref_logps/rejected": -45.534423828125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6005581617355347, "rewards/margins": 0.5329651832580566, "rewards/rejected": -1.1335232257843018, "step": 141 }, { "epoch": 1.07, "grad_norm": 7.887278895914157, "learning_rate": 3.5674157303370783e-07, "logps/chosen": -42.77589416503906, "logps/rejected": -51.52098083496094, "loss": 0.5407, "losses/dpo": 0.35035043954849243, "losses/sft": 1.5323569774627686, "losses/total": 0.35035043954849243, "ref_logps/chosen": -37.22514343261719, "ref_logps/rejected": -41.38079833984375, "rewards/accuracies": 0.75, "rewards/chosen": -0.5550752282142639, "rewards/margins": 0.45894336700439453, "rewards/rejected": -1.0140186548233032, "step": 142 }, { "epoch": 1.08, "grad_norm": 7.8842349887311505, "learning_rate": 3.553370786516854e-07, "logps/chosen": -48.38291549682617, "logps/rejected": -60.9052619934082, "loss": 0.5223, "losses/dpo": 0.3222103714942932, "losses/sft": 1.7116018533706665, "losses/total": 0.3222103714942932, "ref_logps/chosen": -41.83588409423828, "ref_logps/rejected": -48.54487228393555, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.6547029614448547, "rewards/margins": 0.5813360810279846, "rewards/rejected": -1.2360389232635498, "step": 143 }, { "epoch": 1.09, "grad_norm": 8.55836602139343, "learning_rate": 3.539325842696629e-07, "logps/chosen": -43.38884353637695, "logps/rejected": -58.84518051147461, "loss": 0.5957, "losses/dpo": 0.458510160446167, "losses/sft": 1.5300884246826172, "losses/total": 0.458510160446167, "ref_logps/chosen": -36.08906555175781, "ref_logps/rejected": -47.359981536865234, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7299779653549194, "rewards/margins": 0.41854166984558105, "rewards/rejected": -1.14851975440979, "step": 144 }, { "epoch": 1.09, "grad_norm": 7.848515280846644, "learning_rate": 3.5252808988764045e-07, "logps/chosen": -44.214759826660156, "logps/rejected": -55.23337936401367, "loss": 0.5469, "losses/dpo": 0.47486573457717896, "losses/sft": 1.7108842134475708, "losses/total": 0.47486573457717896, "ref_logps/chosen": -38.41852569580078, "ref_logps/rejected": -44.74144744873047, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5796229243278503, "rewards/margins": 0.4695700407028198, "rewards/rejected": -1.049193024635315, "step": 145 }, { "epoch": 1.1, "grad_norm": 7.658781592826533, "learning_rate": 3.51123595505618e-07, "logps/chosen": -44.951683044433594, "logps/rejected": -60.23070526123047, "loss": 0.5266, "losses/dpo": 0.5872490406036377, "losses/sft": 1.8830925226211548, "losses/total": 0.5872490406036377, "ref_logps/chosen": -38.69530487060547, "ref_logps/rejected": -48.41132354736328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6256377696990967, "rewards/margins": 0.5562998056411743, "rewards/rejected": -1.1819374561309814, "step": 146 }, { "epoch": 1.11, "grad_norm": 8.54544198888784, "learning_rate": 3.497191011235955e-07, "logps/chosen": -48.944068908691406, "logps/rejected": -63.39553451538086, "loss": 0.533, "losses/dpo": 0.40039166808128357, "losses/sft": 1.862857460975647, "losses/total": 0.40039166808128357, "ref_logps/chosen": -42.56263732910156, "ref_logps/rejected": -50.94089126586914, "rewards/accuracies": 0.75, "rewards/chosen": -0.6381431818008423, "rewards/margins": 0.6073207259178162, "rewards/rejected": -1.2454639673233032, "step": 147 }, { "epoch": 1.12, "grad_norm": 7.389416730585067, "learning_rate": 3.48314606741573e-07, "logps/chosen": -38.91257858276367, "logps/rejected": -49.71195602416992, "loss": 0.5426, "losses/dpo": 0.46995991468429565, "losses/sft": 1.7020084857940674, "losses/total": 0.46995991468429565, "ref_logps/chosen": -33.769771575927734, "ref_logps/rejected": -40.01531219482422, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.5142804384231567, "rewards/margins": 0.45538395643234253, "rewards/rejected": -0.969664454460144, "step": 148 }, { "epoch": 1.12, "grad_norm": 7.674169098254074, "learning_rate": 3.469101123595505e-07, "logps/chosen": -51.57361602783203, "logps/rejected": -61.665931701660156, "loss": 0.4713, "losses/dpo": 0.4245557487010956, "losses/sft": 1.5430463552474976, "losses/total": 0.4245557487010956, "ref_logps/chosen": -44.85792541503906, "ref_logps/rejected": -48.205955505371094, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6715684533119202, "rewards/margins": 0.6744291186332703, "rewards/rejected": -1.3459975719451904, "step": 149 }, { "epoch": 1.13, "grad_norm": 22.33237134875282, "learning_rate": 3.4550561797752807e-07, "logps/chosen": -47.884517669677734, "logps/rejected": -58.32150650024414, "loss": 0.5577, "losses/dpo": 0.5011469721794128, "losses/sft": 1.6741063594818115, "losses/total": 0.5011469721794128, "ref_logps/chosen": -40.72835159301758, "ref_logps/rejected": -46.43974685668945, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.7156162261962891, "rewards/margins": 0.47256001830101013, "rewards/rejected": -1.1881763935089111, "step": 150 }, { "epoch": 1.14, "grad_norm": 7.494096272004472, "learning_rate": 3.441011235955056e-07, "logps/chosen": -47.11280059814453, "logps/rejected": -60.22679901123047, "loss": 0.4938, "losses/dpo": 0.5828070044517517, "losses/sft": 1.9693294763565063, "losses/total": 0.5828070044517517, "ref_logps/chosen": -41.27467727661133, "ref_logps/rejected": -47.46904754638672, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5838119983673096, "rewards/margins": 0.691962480545044, "rewards/rejected": -1.2757744789123535, "step": 151 }, { "epoch": 1.15, "grad_norm": 7.516820355286503, "learning_rate": 3.4269662921348313e-07, "logps/chosen": -40.18495559692383, "logps/rejected": -54.60972595214844, "loss": 0.5232, "losses/dpo": 0.5568748116493225, "losses/sft": 1.7009055614471436, "losses/total": 0.5568748116493225, "ref_logps/chosen": -33.816017150878906, "ref_logps/rejected": -43.013160705566406, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6368939280509949, "rewards/margins": 0.5227622389793396, "rewards/rejected": -1.159656047821045, "step": 152 }, { "epoch": 1.15, "grad_norm": 7.502946402300403, "learning_rate": 3.4129213483146064e-07, "logps/chosen": -42.238433837890625, "logps/rejected": -53.268348693847656, "loss": 0.543, "losses/dpo": 0.5467118620872498, "losses/sft": 1.2100037336349487, "losses/total": 0.5467118620872498, "ref_logps/chosen": -35.477867126464844, "ref_logps/rejected": -41.79436111450195, "rewards/accuracies": 0.765625, "rewards/chosen": -0.676056981086731, "rewards/margins": 0.47134220600128174, "rewards/rejected": -1.1473990678787231, "step": 153 }, { "epoch": 1.16, "grad_norm": 7.772041407679626, "learning_rate": 3.398876404494382e-07, "logps/chosen": -44.26182174682617, "logps/rejected": -64.4139633178711, "loss": 0.4692, "losses/dpo": 0.4075261354446411, "losses/sft": 1.9415578842163086, "losses/total": 0.4075261354446411, "ref_logps/chosen": -37.64110565185547, "ref_logps/rejected": -50.549678802490234, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6620715260505676, "rewards/margins": 0.7243567109107971, "rewards/rejected": -1.3864283561706543, "step": 154 }, { "epoch": 1.17, "grad_norm": 9.887168758150576, "learning_rate": 3.3848314606741575e-07, "logps/chosen": -46.856407165527344, "logps/rejected": -60.13237380981445, "loss": 0.5388, "losses/dpo": 0.4830757975578308, "losses/sft": 1.654982328414917, "losses/total": 0.4830757975578308, "ref_logps/chosen": -39.95355987548828, "ref_logps/rejected": -47.57228088378906, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6902844905853271, "rewards/margins": 0.5657243132591248, "rewards/rejected": -1.2560089826583862, "step": 155 }, { "epoch": 1.18, "grad_norm": 8.0774394495718, "learning_rate": 3.3707865168539325e-07, "logps/chosen": -46.11971664428711, "logps/rejected": -58.2076301574707, "loss": 0.5571, "losses/dpo": 0.6668601036071777, "losses/sft": 2.1451334953308105, "losses/total": 0.6668601036071777, "ref_logps/chosen": -38.718963623046875, "ref_logps/rejected": -45.97993850708008, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.7400756478309631, "rewards/margins": 0.48269355297088623, "rewards/rejected": -1.2227692604064941, "step": 156 }, { "epoch": 1.18, "grad_norm": 7.6995260059404735, "learning_rate": 3.356741573033708e-07, "logps/chosen": -42.61933898925781, "logps/rejected": -53.914058685302734, "loss": 0.5563, "losses/dpo": 0.5214600563049316, "losses/sft": 1.6869933605194092, "losses/total": 0.5214600563049316, "ref_logps/chosen": -35.26099395751953, "ref_logps/rejected": -41.69981002807617, "rewards/accuracies": 0.75, "rewards/chosen": -0.7358340620994568, "rewards/margins": 0.4855908751487732, "rewards/rejected": -1.2214250564575195, "step": 157 }, { "epoch": 1.19, "grad_norm": 10.378363778579734, "learning_rate": 3.3426966292134826e-07, "logps/chosen": -46.86024856567383, "logps/rejected": -61.01960754394531, "loss": 0.5047, "losses/dpo": 0.35230398178100586, "losses/sft": 2.1814329624176025, "losses/total": 0.35230398178100586, "ref_logps/chosen": -40.29644775390625, "ref_logps/rejected": -48.00575256347656, "rewards/accuracies": 0.75, "rewards/chosen": -0.6563804149627686, "rewards/margins": 0.6450048685073853, "rewards/rejected": -1.3013852834701538, "step": 158 }, { "epoch": 1.2, "grad_norm": 8.023005518353788, "learning_rate": 3.328651685393258e-07, "logps/chosen": -48.07395553588867, "logps/rejected": -60.29581069946289, "loss": 0.5284, "losses/dpo": 0.57308030128479, "losses/sft": 2.2541966438293457, "losses/total": 0.57308030128479, "ref_logps/chosen": -39.92966842651367, "ref_logps/rejected": -46.47784423828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.8144292831420898, "rewards/margins": 0.5673672556877136, "rewards/rejected": -1.3817965984344482, "step": 159 }, { "epoch": 1.21, "grad_norm": 7.252828799858416, "learning_rate": 3.314606741573033e-07, "logps/chosen": -45.51723861694336, "logps/rejected": -60.294551849365234, "loss": 0.4636, "losses/dpo": 0.581541895866394, "losses/sft": 1.6298561096191406, "losses/total": 0.581541895866394, "ref_logps/chosen": -38.64426040649414, "ref_logps/rejected": -45.62995147705078, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6872978210449219, "rewards/margins": 0.7791618704795837, "rewards/rejected": -1.4664596319198608, "step": 160 }, { "epoch": 1.22, "grad_norm": 8.57918326492943, "learning_rate": 3.300561797752809e-07, "logps/chosen": -50.444820404052734, "logps/rejected": -56.87160110473633, "loss": 0.5883, "losses/dpo": 0.5911461710929871, "losses/sft": 2.1341657638549805, "losses/total": 0.5911461710929871, "ref_logps/chosen": -41.919921875, "ref_logps/rejected": -43.998451232910156, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8524903059005737, "rewards/margins": 0.43482455611228943, "rewards/rejected": -1.2873148918151855, "step": 161 }, { "epoch": 1.22, "grad_norm": 8.38374098065899, "learning_rate": 3.2865168539325844e-07, "logps/chosen": -50.57026672363281, "logps/rejected": -61.819053649902344, "loss": 0.5544, "losses/dpo": 0.5535627603530884, "losses/sft": 1.9382483959197998, "losses/total": 0.5535627603530884, "ref_logps/chosen": -41.4891357421875, "ref_logps/rejected": -47.33740234375, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.9081130027770996, "rewards/margins": 0.540052056312561, "rewards/rejected": -1.4481650590896606, "step": 162 }, { "epoch": 1.23, "grad_norm": 8.053746658561026, "learning_rate": 3.2724719101123594e-07, "logps/chosen": -49.71845245361328, "logps/rejected": -55.962120056152344, "loss": 0.5659, "losses/dpo": 0.5589174628257751, "losses/sft": 2.0584986209869385, "losses/total": 0.5589174628257751, "ref_logps/chosen": -41.30320739746094, "ref_logps/rejected": -42.65535354614258, "rewards/accuracies": 0.734375, "rewards/chosen": -0.841524600982666, "rewards/margins": 0.4891516864299774, "rewards/rejected": -1.3306763172149658, "step": 163 }, { "epoch": 1.24, "grad_norm": 8.128166160006492, "learning_rate": 3.258426966292135e-07, "logps/chosen": -49.184532165527344, "logps/rejected": -62.28634262084961, "loss": 0.5594, "losses/dpo": 0.6036563515663147, "losses/sft": 1.7208425998687744, "losses/total": 0.6036563515663147, "ref_logps/chosen": -40.15517044067383, "ref_logps/rejected": -47.867679595947266, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9029368162155151, "rewards/margins": 0.5389291644096375, "rewards/rejected": -1.441866159439087, "step": 164 }, { "epoch": 1.25, "grad_norm": 7.785958027787598, "learning_rate": 3.24438202247191e-07, "logps/chosen": -46.6161003112793, "logps/rejected": -64.03797912597656, "loss": 0.5012, "losses/dpo": 0.36227643489837646, "losses/sft": 1.8801686763763428, "losses/total": 0.36227643489837646, "ref_logps/chosen": -39.068603515625, "ref_logps/rejected": -49.793888092041016, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7547495365142822, "rewards/margins": 0.6696599125862122, "rewards/rejected": -1.42440927028656, "step": 165 }, { "epoch": 1.25, "grad_norm": 8.005747191405192, "learning_rate": 3.2303370786516856e-07, "logps/chosen": -47.46510696411133, "logps/rejected": -64.76780700683594, "loss": 0.495, "losses/dpo": 0.5762285590171814, "losses/sft": 1.6507606506347656, "losses/total": 0.5762285590171814, "ref_logps/chosen": -39.22062683105469, "ref_logps/rejected": -49.37548065185547, "rewards/accuracies": 0.765625, "rewards/chosen": -0.824447751045227, "rewards/margins": 0.7147842645645142, "rewards/rejected": -1.5392321348190308, "step": 166 }, { "epoch": 1.26, "grad_norm": 8.587266771042302, "learning_rate": 3.21629213483146e-07, "logps/chosen": -44.891380310058594, "logps/rejected": -63.128692626953125, "loss": 0.476, "losses/dpo": 0.3979690968990326, "losses/sft": 1.9852135181427002, "losses/total": 0.3979690968990326, "ref_logps/chosen": -36.76803970336914, "ref_logps/rejected": -47.980194091796875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8123339414596558, "rewards/margins": 0.7025157809257507, "rewards/rejected": -1.5148497819900513, "step": 167 }, { "epoch": 1.27, "grad_norm": 8.326932342308945, "learning_rate": 3.2022471910112357e-07, "logps/chosen": -47.12702941894531, "logps/rejected": -63.2083625793457, "loss": 0.4967, "losses/dpo": 0.46893593668937683, "losses/sft": 2.4472603797912598, "losses/total": 0.46893593668937683, "ref_logps/chosen": -38.752357482910156, "ref_logps/rejected": -48.1095085144043, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.8374671339988708, "rewards/margins": 0.6724185943603516, "rewards/rejected": -1.5098857879638672, "step": 168 }, { "epoch": 1.28, "grad_norm": 8.055720143262379, "learning_rate": 3.1882022471910107e-07, "logps/chosen": -47.89605712890625, "logps/rejected": -61.83879852294922, "loss": 0.5001, "losses/dpo": 0.5356773138046265, "losses/sft": 2.059915781021118, "losses/total": 0.5356773138046265, "ref_logps/chosen": -38.51155090332031, "ref_logps/rejected": -45.76554870605469, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9384507536888123, "rewards/margins": 0.6688745021820068, "rewards/rejected": -1.6073251962661743, "step": 169 }, { "epoch": 1.28, "grad_norm": 7.586041013117629, "learning_rate": 3.1741573033707863e-07, "logps/chosen": -48.82367706298828, "logps/rejected": -61.73891830444336, "loss": 0.482, "losses/dpo": 0.4217451214790344, "losses/sft": 1.6581342220306396, "losses/total": 0.4217451214790344, "ref_logps/chosen": -40.12367248535156, "ref_logps/rejected": -45.60664367675781, "rewards/accuracies": 0.78125, "rewards/chosen": -0.870000422000885, "rewards/margins": 0.7432273626327515, "rewards/rejected": -1.6132278442382812, "step": 170 }, { "epoch": 1.29, "grad_norm": 8.642028616636065, "learning_rate": 3.160112359550562e-07, "logps/chosen": -48.638423919677734, "logps/rejected": -60.67471694946289, "loss": 0.5046, "losses/dpo": 0.5264810919761658, "losses/sft": 1.7720236778259277, "losses/total": 0.5264810919761658, "ref_logps/chosen": -39.852603912353516, "ref_logps/rejected": -44.74626159667969, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8785818815231323, "rewards/margins": 0.7142631411552429, "rewards/rejected": -1.592844843864441, "step": 171 }, { "epoch": 1.3, "grad_norm": 8.335982070287319, "learning_rate": 3.146067415730337e-07, "logps/chosen": -52.76824951171875, "logps/rejected": -63.022315979003906, "loss": 0.5192, "losses/dpo": 0.4350988268852234, "losses/sft": 2.3970541954040527, "losses/total": 0.4350988268852234, "ref_logps/chosen": -42.989315032958984, "ref_logps/rejected": -47.05906677246094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9778933525085449, "rewards/margins": 0.6184311509132385, "rewards/rejected": -1.5963245630264282, "step": 172 }, { "epoch": 1.31, "grad_norm": 8.969238567903124, "learning_rate": 3.1320224719101125e-07, "logps/chosen": -50.0399169921875, "logps/rejected": -61.899139404296875, "loss": 0.5552, "losses/dpo": 0.6638925075531006, "losses/sft": 2.114647150039673, "losses/total": 0.6638925075531006, "ref_logps/chosen": -40.658668518066406, "ref_logps/rejected": -46.74747848510742, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.938124418258667, "rewards/margins": 0.5770419239997864, "rewards/rejected": -1.515166163444519, "step": 173 }, { "epoch": 1.31, "grad_norm": 8.235186141597438, "learning_rate": 3.1179775280898875e-07, "logps/chosen": -52.105690002441406, "logps/rejected": -62.85676574707031, "loss": 0.5092, "losses/dpo": 0.7253843545913696, "losses/sft": 2.424346446990967, "losses/total": 0.7253843545913696, "ref_logps/chosen": -43.18442916870117, "ref_logps/rejected": -47.48811340332031, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8921262621879578, "rewards/margins": 0.6447390913963318, "rewards/rejected": -1.5368653535842896, "step": 174 }, { "epoch": 1.32, "grad_norm": 7.582218244857748, "learning_rate": 3.103932584269663e-07, "logps/chosen": -50.67258071899414, "logps/rejected": -66.15370178222656, "loss": 0.4585, "losses/dpo": 0.45553505420684814, "losses/sft": 1.935417890548706, "losses/total": 0.45553505420684814, "ref_logps/chosen": -41.14632034301758, "ref_logps/rejected": -48.69713592529297, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9526264667510986, "rewards/margins": 0.7930303812026978, "rewards/rejected": -1.745656967163086, "step": 175 }, { "epoch": 1.33, "grad_norm": 8.60397689359237, "learning_rate": 3.0898876404494376e-07, "logps/chosen": -48.78178405761719, "logps/rejected": -56.637123107910156, "loss": 0.5424, "losses/dpo": 0.46894580125808716, "losses/sft": 1.4491811990737915, "losses/total": 0.46894580125808716, "ref_logps/chosen": -39.89250183105469, "ref_logps/rejected": -41.9540901184082, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8889281749725342, "rewards/margins": 0.5793753862380981, "rewards/rejected": -1.4683035612106323, "step": 176 }, { "epoch": 1.34, "grad_norm": 8.889595865380464, "learning_rate": 3.075842696629213e-07, "logps/chosen": -54.22248077392578, "logps/rejected": -62.7822380065918, "loss": 0.5381, "losses/dpo": 0.5002489686012268, "losses/sft": 1.6078195571899414, "losses/total": 0.5002489686012268, "ref_logps/chosen": -43.68180847167969, "ref_logps/rejected": -46.31658172607422, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.0540671348571777, "rewards/margins": 0.592498779296875, "rewards/rejected": -1.6465659141540527, "step": 177 }, { "epoch": 1.34, "grad_norm": 8.532472841711035, "learning_rate": 3.0617977528089887e-07, "logps/chosen": -49.95827865600586, "logps/rejected": -61.806705474853516, "loss": 0.5407, "losses/dpo": 0.6499188542366028, "losses/sft": 2.202641725540161, "losses/total": 0.6499188542366028, "ref_logps/chosen": -39.29413986206055, "ref_logps/rejected": -44.9674072265625, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0664141178131104, "rewards/margins": 0.6175155639648438, "rewards/rejected": -1.6839298009872437, "step": 178 }, { "epoch": 1.35, "grad_norm": 8.909287859284307, "learning_rate": 3.047752808988764e-07, "logps/chosen": -51.441497802734375, "logps/rejected": -62.11806106567383, "loss": 0.527, "losses/dpo": 0.2888008952140808, "losses/sft": 1.8648221492767334, "losses/total": 0.2888008952140808, "ref_logps/chosen": -41.599273681640625, "ref_logps/rejected": -46.178306579589844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9842216372489929, "rewards/margins": 0.6097543239593506, "rewards/rejected": -1.5939760208129883, "step": 179 }, { "epoch": 1.36, "grad_norm": 8.118331116269069, "learning_rate": 3.0337078651685393e-07, "logps/chosen": -50.35320281982422, "logps/rejected": -68.11038208007812, "loss": 0.4825, "losses/dpo": 0.2915097177028656, "losses/sft": 2.2731690406799316, "losses/total": 0.2915097177028656, "ref_logps/chosen": -40.37377166748047, "ref_logps/rejected": -50.12421798706055, "rewards/accuracies": 0.8125, "rewards/chosen": -0.997943103313446, "rewards/margins": 0.8006736636161804, "rewards/rejected": -1.7986167669296265, "step": 180 }, { "epoch": 1.37, "grad_norm": 7.611558494319546, "learning_rate": 3.0196629213483144e-07, "logps/chosen": -48.98621368408203, "logps/rejected": -63.958377838134766, "loss": 0.4771, "losses/dpo": 0.8142632246017456, "losses/sft": 2.000248670578003, "losses/total": 0.8142632246017456, "ref_logps/chosen": -39.60884475708008, "ref_logps/rejected": -46.56087875366211, "rewards/accuracies": 0.75, "rewards/chosen": -0.9377367496490479, "rewards/margins": 0.8020133376121521, "rewards/rejected": -1.7397500276565552, "step": 181 }, { "epoch": 1.37, "grad_norm": 7.87112340864733, "learning_rate": 3.00561797752809e-07, "logps/chosen": -50.70545959472656, "logps/rejected": -60.79212951660156, "loss": 0.481, "losses/dpo": 0.4779345989227295, "losses/sft": 2.002454996109009, "losses/total": 0.4779345989227295, "ref_logps/chosen": -40.73643112182617, "ref_logps/rejected": -43.39889144897461, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.9969026446342468, "rewards/margins": 0.7424213886260986, "rewards/rejected": -1.7393240928649902, "step": 182 }, { "epoch": 1.38, "grad_norm": 8.88113336772084, "learning_rate": 2.991573033707865e-07, "logps/chosen": -49.58301544189453, "logps/rejected": -58.122802734375, "loss": 0.5481, "losses/dpo": 0.5509602427482605, "losses/sft": 2.1736414432525635, "losses/total": 0.5509602427482605, "ref_logps/chosen": -39.8292350769043, "ref_logps/rejected": -42.636268615722656, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9753779768943787, "rewards/margins": 0.5732761025428772, "rewards/rejected": -1.5486540794372559, "step": 183 }, { "epoch": 1.39, "grad_norm": 8.086846655989897, "learning_rate": 2.9775280898876406e-07, "logps/chosen": -49.57462692260742, "logps/rejected": -59.01618194580078, "loss": 0.5191, "losses/dpo": 0.41641122102737427, "losses/sft": 1.5094877481460571, "losses/total": 0.41641122102737427, "ref_logps/chosen": -40.067962646484375, "ref_logps/rejected": -43.40235900878906, "rewards/accuracies": 0.75, "rewards/chosen": -0.9506663084030151, "rewards/margins": 0.6107163429260254, "rewards/rejected": -1.561382532119751, "step": 184 }, { "epoch": 1.4, "grad_norm": 14.80695955125612, "learning_rate": 2.9634831460674156e-07, "logps/chosen": -49.670318603515625, "logps/rejected": -63.068153381347656, "loss": 0.4798, "losses/dpo": 0.71369868516922, "losses/sft": 2.1753716468811035, "losses/total": 0.71369868516922, "ref_logps/chosen": -40.42032241821289, "ref_logps/rejected": -45.94530487060547, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9249992966651917, "rewards/margins": 0.7872861623764038, "rewards/rejected": -1.7122855186462402, "step": 185 }, { "epoch": 1.4, "grad_norm": 7.764172479016813, "learning_rate": 2.9494382022471906e-07, "logps/chosen": -46.467315673828125, "logps/rejected": -61.956661224365234, "loss": 0.4781, "losses/dpo": 0.6086141467094421, "losses/sft": 1.7402431964874268, "losses/total": 0.6086141467094421, "ref_logps/chosen": -37.99009704589844, "ref_logps/rejected": -45.549415588378906, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8477218747138977, "rewards/margins": 0.7930029630661011, "rewards/rejected": -1.640724778175354, "step": 186 }, { "epoch": 1.41, "grad_norm": 8.22953072067457, "learning_rate": 2.935393258426966e-07, "logps/chosen": -50.86983871459961, "logps/rejected": -62.61697769165039, "loss": 0.5075, "losses/dpo": 0.4520256519317627, "losses/sft": 2.4947710037231445, "losses/total": 0.4520256519317627, "ref_logps/chosen": -40.587074279785156, "ref_logps/rejected": -45.32030487060547, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.0282765626907349, "rewards/margins": 0.7013900279998779, "rewards/rejected": -1.7296665906906128, "step": 187 }, { "epoch": 1.42, "grad_norm": 7.9279440455740735, "learning_rate": 2.921348314606741e-07, "logps/chosen": -44.96842956542969, "logps/rejected": -62.975059509277344, "loss": 0.5187, "losses/dpo": 0.5396404266357422, "losses/sft": 2.048635482788086, "losses/total": 0.5396404266357422, "ref_logps/chosen": -34.73614501953125, "ref_logps/rejected": -46.000450134277344, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.023228406906128, "rewards/margins": 0.6742324829101562, "rewards/rejected": -1.6974608898162842, "step": 188 }, { "epoch": 1.43, "grad_norm": 7.307548456693543, "learning_rate": 2.907303370786517e-07, "logps/chosen": -45.52065658569336, "logps/rejected": -59.8885498046875, "loss": 0.4755, "losses/dpo": 0.40278539061546326, "losses/sft": 2.2951760292053223, "losses/total": 0.40278539061546326, "ref_logps/chosen": -37.46648406982422, "ref_logps/rejected": -44.16557312011719, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8054174184799194, "rewards/margins": 0.7668801546096802, "rewards/rejected": -1.5722976922988892, "step": 189 }, { "epoch": 1.43, "grad_norm": 8.912502235874074, "learning_rate": 2.893258426966292e-07, "logps/chosen": -45.76905059814453, "logps/rejected": -57.87416076660156, "loss": 0.5655, "losses/dpo": 0.5564082860946655, "losses/sft": 1.3282924890518188, "losses/total": 0.5564082860946655, "ref_logps/chosen": -36.90949630737305, "ref_logps/rejected": -43.09407424926758, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8859553933143616, "rewards/margins": 0.5920534133911133, "rewards/rejected": -1.47800874710083, "step": 190 }, { "epoch": 1.44, "grad_norm": 7.963453038939498, "learning_rate": 2.8792134831460674e-07, "logps/chosen": -48.343929290771484, "logps/rejected": -68.3438720703125, "loss": 0.4613, "losses/dpo": 0.4805658161640167, "losses/sft": 2.2470126152038574, "losses/total": 0.4805658161640167, "ref_logps/chosen": -40.115596771240234, "ref_logps/rejected": -51.82239532470703, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8228334784507751, "rewards/margins": 0.8293145895004272, "rewards/rejected": -1.6521480083465576, "step": 191 }, { "epoch": 1.45, "grad_norm": 8.622412292262098, "learning_rate": 2.8651685393258425e-07, "logps/chosen": -51.2635383605957, "logps/rejected": -63.78586959838867, "loss": 0.4851, "losses/dpo": 0.38889288902282715, "losses/sft": 2.7701683044433594, "losses/total": 0.38889288902282715, "ref_logps/chosen": -42.14338684082031, "ref_logps/rejected": -46.94654083251953, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9120146036148071, "rewards/margins": 0.7719184756278992, "rewards/rejected": -1.683933138847351, "step": 192 }, { "epoch": 1.46, "grad_norm": 7.759600823991896, "learning_rate": 2.851123595505618e-07, "logps/chosen": -46.17103958129883, "logps/rejected": -60.57073974609375, "loss": 0.5056, "losses/dpo": 0.6096426248550415, "losses/sft": 2.101557493209839, "losses/total": 0.6096426248550415, "ref_logps/chosen": -37.342018127441406, "ref_logps/rejected": -44.97611999511719, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8829020261764526, "rewards/margins": 0.6765601634979248, "rewards/rejected": -1.559462308883667, "step": 193 }, { "epoch": 1.46, "grad_norm": 8.401972111307101, "learning_rate": 2.8370786516853936e-07, "logps/chosen": -49.945350646972656, "logps/rejected": -64.91703033447266, "loss": 0.4635, "losses/dpo": 0.5600602626800537, "losses/sft": 1.5397871732711792, "losses/total": 0.5600602626800537, "ref_logps/chosen": -41.32893371582031, "ref_logps/rejected": -47.677955627441406, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8616417050361633, "rewards/margins": 0.8622665405273438, "rewards/rejected": -1.7239081859588623, "step": 194 }, { "epoch": 1.47, "grad_norm": 8.847895232096153, "learning_rate": 2.823033707865168e-07, "logps/chosen": -45.08359146118164, "logps/rejected": -65.03630828857422, "loss": 0.5214, "losses/dpo": 0.5835427045822144, "losses/sft": 1.4693069458007812, "losses/total": 0.5835427045822144, "ref_logps/chosen": -35.359466552734375, "ref_logps/rejected": -47.392391204833984, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9724127650260925, "rewards/margins": 0.7919799089431763, "rewards/rejected": -1.764392614364624, "step": 195 }, { "epoch": 1.48, "grad_norm": 8.759081047535012, "learning_rate": 2.8089887640449437e-07, "logps/chosen": -45.32444763183594, "logps/rejected": -63.420326232910156, "loss": 0.5066, "losses/dpo": 0.38791435956954956, "losses/sft": 1.907132625579834, "losses/total": 0.38791435956954956, "ref_logps/chosen": -35.994544982910156, "ref_logps/rejected": -46.32176971435547, "rewards/accuracies": 0.78125, "rewards/chosen": -0.932990550994873, "rewards/margins": 0.776865541934967, "rewards/rejected": -1.7098561525344849, "step": 196 }, { "epoch": 1.49, "grad_norm": 8.54168900663033, "learning_rate": 2.794943820224719e-07, "logps/chosen": -53.67341613769531, "logps/rejected": -73.58582305908203, "loss": 0.4837, "losses/dpo": 0.5046126842498779, "losses/sft": 2.1525635719299316, "losses/total": 0.5046126842498779, "ref_logps/chosen": -42.92031478881836, "ref_logps/rejected": -54.27576446533203, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0753098726272583, "rewards/margins": 0.8556962013244629, "rewards/rejected": -1.9310060739517212, "step": 197 }, { "epoch": 1.49, "grad_norm": 9.360560561178314, "learning_rate": 2.7808988764044943e-07, "logps/chosen": -49.29429626464844, "logps/rejected": -58.852745056152344, "loss": 0.5219, "losses/dpo": 0.6207550168037415, "losses/sft": 1.4964567422866821, "losses/total": 0.6207550168037415, "ref_logps/chosen": -39.879859924316406, "ref_logps/rejected": -42.66207504272461, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9414433836936951, "rewards/margins": 0.6776232719421387, "rewards/rejected": -1.6190667152404785, "step": 198 }, { "epoch": 1.5, "grad_norm": 7.766094681277619, "learning_rate": 2.7668539325842694e-07, "logps/chosen": -44.67707443237305, "logps/rejected": -62.46302032470703, "loss": 0.4513, "losses/dpo": 0.3586122393608093, "losses/sft": 1.7712942361831665, "losses/total": 0.3586122393608093, "ref_logps/chosen": -35.756446838378906, "ref_logps/rejected": -45.0746955871582, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8920624256134033, "rewards/margins": 0.8467705249786377, "rewards/rejected": -1.738832950592041, "step": 199 }, { "epoch": 1.51, "grad_norm": 8.387391054319354, "learning_rate": 2.752808988764045e-07, "logps/chosen": -51.25703430175781, "logps/rejected": -66.00263977050781, "loss": 0.4694, "losses/dpo": 0.4337347447872162, "losses/sft": 2.2087419033050537, "losses/total": 0.4337347447872162, "ref_logps/chosen": -41.61771011352539, "ref_logps/rejected": -48.16603469848633, "rewards/accuracies": 0.8125, "rewards/chosen": -0.96393221616745, "rewards/margins": 0.8197280764579773, "rewards/rejected": -1.7836604118347168, "step": 200 }, { "epoch": 1.52, "grad_norm": 8.762356246957351, "learning_rate": 2.73876404494382e-07, "logps/chosen": -48.135372161865234, "logps/rejected": -59.56908416748047, "loss": 0.5349, "losses/dpo": 0.5300882458686829, "losses/sft": 1.96072256565094, "losses/total": 0.5300882458686829, "ref_logps/chosen": -39.096519470214844, "ref_logps/rejected": -44.62074279785156, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9038856625556946, "rewards/margins": 0.5909484028816223, "rewards/rejected": -1.4948341846466064, "step": 201 }, { "epoch": 1.52, "grad_norm": 8.01612481161353, "learning_rate": 2.7247191011235955e-07, "logps/chosen": -48.03499984741211, "logps/rejected": -58.95391845703125, "loss": 0.4596, "losses/dpo": 0.5687890648841858, "losses/sft": 1.5117721557617188, "losses/total": 0.5687890648841858, "ref_logps/chosen": -40.076637268066406, "ref_logps/rejected": -42.60528564453125, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.795836865901947, "rewards/margins": 0.839026153087616, "rewards/rejected": -1.634863018989563, "step": 202 }, { "epoch": 1.53, "grad_norm": 8.170779294474082, "learning_rate": 2.710674157303371e-07, "logps/chosen": -53.97732925415039, "logps/rejected": -66.41366577148438, "loss": 0.4697, "losses/dpo": 0.440776526927948, "losses/sft": 2.09440279006958, "losses/total": 0.440776526927948, "ref_logps/chosen": -44.04298782348633, "ref_logps/rejected": -48.34318161010742, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.9934341311454773, "rewards/margins": 0.8136138319969177, "rewards/rejected": -1.8070482015609741, "step": 203 }, { "epoch": 1.54, "grad_norm": 8.062617165532144, "learning_rate": 2.6966292134831456e-07, "logps/chosen": -51.31819152832031, "logps/rejected": -66.41058349609375, "loss": 0.4809, "losses/dpo": 0.5263036489486694, "losses/sft": 1.6697288751602173, "losses/total": 0.5263036489486694, "ref_logps/chosen": -41.211219787597656, "ref_logps/rejected": -48.88929748535156, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0106972455978394, "rewards/margins": 0.7414315938949585, "rewards/rejected": -1.7521288394927979, "step": 204 }, { "epoch": 1.55, "grad_norm": 8.232167547461342, "learning_rate": 2.682584269662921e-07, "logps/chosen": -50.255958557128906, "logps/rejected": -64.30512237548828, "loss": 0.4906, "losses/dpo": 0.5959848761558533, "losses/sft": 2.509490966796875, "losses/total": 0.5959848761558533, "ref_logps/chosen": -40.21955490112305, "ref_logps/rejected": -46.348358154296875, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.0036402940750122, "rewards/margins": 0.7920358180999756, "rewards/rejected": -1.7956762313842773, "step": 205 }, { "epoch": 1.55, "grad_norm": 8.658885149622282, "learning_rate": 2.668539325842696e-07, "logps/chosen": -48.23283004760742, "logps/rejected": -63.8531608581543, "loss": 0.5012, "losses/dpo": 0.3958526849746704, "losses/sft": 1.8156052827835083, "losses/total": 0.3958526849746704, "ref_logps/chosen": -39.19361877441406, "ref_logps/rejected": -46.780818939208984, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9039209485054016, "rewards/margins": 0.8033130168914795, "rewards/rejected": -1.7072339057922363, "step": 206 }, { "epoch": 1.56, "grad_norm": 9.062222930248339, "learning_rate": 2.654494382022472e-07, "logps/chosen": -48.18266296386719, "logps/rejected": -60.7551155090332, "loss": 0.5163, "losses/dpo": 0.5344985127449036, "losses/sft": 1.5162503719329834, "losses/total": 0.5344985127449036, "ref_logps/chosen": -38.624839782714844, "ref_logps/rejected": -43.208770751953125, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.9557825326919556, "rewards/margins": 0.798852264881134, "rewards/rejected": -1.7546348571777344, "step": 207 }, { "epoch": 1.57, "grad_norm": 8.337997292070591, "learning_rate": 2.640449438202247e-07, "logps/chosen": -46.034690856933594, "logps/rejected": -58.21810531616211, "loss": 0.5159, "losses/dpo": 0.5175392627716064, "losses/sft": 1.6583523750305176, "losses/total": 0.5175392627716064, "ref_logps/chosen": -37.22336959838867, "ref_logps/rejected": -42.30153274536133, "rewards/accuracies": 0.75, "rewards/chosen": -0.8811318278312683, "rewards/margins": 0.7105254530906677, "rewards/rejected": -1.5916571617126465, "step": 208 }, { "epoch": 1.58, "grad_norm": 8.605777081243177, "learning_rate": 2.6264044943820224e-07, "logps/chosen": -49.07252502441406, "logps/rejected": -62.991764068603516, "loss": 0.5234, "losses/dpo": 0.3841923177242279, "losses/sft": 2.0660533905029297, "losses/total": 0.3841923177242279, "ref_logps/chosen": -38.91202163696289, "ref_logps/rejected": -45.201744079589844, "rewards/accuracies": 0.734375, "rewards/chosen": -1.0160505771636963, "rewards/margins": 0.7629517316818237, "rewards/rejected": -1.7790021896362305, "step": 209 }, { "epoch": 1.58, "grad_norm": 8.594902163030543, "learning_rate": 2.612359550561798e-07, "logps/chosen": -48.94342803955078, "logps/rejected": -67.45938110351562, "loss": 0.4628, "losses/dpo": 0.5517236590385437, "losses/sft": 1.831754446029663, "losses/total": 0.5517236590385437, "ref_logps/chosen": -39.35862731933594, "ref_logps/rejected": -48.98806381225586, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9584797620773315, "rewards/margins": 0.8886520862579346, "rewards/rejected": -1.8471317291259766, "step": 210 }, { "epoch": 1.59, "grad_norm": 9.393987371477554, "learning_rate": 2.598314606741573e-07, "logps/chosen": -51.1754035949707, "logps/rejected": -62.90049362182617, "loss": 0.538, "losses/dpo": 0.4337689280509949, "losses/sft": 1.871553897857666, "losses/total": 0.4337689280509949, "ref_logps/chosen": -40.097930908203125, "ref_logps/rejected": -44.89445877075195, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1077474355697632, "rewards/margins": 0.6928560137748718, "rewards/rejected": -1.8006032705307007, "step": 211 }, { "epoch": 1.6, "grad_norm": 8.268094387246107, "learning_rate": 2.5842696629213486e-07, "logps/chosen": -46.02891159057617, "logps/rejected": -58.77110290527344, "loss": 0.5183, "losses/dpo": 0.3652556836605072, "losses/sft": 2.1132707595825195, "losses/total": 0.3652556836605072, "ref_logps/chosen": -37.71674346923828, "ref_logps/rejected": -43.298362731933594, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8312174677848816, "rewards/margins": 0.7160569429397583, "rewards/rejected": -1.5472744703292847, "step": 212 }, { "epoch": 1.61, "grad_norm": 8.156887652455024, "learning_rate": 2.5702247191011236e-07, "logps/chosen": -53.53913879394531, "logps/rejected": -65.96989440917969, "loss": 0.454, "losses/dpo": 0.6029412746429443, "losses/sft": 2.5100276470184326, "losses/total": 0.6029412746429443, "ref_logps/chosen": -43.845726013183594, "ref_logps/rejected": -47.28349304199219, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9693412780761719, "rewards/margins": 0.899299681186676, "rewards/rejected": -1.8686408996582031, "step": 213 }, { "epoch": 1.62, "grad_norm": 9.181481809417177, "learning_rate": 2.5561797752808987e-07, "logps/chosen": -51.8013801574707, "logps/rejected": -63.46139907836914, "loss": 0.5059, "losses/dpo": 0.14845682680606842, "losses/sft": 1.4499884843826294, "losses/total": 0.14845682680606842, "ref_logps/chosen": -42.47813415527344, "ref_logps/rejected": -46.913028717041016, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9323242902755737, "rewards/margins": 0.7225131988525391, "rewards/rejected": -1.6548374891281128, "step": 214 }, { "epoch": 1.62, "grad_norm": 15.612026068166484, "learning_rate": 2.5421348314606737e-07, "logps/chosen": -53.003204345703125, "logps/rejected": -66.69940948486328, "loss": 0.4433, "losses/dpo": 0.37596985697746277, "losses/sft": 1.6896804571151733, "losses/total": 0.37596985697746277, "ref_logps/chosen": -43.648799896240234, "ref_logps/rejected": -48.52040481567383, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.9354407787322998, "rewards/margins": 0.8824598789215088, "rewards/rejected": -1.8179006576538086, "step": 215 }, { "epoch": 1.63, "grad_norm": 9.017380971745943, "learning_rate": 2.5280898876404493e-07, "logps/chosen": -53.18143081665039, "logps/rejected": -68.83674621582031, "loss": 0.5005, "losses/dpo": 0.9546025395393372, "losses/sft": 2.2383298873901367, "losses/total": 0.9546025395393372, "ref_logps/chosen": -42.14701843261719, "ref_logps/rejected": -50.13629913330078, "rewards/accuracies": 0.765625, "rewards/chosen": -1.103441834449768, "rewards/margins": 0.7666029334068298, "rewards/rejected": -1.8700445890426636, "step": 216 }, { "epoch": 1.64, "grad_norm": 7.002144270086572, "learning_rate": 2.5140449438202243e-07, "logps/chosen": -39.812992095947266, "logps/rejected": -58.04911804199219, "loss": 0.4312, "losses/dpo": 0.3237588107585907, "losses/sft": 1.9024207592010498, "losses/total": 0.3237588107585907, "ref_logps/chosen": -31.948955535888672, "ref_logps/rejected": -40.56779479980469, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7864038348197937, "rewards/margins": 0.9617283940315247, "rewards/rejected": -1.7481321096420288, "step": 217 }, { "epoch": 1.65, "grad_norm": 8.018556807115827, "learning_rate": 2.5e-07, "logps/chosen": -47.59326934814453, "logps/rejected": -64.22843170166016, "loss": 0.4616, "losses/dpo": 0.3011893630027771, "losses/sft": 1.4501286745071411, "losses/total": 0.3011893630027771, "ref_logps/chosen": -37.48702621459961, "ref_logps/rejected": -45.30727767944336, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.0106242895126343, "rewards/margins": 0.8814913630485535, "rewards/rejected": -1.892115592956543, "step": 218 }, { "epoch": 1.65, "grad_norm": 9.421297346982232, "learning_rate": 2.485955056179775e-07, "logps/chosen": -52.299922943115234, "logps/rejected": -62.2135009765625, "loss": 0.5647, "losses/dpo": 0.6175757050514221, "losses/sft": 2.058591365814209, "losses/total": 0.6175757050514221, "ref_logps/chosen": -41.575767517089844, "ref_logps/rejected": -45.39363479614258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0724154710769653, "rewards/margins": 0.6095717549324036, "rewards/rejected": -1.6819872856140137, "step": 219 }, { "epoch": 1.66, "grad_norm": 10.845286708552022, "learning_rate": 2.4719101123595505e-07, "logps/chosen": -49.824981689453125, "logps/rejected": -61.322139739990234, "loss": 0.4823, "losses/dpo": 0.5226192474365234, "losses/sft": 1.8831043243408203, "losses/total": 0.5226192474365234, "ref_logps/chosen": -39.458709716796875, "ref_logps/rejected": -43.33076477050781, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0366266965866089, "rewards/margins": 0.7625109553337097, "rewards/rejected": -1.7991377115249634, "step": 220 }, { "epoch": 1.67, "grad_norm": 7.736788176006532, "learning_rate": 2.4578651685393255e-07, "logps/chosen": -48.30406951904297, "logps/rejected": -62.53293228149414, "loss": 0.4473, "losses/dpo": 0.24700236320495605, "losses/sft": 1.5504862070083618, "losses/total": 0.24700236320495605, "ref_logps/chosen": -38.438228607177734, "ref_logps/rejected": -43.876705169677734, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.9865838289260864, "rewards/margins": 0.8790390491485596, "rewards/rejected": -1.8656229972839355, "step": 221 }, { "epoch": 1.68, "grad_norm": 7.9918716943613894, "learning_rate": 2.443820224719101e-07, "logps/chosen": -46.76765441894531, "logps/rejected": -63.072418212890625, "loss": 0.4597, "losses/dpo": 0.5004298686981201, "losses/sft": 2.039869785308838, "losses/total": 0.5004298686981201, "ref_logps/chosen": -37.515079498291016, "ref_logps/rejected": -44.76315689086914, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9252572655677795, "rewards/margins": 0.9056685566902161, "rewards/rejected": -1.8309259414672852, "step": 222 }, { "epoch": 1.68, "grad_norm": 8.812574527536418, "learning_rate": 2.429775280898876e-07, "logps/chosen": -48.68953323364258, "logps/rejected": -70.00579071044922, "loss": 0.4842, "losses/dpo": 0.5064282417297363, "losses/sft": 1.860622763633728, "losses/total": 0.5064282417297363, "ref_logps/chosen": -37.93241500854492, "ref_logps/rejected": -49.59477996826172, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.0757118463516235, "rewards/margins": 0.9653894305229187, "rewards/rejected": -2.0411009788513184, "step": 223 }, { "epoch": 1.69, "grad_norm": 8.262706110631338, "learning_rate": 2.4157303370786517e-07, "logps/chosen": -51.853031158447266, "logps/rejected": -66.55961608886719, "loss": 0.4643, "losses/dpo": 0.32682162523269653, "losses/sft": 2.5111522674560547, "losses/total": 0.32682162523269653, "ref_logps/chosen": -42.025306701660156, "ref_logps/rejected": -47.904380798339844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9827719330787659, "rewards/margins": 0.8827516436576843, "rewards/rejected": -1.8655235767364502, "step": 224 }, { "epoch": 1.7, "grad_norm": 9.91340992789526, "learning_rate": 2.401685393258427e-07, "logps/chosen": -51.620609283447266, "logps/rejected": -61.01588821411133, "loss": 0.5382, "losses/dpo": 0.47428151965141296, "losses/sft": 1.9888023138046265, "losses/total": 0.47428151965141296, "ref_logps/chosen": -40.93183135986328, "ref_logps/rejected": -43.67429733276367, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.0688778162002563, "rewards/margins": 0.6652814745903015, "rewards/rejected": -1.7341593503952026, "step": 225 }, { "epoch": 1.71, "grad_norm": 8.353660910725022, "learning_rate": 2.3876404494382023e-07, "logps/chosen": -49.402103424072266, "logps/rejected": -61.825992584228516, "loss": 0.4796, "losses/dpo": 0.4575900137424469, "losses/sft": 2.0813965797424316, "losses/total": 0.4575900137424469, "ref_logps/chosen": -39.205528259277344, "ref_logps/rejected": -43.332237243652344, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.019657850265503, "rewards/margins": 0.8297175765037537, "rewards/rejected": -1.8493753671646118, "step": 226 }, { "epoch": 1.71, "grad_norm": 8.489694346068541, "learning_rate": 2.3735955056179774e-07, "logps/chosen": -49.869972229003906, "logps/rejected": -61.926387786865234, "loss": 0.5007, "losses/dpo": 0.5196319222450256, "losses/sft": 2.3197567462921143, "losses/total": 0.5196319222450256, "ref_logps/chosen": -40.345924377441406, "ref_logps/rejected": -43.80282211303711, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.9524051547050476, "rewards/margins": 0.8599514365196228, "rewards/rejected": -1.8123565912246704, "step": 227 }, { "epoch": 1.72, "grad_norm": 8.455711622023179, "learning_rate": 2.3595505617977527e-07, "logps/chosen": -48.83377456665039, "logps/rejected": -62.39826202392578, "loss": 0.4853, "losses/dpo": 0.33744513988494873, "losses/sft": 2.174797296524048, "losses/total": 0.33744513988494873, "ref_logps/chosen": -38.7586555480957, "ref_logps/rejected": -44.35865783691406, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0075117349624634, "rewards/margins": 0.7964487075805664, "rewards/rejected": -1.8039604425430298, "step": 228 }, { "epoch": 1.73, "grad_norm": 8.501384419530382, "learning_rate": 2.345505617977528e-07, "logps/chosen": -46.079429626464844, "logps/rejected": -62.38417053222656, "loss": 0.4593, "losses/dpo": 0.45904022455215454, "losses/sft": 2.14939546585083, "losses/total": 0.45904022455215454, "ref_logps/chosen": -37.05137634277344, "ref_logps/rejected": -44.551025390625, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9028058052062988, "rewards/margins": 0.8805083632469177, "rewards/rejected": -1.7833141088485718, "step": 229 }, { "epoch": 1.74, "grad_norm": 8.42568627673927, "learning_rate": 2.331460674157303e-07, "logps/chosen": -53.63593292236328, "logps/rejected": -67.56592559814453, "loss": 0.4141, "losses/dpo": 0.35481470823287964, "losses/sft": 1.9238269329071045, "losses/total": 0.35481470823287964, "ref_logps/chosen": -43.60770797729492, "ref_logps/rejected": -47.71710205078125, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.0028222799301147, "rewards/margins": 0.9820606112480164, "rewards/rejected": -1.9848829507827759, "step": 230 }, { "epoch": 1.74, "grad_norm": 8.985097587921354, "learning_rate": 2.3174157303370786e-07, "logps/chosen": -49.890769958496094, "logps/rejected": -64.36235046386719, "loss": 0.5005, "losses/dpo": 0.4699353277683258, "losses/sft": 1.5774719715118408, "losses/total": 0.4699353277683258, "ref_logps/chosen": -39.884613037109375, "ref_logps/rejected": -46.45123291015625, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0006158351898193, "rewards/margins": 0.7904958724975586, "rewards/rejected": -1.7911114692687988, "step": 231 }, { "epoch": 1.75, "grad_norm": 8.37260934759885, "learning_rate": 2.303370786516854e-07, "logps/chosen": -47.88700485229492, "logps/rejected": -65.87992095947266, "loss": 0.4563, "losses/dpo": 0.4289396107196808, "losses/sft": 1.9490468502044678, "losses/total": 0.4289396107196808, "ref_logps/chosen": -38.816673278808594, "ref_logps/rejected": -48.137718200683594, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9070336222648621, "rewards/margins": 0.8671874403953552, "rewards/rejected": -1.7742209434509277, "step": 232 }, { "epoch": 1.76, "grad_norm": 8.439180796005859, "learning_rate": 2.2893258426966292e-07, "logps/chosen": -47.75567626953125, "logps/rejected": -59.802364349365234, "loss": 0.5339, "losses/dpo": 0.3888584077358246, "losses/sft": 1.453101396560669, "losses/total": 0.3888584077358246, "ref_logps/chosen": -38.37287521362305, "ref_logps/rejected": -43.31857681274414, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.9382801055908203, "rewards/margins": 0.7100984454154968, "rewards/rejected": -1.6483784914016724, "step": 233 }, { "epoch": 1.77, "grad_norm": 8.820079708795298, "learning_rate": 2.2752808988764045e-07, "logps/chosen": -53.505958557128906, "logps/rejected": -66.17211151123047, "loss": 0.4792, "losses/dpo": 0.5977008938789368, "losses/sft": 1.5070393085479736, "losses/total": 0.5977008938789368, "ref_logps/chosen": -42.90943145751953, "ref_logps/rejected": -47.009498596191406, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0596526861190796, "rewards/margins": 0.8566086292266846, "rewards/rejected": -1.9162614345550537, "step": 234 }, { "epoch": 1.77, "grad_norm": 8.954294680805216, "learning_rate": 2.2612359550561795e-07, "logps/chosen": -51.98886489868164, "logps/rejected": -65.2154769897461, "loss": 0.4863, "losses/dpo": 0.4061537981033325, "losses/sft": 1.7505851984024048, "losses/total": 0.4061537981033325, "ref_logps/chosen": -41.45209503173828, "ref_logps/rejected": -46.05805969238281, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.0536775588989258, "rewards/margins": 0.8620648980140686, "rewards/rejected": -1.9157423973083496, "step": 235 }, { "epoch": 1.78, "grad_norm": 8.276096992832155, "learning_rate": 2.2471910112359549e-07, "logps/chosen": -46.52098846435547, "logps/rejected": -60.28398895263672, "loss": 0.4849, "losses/dpo": 0.4493502676486969, "losses/sft": 1.8399590253829956, "losses/total": 0.4493502676486969, "ref_logps/chosen": -37.41412353515625, "ref_logps/rejected": -42.95138931274414, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9106867909431458, "rewards/margins": 0.8225734233856201, "rewards/rejected": -1.733260154724121, "step": 236 }, { "epoch": 1.79, "grad_norm": 9.148667637325849, "learning_rate": 2.2331460674157302e-07, "logps/chosen": -49.735443115234375, "logps/rejected": -68.21788787841797, "loss": 0.49, "losses/dpo": 0.2873924672603607, "losses/sft": 1.5266362428665161, "losses/total": 0.2873924672603607, "ref_logps/chosen": -39.63804244995117, "ref_logps/rejected": -49.29356384277344, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0097399950027466, "rewards/margins": 0.8826919198036194, "rewards/rejected": -1.8924317359924316, "step": 237 }, { "epoch": 1.8, "grad_norm": 8.143852894216051, "learning_rate": 2.2191011235955055e-07, "logps/chosen": -48.24225616455078, "logps/rejected": -63.4071044921875, "loss": 0.4505, "losses/dpo": 0.4800674617290497, "losses/sft": 1.840663194656372, "losses/total": 0.4800674617290497, "ref_logps/chosen": -38.12638854980469, "ref_logps/rejected": -44.40414047241211, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.0115869045257568, "rewards/margins": 0.8887090682983398, "rewards/rejected": -1.9002958536148071, "step": 238 }, { "epoch": 1.8, "grad_norm": 8.978669612809192, "learning_rate": 2.205056179775281e-07, "logps/chosen": -49.006935119628906, "logps/rejected": -64.8754653930664, "loss": 0.4701, "losses/dpo": 0.3446974754333496, "losses/sft": 1.800228476524353, "losses/total": 0.3446974754333496, "ref_logps/chosen": -39.40437698364258, "ref_logps/rejected": -46.17237091064453, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9602562189102173, "rewards/margins": 0.9100530743598938, "rewards/rejected": -1.8703094720840454, "step": 239 }, { "epoch": 1.81, "grad_norm": 9.39799976416451, "learning_rate": 2.191011235955056e-07, "logps/chosen": -49.0966682434082, "logps/rejected": -62.466331481933594, "loss": 0.4979, "losses/dpo": 0.7114862203598022, "losses/sft": 1.6450343132019043, "losses/total": 0.7114862203598022, "ref_logps/chosen": -38.396942138671875, "ref_logps/rejected": -43.12867736816406, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.0699726343154907, "rewards/margins": 0.8637927770614624, "rewards/rejected": -1.9337654113769531, "step": 240 }, { "epoch": 1.82, "grad_norm": 7.9075518471198585, "learning_rate": 2.1769662921348314e-07, "logps/chosen": -49.44485092163086, "logps/rejected": -64.67866516113281, "loss": 0.4424, "losses/dpo": 0.3071708679199219, "losses/sft": 2.3602986335754395, "losses/total": 0.3071708679199219, "ref_logps/chosen": -39.44845199584961, "ref_logps/rejected": -45.48222351074219, "rewards/accuracies": 0.796875, "rewards/chosen": -0.999640703201294, "rewards/margins": 0.9200041890144348, "rewards/rejected": -1.919644832611084, "step": 241 }, { "epoch": 1.83, "grad_norm": 8.785359064590306, "learning_rate": 2.1629213483146067e-07, "logps/chosen": -53.305233001708984, "logps/rejected": -65.16683959960938, "loss": 0.4948, "losses/dpo": 0.5694843530654907, "losses/sft": 1.6856664419174194, "losses/total": 0.5694843530654907, "ref_logps/chosen": -42.595603942871094, "ref_logps/rejected": -46.201744079589844, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0709630250930786, "rewards/margins": 0.8255467414855957, "rewards/rejected": -1.8965098857879639, "step": 242 }, { "epoch": 1.83, "grad_norm": 9.446502314051532, "learning_rate": 2.148876404494382e-07, "logps/chosen": -48.44091033935547, "logps/rejected": -60.52621078491211, "loss": 0.526, "losses/dpo": 0.45424705743789673, "losses/sft": 2.151371479034424, "losses/total": 0.45424705743789673, "ref_logps/chosen": -38.4775390625, "ref_logps/rejected": -43.26105499267578, "rewards/accuracies": 0.75, "rewards/chosen": -0.9963367581367493, "rewards/margins": 0.730178952217102, "rewards/rejected": -1.726515531539917, "step": 243 }, { "epoch": 1.84, "grad_norm": 8.81696350872476, "learning_rate": 2.134831460674157e-07, "logps/chosen": -50.6672477722168, "logps/rejected": -66.28202819824219, "loss": 0.4731, "losses/dpo": 0.3640851080417633, "losses/sft": 1.5808416604995728, "losses/total": 0.3640851080417633, "ref_logps/chosen": -40.499908447265625, "ref_logps/rejected": -47.59101486206055, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0167338848114014, "rewards/margins": 0.852367103099823, "rewards/rejected": -1.8691009283065796, "step": 244 }, { "epoch": 1.85, "grad_norm": 9.28428850947705, "learning_rate": 2.1207865168539323e-07, "logps/chosen": -54.07783889770508, "logps/rejected": -65.76669311523438, "loss": 0.5163, "losses/dpo": 0.8544118404388428, "losses/sft": 1.9635812044143677, "losses/total": 0.8544118404388428, "ref_logps/chosen": -42.8679084777832, "ref_logps/rejected": -46.79513931274414, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.120992660522461, "rewards/margins": 0.7761632204055786, "rewards/rejected": -1.897156000137329, "step": 245 }, { "epoch": 1.86, "grad_norm": 9.752420565118218, "learning_rate": 2.1067415730337076e-07, "logps/chosen": -49.50055694580078, "logps/rejected": -65.74305725097656, "loss": 0.4852, "losses/dpo": 0.23906151950359344, "losses/sft": 1.417936086654663, "losses/total": 0.23906151950359344, "ref_logps/chosen": -39.56950378417969, "ref_logps/rejected": -47.11359405517578, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9931050539016724, "rewards/margins": 0.8698407411575317, "rewards/rejected": -1.8629456758499146, "step": 246 }, { "epoch": 1.86, "grad_norm": 9.475549639179285, "learning_rate": 2.0926966292134832e-07, "logps/chosen": -49.32406997680664, "logps/rejected": -63.67836380004883, "loss": 0.5131, "losses/dpo": 0.4199092984199524, "losses/sft": 1.6027624607086182, "losses/total": 0.4199092984199524, "ref_logps/chosen": -39.44010925292969, "ref_logps/rejected": -46.58472442626953, "rewards/accuracies": 0.75, "rewards/chosen": -0.988396942615509, "rewards/margins": 0.7209669947624207, "rewards/rejected": -1.7093638181686401, "step": 247 }, { "epoch": 1.87, "grad_norm": 8.691211910246372, "learning_rate": 2.0786516853932585e-07, "logps/chosen": -50.76154327392578, "logps/rejected": -63.2697868347168, "loss": 0.4931, "losses/dpo": 0.48489272594451904, "losses/sft": 2.4353795051574707, "losses/total": 0.48489272594451904, "ref_logps/chosen": -40.483741760253906, "ref_logps/rejected": -45.29812240600586, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0277801752090454, "rewards/margins": 0.7693858742713928, "rewards/rejected": -1.7971662282943726, "step": 248 }, { "epoch": 1.88, "grad_norm": 9.872927469509172, "learning_rate": 2.0646067415730336e-07, "logps/chosen": -49.138465881347656, "logps/rejected": -66.16885375976562, "loss": 0.5055, "losses/dpo": 0.5177363753318787, "losses/sft": 1.834108829498291, "losses/total": 0.5177363753318787, "ref_logps/chosen": -38.63552474975586, "ref_logps/rejected": -48.135929107666016, "rewards/accuracies": 0.765625, "rewards/chosen": -1.050294280052185, "rewards/margins": 0.7529983520507812, "rewards/rejected": -1.8032926321029663, "step": 249 }, { "epoch": 1.89, "grad_norm": 9.265714326509956, "learning_rate": 2.0505617977528089e-07, "logps/chosen": -48.308753967285156, "logps/rejected": -66.52729034423828, "loss": 0.486, "losses/dpo": 0.6348212957382202, "losses/sft": 1.772031307220459, "losses/total": 0.6348212957382202, "ref_logps/chosen": -38.7848014831543, "ref_logps/rejected": -48.895755767822266, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9523951411247253, "rewards/margins": 0.8107584714889526, "rewards/rejected": -1.7631536722183228, "step": 250 }, { "epoch": 1.89, "grad_norm": 9.119904977454695, "learning_rate": 2.0365168539325842e-07, "logps/chosen": -50.29534912109375, "logps/rejected": -64.1716537475586, "loss": 0.4977, "losses/dpo": 0.522094190120697, "losses/sft": 1.8622956275939941, "losses/total": 0.522094190120697, "ref_logps/chosen": -39.614479064941406, "ref_logps/rejected": -45.53257751464844, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0680873394012451, "rewards/margins": 0.7958202958106995, "rewards/rejected": -1.8639075756072998, "step": 251 }, { "epoch": 1.9, "grad_norm": 8.300519651321538, "learning_rate": 2.0224719101123595e-07, "logps/chosen": -48.15717315673828, "logps/rejected": -66.43309783935547, "loss": 0.4749, "losses/dpo": 0.7441291809082031, "losses/sft": 2.440709352493286, "losses/total": 0.7441291809082031, "ref_logps/chosen": -38.77737808227539, "ref_logps/rejected": -47.56039047241211, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9379786252975464, "rewards/margins": 0.9492916464805603, "rewards/rejected": -1.887270212173462, "step": 252 }, { "epoch": 1.91, "grad_norm": 9.317842509294723, "learning_rate": 2.0084269662921348e-07, "logps/chosen": -44.09011459350586, "logps/rejected": -59.16832733154297, "loss": 0.5178, "losses/dpo": 0.8791393041610718, "losses/sft": 2.401695489883423, "losses/total": 0.8791393041610718, "ref_logps/chosen": -35.33125305175781, "ref_logps/rejected": -43.098087310791016, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8758863210678101, "rewards/margins": 0.7311373353004456, "rewards/rejected": -1.60702383518219, "step": 253 }, { "epoch": 1.92, "grad_norm": 8.201438226460152, "learning_rate": 1.9943820224719098e-07, "logps/chosen": -51.51540756225586, "logps/rejected": -67.69857788085938, "loss": 0.4104, "losses/dpo": 0.48046156764030457, "losses/sft": 1.7079527378082275, "losses/total": 0.48046156764030457, "ref_logps/chosen": -42.20224380493164, "ref_logps/rejected": -47.85743713378906, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.9313161373138428, "rewards/margins": 1.0527985095977783, "rewards/rejected": -1.9841147661209106, "step": 254 }, { "epoch": 1.92, "grad_norm": 9.182124453243851, "learning_rate": 1.9803370786516854e-07, "logps/chosen": -52.86988067626953, "logps/rejected": -64.82228088378906, "loss": 0.4958, "losses/dpo": 0.4478102922439575, "losses/sft": 1.8136005401611328, "losses/total": 0.4478102922439575, "ref_logps/chosen": -43.32461929321289, "ref_logps/rejected": -48.31917190551758, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.9545266032218933, "rewards/margins": 0.6957840323448181, "rewards/rejected": -1.650310754776001, "step": 255 }, { "epoch": 1.93, "grad_norm": 8.664903887618609, "learning_rate": 1.9662921348314607e-07, "logps/chosen": -47.40139389038086, "logps/rejected": -61.24211502075195, "loss": 0.5069, "losses/dpo": 0.45637544989585876, "losses/sft": 2.080510139465332, "losses/total": 0.45637544989585876, "ref_logps/chosen": -37.96201705932617, "ref_logps/rejected": -44.53689956665039, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9439379572868347, "rewards/margins": 0.7265833616256714, "rewards/rejected": -1.6705212593078613, "step": 256 }, { "epoch": 1.94, "grad_norm": 8.357609816704013, "learning_rate": 1.952247191011236e-07, "logps/chosen": -51.97878646850586, "logps/rejected": -66.72062683105469, "loss": 0.4312, "losses/dpo": 0.3072975277900696, "losses/sft": 1.8392665386199951, "losses/total": 0.3072975277900696, "ref_logps/chosen": -42.230186462402344, "ref_logps/rejected": -47.218257904052734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9748601913452148, "rewards/margins": 0.9753769040107727, "rewards/rejected": -1.9502369165420532, "step": 257 }, { "epoch": 1.95, "grad_norm": 8.434371561430744, "learning_rate": 1.938202247191011e-07, "logps/chosen": -50.701534271240234, "logps/rejected": -65.6552734375, "loss": 0.4402, "losses/dpo": 0.4483451545238495, "losses/sft": 1.6883811950683594, "losses/total": 0.4483451545238495, "ref_logps/chosen": -40.96206283569336, "ref_logps/rejected": -46.52254104614258, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.9739474654197693, "rewards/margins": 0.9393259286880493, "rewards/rejected": -1.9132733345031738, "step": 258 }, { "epoch": 1.95, "grad_norm": 8.326106947840566, "learning_rate": 1.9241573033707863e-07, "logps/chosen": -49.05642318725586, "logps/rejected": -62.65263366699219, "loss": 0.4559, "losses/dpo": 0.43069154024124146, "losses/sft": 2.6451079845428467, "losses/total": 0.43069154024124146, "ref_logps/chosen": -39.344356536865234, "ref_logps/rejected": -43.80147933959961, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9712071418762207, "rewards/margins": 0.9139088988304138, "rewards/rejected": -1.8851161003112793, "step": 259 }, { "epoch": 1.96, "grad_norm": 10.317747735678177, "learning_rate": 1.9101123595505617e-07, "logps/chosen": -47.29045867919922, "logps/rejected": -60.76853942871094, "loss": 0.5233, "losses/dpo": 0.6233609914779663, "losses/sft": 2.166818618774414, "losses/total": 0.6233609914779663, "ref_logps/chosen": -37.4037971496582, "ref_logps/rejected": -43.672698974609375, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.9886665344238281, "rewards/margins": 0.7209180593490601, "rewards/rejected": -1.7095845937728882, "step": 260 }, { "epoch": 1.97, "grad_norm": 8.570482678193486, "learning_rate": 1.896067415730337e-07, "logps/chosen": -47.826866149902344, "logps/rejected": -64.69200897216797, "loss": 0.4646, "losses/dpo": 0.39222848415374756, "losses/sft": 1.7622792720794678, "losses/total": 0.39222848415374756, "ref_logps/chosen": -37.465206146240234, "ref_logps/rejected": -45.77225112915039, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0361659526824951, "rewards/margins": 0.8558104634284973, "rewards/rejected": -1.8919763565063477, "step": 261 }, { "epoch": 1.98, "grad_norm": 8.082044540577316, "learning_rate": 1.8820224719101123e-07, "logps/chosen": -44.3437385559082, "logps/rejected": -65.46928405761719, "loss": 0.4247, "losses/dpo": 0.25466495752334595, "losses/sft": 1.6573872566223145, "losses/total": 0.25466495752334595, "ref_logps/chosen": -35.26055908203125, "ref_logps/rejected": -46.34375, "rewards/accuracies": 0.84375, "rewards/chosen": -0.908318042755127, "rewards/margins": 1.0042363405227661, "rewards/rejected": -1.9125542640686035, "step": 262 }, { "epoch": 1.98, "grad_norm": 7.886213605680278, "learning_rate": 1.8679775280898876e-07, "logps/chosen": -44.65882873535156, "logps/rejected": -65.91732025146484, "loss": 0.4233, "losses/dpo": 0.37609466910362244, "losses/sft": 1.7309682369232178, "losses/total": 0.37609466910362244, "ref_logps/chosen": -34.98197555541992, "ref_logps/rejected": -45.339569091796875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9676854610443115, "rewards/margins": 1.0900897979736328, "rewards/rejected": -2.0577754974365234, "step": 263 }, { "epoch": 1.99, "grad_norm": 9.420349702400047, "learning_rate": 1.853932584269663e-07, "logps/chosen": -55.511512756347656, "logps/rejected": -67.9210205078125, "loss": 0.4846, "losses/dpo": 0.35744136571884155, "losses/sft": 1.9689966440200806, "losses/total": 0.35744136571884155, "ref_logps/chosen": -44.93099594116211, "ref_logps/rejected": -48.529052734375, "rewards/accuracies": 0.765625, "rewards/chosen": -1.058051347732544, "rewards/margins": 0.8811461925506592, "rewards/rejected": -1.9391975402832031, "step": 264 }, { "epoch": 2.0, "grad_norm": 8.90872581509833, "learning_rate": 1.8398876404494382e-07, "logps/chosen": -50.006412506103516, "logps/rejected": -65.82926940917969, "loss": 0.4658, "losses/dpo": 0.42064571380615234, "losses/sft": 1.965097427368164, "losses/total": 0.42064571380615234, "ref_logps/chosen": -39.1809196472168, "ref_logps/rejected": -45.722938537597656, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.0825499296188354, "rewards/margins": 0.9280825853347778, "rewards/rejected": -2.0106325149536133, "step": 265 }, { "epoch": 2.01, "grad_norm": 7.965650717330079, "learning_rate": 1.8258426966292135e-07, "logps/chosen": -50.00637435913086, "logps/rejected": -65.08876037597656, "loss": 0.4116, "losses/dpo": 0.36081230640411377, "losses/sft": 2.0271382331848145, "losses/total": 0.36081230640411377, "ref_logps/chosen": -39.65681076049805, "ref_logps/rejected": -44.160343170166016, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0349565744400024, "rewards/margins": 1.0578850507736206, "rewards/rejected": -2.092841625213623, "step": 266 }, { "epoch": 2.02, "grad_norm": 8.213010773638015, "learning_rate": 1.8117977528089888e-07, "logps/chosen": -49.70448303222656, "logps/rejected": -64.45352172851562, "loss": 0.4328, "losses/dpo": 0.4751141667366028, "losses/sft": 2.163590431213379, "losses/total": 0.4751141667366028, "ref_logps/chosen": -39.66984558105469, "ref_logps/rejected": -45.02206802368164, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.0034637451171875, "rewards/margins": 0.9396811723709106, "rewards/rejected": -1.943144679069519, "step": 267 }, { "epoch": 2.02, "grad_norm": 8.627785952714778, "learning_rate": 1.7977528089887638e-07, "logps/chosen": -47.43844985961914, "logps/rejected": -61.57966995239258, "loss": 0.4366, "losses/dpo": 0.4777096211910248, "losses/sft": 2.011448860168457, "losses/total": 0.4777096211910248, "ref_logps/chosen": -37.887752532958984, "ref_logps/rejected": -42.577537536621094, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.9550699591636658, "rewards/margins": 0.945143461227417, "rewards/rejected": -1.9002132415771484, "step": 268 }, { "epoch": 2.03, "grad_norm": 9.54969996951565, "learning_rate": 1.7837078651685391e-07, "logps/chosen": -50.50410842895508, "logps/rejected": -64.97512817382812, "loss": 0.4367, "losses/dpo": 0.4186415672302246, "losses/sft": 2.4884321689605713, "losses/total": 0.4186415672302246, "ref_logps/chosen": -39.366878509521484, "ref_logps/rejected": -44.09857940673828, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.1137233972549438, "rewards/margins": 0.973931610584259, "rewards/rejected": -2.0876548290252686, "step": 269 }, { "epoch": 2.04, "grad_norm": 7.541865642493473, "learning_rate": 1.7696629213483144e-07, "logps/chosen": -53.07981491088867, "logps/rejected": -69.67108917236328, "loss": 0.3686, "losses/dpo": 0.23445191979408264, "losses/sft": 1.9627153873443604, "losses/total": 0.23445191979408264, "ref_logps/chosen": -41.86531448364258, "ref_logps/rejected": -46.77862548828125, "rewards/accuracies": 0.890625, "rewards/chosen": -1.121450424194336, "rewards/margins": 1.1677953004837036, "rewards/rejected": -2.289245843887329, "step": 270 }, { "epoch": 2.05, "grad_norm": 7.78557132067994, "learning_rate": 1.75561797752809e-07, "logps/chosen": -44.59587478637695, "logps/rejected": -61.11756896972656, "loss": 0.4304, "losses/dpo": 0.40728461742401123, "losses/sft": 2.1670854091644287, "losses/total": 0.40728461742401123, "ref_logps/chosen": -34.50210189819336, "ref_logps/rejected": -41.35382843017578, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.009376883506775, "rewards/margins": 0.9669971466064453, "rewards/rejected": -1.9763740301132202, "step": 271 }, { "epoch": 2.05, "grad_norm": 8.691528110147415, "learning_rate": 1.741573033707865e-07, "logps/chosen": -47.1044807434082, "logps/rejected": -62.402366638183594, "loss": 0.4439, "losses/dpo": 0.6048084497451782, "losses/sft": 2.738722801208496, "losses/total": 0.6048084497451782, "ref_logps/chosen": -36.921566009521484, "ref_logps/rejected": -41.66175842285156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0182914733886719, "rewards/margins": 1.0557701587677002, "rewards/rejected": -2.074061632156372, "step": 272 }, { "epoch": 2.06, "grad_norm": 7.3620175100007135, "learning_rate": 1.7275280898876404e-07, "logps/chosen": -51.797447204589844, "logps/rejected": -69.3549575805664, "loss": 0.3635, "losses/dpo": 0.4010230600833893, "losses/sft": 1.782325029373169, "losses/total": 0.4010230600833893, "ref_logps/chosen": -41.61540222167969, "ref_logps/rejected": -47.2431526184082, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.0182045698165894, "rewards/margins": 1.192975640296936, "rewards/rejected": -2.2111802101135254, "step": 273 }, { "epoch": 2.07, "grad_norm": 8.804306158193548, "learning_rate": 1.7134831460674157e-07, "logps/chosen": -51.407501220703125, "logps/rejected": -64.26744079589844, "loss": 0.4578, "losses/dpo": 0.6319560408592224, "losses/sft": 2.214840888977051, "losses/total": 0.6319560408592224, "ref_logps/chosen": -40.07988739013672, "ref_logps/rejected": -43.7442741394043, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.1327617168426514, "rewards/margins": 0.919555127620697, "rewards/rejected": -2.052316665649414, "step": 274 }, { "epoch": 2.08, "grad_norm": 9.887135650412674, "learning_rate": 1.699438202247191e-07, "logps/chosen": -48.88153839111328, "logps/rejected": -64.42852783203125, "loss": 0.4198, "losses/dpo": 0.36265987157821655, "losses/sft": 2.2624428272247314, "losses/total": 0.36265987157821655, "ref_logps/chosen": -38.071693420410156, "ref_logps/rejected": -42.96941375732422, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.080984115600586, "rewards/margins": 1.064927339553833, "rewards/rejected": -2.145911455154419, "step": 275 }, { "epoch": 2.08, "grad_norm": 7.789984027153804, "learning_rate": 1.6853932584269663e-07, "logps/chosen": -53.47975158691406, "logps/rejected": -69.208251953125, "loss": 0.4077, "losses/dpo": 0.3306717872619629, "losses/sft": 1.8213623762130737, "losses/total": 0.3306717872619629, "ref_logps/chosen": -42.26997375488281, "ref_logps/rejected": -47.255218505859375, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1209776401519775, "rewards/margins": 1.0743255615234375, "rewards/rejected": -2.195303440093994, "step": 276 }, { "epoch": 2.09, "grad_norm": 8.577961125042698, "learning_rate": 1.6713483146067413e-07, "logps/chosen": -48.9437255859375, "logps/rejected": -66.896484375, "loss": 0.4237, "losses/dpo": 0.375847727060318, "losses/sft": 1.7302836179733276, "losses/total": 0.375847727060318, "ref_logps/chosen": -38.10498046875, "ref_logps/rejected": -45.20066452026367, "rewards/accuracies": 0.828125, "rewards/chosen": -1.0838744640350342, "rewards/margins": 1.0857088565826416, "rewards/rejected": -2.1695830821990967, "step": 277 }, { "epoch": 2.1, "grad_norm": 7.88842347856655, "learning_rate": 1.6573033707865166e-07, "logps/chosen": -50.38557815551758, "logps/rejected": -71.69691467285156, "loss": 0.3884, "losses/dpo": 0.20783157646656036, "losses/sft": 1.65842604637146, "losses/total": 0.20783157646656036, "ref_logps/chosen": -38.84345245361328, "ref_logps/rejected": -47.66883850097656, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.1542127132415771, "rewards/margins": 1.2485952377319336, "rewards/rejected": -2.40280818939209, "step": 278 }, { "epoch": 2.11, "grad_norm": 8.27071868454665, "learning_rate": 1.6432584269662922e-07, "logps/chosen": -50.050907135009766, "logps/rejected": -71.91120147705078, "loss": 0.3873, "losses/dpo": 0.36935174465179443, "losses/sft": 2.4310526847839355, "losses/total": 0.36935174465179443, "ref_logps/chosen": -38.60224914550781, "ref_logps/rejected": -48.42378234863281, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.1448655128479004, "rewards/margins": 1.2038761377334595, "rewards/rejected": -2.3487415313720703, "step": 279 }, { "epoch": 2.11, "grad_norm": 8.202604558490952, "learning_rate": 1.6292134831460675e-07, "logps/chosen": -49.78974533081055, "logps/rejected": -63.648345947265625, "loss": 0.4287, "losses/dpo": 0.4768953025341034, "losses/sft": 1.486172080039978, "losses/total": 0.4768953025341034, "ref_logps/chosen": -38.987091064453125, "ref_logps/rejected": -42.706016540527344, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0802651643753052, "rewards/margins": 1.0139687061309814, "rewards/rejected": -2.094233751296997, "step": 280 }, { "epoch": 2.12, "grad_norm": 7.6837084768811295, "learning_rate": 1.6151685393258428e-07, "logps/chosen": -46.654842376708984, "logps/rejected": -65.57063293457031, "loss": 0.4002, "losses/dpo": 0.25231242179870605, "losses/sft": 1.6833255290985107, "losses/total": 0.25231242179870605, "ref_logps/chosen": -35.983055114746094, "ref_logps/rejected": -43.31926345825195, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.0671789646148682, "rewards/margins": 1.1579577922821045, "rewards/rejected": -2.2251367568969727, "step": 281 }, { "epoch": 2.13, "grad_norm": 7.367492054152066, "learning_rate": 1.6011235955056178e-07, "logps/chosen": -47.970069885253906, "logps/rejected": -67.12972259521484, "loss": 0.3817, "losses/dpo": 0.2698941230773926, "losses/sft": 2.0059823989868164, "losses/total": 0.2698941230773926, "ref_logps/chosen": -37.96726608276367, "ref_logps/rejected": -44.837013244628906, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.0002803802490234, "rewards/margins": 1.2289901971817017, "rewards/rejected": -2.2292706966400146, "step": 282 }, { "epoch": 2.14, "grad_norm": 8.198398756823499, "learning_rate": 1.5870786516853931e-07, "logps/chosen": -52.00371551513672, "logps/rejected": -71.66854095458984, "loss": 0.373, "losses/dpo": 0.37371307611465454, "losses/sft": 2.166947841644287, "losses/total": 0.37371307611465454, "ref_logps/chosen": -40.8586540222168, "ref_logps/rejected": -47.34501266479492, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1145060062408447, "rewards/margins": 1.3178460597991943, "rewards/rejected": -2.432352304458618, "step": 283 }, { "epoch": 2.14, "grad_norm": 7.211631798978324, "learning_rate": 1.5730337078651685e-07, "logps/chosen": -45.30519104003906, "logps/rejected": -67.6802749633789, "loss": 0.3529, "losses/dpo": 0.6495727896690369, "losses/sft": 2.1896119117736816, "losses/total": 0.6495727896690369, "ref_logps/chosen": -35.870975494384766, "ref_logps/rejected": -45.01594924926758, "rewards/accuracies": 0.875, "rewards/chosen": -0.9434216022491455, "rewards/margins": 1.3230111598968506, "rewards/rejected": -2.266432523727417, "step": 284 }, { "epoch": 2.15, "grad_norm": 8.73649186021766, "learning_rate": 1.5589887640449438e-07, "logps/chosen": -51.24886703491211, "logps/rejected": -73.5231704711914, "loss": 0.3706, "losses/dpo": 0.2774621248245239, "losses/sft": 2.14704966545105, "losses/total": 0.2774621248245239, "ref_logps/chosen": -39.21043395996094, "ref_logps/rejected": -48.85007858276367, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2038426399230957, "rewards/margins": 1.2634668350219727, "rewards/rejected": -2.4673094749450684, "step": 285 }, { "epoch": 2.16, "grad_norm": 9.965422332405613, "learning_rate": 1.5449438202247188e-07, "logps/chosen": -56.63559341430664, "logps/rejected": -69.31437683105469, "loss": 0.4581, "losses/dpo": 1.019913673400879, "losses/sft": 2.6583502292633057, "losses/total": 1.019913673400879, "ref_logps/chosen": -42.96681594848633, "ref_logps/rejected": -44.904361724853516, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3668776750564575, "rewards/margins": 1.0741242170333862, "rewards/rejected": -2.4410018920898438, "step": 286 }, { "epoch": 2.17, "grad_norm": 9.04029000539786, "learning_rate": 1.5308988764044944e-07, "logps/chosen": -50.918128967285156, "logps/rejected": -67.66094970703125, "loss": 0.4491, "losses/dpo": 0.536713182926178, "losses/sft": 2.0050907135009766, "losses/total": 0.536713182926178, "ref_logps/chosen": -37.77289581298828, "ref_logps/rejected": -43.736061096191406, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3145227432250977, "rewards/margins": 1.0779664516448975, "rewards/rejected": -2.392489194869995, "step": 287 }, { "epoch": 2.17, "grad_norm": 7.9909005031619476, "learning_rate": 1.5168539325842697e-07, "logps/chosen": -48.0228271484375, "logps/rejected": -72.348876953125, "loss": 0.3569, "losses/dpo": 0.22953173518180847, "losses/sft": 1.752846360206604, "losses/total": 0.22953173518180847, "ref_logps/chosen": -36.91614532470703, "ref_logps/rejected": -47.88288497924805, "rewards/accuracies": 0.890625, "rewards/chosen": -1.1106677055358887, "rewards/margins": 1.3359307050704956, "rewards/rejected": -2.4465982913970947, "step": 288 }, { "epoch": 2.18, "grad_norm": 9.189297221788385, "learning_rate": 1.502808988764045e-07, "logps/chosen": -55.16273880004883, "logps/rejected": -74.13533782958984, "loss": 0.4081, "losses/dpo": 0.39626190066337585, "losses/sft": 2.1484994888305664, "losses/total": 0.39626190066337585, "ref_logps/chosen": -42.12677001953125, "ref_logps/rejected": -48.699642181396484, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.3035968542099, "rewards/margins": 1.2399725914001465, "rewards/rejected": -2.543569326400757, "step": 289 }, { "epoch": 2.19, "grad_norm": 8.223258235820202, "learning_rate": 1.4887640449438203e-07, "logps/chosen": -46.8680534362793, "logps/rejected": -64.84513854980469, "loss": 0.3998, "losses/dpo": 0.38319000601768494, "losses/sft": 2.0748698711395264, "losses/total": 0.38319000601768494, "ref_logps/chosen": -35.66654968261719, "ref_logps/rejected": -42.36066818237305, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1201505661010742, "rewards/margins": 1.1282968521118164, "rewards/rejected": -2.2484474182128906, "step": 290 }, { "epoch": 2.2, "grad_norm": 9.322598904199673, "learning_rate": 1.4747191011235953e-07, "logps/chosen": -51.94579315185547, "logps/rejected": -76.23491668701172, "loss": 0.4365, "losses/dpo": 0.6639813184738159, "losses/sft": 3.0463194847106934, "losses/total": 0.6639813184738159, "ref_logps/chosen": -37.92943572998047, "ref_logps/rejected": -49.77875518798828, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.4016355276107788, "rewards/margins": 1.243980884552002, "rewards/rejected": -2.6456165313720703, "step": 291 }, { "epoch": 2.2, "grad_norm": 9.28502050286711, "learning_rate": 1.4606741573033706e-07, "logps/chosen": -51.56089782714844, "logps/rejected": -67.3809814453125, "loss": 0.4295, "losses/dpo": 0.2485855668783188, "losses/sft": 2.5399341583251953, "losses/total": 0.2485855668783188, "ref_logps/chosen": -38.323081970214844, "ref_logps/rejected": -43.60871505737305, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3237823247909546, "rewards/margins": 1.0534443855285645, "rewards/rejected": -2.3772268295288086, "step": 292 }, { "epoch": 2.21, "grad_norm": 10.352671653799021, "learning_rate": 1.446629213483146e-07, "logps/chosen": -60.71104431152344, "logps/rejected": -76.54723358154297, "loss": 0.4205, "losses/dpo": 0.5032411813735962, "losses/sft": 2.2452447414398193, "losses/total": 0.5032411813735962, "ref_logps/chosen": -45.97325897216797, "ref_logps/rejected": -49.70647430419922, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.4737780094146729, "rewards/margins": 1.2102973461151123, "rewards/rejected": -2.684075117111206, "step": 293 }, { "epoch": 2.22, "grad_norm": 9.489700991538179, "learning_rate": 1.4325842696629212e-07, "logps/chosen": -51.33311462402344, "logps/rejected": -72.56240844726562, "loss": 0.4041, "losses/dpo": 0.39366570115089417, "losses/sft": 1.5643844604492188, "losses/total": 0.39366570115089417, "ref_logps/chosen": -38.85292053222656, "ref_logps/rejected": -47.41047668457031, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2480189800262451, "rewards/margins": 1.2671747207641602, "rewards/rejected": -2.5151939392089844, "step": 294 }, { "epoch": 2.23, "grad_norm": 8.202282597323313, "learning_rate": 1.4185393258426968e-07, "logps/chosen": -51.06114959716797, "logps/rejected": -74.44361114501953, "loss": 0.3473, "losses/dpo": 0.40783169865608215, "losses/sft": 2.3142831325531006, "losses/total": 0.40783169865608215, "ref_logps/chosen": -39.6861457824707, "ref_logps/rejected": -49.402000427246094, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1375010013580322, "rewards/margins": 1.3666609525680542, "rewards/rejected": -2.504162073135376, "step": 295 }, { "epoch": 2.23, "grad_norm": 8.616438775614668, "learning_rate": 1.4044943820224718e-07, "logps/chosen": -50.71443557739258, "logps/rejected": -67.99952697753906, "loss": 0.4191, "losses/dpo": 0.3447108566761017, "losses/sft": 1.6047589778900146, "losses/total": 0.3447108566761017, "ref_logps/chosen": -38.159549713134766, "ref_logps/rejected": -44.2279052734375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2554888725280762, "rewards/margins": 1.12167227268219, "rewards/rejected": -2.3771612644195557, "step": 296 }, { "epoch": 2.24, "grad_norm": 8.42295456953331, "learning_rate": 1.3904494382022472e-07, "logps/chosen": -50.11585235595703, "logps/rejected": -69.99391174316406, "loss": 0.4154, "losses/dpo": 0.6180249452590942, "losses/sft": 1.9767247438430786, "losses/total": 0.6180249452590942, "ref_logps/chosen": -37.855167388916016, "ref_logps/rejected": -45.18976974487305, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.2260689735412598, "rewards/margins": 1.2543449401855469, "rewards/rejected": -2.4804139137268066, "step": 297 }, { "epoch": 2.25, "grad_norm": 7.916024207524859, "learning_rate": 1.3764044943820225e-07, "logps/chosen": -48.01890563964844, "logps/rejected": -71.55242919921875, "loss": 0.3286, "losses/dpo": 0.39366215467453003, "losses/sft": 1.6183239221572876, "losses/total": 0.39366215467453003, "ref_logps/chosen": -36.65507888793945, "ref_logps/rejected": -45.11772537231445, "rewards/accuracies": 0.875, "rewards/chosen": -1.1363829374313354, "rewards/margins": 1.5070867538452148, "rewards/rejected": -2.64346981048584, "step": 298 }, { "epoch": 2.26, "grad_norm": 8.381587007100304, "learning_rate": 1.3623595505617978e-07, "logps/chosen": -48.97580337524414, "logps/rejected": -71.05843353271484, "loss": 0.369, "losses/dpo": 0.4669285714626312, "losses/sft": 2.4736876487731934, "losses/total": 0.4669285714626312, "ref_logps/chosen": -36.59314727783203, "ref_logps/rejected": -46.38197708129883, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.2382655143737793, "rewards/margins": 1.229379415512085, "rewards/rejected": -2.4676451683044434, "step": 299 }, { "epoch": 2.26, "grad_norm": 7.816178695034966, "learning_rate": 1.3483146067415728e-07, "logps/chosen": -48.929466247558594, "logps/rejected": -70.30461883544922, "loss": 0.3683, "losses/dpo": 0.30282700061798096, "losses/sft": 2.068610906600952, "losses/total": 0.30282700061798096, "ref_logps/chosen": -36.281883239746094, "ref_logps/rejected": -44.1247444152832, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2647581100463867, "rewards/margins": 1.3532286882400513, "rewards/rejected": -2.6179869174957275, "step": 300 }, { "epoch": 2.27, "grad_norm": 9.42088429106351, "learning_rate": 1.334269662921348e-07, "logps/chosen": -53.99761199951172, "logps/rejected": -68.16130828857422, "loss": 0.4154, "losses/dpo": 0.7334519028663635, "losses/sft": 2.3190560340881348, "losses/total": 0.7334519028663635, "ref_logps/chosen": -41.94767379760742, "ref_logps/rejected": -43.898292541503906, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.2049940824508667, "rewards/margins": 1.2213077545166016, "rewards/rejected": -2.4263014793395996, "step": 301 }, { "epoch": 2.28, "grad_norm": 8.474867745213691, "learning_rate": 1.3202247191011234e-07, "logps/chosen": -56.917606353759766, "logps/rejected": -78.60893249511719, "loss": 0.3643, "losses/dpo": 0.30533695220947266, "losses/sft": 2.2758193016052246, "losses/total": 0.30533695220947266, "ref_logps/chosen": -43.154685974121094, "ref_logps/rejected": -50.671146392822266, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3762919902801514, "rewards/margins": 1.4174861907958984, "rewards/rejected": -2.793778419494629, "step": 302 }, { "epoch": 2.29, "grad_norm": 9.098388851491974, "learning_rate": 1.306179775280899e-07, "logps/chosen": -52.541847229003906, "logps/rejected": -75.27273559570312, "loss": 0.4022, "losses/dpo": 0.7690958380699158, "losses/sft": 2.406214714050293, "losses/total": 0.7690958380699158, "ref_logps/chosen": -39.33592224121094, "ref_logps/rejected": -50.239105224609375, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3205927610397339, "rewards/margins": 1.1827703714370728, "rewards/rejected": -2.5033628940582275, "step": 303 }, { "epoch": 2.29, "grad_norm": 7.915112908663982, "learning_rate": 1.2921348314606743e-07, "logps/chosen": -52.54154968261719, "logps/rejected": -79.33431243896484, "loss": 0.3, "losses/dpo": 0.22580446302890778, "losses/sft": 2.1299729347229004, "losses/total": 0.22580446302890778, "ref_logps/chosen": -39.62370300292969, "ref_logps/rejected": -51.30101013183594, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2917848825454712, "rewards/margins": 1.511545181274414, "rewards/rejected": -2.8033299446105957, "step": 304 }, { "epoch": 2.3, "grad_norm": 8.60697942251823, "learning_rate": 1.2780898876404493e-07, "logps/chosen": -54.95063781738281, "logps/rejected": -76.17906188964844, "loss": 0.3787, "losses/dpo": 0.48840370774269104, "losses/sft": 2.068924903869629, "losses/total": 0.48840370774269104, "ref_logps/chosen": -40.245113372802734, "ref_logps/rejected": -48.299339294433594, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.4705531597137451, "rewards/margins": 1.317419409751892, "rewards/rejected": -2.7879724502563477, "step": 305 }, { "epoch": 2.31, "grad_norm": 8.916108799954989, "learning_rate": 1.2640449438202246e-07, "logps/chosen": -54.08748245239258, "logps/rejected": -72.3311767578125, "loss": 0.384, "losses/dpo": 0.27609461545944214, "losses/sft": 2.02748703956604, "losses/total": 0.27609461545944214, "ref_logps/chosen": -40.30693817138672, "ref_logps/rejected": -46.038856506347656, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3780547380447388, "rewards/margins": 1.2511768341064453, "rewards/rejected": -2.6292316913604736, "step": 306 }, { "epoch": 2.32, "grad_norm": 9.08799197478477, "learning_rate": 1.25e-07, "logps/chosen": -54.119285583496094, "logps/rejected": -65.3755111694336, "loss": 0.4074, "losses/dpo": 0.314796507358551, "losses/sft": 2.275911808013916, "losses/total": 0.314796507358551, "ref_logps/chosen": -40.99217224121094, "ref_logps/rejected": -41.43158721923828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.312711477279663, "rewards/margins": 1.081681251525879, "rewards/rejected": -2.394392967224121, "step": 307 }, { "epoch": 2.32, "grad_norm": 8.103580469459054, "learning_rate": 1.2359550561797752e-07, "logps/chosen": -51.23326873779297, "logps/rejected": -75.66114807128906, "loss": 0.3713, "losses/dpo": 0.387349396944046, "losses/sft": 2.3062949180603027, "losses/total": 0.387349396944046, "ref_logps/chosen": -36.80121994018555, "ref_logps/rejected": -47.23471450805664, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.4432051181793213, "rewards/margins": 1.399438738822937, "rewards/rejected": -2.842643976211548, "step": 308 }, { "epoch": 2.33, "grad_norm": 8.263469946945373, "learning_rate": 1.2219101123595506e-07, "logps/chosen": -51.04032897949219, "logps/rejected": -73.06275177001953, "loss": 0.3571, "losses/dpo": 0.4852275252342224, "losses/sft": 2.0593721866607666, "losses/total": 0.4852275252342224, "ref_logps/chosen": -37.46986770629883, "ref_logps/rejected": -45.448211669921875, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3570460081100464, "rewards/margins": 1.4044082164764404, "rewards/rejected": -2.7614541053771973, "step": 309 }, { "epoch": 2.34, "grad_norm": 9.638610818670363, "learning_rate": 1.2078651685393259e-07, "logps/chosen": -61.78599166870117, "logps/rejected": -79.4295883178711, "loss": 0.3814, "losses/dpo": 0.3531866669654846, "losses/sft": 2.5623600482940674, "losses/total": 0.3531866669654846, "ref_logps/chosen": -47.63096618652344, "ref_logps/rejected": -52.177154541015625, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4155021905899048, "rewards/margins": 1.3097403049468994, "rewards/rejected": -2.7252423763275146, "step": 310 }, { "epoch": 2.35, "grad_norm": 8.100555853256143, "learning_rate": 1.1938202247191012e-07, "logps/chosen": -51.98362350463867, "logps/rejected": -76.51303100585938, "loss": 0.3451, "losses/dpo": 0.25824400782585144, "losses/sft": 1.8786492347717285, "losses/total": 0.25824400782585144, "ref_logps/chosen": -38.72646713256836, "ref_logps/rejected": -48.84027862548828, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.3257157802581787, "rewards/margins": 1.4415602684020996, "rewards/rejected": -2.767275810241699, "step": 311 }, { "epoch": 2.35, "grad_norm": 8.895756271806501, "learning_rate": 1.1797752808988763e-07, "logps/chosen": -54.543087005615234, "logps/rejected": -75.70824432373047, "loss": 0.3937, "losses/dpo": 0.44249895215034485, "losses/sft": 2.0751869678497314, "losses/total": 0.44249895215034485, "ref_logps/chosen": -40.311073303222656, "ref_logps/rejected": -47.72747802734375, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.423201322555542, "rewards/margins": 1.3748749494552612, "rewards/rejected": -2.7980761528015137, "step": 312 }, { "epoch": 2.36, "grad_norm": 9.181986369222448, "learning_rate": 1.1657303370786515e-07, "logps/chosen": -53.058631896972656, "logps/rejected": -76.43999481201172, "loss": 0.3426, "losses/dpo": 0.37313902378082275, "losses/sft": 1.9281624555587769, "losses/total": 0.37313902378082275, "ref_logps/chosen": -38.72821044921875, "ref_logps/rejected": -47.35258483886719, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.433042049407959, "rewards/margins": 1.4756982326507568, "rewards/rejected": -2.908740282058716, "step": 313 }, { "epoch": 2.37, "grad_norm": 10.667806756150233, "learning_rate": 1.151685393258427e-07, "logps/chosen": -55.91019058227539, "logps/rejected": -73.06080627441406, "loss": 0.4539, "losses/dpo": 0.27354708313941956, "losses/sft": 2.2720413208007812, "losses/total": 0.27354708313941956, "ref_logps/chosen": -40.70777893066406, "ref_logps/rejected": -46.47440719604492, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.520241141319275, "rewards/margins": 1.138399600982666, "rewards/rejected": -2.6586403846740723, "step": 314 }, { "epoch": 2.38, "grad_norm": 9.237157441443594, "learning_rate": 1.1376404494382023e-07, "logps/chosen": -52.99339294433594, "logps/rejected": -77.64442443847656, "loss": 0.413, "losses/dpo": 0.5332150459289551, "losses/sft": 2.2471044063568115, "losses/total": 0.5332150459289551, "ref_logps/chosen": -38.779422760009766, "ref_logps/rejected": -49.12788772583008, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.4213968515396118, "rewards/margins": 1.4302568435668945, "rewards/rejected": -2.851653814315796, "step": 315 }, { "epoch": 2.38, "grad_norm": 10.26818606773468, "learning_rate": 1.1235955056179774e-07, "logps/chosen": -54.45465087890625, "logps/rejected": -77.27416229248047, "loss": 0.3916, "losses/dpo": 0.3256514072418213, "losses/sft": 2.224207878112793, "losses/total": 0.3256514072418213, "ref_logps/chosen": -39.11296463012695, "ref_logps/rejected": -47.37888717651367, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.5341691970825195, "rewards/margins": 1.4553582668304443, "rewards/rejected": -2.9895272254943848, "step": 316 }, { "epoch": 2.39, "grad_norm": 8.908115110424331, "learning_rate": 1.1095505617977527e-07, "logps/chosen": -52.21783447265625, "logps/rejected": -74.33990478515625, "loss": 0.3491, "losses/dpo": 0.4351283013820648, "losses/sft": 2.3193869590759277, "losses/total": 0.4351283013820648, "ref_logps/chosen": -38.05769348144531, "ref_logps/rejected": -46.181705474853516, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.4160147905349731, "rewards/margins": 1.3998043537139893, "rewards/rejected": -2.815819263458252, "step": 317 }, { "epoch": 2.4, "grad_norm": 9.20983406154459, "learning_rate": 1.095505617977528e-07, "logps/chosen": -51.61767578125, "logps/rejected": -72.1242446899414, "loss": 0.4144, "losses/dpo": 0.21413980424404144, "losses/sft": 1.885907530784607, "losses/total": 0.21413980424404144, "ref_logps/chosen": -37.064613342285156, "ref_logps/rejected": -44.01332473754883, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.4553061723709106, "rewards/margins": 1.355785608291626, "rewards/rejected": -2.811091899871826, "step": 318 }, { "epoch": 2.41, "grad_norm": 9.683467779206334, "learning_rate": 1.0814606741573033e-07, "logps/chosen": -52.75745391845703, "logps/rejected": -68.34822082519531, "loss": 0.4051, "losses/dpo": 0.5242694020271301, "losses/sft": 1.8490060567855835, "losses/total": 0.5242694020271301, "ref_logps/chosen": -38.73335266113281, "ref_logps/rejected": -42.334041595458984, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4024099111557007, "rewards/margins": 1.1990087032318115, "rewards/rejected": -2.6014187335968018, "step": 319 }, { "epoch": 2.42, "grad_norm": 10.094879725137059, "learning_rate": 1.0674157303370785e-07, "logps/chosen": -55.7940788269043, "logps/rejected": -71.7145767211914, "loss": 0.4083, "losses/dpo": 0.33178359270095825, "losses/sft": 2.2600364685058594, "losses/total": 0.33178359270095825, "ref_logps/chosen": -41.903411865234375, "ref_logps/rejected": -45.71622848510742, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.389066457748413, "rewards/margins": 1.2107690572738647, "rewards/rejected": -2.5998356342315674, "step": 320 }, { "epoch": 2.42, "grad_norm": 8.737981073393495, "learning_rate": 1.0533707865168538e-07, "logps/chosen": -52.43000793457031, "logps/rejected": -70.14034271240234, "loss": 0.4104, "losses/dpo": 0.4298698902130127, "losses/sft": 1.832787036895752, "losses/total": 0.4298698902130127, "ref_logps/chosen": -38.05841064453125, "ref_logps/rejected": -43.76458740234375, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4371598958969116, "rewards/margins": 1.2004159688949585, "rewards/rejected": -2.63757586479187, "step": 321 }, { "epoch": 2.43, "grad_norm": 8.489155654631809, "learning_rate": 1.0393258426966293e-07, "logps/chosen": -54.25529479980469, "logps/rejected": -76.93711853027344, "loss": 0.3281, "losses/dpo": 0.2001137137413025, "losses/sft": 1.605088710784912, "losses/total": 0.2001137137413025, "ref_logps/chosen": -41.82042694091797, "ref_logps/rejected": -49.489280700683594, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2434866428375244, "rewards/margins": 1.5012969970703125, "rewards/rejected": -2.744783401489258, "step": 322 }, { "epoch": 2.44, "grad_norm": 8.976932361180113, "learning_rate": 1.0252808988764044e-07, "logps/chosen": -51.483985900878906, "logps/rejected": -75.30467224121094, "loss": 0.3253, "losses/dpo": 0.28753212094306946, "losses/sft": 2.173304557800293, "losses/total": 0.28753212094306946, "ref_logps/chosen": -38.227813720703125, "ref_logps/rejected": -47.102657318115234, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.3256170749664307, "rewards/margins": 1.4945844411849976, "rewards/rejected": -2.8202013969421387, "step": 323 }, { "epoch": 2.45, "grad_norm": 7.593474701084862, "learning_rate": 1.0112359550561797e-07, "logps/chosen": -47.74129867553711, "logps/rejected": -69.7999267578125, "loss": 0.3669, "losses/dpo": 0.43520650267601013, "losses/sft": 1.8445793390274048, "losses/total": 0.43520650267601013, "ref_logps/chosen": -35.14937210083008, "ref_logps/rejected": -43.280242919921875, "rewards/accuracies": 0.828125, "rewards/chosen": -1.259192943572998, "rewards/margins": 1.3927757740020752, "rewards/rejected": -2.6519687175750732, "step": 324 }, { "epoch": 2.45, "grad_norm": 9.113140126947922, "learning_rate": 9.971910112359549e-08, "logps/chosen": -51.48912811279297, "logps/rejected": -75.93363189697266, "loss": 0.3625, "losses/dpo": 0.271272748708725, "losses/sft": 2.1029720306396484, "losses/total": 0.271272748708725, "ref_logps/chosen": -38.47105026245117, "ref_logps/rejected": -48.987518310546875, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3018079996109009, "rewards/margins": 1.3928041458129883, "rewards/rejected": -2.6946120262145996, "step": 325 }, { "epoch": 2.46, "grad_norm": 9.203607647069793, "learning_rate": 9.831460674157303e-08, "logps/chosen": -56.46796417236328, "logps/rejected": -72.37566375732422, "loss": 0.3758, "losses/dpo": 0.27879202365875244, "losses/sft": 1.8894522190093994, "losses/total": 0.27879202365875244, "ref_logps/chosen": -43.5599365234375, "ref_logps/rejected": -46.51646423339844, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2908027172088623, "rewards/margins": 1.295116901397705, "rewards/rejected": -2.5859196186065674, "step": 326 }, { "epoch": 2.47, "grad_norm": 8.9943305703751, "learning_rate": 9.691011235955055e-08, "logps/chosen": -55.962684631347656, "logps/rejected": -77.36865997314453, "loss": 0.383, "losses/dpo": 0.2792072296142578, "losses/sft": 1.8898770809173584, "losses/total": 0.2792072296142578, "ref_logps/chosen": -41.8868522644043, "ref_logps/rejected": -48.986183166503906, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4075829982757568, "rewards/margins": 1.430665373802185, "rewards/rejected": -2.8382484912872314, "step": 327 }, { "epoch": 2.48, "grad_norm": 9.502289617698672, "learning_rate": 9.550561797752808e-08, "logps/chosen": -50.20685577392578, "logps/rejected": -66.51139831542969, "loss": 0.441, "losses/dpo": 0.2940795123577118, "losses/sft": 2.298060894012451, "losses/total": 0.2940795123577118, "ref_logps/chosen": -37.659244537353516, "ref_logps/rejected": -42.670841217041016, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.254760980606079, "rewards/margins": 1.129294514656067, "rewards/rejected": -2.3840553760528564, "step": 328 }, { "epoch": 2.48, "grad_norm": 9.657263439264016, "learning_rate": 9.410112359550561e-08, "logps/chosen": -54.32592010498047, "logps/rejected": -69.8943862915039, "loss": 0.4163, "losses/dpo": 0.6257603764533997, "losses/sft": 2.595241069793701, "losses/total": 0.6257603764533997, "ref_logps/chosen": -40.53725814819336, "ref_logps/rejected": -43.76425552368164, "rewards/accuracies": 0.796875, "rewards/chosen": -1.37886643409729, "rewards/margins": 1.2341458797454834, "rewards/rejected": -2.6130123138427734, "step": 329 }, { "epoch": 2.49, "grad_norm": 9.274122078417514, "learning_rate": 9.269662921348314e-08, "logps/chosen": -54.48114013671875, "logps/rejected": -74.62823486328125, "loss": 0.3801, "losses/dpo": 0.46192625164985657, "losses/sft": 2.048821449279785, "losses/total": 0.46192625164985657, "ref_logps/chosen": -40.628318786621094, "ref_logps/rejected": -47.394065856933594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.385282278060913, "rewards/margins": 1.3381340503692627, "rewards/rejected": -2.7234160900115967, "step": 330 }, { "epoch": 2.5, "grad_norm": 11.530277636718996, "learning_rate": 9.129213483146067e-08, "logps/chosen": -51.81992721557617, "logps/rejected": -78.34001922607422, "loss": 0.3378, "losses/dpo": 0.36353716254234314, "losses/sft": 2.3431830406188965, "losses/total": 0.36353716254234314, "ref_logps/chosen": -38.79859161376953, "ref_logps/rejected": -50.49066925048828, "rewards/accuracies": 0.875, "rewards/chosen": -1.302133560180664, "rewards/margins": 1.4828013181686401, "rewards/rejected": -2.7849345207214355, "step": 331 }, { "epoch": 2.51, "grad_norm": 9.663796087127917, "learning_rate": 8.988764044943819e-08, "logps/chosen": -56.09492492675781, "logps/rejected": -77.04325866699219, "loss": 0.3814, "losses/dpo": 0.25816428661346436, "losses/sft": 2.8091163635253906, "losses/total": 0.25816428661346436, "ref_logps/chosen": -40.704925537109375, "ref_logps/rejected": -47.98851013183594, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.5390002727508545, "rewards/margins": 1.3664746284484863, "rewards/rejected": -2.90547513961792, "step": 332 }, { "epoch": 2.51, "grad_norm": 9.817894775320998, "learning_rate": 8.848314606741572e-08, "logps/chosen": -53.74916076660156, "logps/rejected": -71.23921966552734, "loss": 0.4129, "losses/dpo": 0.5282669067382812, "losses/sft": 2.027956962585449, "losses/total": 0.5282669067382812, "ref_logps/chosen": -40.569034576416016, "ref_logps/rejected": -45.03144073486328, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3180131912231445, "rewards/margins": 1.302764654159546, "rewards/rejected": -2.6207778453826904, "step": 333 }, { "epoch": 2.52, "grad_norm": 9.915518749588111, "learning_rate": 8.707865168539325e-08, "logps/chosen": -53.48023986816406, "logps/rejected": -72.4287109375, "loss": 0.4231, "losses/dpo": 0.4929217994213104, "losses/sft": 2.577164888381958, "losses/total": 0.4929217994213104, "ref_logps/chosen": -39.79558563232422, "ref_logps/rejected": -45.82670593261719, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3684654235839844, "rewards/margins": 1.291735291481018, "rewards/rejected": -2.660200595855713, "step": 334 }, { "epoch": 2.53, "grad_norm": 9.335165389726255, "learning_rate": 8.567415730337078e-08, "logps/chosen": -52.41722106933594, "logps/rejected": -71.85494995117188, "loss": 0.3694, "losses/dpo": 0.317619651556015, "losses/sft": 2.0792832374572754, "losses/total": 0.317619651556015, "ref_logps/chosen": -39.57048416137695, "ref_logps/rejected": -46.20240783691406, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2846734523773193, "rewards/margins": 1.280580997467041, "rewards/rejected": -2.5652544498443604, "step": 335 }, { "epoch": 2.54, "grad_norm": 9.185058383312379, "learning_rate": 8.426966292134831e-08, "logps/chosen": -56.19029235839844, "logps/rejected": -80.49638366699219, "loss": 0.3411, "losses/dpo": 0.2321043312549591, "losses/sft": 1.5318742990493774, "losses/total": 0.2321043312549591, "ref_logps/chosen": -41.53580856323242, "ref_logps/rejected": -50.73744201660156, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4654479026794434, "rewards/margins": 1.5104467868804932, "rewards/rejected": -2.9758946895599365, "step": 336 }, { "epoch": 2.54, "grad_norm": 8.541745993770446, "learning_rate": 8.286516853932583e-08, "logps/chosen": -49.73480987548828, "logps/rejected": -73.88976287841797, "loss": 0.3512, "losses/dpo": 0.2616669237613678, "losses/sft": 1.7109529972076416, "losses/total": 0.2616669237613678, "ref_logps/chosen": -37.30692672729492, "ref_logps/rejected": -46.78838348388672, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2427881956100464, "rewards/margins": 1.4673501253128052, "rewards/rejected": -2.7101383209228516, "step": 337 }, { "epoch": 2.55, "grad_norm": 10.42078485613911, "learning_rate": 8.146067415730337e-08, "logps/chosen": -52.26924514770508, "logps/rejected": -67.19551086425781, "loss": 0.4575, "losses/dpo": 0.4895854890346527, "losses/sft": 2.276334762573242, "losses/total": 0.4895854890346527, "ref_logps/chosen": -39.17838668823242, "ref_logps/rejected": -43.06511688232422, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.3090859651565552, "rewards/margins": 1.1039537191390991, "rewards/rejected": -2.4130399227142334, "step": 338 }, { "epoch": 2.56, "grad_norm": 10.673884752576804, "learning_rate": 8.005617977528089e-08, "logps/chosen": -54.1285514831543, "logps/rejected": -66.32559967041016, "loss": 0.4616, "losses/dpo": 0.3069703280925751, "losses/sft": 1.7043497562408447, "losses/total": 0.3069703280925751, "ref_logps/chosen": -39.07845687866211, "ref_logps/rejected": -40.795433044433594, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.5050091743469238, "rewards/margins": 1.0480072498321533, "rewards/rejected": -2.5530166625976562, "step": 339 }, { "epoch": 2.57, "grad_norm": 7.734430324176608, "learning_rate": 7.865168539325842e-08, "logps/chosen": -51.84765625, "logps/rejected": -73.76638793945312, "loss": 0.343, "losses/dpo": 0.21601220965385437, "losses/sft": 1.731180191040039, "losses/total": 0.21601220965385437, "ref_logps/chosen": -38.94496154785156, "ref_logps/rejected": -47.23394012451172, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.2902700901031494, "rewards/margins": 1.3629752397537231, "rewards/rejected": -2.653244972229004, "step": 340 }, { "epoch": 2.57, "grad_norm": 9.718523923824968, "learning_rate": 7.724719101123594e-08, "logps/chosen": -54.3284797668457, "logps/rejected": -74.42861938476562, "loss": 0.4342, "losses/dpo": 0.5424623489379883, "losses/sft": 2.519442081451416, "losses/total": 0.5424623489379883, "ref_logps/chosen": -40.32830047607422, "ref_logps/rejected": -47.90203094482422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4000180959701538, "rewards/margins": 1.2526406049728394, "rewards/rejected": -2.652658700942993, "step": 341 }, { "epoch": 2.58, "grad_norm": 8.766478003038216, "learning_rate": 7.584269662921348e-08, "logps/chosen": -56.134185791015625, "logps/rejected": -74.1839370727539, "loss": 0.3627, "losses/dpo": 0.2985873520374298, "losses/sft": 2.3777570724487305, "losses/total": 0.2985873520374298, "ref_logps/chosen": -40.96846008300781, "ref_logps/rejected": -45.8743896484375, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.5165728330612183, "rewards/margins": 1.3143821954727173, "rewards/rejected": -2.8309547901153564, "step": 342 }, { "epoch": 2.59, "grad_norm": 10.011243908821191, "learning_rate": 7.443820224719101e-08, "logps/chosen": -51.62477493286133, "logps/rejected": -70.33786010742188, "loss": 0.4342, "losses/dpo": 0.337339848279953, "losses/sft": 2.341553211212158, "losses/total": 0.337339848279953, "ref_logps/chosen": -37.967830657958984, "ref_logps/rejected": -45.28611755371094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.365694522857666, "rewards/margins": 1.1394801139831543, "rewards/rejected": -2.5051746368408203, "step": 343 }, { "epoch": 2.6, "grad_norm": 9.59771843166304, "learning_rate": 7.303370786516853e-08, "logps/chosen": -51.54905319213867, "logps/rejected": -71.37974548339844, "loss": 0.4085, "losses/dpo": 0.39244934916496277, "losses/sft": 1.864844799041748, "losses/total": 0.39244934916496277, "ref_logps/chosen": -38.66598129272461, "ref_logps/rejected": -45.7724609375, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.2883074283599854, "rewards/margins": 1.272420883178711, "rewards/rejected": -2.5607285499572754, "step": 344 }, { "epoch": 2.6, "grad_norm": 8.700784807594031, "learning_rate": 7.162921348314606e-08, "logps/chosen": -56.775753021240234, "logps/rejected": -77.216796875, "loss": 0.3416, "losses/dpo": 0.22181375324726105, "losses/sft": 2.4348971843719482, "losses/total": 0.22181375324726105, "ref_logps/chosen": -42.833614349365234, "ref_logps/rejected": -49.19655227661133, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.3942136764526367, "rewards/margins": 1.4078103303909302, "rewards/rejected": -2.8020238876342773, "step": 345 }, { "epoch": 2.61, "grad_norm": 9.30159664736646, "learning_rate": 7.022471910112359e-08, "logps/chosen": -48.35330581665039, "logps/rejected": -68.90961456298828, "loss": 0.4087, "losses/dpo": 0.43862560391426086, "losses/sft": 1.861382007598877, "losses/total": 0.43862560391426086, "ref_logps/chosen": -35.64689254760742, "ref_logps/rejected": -43.799800872802734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.270641803741455, "rewards/margins": 1.240339756011963, "rewards/rejected": -2.510981559753418, "step": 346 }, { "epoch": 2.62, "grad_norm": 9.466845106547478, "learning_rate": 6.882022471910112e-08, "logps/chosen": -52.57171630859375, "logps/rejected": -66.74671173095703, "loss": 0.3995, "losses/dpo": 0.5215581655502319, "losses/sft": 2.0002975463867188, "losses/total": 0.5215581655502319, "ref_logps/chosen": -39.363014221191406, "ref_logps/rejected": -41.616329193115234, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.3208706378936768, "rewards/margins": 1.1921679973602295, "rewards/rejected": -2.5130386352539062, "step": 347 }, { "epoch": 2.63, "grad_norm": 8.897974125441753, "learning_rate": 6.741573033707864e-08, "logps/chosen": -54.96879577636719, "logps/rejected": -71.28080749511719, "loss": 0.4172, "losses/dpo": 0.6290773749351501, "losses/sft": 2.65497088432312, "losses/total": 0.6290773749351501, "ref_logps/chosen": -41.37034225463867, "ref_logps/rejected": -45.2210578918457, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3598453998565674, "rewards/margins": 1.2461297512054443, "rewards/rejected": -2.605975389480591, "step": 348 }, { "epoch": 2.63, "grad_norm": 8.450179202589874, "learning_rate": 6.601123595505617e-08, "logps/chosen": -56.42803192138672, "logps/rejected": -78.08199310302734, "loss": 0.3279, "losses/dpo": 0.2672095000743866, "losses/sft": 1.7004587650299072, "losses/total": 0.2672095000743866, "ref_logps/chosen": -43.05862808227539, "ref_logps/rejected": -49.85722732543945, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.3369402885437012, "rewards/margins": 1.4855366945266724, "rewards/rejected": -2.822477102279663, "step": 349 }, { "epoch": 2.64, "grad_norm": 9.946490252779716, "learning_rate": 6.460674157303371e-08, "logps/chosen": -52.30256652832031, "logps/rejected": -67.33949279785156, "loss": 0.4179, "losses/dpo": 0.23541654646396637, "losses/sft": 1.6304823160171509, "losses/total": 0.23541654646396637, "ref_logps/chosen": -39.795745849609375, "ref_logps/rejected": -43.05610275268555, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.2506815195083618, "rewards/margins": 1.177657961845398, "rewards/rejected": -2.4283392429351807, "step": 350 }, { "epoch": 2.65, "grad_norm": 8.420756732795637, "learning_rate": 6.320224719101123e-08, "logps/chosen": -50.62635040283203, "logps/rejected": -71.55708312988281, "loss": 0.3839, "losses/dpo": 0.3438160717487335, "losses/sft": 1.8534799814224243, "losses/total": 0.3438160717487335, "ref_logps/chosen": -39.325897216796875, "ref_logps/rejected": -46.86465835571289, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.1300455331802368, "rewards/margins": 1.3391977548599243, "rewards/rejected": -2.469243049621582, "step": 351 }, { "epoch": 2.66, "grad_norm": 9.697968850403186, "learning_rate": 6.179775280898876e-08, "logps/chosen": -54.621578216552734, "logps/rejected": -70.27922058105469, "loss": 0.4183, "losses/dpo": 0.35120806097984314, "losses/sft": 2.030266284942627, "losses/total": 0.35120806097984314, "ref_logps/chosen": -41.76460266113281, "ref_logps/rejected": -45.33925247192383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2856972217559814, "rewards/margins": 1.2083001136779785, "rewards/rejected": -2.49399733543396, "step": 352 }, { "epoch": 2.66, "grad_norm": 9.120026907057964, "learning_rate": 6.039325842696629e-08, "logps/chosen": -52.02517318725586, "logps/rejected": -74.53661346435547, "loss": 0.409, "losses/dpo": 0.3873208463191986, "losses/sft": 1.7444610595703125, "losses/total": 0.3873208463191986, "ref_logps/chosen": -38.21184539794922, "ref_logps/rejected": -49.15116882324219, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3813323974609375, "rewards/margins": 1.157212495803833, "rewards/rejected": -2.5385448932647705, "step": 353 }, { "epoch": 2.67, "grad_norm": 9.69567118291811, "learning_rate": 5.898876404494382e-08, "logps/chosen": -52.73221969604492, "logps/rejected": -70.13288879394531, "loss": 0.4226, "losses/dpo": 0.3050675392150879, "losses/sft": 1.7437413930892944, "losses/total": 0.3050675392150879, "ref_logps/chosen": -40.23515701293945, "ref_logps/rejected": -45.879547119140625, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.2497066259384155, "rewards/margins": 1.1756272315979004, "rewards/rejected": -2.4253337383270264, "step": 354 }, { "epoch": 2.68, "grad_norm": 8.584053557094956, "learning_rate": 5.758426966292135e-08, "logps/chosen": -57.64381408691406, "logps/rejected": -72.0084457397461, "loss": 0.3835, "losses/dpo": 0.40820345282554626, "losses/sft": 2.4096083641052246, "losses/total": 0.40820345282554626, "ref_logps/chosen": -43.7611198425293, "ref_logps/rejected": -46.36372375488281, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3882694244384766, "rewards/margins": 1.176202654838562, "rewards/rejected": -2.564471960067749, "step": 355 }, { "epoch": 2.69, "grad_norm": 8.221246825721817, "learning_rate": 5.617977528089887e-08, "logps/chosen": -46.82723617553711, "logps/rejected": -68.99041748046875, "loss": 0.3539, "losses/dpo": 0.29153013229370117, "losses/sft": 1.427022099494934, "losses/total": 0.29153013229370117, "ref_logps/chosen": -35.502262115478516, "ref_logps/rejected": -43.46721649169922, "rewards/accuracies": 0.859375, "rewards/chosen": -1.132498025894165, "rewards/margins": 1.419821858406067, "rewards/rejected": -2.5523197650909424, "step": 356 }, { "epoch": 2.69, "grad_norm": 9.482797590566683, "learning_rate": 5.47752808988764e-08, "logps/chosen": -51.7399787902832, "logps/rejected": -71.89605712890625, "loss": 0.3979, "losses/dpo": 0.21694956719875336, "losses/sft": 1.9680830240249634, "losses/total": 0.21694956719875336, "ref_logps/chosen": -38.23194122314453, "ref_logps/rejected": -45.57975769042969, "rewards/accuracies": 0.859375, "rewards/chosen": -1.350803256034851, "rewards/margins": 1.2808265686035156, "rewards/rejected": -2.631629705429077, "step": 357 }, { "epoch": 2.7, "grad_norm": 8.17039044420366, "learning_rate": 5.3370786516853926e-08, "logps/chosen": -52.407249450683594, "logps/rejected": -71.58457946777344, "loss": 0.3517, "losses/dpo": 0.45670050382614136, "losses/sft": 2.3598852157592773, "losses/total": 0.45670050382614136, "ref_logps/chosen": -40.96891784667969, "ref_logps/rejected": -45.695167541503906, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.143832802772522, "rewards/margins": 1.4451087713241577, "rewards/rejected": -2.5889415740966797, "step": 358 }, { "epoch": 2.71, "grad_norm": 7.803291186480779, "learning_rate": 5.196629213483146e-08, "logps/chosen": -47.12774658203125, "logps/rejected": -70.28164672851562, "loss": 0.331, "losses/dpo": 0.3020516037940979, "losses/sft": 1.725950837135315, "losses/total": 0.3020516037940979, "ref_logps/chosen": -36.658851623535156, "ref_logps/rejected": -44.756195068359375, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.0468891859054565, "rewards/margins": 1.5056557655334473, "rewards/rejected": -2.5525450706481934, "step": 359 }, { "epoch": 2.72, "grad_norm": 8.941952443820963, "learning_rate": 5.056179775280899e-08, "logps/chosen": -51.66205596923828, "logps/rejected": -69.529296875, "loss": 0.3878, "losses/dpo": 0.4756266474723816, "losses/sft": 1.6768076419830322, "losses/total": 0.4756266474723816, "ref_logps/chosen": -39.167083740234375, "ref_logps/rejected": -45.206932067871094, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2494975328445435, "rewards/margins": 1.1827386617660522, "rewards/rejected": -2.4322359561920166, "step": 360 }, { "epoch": 2.72, "grad_norm": 9.404438765920613, "learning_rate": 4.915730337078652e-08, "logps/chosen": -54.10792922973633, "logps/rejected": -70.66584014892578, "loss": 0.385, "losses/dpo": 0.25079599022865295, "losses/sft": 2.628451108932495, "losses/total": 0.25079599022865295, "ref_logps/chosen": -41.44969177246094, "ref_logps/rejected": -44.49009704589844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2658233642578125, "rewards/margins": 1.3517518043518066, "rewards/rejected": -2.6175754070281982, "step": 361 }, { "epoch": 2.73, "grad_norm": 8.339761715598929, "learning_rate": 4.775280898876404e-08, "logps/chosen": -52.00060272216797, "logps/rejected": -70.13935852050781, "loss": 0.3657, "losses/dpo": 0.410220742225647, "losses/sft": 2.064330577850342, "losses/total": 0.410220742225647, "ref_logps/chosen": -38.8874626159668, "ref_logps/rejected": -44.29195785522461, "rewards/accuracies": 0.890625, "rewards/chosen": -1.311313509941101, "rewards/margins": 1.27342689037323, "rewards/rejected": -2.584740400314331, "step": 362 }, { "epoch": 2.74, "grad_norm": 8.695799719443274, "learning_rate": 4.634831460674157e-08, "logps/chosen": -54.19892120361328, "logps/rejected": -70.33839416503906, "loss": 0.3834, "losses/dpo": 0.4246940612792969, "losses/sft": 1.6766891479492188, "losses/total": 0.4246940612792969, "ref_logps/chosen": -40.98695755004883, "ref_logps/rejected": -44.57014846801758, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.3211965560913086, "rewards/margins": 1.255626916885376, "rewards/rejected": -2.5768234729766846, "step": 363 }, { "epoch": 2.75, "grad_norm": 7.953708445192091, "learning_rate": 4.4943820224719096e-08, "logps/chosen": -51.214752197265625, "logps/rejected": -75.3336181640625, "loss": 0.3295, "losses/dpo": 0.17288488149642944, "losses/sft": 2.220893383026123, "losses/total": 0.17288488149642944, "ref_logps/chosen": -38.46108627319336, "ref_logps/rejected": -48.44053268432617, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.2753666639328003, "rewards/margins": 1.4139418601989746, "rewards/rejected": -2.6893081665039062, "step": 364 }, { "epoch": 2.75, "grad_norm": 8.821196988330435, "learning_rate": 4.3539325842696626e-08, "logps/chosen": -56.51776123046875, "logps/rejected": -75.40132904052734, "loss": 0.35, "losses/dpo": 0.169864684343338, "losses/sft": 2.520303964614868, "losses/total": 0.169864684343338, "ref_logps/chosen": -43.79001235961914, "ref_logps/rejected": -48.247989654541016, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.2727751731872559, "rewards/margins": 1.4425586462020874, "rewards/rejected": -2.715333938598633, "step": 365 }, { "epoch": 2.76, "grad_norm": 9.689618011799487, "learning_rate": 4.213483146067416e-08, "logps/chosen": -57.19207000732422, "logps/rejected": -72.71266174316406, "loss": 0.407, "losses/dpo": 0.27488580346107483, "losses/sft": 1.8573498725891113, "losses/total": 0.27488580346107483, "ref_logps/chosen": -43.24185562133789, "ref_logps/rejected": -46.41039276123047, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.3950214385986328, "rewards/margins": 1.2352051734924316, "rewards/rejected": -2.6302266120910645, "step": 366 }, { "epoch": 2.77, "grad_norm": 9.338062839876327, "learning_rate": 4.073033707865169e-08, "logps/chosen": -50.966712951660156, "logps/rejected": -68.7747802734375, "loss": 0.4169, "losses/dpo": 0.3666359782218933, "losses/sft": 2.0023789405822754, "losses/total": 0.3666359782218933, "ref_logps/chosen": -38.069068908691406, "ref_logps/rejected": -44.10002899169922, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.2897647619247437, "rewards/margins": 1.1777102947235107, "rewards/rejected": -2.467475175857544, "step": 367 }, { "epoch": 2.78, "grad_norm": 7.949833391744097, "learning_rate": 3.932584269662921e-08, "logps/chosen": -47.72471237182617, "logps/rejected": -70.84202575683594, "loss": 0.3803, "losses/dpo": 0.3086835443973541, "losses/sft": 1.9907643795013428, "losses/total": 0.3086835443973541, "ref_logps/chosen": -35.20145034790039, "ref_logps/rejected": -44.49205780029297, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2523258924484253, "rewards/margins": 1.3826706409454346, "rewards/rejected": -2.6349964141845703, "step": 368 }, { "epoch": 2.78, "grad_norm": 9.235066572868517, "learning_rate": 3.792134831460674e-08, "logps/chosen": -52.30189895629883, "logps/rejected": -70.8028335571289, "loss": 0.3852, "losses/dpo": 0.2891031503677368, "losses/sft": 1.8405730724334717, "losses/total": 0.2891031503677368, "ref_logps/chosen": -39.477725982666016, "ref_logps/rejected": -45.31201934814453, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2824174165725708, "rewards/margins": 1.266663908958435, "rewards/rejected": -2.549081563949585, "step": 369 }, { "epoch": 2.79, "grad_norm": 9.190353581688715, "learning_rate": 3.6516853932584266e-08, "logps/chosen": -49.18308639526367, "logps/rejected": -67.69566345214844, "loss": 0.4, "losses/dpo": 0.3436277508735657, "losses/sft": 2.0218303203582764, "losses/total": 0.3436277508735657, "ref_logps/chosen": -36.68975830078125, "ref_logps/rejected": -43.71223449707031, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2493327856063843, "rewards/margins": 1.1490094661712646, "rewards/rejected": -2.3983423709869385, "step": 370 }, { "epoch": 2.8, "grad_norm": 8.148479571122722, "learning_rate": 3.5112359550561796e-08, "logps/chosen": -51.570220947265625, "logps/rejected": -71.7831039428711, "loss": 0.3307, "losses/dpo": 0.5651198625564575, "losses/sft": 2.007855176925659, "losses/total": 0.5651198625564575, "ref_logps/chosen": -40.369407653808594, "ref_logps/rejected": -46.21443176269531, "rewards/accuracies": 0.875, "rewards/chosen": -1.1200807094573975, "rewards/margins": 1.4367868900299072, "rewards/rejected": -2.5568673610687256, "step": 371 }, { "epoch": 2.81, "grad_norm": 9.663237502055296, "learning_rate": 3.370786516853932e-08, "logps/chosen": -55.40485382080078, "logps/rejected": -73.81289672851562, "loss": 0.4029, "losses/dpo": 0.7689430713653564, "losses/sft": 1.9435756206512451, "losses/total": 0.7689430713653564, "ref_logps/chosen": -42.52079772949219, "ref_logps/rejected": -47.747249603271484, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2884055376052856, "rewards/margins": 1.3181602954864502, "rewards/rejected": -2.6065659523010254, "step": 372 }, { "epoch": 2.82, "grad_norm": 8.41997319865235, "learning_rate": 3.230337078651686e-08, "logps/chosen": -55.254249572753906, "logps/rejected": -81.1642074584961, "loss": 0.3146, "losses/dpo": 0.3366415500640869, "losses/sft": 1.8378387689590454, "losses/total": 0.3366415500640869, "ref_logps/chosen": -41.7273063659668, "ref_logps/rejected": -51.86448669433594, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.3526947498321533, "rewards/margins": 1.577277421951294, "rewards/rejected": -2.929971933364868, "step": 373 }, { "epoch": 2.82, "grad_norm": 9.08048419718233, "learning_rate": 3.089887640449438e-08, "logps/chosen": -52.86750793457031, "logps/rejected": -77.41828918457031, "loss": 0.3294, "losses/dpo": 0.23508216440677643, "losses/sft": 1.5872150659561157, "losses/total": 0.23508216440677643, "ref_logps/chosen": -39.141883850097656, "ref_logps/rejected": -49.06714630126953, "rewards/accuracies": 0.875, "rewards/chosen": -1.3725626468658447, "rewards/margins": 1.4625511169433594, "rewards/rejected": -2.835113763809204, "step": 374 }, { "epoch": 2.83, "grad_norm": 9.149313086592246, "learning_rate": 2.949438202247191e-08, "logps/chosen": -50.02390670776367, "logps/rejected": -75.56754302978516, "loss": 0.362, "losses/dpo": 0.6652272939682007, "losses/sft": 2.926239252090454, "losses/total": 0.6652272939682007, "ref_logps/chosen": -37.26597595214844, "ref_logps/rejected": -48.15172576904297, "rewards/accuracies": 0.875, "rewards/chosen": -1.2757928371429443, "rewards/margins": 1.4657888412475586, "rewards/rejected": -2.741581916809082, "step": 375 }, { "epoch": 2.84, "grad_norm": 9.114150173411574, "learning_rate": 2.8089887640449436e-08, "logps/chosen": -55.91659164428711, "logps/rejected": -74.04378509521484, "loss": 0.3668, "losses/dpo": 0.2893008589744568, "losses/sft": 2.1376471519470215, "losses/total": 0.2893008589744568, "ref_logps/chosen": -42.005611419677734, "ref_logps/rejected": -46.799495697021484, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3910987377166748, "rewards/margins": 1.333329677581787, "rewards/rejected": -2.724428415298462, "step": 376 }, { "epoch": 2.85, "grad_norm": 10.02587161317734, "learning_rate": 2.6685393258426963e-08, "logps/chosen": -53.21587371826172, "logps/rejected": -71.60870361328125, "loss": 0.4256, "losses/dpo": 0.7751315236091614, "losses/sft": 2.11029314994812, "losses/total": 0.7751315236091614, "ref_logps/chosen": -40.663108825683594, "ref_logps/rejected": -46.480648040771484, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.2552767992019653, "rewards/margins": 1.25752854347229, "rewards/rejected": -2.512805700302124, "step": 377 }, { "epoch": 2.85, "grad_norm": 8.810937586248796, "learning_rate": 2.5280898876404493e-08, "logps/chosen": -52.51762008666992, "logps/rejected": -77.57078552246094, "loss": 0.3381, "losses/dpo": 0.28002500534057617, "losses/sft": 1.5633399486541748, "losses/total": 0.28002500534057617, "ref_logps/chosen": -39.36627960205078, "ref_logps/rejected": -50.626708984375, "rewards/accuracies": 0.875, "rewards/chosen": -1.315134048461914, "rewards/margins": 1.3792742490768433, "rewards/rejected": -2.6944081783294678, "step": 378 }, { "epoch": 2.86, "grad_norm": 9.293485858540686, "learning_rate": 2.387640449438202e-08, "logps/chosen": -51.378074645996094, "logps/rejected": -64.74043273925781, "loss": 0.4262, "losses/dpo": 0.7660055756568909, "losses/sft": 2.2007508277893066, "losses/total": 0.7660055756568909, "ref_logps/chosen": -39.29324722290039, "ref_logps/rejected": -41.33624267578125, "rewards/accuracies": 0.7890625, "rewards/chosen": -1.2084828615188599, "rewards/margins": 1.131935954093933, "rewards/rejected": -2.340418815612793, "step": 379 }, { "epoch": 2.87, "grad_norm": 8.825439696536032, "learning_rate": 2.2471910112359548e-08, "logps/chosen": -55.97784423828125, "logps/rejected": -78.24275207519531, "loss": 0.345, "losses/dpo": 0.33630573749542236, "losses/sft": 2.7268307209014893, "losses/total": 0.33630573749542236, "ref_logps/chosen": -41.14834976196289, "ref_logps/rejected": -49.048004150390625, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.4829493761062622, "rewards/margins": 1.4365259408950806, "rewards/rejected": -2.9194750785827637, "step": 380 }, { "epoch": 2.88, "grad_norm": 8.683259186904719, "learning_rate": 2.106741573033708e-08, "logps/chosen": -52.257469177246094, "logps/rejected": -67.984130859375, "loss": 0.4087, "losses/dpo": 0.3859608471393585, "losses/sft": 1.8246614933013916, "losses/total": 0.3859608471393585, "ref_logps/chosen": -38.84019470214844, "ref_logps/rejected": -43.60353088378906, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.341727375984192, "rewards/margins": 1.0963327884674072, "rewards/rejected": -2.4380600452423096, "step": 381 }, { "epoch": 2.88, "grad_norm": 8.270857768884154, "learning_rate": 1.9662921348314606e-08, "logps/chosen": -54.033363342285156, "logps/rejected": -77.31697082519531, "loss": 0.3287, "losses/dpo": 0.19281096756458282, "losses/sft": 1.8291985988616943, "losses/total": 0.19281096756458282, "ref_logps/chosen": -39.56074523925781, "ref_logps/rejected": -47.55027770996094, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.4472615718841553, "rewards/margins": 1.5294086933135986, "rewards/rejected": -2.9766697883605957, "step": 382 }, { "epoch": 2.89, "grad_norm": 9.68267538629506, "learning_rate": 1.8258426966292133e-08, "logps/chosen": -53.385498046875, "logps/rejected": -68.63336181640625, "loss": 0.391, "losses/dpo": 0.5480431318283081, "losses/sft": 2.3586955070495605, "losses/total": 0.5480431318283081, "ref_logps/chosen": -40.53166198730469, "ref_logps/rejected": -44.218746185302734, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.2853829860687256, "rewards/margins": 1.156078815460205, "rewards/rejected": -2.4414615631103516, "step": 383 }, { "epoch": 2.9, "grad_norm": 8.588925065399472, "learning_rate": 1.685393258426966e-08, "logps/chosen": -53.107017517089844, "logps/rejected": -73.8092041015625, "loss": 0.3472, "losses/dpo": 0.5091351866722107, "losses/sft": 2.4279067516326904, "losses/total": 0.5091351866722107, "ref_logps/chosen": -39.874427795410156, "ref_logps/rejected": -47.138092041015625, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.3232589960098267, "rewards/margins": 1.34385085105896, "rewards/rejected": -2.667109966278076, "step": 384 }, { "epoch": 2.91, "grad_norm": 9.061222152581863, "learning_rate": 1.544943820224719e-08, "logps/chosen": -55.36473846435547, "logps/rejected": -71.19318389892578, "loss": 0.3925, "losses/dpo": 0.7466526627540588, "losses/sft": 2.359135627746582, "losses/total": 0.7466526627540588, "ref_logps/chosen": -41.025726318359375, "ref_logps/rejected": -44.06968688964844, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4339020252227783, "rewards/margins": 1.2784475088119507, "rewards/rejected": -2.7123494148254395, "step": 385 }, { "epoch": 2.91, "grad_norm": 9.909969914239525, "learning_rate": 1.4044943820224718e-08, "logps/chosen": -51.63406753540039, "logps/rejected": -77.14066314697266, "loss": 0.3813, "losses/dpo": 0.6047529578208923, "losses/sft": 1.7509853839874268, "losses/total": 0.6047529578208923, "ref_logps/chosen": -37.65089416503906, "ref_logps/rejected": -48.93791198730469, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.398316740989685, "rewards/margins": 1.4219584465026855, "rewards/rejected": -2.82027530670166, "step": 386 }, { "epoch": 2.92, "grad_norm": 8.501319268805696, "learning_rate": 1.2640449438202247e-08, "logps/chosen": -53.17372512817383, "logps/rejected": -68.52401733398438, "loss": 0.3433, "losses/dpo": 0.2757856249809265, "losses/sft": 2.1045045852661133, "losses/total": 0.2757856249809265, "ref_logps/chosen": -40.52703857421875, "ref_logps/rejected": -42.728126525878906, "rewards/accuracies": 0.875, "rewards/chosen": -1.264668345451355, "rewards/margins": 1.3149209022521973, "rewards/rejected": -2.579589366912842, "step": 387 }, { "epoch": 2.93, "grad_norm": 8.718869385679747, "learning_rate": 1.1235955056179774e-08, "logps/chosen": -54.90761184692383, "logps/rejected": -70.40953826904297, "loss": 0.3789, "losses/dpo": 0.5766834020614624, "losses/sft": 2.222163200378418, "losses/total": 0.5766834020614624, "ref_logps/chosen": -41.6505012512207, "ref_logps/rejected": -44.63560485839844, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.3257105350494385, "rewards/margins": 1.2516822814941406, "rewards/rejected": -2.577392816543579, "step": 388 }, { "epoch": 2.94, "grad_norm": 9.426380018310086, "learning_rate": 9.831460674157303e-09, "logps/chosen": -54.11854934692383, "logps/rejected": -71.68621826171875, "loss": 0.386, "losses/dpo": 0.20794588327407837, "losses/sft": 2.1166539192199707, "losses/total": 0.20794588327407837, "ref_logps/chosen": -40.966426849365234, "ref_logps/rejected": -45.38508605957031, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3152116537094116, "rewards/margins": 1.3149020671844482, "rewards/rejected": -2.6301136016845703, "step": 389 }, { "epoch": 2.94, "grad_norm": 10.46768940793886, "learning_rate": 8.42696629213483e-09, "logps/chosen": -54.164424896240234, "logps/rejected": -71.98051452636719, "loss": 0.4893, "losses/dpo": 0.39178702235221863, "losses/sft": 2.2996134757995605, "losses/total": 0.39178702235221863, "ref_logps/chosen": -40.31678009033203, "ref_logps/rejected": -46.53054428100586, "rewards/accuracies": 0.75, "rewards/chosen": -1.384764313697815, "rewards/margins": 1.1602333784103394, "rewards/rejected": -2.5449976921081543, "step": 390 }, { "epoch": 2.95, "grad_norm": 8.941416739373434, "learning_rate": 7.022471910112359e-09, "logps/chosen": -53.463233947753906, "logps/rejected": -71.62788391113281, "loss": 0.3318, "losses/dpo": 0.2409718632698059, "losses/sft": 1.6294231414794922, "losses/total": 0.2409718632698059, "ref_logps/chosen": -41.0477294921875, "ref_logps/rejected": -44.89318084716797, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.2415508031845093, "rewards/margins": 1.4319190979003906, "rewards/rejected": -2.6734697818756104, "step": 391 }, { "epoch": 2.96, "grad_norm": 8.98210941116973, "learning_rate": 5.617977528089887e-09, "logps/chosen": -54.616600036621094, "logps/rejected": -73.2689208984375, "loss": 0.3832, "losses/dpo": 0.43616408109664917, "losses/sft": 2.2494640350341797, "losses/total": 0.43616408109664917, "ref_logps/chosen": -40.2120361328125, "ref_logps/rejected": -46.42675018310547, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.4404562711715698, "rewards/margins": 1.2437611818313599, "rewards/rejected": -2.684217691421509, "step": 392 }, { "epoch": 2.97, "grad_norm": 9.151862468783703, "learning_rate": 4.213483146067415e-09, "logps/chosen": -51.817893981933594, "logps/rejected": -69.46862030029297, "loss": 0.3906, "losses/dpo": 0.2829042077064514, "losses/sft": 2.443455696105957, "losses/total": 0.2829042077064514, "ref_logps/chosen": -38.848846435546875, "ref_logps/rejected": -43.35674285888672, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2969045639038086, "rewards/margins": 1.3142831325531006, "rewards/rejected": -2.61118745803833, "step": 393 }, { "epoch": 2.97, "grad_norm": 10.678776530633808, "learning_rate": 2.8089887640449435e-09, "logps/chosen": -54.69252014160156, "logps/rejected": -73.899658203125, "loss": 0.4591, "losses/dpo": 0.4431733191013336, "losses/sft": 2.1250791549682617, "losses/total": 0.4431733191013336, "ref_logps/chosen": -39.718658447265625, "ref_logps/rejected": -48.00882339477539, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4973857402801514, "rewards/margins": 1.0916969776153564, "rewards/rejected": -2.589082717895508, "step": 394 }, { "epoch": 2.98, "grad_norm": 8.192368817594446, "learning_rate": 1.4044943820224717e-09, "logps/chosen": -50.57025909423828, "logps/rejected": -69.49359130859375, "loss": 0.3625, "losses/dpo": 0.550754964351654, "losses/sft": 2.1057140827178955, "losses/total": 0.550754964351654, "ref_logps/chosen": -38.39902877807617, "ref_logps/rejected": -43.90056228637695, "rewards/accuracies": 0.875, "rewards/chosen": -1.2171236276626587, "rewards/margins": 1.342179298400879, "rewards/rejected": -2.559302806854248, "step": 395 }, { "epoch": 2.99, "grad_norm": 8.91219728509694, "learning_rate": 0.0, "logps/chosen": -56.36674499511719, "logps/rejected": -75.95218658447266, "loss": 0.3695, "losses/dpo": 0.45165300369262695, "losses/sft": 1.7463542222976685, "losses/total": 0.45165300369262695, "ref_logps/chosen": -42.22056198120117, "ref_logps/rejected": -48.152099609375, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.4146177768707275, "rewards/margins": 1.3653908967971802, "rewards/rejected": -2.7800087928771973, "step": 396 }, { "epoch": 2.99, "step": 396, "total_flos": 0.0, "train_loss": 0.5140665640132596, "train_runtime": 34070.7646, "train_samples_per_second": 1.493, "train_steps_per_second": 0.012 } ], "logging_steps": 1.0, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 70, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }