diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,10 +11,10 @@ { "epoch": 0.0, "learning_rate": 4e-08, - "logits/chosen": -2.682399272918701, - "logits/rejected": -2.7047135829925537, - "logps/chosen": -275.10638427734375, - "logps/rejected": -271.8466491699219, + "logits/chosen": -2.6824121475219727, + "logits/rejected": -2.7049124240875244, + "logps/chosen": -275.1597900390625, + "logps/rejected": -271.6430969238281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,1937 +25,1937 @@ { "epoch": 0.01, "learning_rate": 4.0000000000000003e-07, - "logits/chosen": -2.606243848800659, - "logits/rejected": -2.633491277694702, - "logps/chosen": -301.7389831542969, - "logps/rejected": -324.2469787597656, - "loss": 0.6931, - "rewards/accuracies": 0.4166666567325592, - "rewards/chosen": 0.00018933300452772528, - "rewards/margins": 9.413135558133945e-06, - "rewards/rejected": 0.00017991992353927344, + "logits/chosen": -2.6062471866607666, + "logits/rejected": -2.633519172668457, + "logps/chosen": -301.788818359375, + "logps/rejected": -324.23992919921875, + "loss": 0.6934, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -0.0003357415844220668, + "rewards/margins": -0.00043744672439061105, + "rewards/rejected": 0.00010170514724450186, "step": 10 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-07, - "logits/chosen": -2.5866377353668213, - "logits/rejected": -2.5900259017944336, - "logps/chosen": -269.0643615722656, - "logps/rejected": -289.1509094238281, - "loss": 0.6931, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.0011652575340121984, - "rewards/margins": 6.114605639595538e-05, - "rewards/rejected": -0.0012264035176485777, + "logits/chosen": -2.5865533351898193, + "logits/rejected": -2.589942693710327, + "logps/chosen": -269.1646423339844, + "logps/rejected": -289.1620178222656, + "loss": 0.6934, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.0017813893500715494, + "rewards/margins": -0.00042842660332098603, + "rewards/rejected": -0.0013529626885429025, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.2000000000000002e-06, - "logits/chosen": -2.57534122467041, - "logits/rejected": -2.5880730152130127, - "logps/chosen": -291.5533752441406, - "logps/rejected": -311.4080505371094, - "loss": 0.693, - "rewards/accuracies": 0.46875, - "rewards/chosen": -0.0030827566515654325, - "rewards/margins": 0.0002471129409968853, - "rewards/rejected": -0.0033298698253929615, + "logits/chosen": -2.5753893852233887, + "logits/rejected": -2.58815860748291, + "logps/chosen": -291.62603759765625, + "logps/rejected": -311.4107971191406, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0035930208396166563, + "rewards/margins": -0.00022997017367742956, + "rewards/rejected": -0.0033630509860813618, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.6000000000000001e-06, - "logits/chosen": -2.612988233566284, - "logits/rejected": -2.619412899017334, - "logps/chosen": -264.4569396972656, - "logps/rejected": -273.54107666015625, - "loss": 0.6927, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.005209661088883877, - "rewards/margins": 0.0008221397292800248, - "rewards/rejected": -0.006031800992786884, + "logits/chosen": -2.612914562225342, + "logits/rejected": -2.619166135787964, + "logps/chosen": -264.53851318359375, + "logps/rejected": -273.5920104980469, + "loss": 0.6929, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.005524822510778904, + "rewards/margins": 0.0004894191515631974, + "rewards/rejected": -0.006014241836965084, "step": 40 }, { "epoch": 0.04, "learning_rate": 2.0000000000000003e-06, - "logits/chosen": -2.563633680343628, - "logits/rejected": -2.5437724590301514, - "logps/chosen": -264.8982849121094, - "logps/rejected": -269.68804931640625, + "logits/chosen": -2.56376576423645, + "logits/rejected": -2.5438730716705322, + "logps/chosen": -264.94989013671875, + "logps/rejected": -269.7061767578125, "loss": 0.6932, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -0.008133028633892536, - "rewards/margins": -1.2002186849713326e-05, - "rewards/rejected": -0.008121026679873466, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.008829522877931595, + "rewards/margins": -0.0001453599688829854, + "rewards/rejected": -0.008684162981808186, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.4000000000000003e-06, - "logits/chosen": -2.6302762031555176, - "logits/rejected": -2.633596420288086, - "logps/chosen": -277.2352600097656, - "logps/rejected": -296.19476318359375, - "loss": 0.6921, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.013791452161967754, - "rewards/margins": 0.0020633486565202475, - "rewards/rejected": -0.01585480198264122, + "logits/chosen": -2.630518674850464, + "logits/rejected": -2.6339123249053955, + "logps/chosen": -277.233642578125, + "logps/rejected": -296.16107177734375, + "loss": 0.6919, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.013874058611690998, + "rewards/margins": 0.0025573372840881348, + "rewards/rejected": -0.016431394964456558, "step": 60 }, { "epoch": 0.06, "learning_rate": 2.8000000000000003e-06, - "logits/chosen": -2.6230902671813965, - "logits/rejected": -2.6135356426239014, - "logps/chosen": -280.782958984375, - "logps/rejected": -286.6590881347656, - "loss": 0.6931, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.01885022595524788, - "rewards/margins": 0.00015832395001780242, - "rewards/rejected": -0.01900855079293251, + "logits/chosen": -2.623034954071045, + "logits/rejected": -2.6133813858032227, + "logps/chosen": -280.8143005371094, + "logps/rejected": -286.6803283691406, + "loss": 0.693, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.01927858218550682, + "rewards/margins": 0.00032674067188054323, + "rewards/rejected": -0.019605323672294617, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.2000000000000003e-06, - "logits/chosen": -2.6521055698394775, - "logits/rejected": -2.650635004043579, - "logps/chosen": -276.92156982421875, - "logps/rejected": -297.58477783203125, + "logits/chosen": -2.6520907878875732, + "logits/rejected": -2.6506881713867188, + "logps/chosen": -276.94451904296875, + "logps/rejected": -297.6499328613281, "loss": 0.6924, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.017211003229022026, - "rewards/margins": 0.0015935760457068682, - "rewards/rejected": -0.018804579973220825, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.017164330929517746, + "rewards/margins": 0.0016351321246474981, + "rewards/rejected": -0.018799465149641037, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.6000000000000003e-06, - "logits/chosen": -2.650332450866699, - "logits/rejected": -2.639291286468506, - "logps/chosen": -311.17218017578125, - "logps/rejected": -316.7694091796875, + "logits/chosen": -2.6500167846679688, + "logits/rejected": -2.6388657093048096, + "logps/chosen": -311.24853515625, + "logps/rejected": -316.82427978515625, "loss": 0.6891, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.017588406801223755, - "rewards/margins": 0.008481341414153576, - "rewards/rejected": -0.026069749146699905, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.01826111041009426, + "rewards/margins": 0.008519862778484821, + "rewards/rejected": -0.026780972257256508, "step": 90 }, { "epoch": 0.08, "learning_rate": 4.000000000000001e-06, - "logits/chosen": -2.6498260498046875, - "logits/rejected": -2.6482200622558594, - "logps/chosen": -276.51043701171875, - "logps/rejected": -290.5121154785156, + "logits/chosen": -2.649533271789551, + "logits/rejected": -2.647923231124878, + "logps/chosen": -276.5948486328125, + "logps/rejected": -290.6126708984375, "loss": 0.6909, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.02430056408047676, - "rewards/margins": 0.004733243025839329, - "rewards/rejected": -0.029033806174993515, + "rewards/chosen": -0.025616105645895004, + "rewards/margins": 0.00467633968219161, + "rewards/rejected": -0.030292445793747902, "step": 100 }, { "epoch": 0.08, - "eval_logits/chosen": -2.6246910095214844, - "eval_logits/rejected": -2.6219358444213867, - "eval_logps/chosen": -222.40379333496094, - "eval_logps/rejected": -228.25218200683594, - "eval_loss": 0.6920604109764099, - "eval_rewards/accuracies": 0.5426666736602783, - "eval_rewards/chosen": -0.01691427268087864, - "eval_rewards/margins": 0.0023173687513917685, - "eval_rewards/rejected": -0.01923164166510105, - "eval_runtime": 1621.9667, - "eval_samples_per_second": 1.846, - "eval_steps_per_second": 0.231, + "eval_logits/chosen": -2.6257693767547607, + "eval_logits/rejected": -2.622962474822998, + "eval_logps/chosen": -222.508056640625, + "eval_logps/rejected": -228.3364715576172, + "eval_loss": 0.6922365427017212, + "eval_rewards/accuracies": 0.5236666798591614, + "eval_rewards/chosen": -0.01801561377942562, + "eval_rewards/margins": 0.0019916673190891743, + "eval_rewards/rejected": -0.020007280632853508, + "eval_runtime": 1605.2167, + "eval_samples_per_second": 1.865, + "eval_steps_per_second": 0.234, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.4e-06, - "logits/chosen": -2.6501355171203613, - "logits/rejected": -2.652731418609619, - "logps/chosen": -298.9729919433594, - "logps/rejected": -309.8643493652344, - "loss": 0.6891, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.025512468069791794, - "rewards/margins": 0.008652618154883385, - "rewards/rejected": -0.03416508436203003, + "logits/chosen": -2.6500422954559326, + "logits/rejected": -2.6527457237243652, + "logps/chosen": -299.06976318359375, + "logps/rejected": -309.92266845703125, + "loss": 0.6888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.026254886761307716, + "rewards/margins": 0.009161447174847126, + "rewards/rejected": -0.03541633114218712, "step": 110 }, { "epoch": 0.1, "learning_rate": 4.800000000000001e-06, - "logits/chosen": -2.676758289337158, - "logits/rejected": -2.6891021728515625, - "logps/chosen": -278.479736328125, - "logps/rejected": -296.43212890625, - "loss": 0.6903, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.031918395310640335, - "rewards/margins": 0.006463131867349148, - "rewards/rejected": -0.03838152438402176, + "logits/chosen": -2.6758484840393066, + "logits/rejected": -2.6881041526794434, + "logps/chosen": -278.5473327636719, + "logps/rejected": -296.552734375, + "loss": 0.6901, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.03307301551103592, + "rewards/margins": 0.006715013645589352, + "rewards/rejected": -0.039788030087947845, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999755876225375e-06, - "logits/chosen": -2.645005702972412, - "logits/rejected": -2.62728214263916, - "logps/chosen": -294.44366455078125, - "logps/rejected": -315.0718994140625, - "loss": 0.6878, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.035983841866254807, - "rewards/margins": 0.011375428177416325, - "rewards/rejected": -0.047359269112348557, + "logits/chosen": -2.6458847522735596, + "logits/rejected": -2.6281027793884277, + "logps/chosen": -294.466796875, + "logps/rejected": -315.05035400390625, + "loss": 0.6875, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.036008208990097046, + "rewards/margins": 0.011871300637722015, + "rewards/rejected": -0.04787950962781906, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.997803172081864e-06, - "logits/chosen": -2.6803853511810303, - "logits/rejected": -2.680997371673584, - "logps/chosen": -289.1062927246094, - "logps/rejected": -302.7191467285156, - "loss": 0.6855, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.041079964488744736, - "rewards/margins": 0.016574053093791008, - "rewards/rejected": -0.057654011994600296, + "logits/chosen": -2.6800694465637207, + "logits/rejected": -2.6806652545928955, + "logps/chosen": -289.0479736328125, + "logps/rejected": -302.6410217285156, + "loss": 0.685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03994801267981529, + "rewards/margins": 0.017537599429488182, + "rewards/rejected": -0.057485610246658325, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.9938992891651825e-06, - "logits/chosen": -2.6616640090942383, - "logits/rejected": -2.6513876914978027, - "logps/chosen": -277.707763671875, - "logps/rejected": -300.9044494628906, - "loss": 0.683, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.05293840169906616, - "rewards/margins": 0.0216156505048275, - "rewards/rejected": -0.07455406337976456, + "logits/chosen": -2.662532091140747, + "logits/rejected": -2.652069568634033, + "logps/chosen": -277.6954040527344, + "logps/rejected": -300.7398986816406, + "loss": 0.6833, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.05284956097602844, + "rewards/margins": 0.02098439633846283, + "rewards/rejected": -0.07383395731449127, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.988047277024456e-06, - "logits/chosen": -2.7210304737091064, - "logits/rejected": -2.7316393852233887, - "logps/chosen": -288.5920715332031, - "logps/rejected": -304.19964599609375, - "loss": 0.6804, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.07934443652629852, - "rewards/margins": 0.026776760816574097, - "rewards/rejected": -0.10612119734287262, + "logits/chosen": -2.7222561836242676, + "logits/rejected": -2.732853412628174, + "logps/chosen": -288.4869689941406, + "logps/rejected": -303.99114990234375, + "loss": 0.6812, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07921075075864792, + "rewards/margins": 0.024997711181640625, + "rewards/rejected": -0.10420846939086914, "step": 160 }, { "epoch": 0.14, "learning_rate": 4.980251707005417e-06, - "logits/chosen": -2.71783185005188, - "logits/rejected": -2.690868854522705, - "logps/chosen": -307.91888427734375, - "logps/rejected": -315.943359375, - "loss": 0.689, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.10906956344842911, - "rewards/margins": 0.009946177713572979, - "rewards/rejected": -0.11901573836803436, + "logits/chosen": -2.719470500946045, + "logits/rejected": -2.6922366619110107, + "logps/chosen": -307.56341552734375, + "logps/rejected": -315.6242370605469, + "loss": 0.6891, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.10631716251373291, + "rewards/margins": 0.0096724983304739, + "rewards/rejected": -0.11598964780569077, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.970518668679459e-06, - "logits/chosen": -2.729719638824463, - "logits/rejected": -2.714111804962158, - "logps/chosen": -304.322998046875, - "logps/rejected": -311.78167724609375, - "loss": 0.6816, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.09336166828870773, - "rewards/margins": 0.0258515365421772, - "rewards/rejected": -0.11921320110559464, + "logits/chosen": -2.7305121421813965, + "logits/rejected": -2.715012311935425, + "logps/chosen": -303.9269104003906, + "logps/rejected": -311.3086853027344, + "loss": 0.6821, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08977223932743073, + "rewards/margins": 0.02493356727063656, + "rewards/rejected": -0.11470580101013184, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.958855765086722e-06, - "logits/chosen": -2.757159471511841, - "logits/rejected": -2.7543435096740723, - "logps/chosen": -284.23687744140625, - "logps/rejected": -293.60595703125, - "loss": 0.6848, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.06431926041841507, - "rewards/margins": 0.019170444458723068, - "rewards/rejected": -0.08348970115184784, + "logits/chosen": -2.7576236724853516, + "logits/rejected": -2.754725933074951, + "logps/chosen": -283.64117431640625, + "logps/rejected": -293.0472717285156, + "loss": 0.6843, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.058794498443603516, + "rewards/margins": 0.02001199498772621, + "rewards/rejected": -0.07880649715662003, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.945272106796919e-06, - "logits/chosen": -2.770078420639038, - "logits/rejected": -2.7745845317840576, - "logps/chosen": -285.0936584472656, - "logps/rejected": -300.80975341796875, - "loss": 0.684, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.0751299113035202, - "rewards/margins": 0.02216259017586708, - "rewards/rejected": -0.09729250520467758, + "logits/chosen": -2.7707252502441406, + "logits/rejected": -2.7755463123321533, + "logps/chosen": -284.79974365234375, + "logps/rejected": -300.4746398925781, + "loss": 0.6843, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07238452136516571, + "rewards/margins": 0.021619267761707306, + "rewards/rejected": -0.09400378167629242, "step": 200 }, { "epoch": 0.16, - "eval_logits/chosen": -2.762911558151245, - "eval_logits/rejected": -2.759880542755127, - "eval_logps/chosen": -228.746826171875, - "eval_logps/rejected": -235.77212524414062, - "eval_loss": 0.6872997283935547, - "eval_rewards/accuracies": 0.5566666722297668, - "eval_rewards/chosen": -0.08034466207027435, - "eval_rewards/margins": 0.014086335897445679, - "eval_rewards/rejected": -0.09443099796772003, - "eval_runtime": 1664.3062, - "eval_samples_per_second": 1.799, - "eval_steps_per_second": 0.225, + "eval_logits/chosen": -2.7632017135620117, + "eval_logits/rejected": -2.7602405548095703, + "eval_logps/chosen": -228.6281280517578, + "eval_logps/rejected": -235.652587890625, + "eval_loss": 0.6873453259468079, + "eval_rewards/accuracies": 0.5546666383743286, + "eval_rewards/chosen": -0.07921627163887024, + "eval_rewards/margins": 0.013952008448541164, + "eval_rewards/rejected": -0.09316828101873398, + "eval_runtime": 1606.4122, + "eval_samples_per_second": 1.864, + "eval_steps_per_second": 0.233, "step": 200 }, { "epoch": 0.17, "learning_rate": 4.929778304792537e-06, - "logits/chosen": -2.7531464099884033, - "logits/rejected": -2.7568936347961426, - "logps/chosen": -310.95513916015625, - "logps/rejected": -315.43353271484375, - "loss": 0.6749, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.08977816253900528, - "rewards/margins": 0.044371671974658966, - "rewards/rejected": -0.13414981961250305, + "logits/chosen": -2.7544033527374268, + "logits/rejected": -2.758392333984375, + "logps/chosen": -311.075439453125, + "logps/rejected": -315.38006591796875, + "loss": 0.6759, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09139387309551239, + "rewards/margins": 0.04242325574159622, + "rewards/rejected": -0.1338171362876892, "step": 210 }, { "epoch": 0.18, "learning_rate": 4.912386462179987e-06, - "logits/chosen": -2.7818262577056885, - "logits/rejected": -2.777902126312256, - "logps/chosen": -298.80780029296875, - "logps/rejected": -325.99853515625, - "loss": 0.6741, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.10158131271600723, - "rewards/margins": 0.044485487043857574, - "rewards/rejected": -0.1460667848587036, + "logits/chosen": -2.783567190170288, + "logits/rejected": -2.779317617416382, + "logps/chosen": -298.64764404296875, + "logps/rejected": -325.9937744140625, + "loss": 0.6736, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10082165896892548, + "rewards/margins": 0.04564264789223671, + "rewards/rejected": -0.14646431803703308, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.893110164735167e-06, - "logits/chosen": -2.8827590942382812, - "logits/rejected": -2.8796629905700684, - "logps/chosen": -305.14068603515625, - "logps/rejected": -319.39471435546875, - "loss": 0.6796, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -0.1311652809381485, - "rewards/margins": 0.0319681391119957, - "rewards/rejected": -0.1631334125995636, + "logits/chosen": -2.8824093341827393, + "logits/rejected": -2.879509449005127, + "logps/chosen": -305.2967224121094, + "logps/rejected": -319.56695556640625, + "loss": 0.6795, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13284233212471008, + "rewards/margins": 0.03221544623374939, + "rewards/rejected": -0.16505777835845947, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.871964470290823e-06, - "logits/chosen": -2.90864634513855, - "logits/rejected": -2.9260551929473877, - "logps/chosen": -309.19970703125, - "logps/rejected": -332.04119873046875, - "loss": 0.6654, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.1595773994922638, - "rewards/margins": 0.06538807600736618, - "rewards/rejected": -0.22496548295021057, + "logits/chosen": -2.902723789215088, + "logits/rejected": -2.9203548431396484, + "logps/chosen": -309.1792907714844, + "logps/rejected": -331.7867736816406, + "loss": 0.6664, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15928030014038086, + "rewards/margins": 0.0632576271891594, + "rewards/rejected": -0.22253794968128204, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.848965896974006e-06, - "logits/chosen": -2.947906494140625, - "logits/rejected": -2.940717935562134, - "logps/chosen": -302.98651123046875, - "logps/rejected": -325.380859375, - "loss": 0.6756, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.19427716732025146, - "rewards/margins": 0.043551910668611526, - "rewards/rejected": -0.2378290891647339, + "logits/chosen": -2.9424166679382324, + "logits/rejected": -2.9350640773773193, + "logps/chosen": -302.81573486328125, + "logps/rejected": -325.1131896972656, + "loss": 0.676, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.19259200990200043, + "rewards/margins": 0.042614568024873734, + "rewards/rejected": -0.23520657420158386, "step": 250 }, { "epoch": 0.21, "learning_rate": 4.8241324103028055e-06, - "logits/chosen": -3.116504430770874, - "logits/rejected": -3.088792324066162, - "logps/chosen": -312.82159423828125, - "logps/rejected": -328.6378479003906, - "loss": 0.6614, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.24597156047821045, - "rewards/margins": 0.07650710642337799, - "rewards/rejected": -0.322478711605072, + "logits/chosen": -3.106982707977295, + "logits/rejected": -3.080066204071045, + "logps/chosen": -312.7992248535156, + "logps/rejected": -328.4107360839844, + "loss": 0.6624, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24676008522510529, + "rewards/margins": 0.07433095574378967, + "rewards/rejected": -0.32109108567237854, "step": 260 }, { "epoch": 0.22, "learning_rate": 4.797483409152438e-06, - "logits/chosen": -3.2219741344451904, - "logits/rejected": -3.211695432662964, - "logps/chosen": -308.4544372558594, - "logps/rejected": -333.4085388183594, - "loss": 0.6625, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.3319942355155945, - "rewards/margins": 0.08054188638925552, - "rewards/rejected": -0.412536084651947, + "logits/chosen": -3.210594892501831, + "logits/rejected": -3.2006404399871826, + "logps/chosen": -308.2965393066406, + "logps/rejected": -333.50439453125, + "loss": 0.6611, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.33079180121421814, + "rewards/margins": 0.08313147723674774, + "rewards/rejected": -0.4139232635498047, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.769039710601669e-06, - "logits/chosen": -3.368110179901123, - "logits/rejected": -3.3736987113952637, - "logps/chosen": -316.3270263671875, - "logps/rejected": -338.3800048828125, - "loss": 0.6608, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.42326441407203674, - "rewards/margins": 0.08332939445972443, - "rewards/rejected": -0.5065938234329224, + "logits/chosen": -3.357815980911255, + "logits/rejected": -3.363719940185547, + "logps/chosen": -316.1602783203125, + "logps/rejected": -338.216552734375, + "loss": 0.6612, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.42240840196609497, + "rewards/margins": 0.08246298134326935, + "rewards/rejected": -0.5048713088035583, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.738823533671383e-06, - "logits/chosen": -3.503385543823242, - "logits/rejected": -3.490826368331909, - "logps/chosen": -351.4194030761719, - "logps/rejected": -368.7189025878906, - "loss": 0.6789, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.5290869474411011, - "rewards/margins": 0.048080917447805405, - "rewards/rejected": -0.5771678686141968, + "logits/chosen": -3.4880664348602295, + "logits/rejected": -3.4769935607910156, + "logps/chosen": -351.0909729003906, + "logps/rejected": -368.14385986328125, + "loss": 0.6803, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5264540314674377, + "rewards/margins": 0.04465585574507713, + "rewards/rejected": -0.5711098909378052, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.706858481968017e-06, - "logits/chosen": -3.464003801345825, - "logits/rejected": -3.469198226928711, - "logps/chosen": -340.75830078125, - "logps/rejected": -352.869384765625, - "loss": 0.6795, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.6002467274665833, - "rewards/margins": 0.04778647795319557, - "rewards/rejected": -0.6480332612991333, + "logits/chosen": -3.45086407661438, + "logits/rejected": -3.455495834350586, + "logps/chosen": -340.1499938964844, + "logps/rejected": -352.09429931640625, + "loss": 0.68, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5941203236579895, + "rewards/margins": 0.046401310712099075, + "rewards/rejected": -0.6405216455459595, "step": 300 }, { "epoch": 0.24, - "eval_logits/chosen": -3.5198659896850586, - "eval_logits/rejected": -3.5140511989593506, - "eval_logps/chosen": -272.0929260253906, - "eval_logps/rejected": -281.4856262207031, - "eval_loss": 0.683901846408844, - "eval_rewards/accuracies": 0.5460000038146973, - "eval_rewards/chosen": -0.5138051509857178, - "eval_rewards/margins": 0.037760715931653976, - "eval_rewards/rejected": -0.551565945148468, - "eval_runtime": 1688.0776, - "eval_samples_per_second": 1.774, - "eval_steps_per_second": 0.222, + "eval_logits/chosen": -3.506669044494629, + "eval_logits/rejected": -3.501077175140381, + "eval_logps/chosen": -272.03741455078125, + "eval_logps/rejected": -281.38311767578125, + "eval_loss": 0.6839653253555298, + "eval_rewards/accuracies": 0.5473333597183228, + "eval_rewards/chosen": -0.5133091807365417, + "eval_rewards/margins": 0.03716452047228813, + "eval_rewards/rejected": -0.5504736304283142, + "eval_runtime": 1603.1979, + "eval_samples_per_second": 1.868, + "eval_steps_per_second": 0.234, "step": 300 }, { "epoch": 0.25, "learning_rate": 4.673169525245416e-06, - "logits/chosen": -3.4468257427215576, - "logits/rejected": -3.422842502593994, - "logps/chosen": -337.1869812011719, - "logps/rejected": -369.9530029296875, - "loss": 0.6606, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.5638504028320312, - "rewards/margins": 0.09270543605089188, - "rewards/rejected": -0.6565557718276978, + "logits/chosen": -3.4355697631835938, + "logits/rejected": -3.412278652191162, + "logps/chosen": -336.8961181640625, + "logps/rejected": -369.5231018066406, + "loss": 0.6613, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5616176724433899, + "rewards/margins": 0.09071463346481323, + "rewards/rejected": -0.6523322463035583, "step": 310 }, { "epoch": 0.26, "learning_rate": 4.63778297989952e-06, - "logits/chosen": -3.598461866378784, - "logits/rejected": -3.5812110900878906, - "logps/chosen": -344.33819580078125, - "logps/rejected": -364.0965270996094, - "loss": 0.6658, + "logits/chosen": -3.588507890701294, + "logits/rejected": -3.570230484008789, + "logps/chosen": -343.86956787109375, + "logps/rejected": -364.1554260253906, + "loss": 0.663, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5991551280021667, - "rewards/margins": 0.08249818533658981, - "rewards/rejected": -0.6816532015800476, + "rewards/chosen": -0.5944578051567078, + "rewards/margins": 0.08861590176820755, + "rewards/rejected": -0.6830736994743347, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.60072648841109e-06, - "logits/chosen": -3.7547969818115234, - "logits/rejected": -3.74609637260437, - "logps/chosen": -364.9189758300781, - "logps/rejected": -394.4504089355469, - "loss": 0.6271, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7271077632904053, - "rewards/margins": 0.17919641733169556, - "rewards/rejected": -0.9063041806221008, + "logits/chosen": -3.744804859161377, + "logits/rejected": -3.735940456390381, + "logps/chosen": -364.733642578125, + "logps/rejected": -394.76348876953125, + "loss": 0.6249, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7255194783210754, + "rewards/margins": 0.18435899913311005, + "rewards/rejected": -0.9098785519599915, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.562028997752574e-06, - "logits/chosen": -3.9404075145721436, - "logits/rejected": -3.9303627014160156, - "logps/chosen": -380.1604919433594, - "logps/rejected": -408.8302307128906, - "loss": 0.676, + "logits/chosen": -3.9339213371276855, + "logits/rejected": -3.9240550994873047, + "logps/chosen": -382.0846252441406, + "logps/rejected": -410.460205078125, + "loss": 0.6783, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.8844378590583801, - "rewards/margins": 0.08313676714897156, - "rewards/rejected": -0.9675747156143188, + "rewards/chosen": -0.9037133455276489, + "rewards/margins": 0.0800480842590332, + "rewards/rejected": -0.9837613105773926, "step": 340 }, { "epoch": 0.28, "learning_rate": 4.521720736775947e-06, - "logits/chosen": -3.974989414215088, - "logits/rejected": -3.998753786087036, - "logps/chosen": -397.6668701171875, - "logps/rejected": -411.4002990722656, - "loss": 0.6559, + "logits/chosen": -3.9775962829589844, + "logits/rejected": -4.001706123352051, + "logps/chosen": -399.79351806640625, + "logps/rejected": -413.55987548828125, + "loss": 0.655, "rewards/accuracies": 0.625, - "rewards/chosen": -1.0197855234146118, - "rewards/margins": 0.12257959693670273, - "rewards/rejected": -1.1423652172088623, + "rewards/chosen": -1.040808916091919, + "rewards/margins": 0.1233123168349266, + "rewards/rejected": -1.1641212701797485, "step": 350 }, { "epoch": 0.29, "learning_rate": 4.479833192599198e-06, - "logits/chosen": -3.942905902862549, - "logits/rejected": -3.9193332195281982, - "logps/chosen": -387.99420166015625, - "logps/rejected": -410.8706970214844, - "loss": 0.6613, + "logits/chosen": -3.9514153003692627, + "logits/rejected": -3.929912567138672, + "logps/chosen": -390.7148132324219, + "logps/rejected": -413.5050354003906, + "loss": 0.6615, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.8740938305854797, - "rewards/margins": 0.10989212989807129, - "rewards/rejected": -0.9839859008789062, + "rewards/chosen": -0.901436984539032, + "rewards/margins": 0.10929825156927109, + "rewards/rejected": -1.010735273361206, "step": 360 }, { "epoch": 0.3, "learning_rate": 4.436399086009928e-06, - "logits/chosen": -3.781745195388794, - "logits/rejected": -3.746504306793213, - "logps/chosen": -363.17657470703125, - "logps/rejected": -384.13323974609375, - "loss": 0.6487, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7560319304466248, - "rewards/margins": 0.12066509574651718, - "rewards/rejected": -0.8766969442367554, + "logits/chosen": -3.7934257984161377, + "logits/rejected": -3.7594401836395264, + "logps/chosen": -366.05413818359375, + "logps/rejected": -387.0696716308594, + "loss": 0.6481, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7858292460441589, + "rewards/margins": 0.1209360808134079, + "rewards/rejected": -0.9067652821540833, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.391452345905239e-06, - "logits/chosen": -3.672318696975708, - "logits/rejected": -3.6834559440612793, - "logps/chosen": -373.22344970703125, - "logps/rejected": -389.52777099609375, - "loss": 0.6611, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8263392448425293, - "rewards/margins": 0.11394073814153671, - "rewards/rejected": -0.9402799606323242, + "logits/chosen": -3.6905906200408936, + "logits/rejected": -3.701895236968994, + "logps/chosen": -377.90118408203125, + "logps/rejected": -394.57147216796875, + "loss": 0.6598, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.8731447458267212, + "rewards/margins": 0.11739823967218399, + "rewards/rejected": -0.990543007850647, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.3450280827879125e-06, - "logits/chosen": -3.7310726642608643, - "logits/rejected": -3.7564334869384766, - "logps/chosen": -374.36260986328125, - "logps/rejected": -394.80010986328125, - "loss": 0.6607, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.8552727699279785, - "rewards/margins": 0.11295287311077118, - "rewards/rejected": -0.9682257771492004, + "logits/chosen": -3.7442736625671387, + "logits/rejected": -3.769087553024292, + "logps/chosen": -379.1444091796875, + "logps/rejected": -399.7411804199219, + "loss": 0.6601, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9029915928840637, + "rewards/margins": 0.11410637944936752, + "rewards/rejected": -1.0170979499816895, "step": 390 }, { "epoch": 0.32, "learning_rate": 4.297162561339554e-06, - "logits/chosen": -3.6382896900177, - "logits/rejected": -3.6042380332946777, - "logps/chosen": -380.48590087890625, - "logps/rejected": -407.5166015625, - "loss": 0.6561, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.8749423027038574, - "rewards/margins": 0.1411461979150772, - "rewards/rejected": -1.016088604927063, + "logits/chosen": -3.6351356506347656, + "logits/rejected": -3.6010661125183105, + "logps/chosen": -385.5706481933594, + "logps/rejected": -412.40130615234375, + "loss": 0.657, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9261330366134644, + "rewards/margins": 0.13883258402347565, + "rewards/rejected": -1.0649656057357788, "step": 400 }, { "epoch": 0.32, - "eval_logits/chosen": -3.7580296993255615, - "eval_logits/rejected": -3.7484097480773926, - "eval_logps/chosen": -302.29541015625, - "eval_logps/rejected": -314.21051025390625, - "eval_loss": 0.6812021136283875, - "eval_rewards/accuracies": 0.5573333501815796, - "eval_rewards/chosen": -0.815830409526825, - "eval_rewards/margins": 0.06298430263996124, - "eval_rewards/rejected": -0.8788146376609802, - "eval_runtime": 1618.6439, - "eval_samples_per_second": 1.85, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -3.7522990703582764, + "eval_logits/rejected": -3.7427072525024414, + "eval_logps/chosen": -306.9489440917969, + "eval_logps/rejected": -319.11859130859375, + "eval_loss": 0.6807547211647034, + "eval_rewards/accuracies": 0.5583333373069763, + "eval_rewards/chosen": -0.862424373626709, + "eval_rewards/margins": 0.06540393829345703, + "eval_rewards/rejected": -0.9278281927108765, + "eval_runtime": 1662.1993, + "eval_samples_per_second": 1.801, + "eval_steps_per_second": 0.226, "step": 400 }, { "epoch": 0.33, "learning_rate": 4.247893172092157e-06, - "logits/chosen": -3.615405559539795, - "logits/rejected": -3.612015962600708, - "logps/chosen": -370.3289794921875, - "logps/rejected": -405.1226806640625, - "loss": 0.6581, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.9003459215164185, - "rewards/margins": 0.13107234239578247, - "rewards/rejected": -1.0314182043075562, + "logits/chosen": -3.6088128089904785, + "logits/rejected": -3.6062328815460205, + "logps/chosen": -375.0890197753906, + "logps/rejected": -409.6904296875, + "loss": 0.6588, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.948278546333313, + "rewards/margins": 0.12885506451129913, + "rewards/rejected": -1.0771336555480957, "step": 410 }, { "epoch": 0.34, "learning_rate": 4.197258402220187e-06, - "logits/chosen": -3.65478515625, - "logits/rejected": -3.6666762828826904, - "logps/chosen": -379.541748046875, - "logps/rejected": -418.70880126953125, - "loss": 0.657, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9110058546066284, - "rewards/margins": 0.3442539870738983, - "rewards/rejected": -1.2552598714828491, + "logits/chosen": -3.6457858085632324, + "logits/rejected": -3.658412218093872, + "logps/chosen": -383.5570068359375, + "logps/rejected": -422.8628845214844, + "loss": 0.6555, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.950586199760437, + "rewards/margins": 0.34554851055145264, + "rewards/rejected": -1.2961347103118896, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.145297805476023e-06, - "logits/chosen": -3.6817328929901123, - "logits/rejected": -3.688814640045166, - "logps/chosen": -376.09783935546875, - "logps/rejected": -406.4654846191406, - "loss": 0.6439, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8991987109184265, - "rewards/margins": 0.1862173080444336, - "rewards/rejected": -1.0854160785675049, + "logits/chosen": -3.6771697998046875, + "logits/rejected": -3.6845245361328125, + "logps/chosen": -379.49029541015625, + "logps/rejected": -410.7177734375, + "loss": 0.6399, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9332340359687805, + "rewards/margins": 0.1954960823059082, + "rewards/rejected": -1.1287301778793335, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.092051971292228e-06, - "logits/chosen": -3.716754198074341, - "logits/rejected": -3.7106194496154785, - "logps/chosen": -376.1932067871094, - "logps/rejected": -401.6561584472656, - "loss": 0.6554, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.954433798789978, - "rewards/margins": 0.13505356013774872, - "rewards/rejected": -1.0894873142242432, + "logits/chosen": -3.715477705001831, + "logits/rejected": -3.7073216438293457, + "logps/chosen": -379.3064270019531, + "logps/rejected": -404.5085144042969, + "loss": 0.6564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9848654866218567, + "rewards/margins": 0.13351285457611084, + "rewards/rejected": -1.1183784008026123, "step": 440 }, { "epoch": 0.36, "learning_rate": 4.037562493074792e-06, - "logits/chosen": -3.8129425048828125, - "logits/rejected": -3.833683729171753, - "logps/chosen": -403.4329528808594, - "logps/rejected": -423.4547424316406, - "loss": 0.6365, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0463995933532715, - "rewards/margins": 0.19165393710136414, - "rewards/rejected": -1.2380534410476685, + "logits/chosen": -3.800858974456787, + "logits/rejected": -3.8201656341552734, + "logps/chosen": -404.48602294921875, + "logps/rejected": -423.95330810546875, + "loss": 0.6378, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.056680679321289, + "rewards/margins": 0.18673589825630188, + "rewards/rejected": -1.2434165477752686, "step": 450 }, { "epoch": 0.37, "learning_rate": 3.981871935712112e-06, - "logits/chosen": -3.983973741531372, - "logits/rejected": -3.940070629119873, - "logps/chosen": -385.3892517089844, - "logps/rejected": -416.1813049316406, - "loss": 0.6533, + "logits/chosen": -3.9585278034210205, + "logits/rejected": -3.9149413108825684, + "logps/chosen": -382.9043884277344, + "logps/rejected": -413.051025390625, + "loss": 0.6556, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.0447807312011719, - "rewards/margins": 0.14543746411800385, - "rewards/rejected": -1.190218210220337, + "rewards/chosen": -1.0201416015625, + "rewards/margins": 0.13921280205249786, + "rewards/rejected": -1.1593544483184814, "step": 460 }, { "epoch": 0.38, "learning_rate": 3.925023802325094e-06, - "logits/chosen": -4.042995929718018, - "logits/rejected": -4.0136399269104, - "logps/chosen": -401.97381591796875, - "logps/rejected": -437.3184509277344, - "loss": 0.6455, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.0883817672729492, - "rewards/margins": 0.2549799978733063, - "rewards/rejected": -1.343361735343933, + "logits/chosen": -4.015332221984863, + "logits/rejected": -3.9858086109161377, + "logps/chosen": -397.7122802734375, + "logps/rejected": -432.3436584472656, + "loss": 0.6466, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.045425295829773, + "rewards/margins": 0.2483961582183838, + "rewards/rejected": -1.2938215732574463, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.867062500284342e-06, - "logits/chosen": -4.073556423187256, - "logits/rejected": -4.043025493621826, - "logps/chosen": -388.99090576171875, - "logps/rejected": -425.56787109375, - "loss": 0.6433, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.1277819871902466, - "rewards/margins": 0.1785212755203247, - "rewards/rejected": -1.3063032627105713, + "logits/chosen": -4.052220344543457, + "logits/rejected": -4.021946907043457, + "logps/chosen": -385.6368103027344, + "logps/rejected": -421.88824462890625, + "loss": 0.6432, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0936793088912964, + "rewards/margins": 0.17554286122322083, + "rewards/rejected": -1.2692222595214844, "step": 480 }, { "epoch": 0.39, "learning_rate": 3.8080333065209885e-06, - "logits/chosen": -4.076624393463135, - "logits/rejected": -4.084932327270508, - "logps/chosen": -391.6429138183594, - "logps/rejected": -392.52362060546875, - "loss": 0.7064, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.1167463064193726, - "rewards/margins": 0.06357622146606445, - "rewards/rejected": -1.180322527885437, + "logits/chosen": -4.067862510681152, + "logits/rejected": -4.076433181762695, + "logps/chosen": -389.60333251953125, + "logps/rejected": -390.48321533203125, + "loss": 0.7053, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.09650719165802, + "rewards/margins": 0.06325383484363556, + "rewards/rejected": -1.1597610712051392, "step": 490 }, { "epoch": 0.4, "learning_rate": 3.7479823321582624e-06, - "logits/chosen": -3.93993878364563, - "logits/rejected": -3.9026870727539062, - "logps/chosen": -378.84918212890625, - "logps/rejected": -416.9828186035156, - "loss": 0.633, + "logits/chosen": -3.942683458328247, + "logits/rejected": -3.9045321941375732, + "logps/chosen": -378.3946228027344, + "logps/rejected": -416.160400390625, + "loss": 0.6342, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.9214572906494141, - "rewards/margins": 0.19020086526870728, - "rewards/rejected": -1.1116580963134766, + "rewards/chosen": -0.91758793592453, + "rewards/margins": 0.18643911182880402, + "rewards/rejected": -1.1040270328521729, "step": 500 }, { "epoch": 0.4, - "eval_logits/chosen": -4.107741355895996, - "eval_logits/rejected": -4.097755432128906, - "eval_logps/chosen": -310.98577880859375, - "eval_logps/rejected": -324.42694091796875, - "eval_loss": 0.678744912147522, - "eval_rewards/accuracies": 0.5596666932106018, - "eval_rewards/chosen": -0.9027342796325684, - "eval_rewards/margins": 0.0782446414232254, - "eval_rewards/rejected": -0.9809789061546326, - "eval_runtime": 1619.4632, - "eval_samples_per_second": 1.849, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -4.11630392074585, + "eval_logits/rejected": -4.106530666351318, + "eval_logps/chosen": -313.0755920410156, + "eval_logps/rejected": -326.7316589355469, + "eval_loss": 0.6783922910690308, + "eval_rewards/accuracies": 0.5580000281333923, + "eval_rewards/chosen": -0.9236910939216614, + "eval_rewards/margins": 0.08026820421218872, + "eval_rewards/rejected": -1.00395929813385, + "eval_runtime": 1601.521, + "eval_samples_per_second": 1.869, + "eval_steps_per_second": 0.234, "step": 500 }, { "epoch": 0.41, "learning_rate": 3.686956486491419e-06, - "logits/chosen": -3.9462807178497314, - "logits/rejected": -3.941249132156372, - "logps/chosen": -386.48590087890625, - "logps/rejected": -424.64508056640625, - "loss": 0.6313, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.9227803349494934, - "rewards/margins": 0.29515519738197327, - "rewards/rejected": -1.217935562133789, + "logits/chosen": -3.9557979106903076, + "logits/rejected": -3.950779676437378, + "logps/chosen": -388.673583984375, + "logps/rejected": -427.42962646484375, + "loss": 0.6299, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9443014860153198, + "rewards/margins": 0.30186328291893005, + "rewards/rejected": -1.2461649179458618, "step": 510 }, { "epoch": 0.42, "learning_rate": 3.625003440344166e-06, - "logits/chosen": -4.044493198394775, - "logits/rejected": -4.073317527770996, - "logps/chosen": -369.5896301269531, - "logps/rejected": -382.105224609375, - "loss": 0.664, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9153301119804382, - "rewards/margins": 0.08651997148990631, - "rewards/rejected": -1.0018501281738281, + "logits/chosen": -4.053747653961182, + "logits/rejected": -4.082505226135254, + "logps/chosen": -371.04339599609375, + "logps/rejected": -383.62921142578125, + "loss": 0.6644, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9302754402160645, + "rewards/margins": 0.08710617572069168, + "rewards/rejected": -1.0173815488815308, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.562171588830231e-06, - "logits/chosen": -4.016448497772217, - "logits/rejected": -3.9958884716033936, - "logps/chosen": -377.91912841796875, - "logps/rejected": -404.9534606933594, - "loss": 0.6943, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8825477361679077, - "rewards/margins": 0.0769728347659111, - "rewards/rejected": -0.9595205187797546, + "logits/chosen": -4.011529445648193, + "logits/rejected": -3.9905009269714355, + "logps/chosen": -379.8844299316406, + "logps/rejected": -406.698974609375, + "loss": 0.699, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9027377963066101, + "rewards/margins": 0.07473494112491608, + "rewards/rejected": -0.9774727821350098, "step": 530 }, { "epoch": 0.43, "learning_rate": 3.4985100135491245e-06, - "logits/chosen": -4.008540630340576, - "logits/rejected": -3.9679737091064453, - "logps/chosen": -382.65924072265625, - "logps/rejected": -425.71954345703125, - "loss": 0.629, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8818261027336121, - "rewards/margins": 0.2214728146791458, - "rewards/rejected": -1.1032989025115967, + "logits/chosen": -3.9951107501983643, + "logits/rejected": -3.956082582473755, + "logps/chosen": -383.92889404296875, + "logps/rejected": -426.62347412109375, + "loss": 0.6307, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8952111005783081, + "rewards/margins": 0.21770212054252625, + "rewards/rejected": -1.1129133701324463, "step": 540 }, { "epoch": 0.44, "learning_rate": 3.4340684442456673e-06, - "logits/chosen": -4.049837589263916, - "logits/rejected": -4.043179035186768, - "logps/chosen": -384.42938232421875, - "logps/rejected": -410.5345153808594, - "loss": 0.6434, + "logits/chosen": -4.0311408042907715, + "logits/rejected": -4.022164821624756, + "logps/chosen": -383.31878662109375, + "logps/rejected": -409.2886047363281, + "loss": 0.6428, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.9648186564445496, - "rewards/margins": 0.1496874988079071, - "rewards/rejected": -1.1145063638687134, + "rewards/chosen": -0.953955352306366, + "rewards/margins": 0.1485908329486847, + "rewards/rejected": -1.1025463342666626, "step": 550 }, { "epoch": 0.45, "learning_rate": 3.3688972199631974e-06, - "logits/chosen": -4.042217254638672, - "logits/rejected": -4.064842224121094, - "logps/chosen": -387.6910705566406, - "logps/rejected": -413.78662109375, - "loss": 0.6329, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9935464859008789, - "rewards/margins": 0.24879872798919678, - "rewards/rejected": -1.2423454523086548, + "logits/chosen": -4.0242533683776855, + "logits/rejected": -4.047430992126465, + "logps/chosen": -385.8880920410156, + "logps/rejected": -412.330810546875, + "loss": 0.6328, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9757701754570007, + "rewards/margins": 0.2515910267829895, + "rewards/rejected": -1.2273612022399902, "step": 560 }, { "epoch": 0.46, "learning_rate": 3.3030472497208354e-06, - "logits/chosen": -4.0646257400512695, - "logits/rejected": -4.018919944763184, - "logps/chosen": -385.509765625, - "logps/rejected": -450.21429443359375, - "loss": 0.6219, + "logits/chosen": -4.045630931854248, + "logits/rejected": -4.000221252441406, + "logps/chosen": -382.03607177734375, + "logps/rejected": -447.540283203125, + "loss": 0.6153, "rewards/accuracies": 0.625, - "rewards/chosen": -1.0814400911331177, - "rewards/margins": 0.26284489035606384, - "rewards/rejected": -1.3442847728729248, + "rewards/chosen": -1.047086477279663, + "rewards/margins": 0.2707362473011017, + "rewards/rejected": -1.3178224563598633, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.236569972745492e-06, - "logits/chosen": -4.145129203796387, - "logits/rejected": -4.1337690353393555, - "logps/chosen": -380.5148620605469, - "logps/rejected": -406.1288146972656, - "loss": 0.6628, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0649586915969849, - "rewards/margins": 0.15909543633460999, - "rewards/rejected": -1.2240540981292725, + "logits/chosen": -4.149961471557617, + "logits/rejected": -4.139186382293701, + "logps/chosen": -379.27294921875, + "logps/rejected": -405.08392333984375, + "loss": 0.6644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.05259108543396, + "rewards/margins": 0.16109797358512878, + "rewards/rejected": -1.213688850402832, "step": 580 }, { "epoch": 0.47, "learning_rate": 3.1695173182897126e-06, - "logits/chosen": -4.133418083190918, - "logits/rejected": -4.11216402053833, - "logps/chosen": -398.23626708984375, - "logps/rejected": -435.56646728515625, - "loss": 0.6554, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1820625066757202, - "rewards/margins": 0.16709741950035095, - "rewards/rejected": -1.349160075187683, + "logits/chosen": -4.157639026641846, + "logits/rejected": -4.137297630310059, + "logps/chosen": -400.1607666015625, + "logps/rejected": -438.38275146484375, + "loss": 0.6558, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2017654180526733, + "rewards/margins": 0.17644351720809937, + "rewards/rejected": -1.378208875656128, "step": 590 }, { "epoch": 0.48, "learning_rate": 3.10194166506673e-06, - "logits/chosen": -4.175902366638184, - "logits/rejected": -4.1359357833862305, - "logps/chosen": -390.8631286621094, - "logps/rejected": -439.28436279296875, - "loss": 0.6302, + "logits/chosen": -4.194487571716309, + "logits/rejected": -4.153265476226807, + "logps/chosen": -393.9078063964844, + "logps/rejected": -442.1280822753906, + "loss": 0.6341, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.061226487159729, - "rewards/margins": 0.2546694874763489, - "rewards/rejected": -1.3158957958221436, + "rewards/chosen": -1.0909817218780518, + "rewards/margins": 0.2533242404460907, + "rewards/rejected": -1.3443058729171753, "step": 600 }, { "epoch": 0.48, - "eval_logits/chosen": -4.443523406982422, - "eval_logits/rejected": -4.431849002838135, - "eval_logps/chosen": -337.6354675292969, - "eval_logps/rejected": -353.24932861328125, - "eval_loss": 0.6784851551055908, - "eval_rewards/accuracies": 0.5596666932106018, - "eval_rewards/chosen": -1.1692306995391846, - "eval_rewards/margins": 0.09997232258319855, - "eval_rewards/rejected": -1.2692030668258667, - "eval_runtime": 1619.3984, - "eval_samples_per_second": 1.849, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -4.4554667472839355, + "eval_logits/rejected": -4.444517135620117, + "eval_logps/chosen": -340.80841064453125, + "eval_logps/rejected": -357.2098693847656, + "eval_loss": 0.6766642928123474, + "eval_rewards/accuracies": 0.5630000233650208, + "eval_rewards/chosen": -1.2010191679000854, + "eval_rewards/margins": 0.1077219545841217, + "eval_rewards/rejected": -1.3087410926818848, + "eval_runtime": 1605.4995, + "eval_samples_per_second": 1.865, + "eval_steps_per_second": 0.234, "step": 600 }, { "epoch": 0.49, "learning_rate": 3.0338958003344115e-06, - "logits/chosen": -4.3245649337768555, - "logits/rejected": -4.272718906402588, - "logps/chosen": -396.521240234375, - "logps/rejected": -432.1895446777344, - "loss": 0.6883, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2242915630340576, - "rewards/margins": 0.24001073837280273, - "rewards/rejected": -1.4643023014068604, + "logits/chosen": -4.3319549560546875, + "logits/rejected": -4.279036045074463, + "logps/chosen": -399.0089416503906, + "logps/rejected": -435.59075927734375, + "loss": 0.692, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2491410970687866, + "rewards/margins": 0.2489248812198639, + "rewards/rejected": -1.4980661869049072, "step": 610 }, { "epoch": 0.5, "learning_rate": 2.9654328786600823e-06, - "logits/chosen": -4.306203365325928, - "logits/rejected": -4.252989768981934, - "logps/chosen": -397.3540344238281, - "logps/rejected": -442.0118103027344, - "loss": 0.6197, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2108550071716309, - "rewards/margins": 0.21409356594085693, - "rewards/rejected": -1.4249485731124878, + "logits/chosen": -4.308805465698242, + "logits/rejected": -4.258028984069824, + "logps/chosen": -398.9922180175781, + "logps/rejected": -443.50775146484375, + "loss": 0.6204, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2274248600006104, + "rewards/margins": 0.2125413715839386, + "rewards/rejected": -1.4399662017822266, "step": 620 }, { "epoch": 0.5, "learning_rate": 2.896606380398402e-06, - "logits/chosen": -4.365767478942871, - "logits/rejected": -4.406495094299316, - "logps/chosen": -417.7538146972656, - "logps/rejected": -443.05548095703125, - "loss": 0.6623, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3476839065551758, - "rewards/margins": 0.18296115100383759, - "rewards/rejected": -1.5306451320648193, + "logits/chosen": -4.377072334289551, + "logits/rejected": -4.414391994476318, + "logps/chosen": -420.06884765625, + "logps/rejected": -445.095458984375, + "loss": 0.6656, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3716115951538086, + "rewards/margins": 0.1795826405286789, + "rewards/rejected": -1.551194190979004, "step": 630 }, { "epoch": 0.51, "learning_rate": 2.827470069914772e-06, - "logits/chosen": -4.2744035720825195, - "logits/rejected": -4.236593723297119, - "logps/chosen": -425.37908935546875, - "logps/rejected": -453.02862548828125, - "loss": 0.6756, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3940837383270264, - "rewards/margins": 0.14192768931388855, - "rewards/rejected": -1.5360115766525269, + "logits/chosen": -4.2714762687683105, + "logits/rejected": -4.23270320892334, + "logps/chosen": -426.16943359375, + "logps/rejected": -454.91278076171875, + "loss": 0.6689, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4019109010696411, + "rewards/margins": 0.15336069464683533, + "rewards/rejected": -1.5552715063095093, "step": 640 }, { "epoch": 0.52, "learning_rate": 2.7580779535868675e-06, - "logits/chosen": -4.252664089202881, - "logits/rejected": -4.254392147064209, - "logps/chosen": -409.19378662109375, - "logps/rejected": -438.70556640625, - "loss": 0.6575, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.230445384979248, - "rewards/margins": 0.18368306756019592, - "rewards/rejected": -1.4141284227371216, + "logits/chosen": -4.235100746154785, + "logits/rejected": -4.236804008483887, + "logps/chosen": -408.59002685546875, + "logps/rejected": -438.03204345703125, + "loss": 0.6618, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2248241901397705, + "rewards/margins": 0.1827341616153717, + "rewards/rejected": -1.4075584411621094, "step": 650 }, { "epoch": 0.53, "learning_rate": 2.688484237617129e-06, - "logits/chosen": -4.151357650756836, - "logits/rejected": -4.122767925262451, - "logps/chosen": -400.3338928222656, - "logps/rejected": -436.16571044921875, - "loss": 0.6222, + "logits/chosen": -4.117595672607422, + "logits/rejected": -4.08989143371582, + "logps/chosen": -397.66925048828125, + "logps/rejected": -433.625732421875, + "loss": 0.6206, "rewards/accuracies": 0.65625, - "rewards/chosen": -1.108682632446289, - "rewards/margins": 0.24583642184734344, - "rewards/rejected": -1.3545191287994385, + "rewards/chosen": -1.0825097560882568, + "rewards/margins": 0.2466181069612503, + "rewards/rejected": -1.3291277885437012, "step": 660 }, { "epoch": 0.54, "learning_rate": 2.6187432856891585e-06, - "logits/chosen": -4.1051225662231445, - "logits/rejected": -4.0978288650512695, - "logps/chosen": -407.8655700683594, - "logps/rejected": -446.1006774902344, - "loss": 0.6432, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.1575034856796265, - "rewards/margins": 0.18804897367954254, - "rewards/rejected": -1.3455523252487183, + "logits/chosen": -4.0606913566589355, + "logits/rejected": -4.052728176116943, + "logps/chosen": -407.27392578125, + "logps/rejected": -445.926025390625, + "loss": 0.6409, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.151308298110962, + "rewards/margins": 0.19334861636161804, + "rewards/rejected": -1.344657063484192, "step": 670 }, { "epoch": 0.54, "learning_rate": 2.548909576501096e-06, - "logits/chosen": -4.150703430175781, - "logits/rejected": -4.143389701843262, - "logps/chosen": -413.3582458496094, - "logps/rejected": -443.1758728027344, - "loss": 0.6353, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2182948589324951, - "rewards/margins": 0.19429844617843628, - "rewards/rejected": -1.4125932455062866, + "logits/chosen": -4.1063737869262695, + "logits/rejected": -4.097175121307373, + "logps/chosen": -412.988037109375, + "logps/rejected": -442.6796875, + "loss": 0.6362, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2146575450897217, + "rewards/margins": 0.1937810331583023, + "rewards/rejected": -1.4084386825561523, "step": 680 }, { "epoch": 0.55, "learning_rate": 2.4790376612091503e-06, - "logits/chosen": -4.271695613861084, - "logits/rejected": -4.229399681091309, - "logps/chosen": -443.7752990722656, - "logps/rejected": -475.551025390625, - "loss": 0.6236, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.456383228302002, - "rewards/margins": 0.23586714267730713, - "rewards/rejected": -1.6922504901885986, + "logits/chosen": -4.220733642578125, + "logits/rejected": -4.182897090911865, + "logps/chosen": -444.1158142089844, + "logps/rejected": -476.25286865234375, + "loss": 0.6217, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4591256380081177, + "rewards/margins": 0.2400621622800827, + "rewards/rejected": -1.6991876363754272, "step": 690 }, { "epoch": 0.56, "learning_rate": 2.40918212081453e-06, - "logits/chosen": -4.358768939971924, - "logits/rejected": -4.3066534996032715, - "logps/chosen": -407.3147888183594, - "logps/rejected": -466.919921875, - "loss": 0.5743, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3695669174194336, - "rewards/margins": 0.4060749411582947, - "rewards/rejected": -1.7756417989730835, + "logits/chosen": -4.308882713317871, + "logits/rejected": -4.258685111999512, + "logps/chosen": -410.50714111328125, + "logps/rejected": -470.98712158203125, + "loss": 0.573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4014512300491333, + "rewards/margins": 0.4143608510494232, + "rewards/rejected": -1.8158118724822998, "step": 700 }, { "epoch": 0.56, - "eval_logits/chosen": -4.518208026885986, - "eval_logits/rejected": -4.504719257354736, - "eval_logps/chosen": -375.0574645996094, - "eval_logps/rejected": -392.7273254394531, - "eval_loss": 0.6835331916809082, - "eval_rewards/accuracies": 0.5630000233650208, - "eval_rewards/chosen": -1.5434508323669434, - "eval_rewards/margins": 0.12053229659795761, - "eval_rewards/rejected": -1.6639831066131592, - "eval_runtime": 1619.3262, - "eval_samples_per_second": 1.849, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -4.460686683654785, + "eval_logits/rejected": -4.447921276092529, + "eval_logps/chosen": -376.8887939453125, + "eval_logps/rejected": -395.554931640625, + "eval_loss": 0.680784285068512, + "eval_rewards/accuracies": 0.5633333325386047, + "eval_rewards/chosen": -1.561822772026062, + "eval_rewards/margins": 0.1303686946630478, + "eval_rewards/rejected": -1.6921913623809814, + "eval_runtime": 1600.8511, + "eval_samples_per_second": 1.87, + "eval_steps_per_second": 0.234, "step": 700 }, { "epoch": 0.57, "learning_rate": 2.3393975235270654e-06, - "logits/chosen": -4.303341865539551, - "logits/rejected": -4.286491394042969, - "logps/chosen": -452.05718994140625, - "logps/rejected": -493.17144775390625, - "loss": 0.6602, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.5600488185882568, - "rewards/margins": 0.23919770121574402, - "rewards/rejected": -1.7992465496063232, + "logits/chosen": -4.24044132232666, + "logits/rejected": -4.221581935882568, + "logps/chosen": -451.12353515625, + "logps/rejected": -492.0777893066406, + "loss": 0.6601, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5504473447799683, + "rewards/margins": 0.23769822716712952, + "rewards/rejected": -1.788145661354065, "step": 710 }, { "epoch": 0.58, "learning_rate": 2.2697383821388153e-06, - "logits/chosen": -4.293368816375732, - "logits/rejected": -4.3109025955200195, - "logps/chosen": -435.1492614746094, - "logps/rejected": -460.28033447265625, - "loss": 0.6504, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.4816340208053589, - "rewards/margins": 0.2186344861984253, - "rewards/rejected": -1.7002685070037842, + "logits/chosen": -4.228194236755371, + "logits/rejected": -4.242656230926514, + "logps/chosen": -432.177001953125, + "logps/rejected": -457.5282287597656, + "loss": 0.6498, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4517402648925781, + "rewards/margins": 0.22140760719776154, + "rewards/rejected": -1.6731477975845337, "step": 720 }, { "epoch": 0.58, "learning_rate": 2.2002591114409657e-06, - "logits/chosen": -4.212637424468994, - "logits/rejected": -4.208783149719238, - "logps/chosen": -432.06805419921875, - "logps/rejected": -468.85601806640625, - "loss": 0.6325, + "logits/chosen": -4.1490349769592285, + "logits/rejected": -4.144831657409668, + "logps/chosen": -428.9891662597656, + "logps/rejected": -466.90228271484375, + "loss": 0.6282, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.403395652770996, - "rewards/margins": 0.24927303194999695, - "rewards/rejected": -1.6526685953140259, + "rewards/chosen": -1.3725817203521729, + "rewards/margins": 0.26138487458229065, + "rewards/rejected": -1.6339666843414307, "step": 730 }, { "epoch": 0.59, "learning_rate": 2.131013985717285e-06, - "logits/chosen": -4.271391868591309, - "logits/rejected": -4.220091819763184, - "logps/chosen": -442.1729431152344, - "logps/rejected": -491.44384765625, - "loss": 0.6362, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4394041299819946, - "rewards/margins": 0.2614460587501526, - "rewards/rejected": -1.7008501291275024, + "logits/chosen": -4.210469722747803, + "logits/rejected": -4.1584882736206055, + "logps/chosen": -442.4186096191406, + "logps/rejected": -490.4705505371094, + "loss": 0.6473, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4414502382278442, + "rewards/margins": 0.2493906468153, + "rewards/rejected": -1.690840721130371, "step": 740 }, { "epoch": 0.6, "learning_rate": 2.062057096347338e-06, - "logits/chosen": -4.25800895690918, - "logits/rejected": -4.223499774932861, - "logps/chosen": -419.89495849609375, - "logps/rejected": -435.30389404296875, - "loss": 0.6593, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2841438055038452, - "rewards/margins": 0.15097984671592712, - "rewards/rejected": -1.4351234436035156, + "logits/chosen": -4.1790266036987305, + "logits/rejected": -4.148745059967041, + "logps/chosen": -418.1016540527344, + "logps/rejected": -432.9185485839844, + "loss": 0.6612, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2657266855239868, + "rewards/margins": 0.14516393840312958, + "rewards/rejected": -1.4108905792236328, "step": 750 }, { "epoch": 0.61, "learning_rate": 1.9934423095525733e-06, - "logits/chosen": -4.121432304382324, - "logits/rejected": -4.1321306228637695, - "logps/chosen": -416.6729431152344, - "logps/rejected": -442.0414123535156, - "loss": 0.6534, + "logits/chosen": -4.040548324584961, + "logits/rejected": -4.0535197257995605, + "logps/chosen": -413.2955017089844, + "logps/rejected": -438.1127014160156, + "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.1921896934509277, - "rewards/margins": 0.2546849846839905, - "rewards/rejected": -1.4468748569488525, + "rewards/chosen": -1.1577858924865723, + "rewards/margins": 0.2499733865261078, + "rewards/rejected": -1.4077593088150024, "step": 760 }, { "epoch": 0.62, "learning_rate": 1.9252232243182986e-06, - "logits/chosen": -4.221813678741455, - "logits/rejected": -4.169572830200195, - "logps/chosen": -377.6047668457031, - "logps/rejected": -425.6900939941406, - "loss": 0.6026, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.071094036102295, - "rewards/margins": 0.307754784822464, - "rewards/rejected": -1.3788487911224365, + "logits/chosen": -4.132304668426514, + "logits/rejected": -4.086350440979004, + "logps/chosen": -374.2073974609375, + "logps/rejected": -421.70947265625, + "loss": 0.6035, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0375020503997803, + "rewards/margins": 0.3012349009513855, + "rewards/rejected": -1.3387370109558105, "step": 770 }, { "epoch": 0.62, "learning_rate": 1.8574531305244043e-06, - "logits/chosen": -4.010577201843262, - "logits/rejected": -3.9968719482421875, - "logps/chosen": -418.9974060058594, - "logps/rejected": -465.41650390625, - "loss": 0.5956, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2276278734207153, - "rewards/margins": 0.3166094124317169, - "rewards/rejected": -1.5442373752593994, + "logits/chosen": -3.9333484172821045, + "logits/rejected": -3.921320676803589, + "logps/chosen": -413.23236083984375, + "logps/rejected": -459.419677734375, + "loss": 0.5945, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1705034971237183, + "rewards/margins": 0.31472960114479065, + "rewards/rejected": -1.485233187675476, "step": 780 }, { "epoch": 0.63, "learning_rate": 1.7901849673175559e-06, - "logits/chosen": -4.077489376068115, - "logits/rejected": -4.033568382263184, - "logps/chosen": -420.52984619140625, - "logps/rejected": -455.3899841308594, - "loss": 0.6434, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2396165132522583, - "rewards/margins": 0.20500020682811737, - "rewards/rejected": -1.4446165561676025, + "logits/chosen": -4.004987716674805, + "logits/rejected": -3.96270751953125, + "logps/chosen": -415.433349609375, + "logps/rejected": -450.65447998046875, + "loss": 0.6413, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1887439489364624, + "rewards/margins": 0.20863094925880432, + "rewards/rejected": -1.3973749876022339, "step": 790 }, { "epoch": 0.64, "learning_rate": 1.7234712817573555e-06, - "logits/chosen": -4.063477516174316, - "logits/rejected": -4.062304496765137, - "logps/chosen": -456.8814392089844, - "logps/rejected": -480.8173828125, - "loss": 0.6443, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4422929286956787, - "rewards/margins": 0.22365431487560272, - "rewards/rejected": -1.6659473180770874, + "logits/chosen": -3.995453357696533, + "logits/rejected": -3.992877960205078, + "logps/chosen": -452.4390563964844, + "logps/rejected": -477.8495178222656, + "loss": 0.6384, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3979339599609375, + "rewards/margins": 0.2388983964920044, + "rewards/rejected": -1.6368324756622314, "step": 800 }, { "epoch": 0.64, - "eval_logits/chosen": -4.257167816162109, - "eval_logits/rejected": -4.245348930358887, - "eval_logps/chosen": -359.3107604980469, - "eval_logps/rejected": -377.020751953125, - "eval_loss": 0.6778839230537415, - "eval_rewards/accuracies": 0.5666666626930237, - "eval_rewards/chosen": -1.385983943939209, - "eval_rewards/margins": 0.12093351036310196, - "eval_rewards/rejected": -1.5069174766540527, - "eval_runtime": 1620.4754, - "eval_samples_per_second": 1.848, - "eval_steps_per_second": 0.231, + "eval_logits/chosen": -4.182389259338379, + "eval_logits/rejected": -4.171318531036377, + "eval_logps/chosen": -357.1873474121094, + "eval_logps/rejected": -375.4593200683594, + "eval_loss": 0.6752617955207825, + "eval_rewards/accuracies": 0.5659999847412109, + "eval_rewards/chosen": -1.3648083209991455, + "eval_rewards/margins": 0.12642760574817657, + "eval_rewards/rejected": -1.4912358522415161, + "eval_runtime": 1604.6041, + "eval_samples_per_second": 1.866, + "eval_steps_per_second": 0.234, "step": 800 }, { "epoch": 0.65, "learning_rate": 1.6573641877687936e-06, - "logits/chosen": -4.076521873474121, - "logits/rejected": -4.057218074798584, - "logps/chosen": -422.85699462890625, - "logps/rejected": -470.95465087890625, - "loss": 0.6161, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3746501207351685, - "rewards/margins": 0.2610599100589752, - "rewards/rejected": -1.6357100009918213, + "logits/chosen": -4.01367712020874, + "logits/rejected": -3.9943137168884277, + "logps/chosen": -420.02734375, + "logps/rejected": -469.4754333496094, + "loss": 0.612, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3465534448623657, + "rewards/margins": 0.27429622411727905, + "rewards/rejected": -1.620849370956421, "step": 810 }, { "epoch": 0.66, "learning_rate": 1.591915325433034e-06, - "logits/chosen": -4.133788108825684, - "logits/rejected": -4.142486572265625, - "logps/chosen": -414.0416564941406, - "logps/rejected": -449.82373046875, - "loss": 0.6194, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.315294861793518, - "rewards/margins": 0.30571961402893066, - "rewards/rejected": -1.6210145950317383, + "logits/chosen": -4.069581031799316, + "logits/rejected": -4.076575756072998, + "logps/chosen": -411.4861755371094, + "logps/rejected": -448.5120544433594, + "loss": 0.6138, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.289924144744873, + "rewards/margins": 0.3182913661003113, + "rewards/rejected": -1.608215570449829, "step": 820 }, { "epoch": 0.66, "learning_rate": 1.5271758206483664e-06, - "logits/chosen": -4.143270015716553, - "logits/rejected": -4.132315158843994, - "logps/chosen": -438.13037109375, - "logps/rejected": -471.50030517578125, - "loss": 0.6481, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4561512470245361, - "rewards/margins": 0.22672787308692932, - "rewards/rejected": -1.682879090309143, + "logits/chosen": -4.08115291595459, + "logits/rejected": -4.065010070800781, + "logps/chosen": -438.2860412597656, + "logps/rejected": -472.0508728027344, + "loss": 0.6431, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.458389163017273, + "rewards/margins": 0.23057003319263458, + "rewards/rejected": -1.6889593601226807, "step": 830 }, { "epoch": 0.67, "learning_rate": 1.4631962451927966e-06, - "logits/chosen": -4.0487775802612305, - "logits/rejected": -4.032698631286621, - "logps/chosen": -431.3470153808594, - "logps/rejected": -473.2076110839844, - "loss": 0.6076, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3846697807312012, - "rewards/margins": 0.27571359276771545, - "rewards/rejected": -1.6603834629058838, + "logits/chosen": -3.989629030227661, + "logits/rejected": -3.970881938934326, + "logps/chosen": -431.90972900390625, + "logps/rejected": -473.61505126953125, + "loss": 0.6104, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3902056217193604, + "rewards/margins": 0.27469927072525024, + "rewards/rejected": -1.6649048328399658, "step": 840 }, { "epoch": 0.68, "learning_rate": 1.4000265772195032e-06, - "logits/chosen": -4.225982666015625, - "logits/rejected": -4.171608924865723, - "logps/chosen": -430.31201171875, - "logps/rejected": -475.3255920410156, - "loss": 0.6196, + "logits/chosen": -4.167831897735596, + "logits/rejected": -4.113857746124268, + "logps/chosen": -430.6526794433594, + "logps/rejected": -475.681884765625, + "loss": 0.6188, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3736594915390015, - "rewards/margins": 0.2828761339187622, - "rewards/rejected": -1.6565355062484741, + "rewards/chosen": -1.3779888153076172, + "rewards/margins": 0.2821890711784363, + "rewards/rejected": -1.6601779460906982, "step": 850 }, { "epoch": 0.69, "learning_rate": 1.3377161622160077e-06, - "logits/chosen": -4.169137001037598, - "logits/rejected": -4.160987854003906, - "logps/chosen": -430.1625061035156, - "logps/rejected": -470.7685546875, - "loss": 0.6046, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3943125009536743, - "rewards/margins": 0.2797546982765198, - "rewards/rejected": -1.6740672588348389, + "logits/chosen": -4.1135406494140625, + "logits/rejected": -4.103851795196533, + "logps/chosen": -431.333740234375, + "logps/rejected": -472.18505859375, + "loss": 0.6059, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4065897464752197, + "rewards/margins": 0.28285306692123413, + "rewards/rejected": -1.6894428730010986, "step": 860 }, { "epoch": 0.7, "learning_rate": 1.276313674457553e-06, - "logits/chosen": -4.306554794311523, - "logits/rejected": -4.296151161193848, - "logps/chosen": -415.35101318359375, - "logps/rejected": -470.18145751953125, - "loss": 0.5782, + "logits/chosen": -4.243846416473389, + "logits/rejected": -4.234537601470947, + "logps/chosen": -417.1517639160156, + "logps/rejected": -473.24517822265625, + "loss": 0.574, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.413987636566162, - "rewards/margins": 0.35374554991722107, - "rewards/rejected": -1.7677332162857056, + "rewards/chosen": -1.4315975904464722, + "rewards/margins": 0.36709946393966675, + "rewards/rejected": -1.7986972332000732, "step": 870 }, { "epoch": 0.7, "learning_rate": 1.2158670789848095e-06, - "logits/chosen": -4.3886284828186035, - "logits/rejected": -4.385241508483887, - "logps/chosen": -460.49005126953125, - "logps/rejected": -502.52130126953125, - "loss": 0.608, + "logits/chosen": -4.32526159286499, + "logits/rejected": -4.3212785720825195, + "logps/chosen": -463.43548583984375, + "logps/rejected": -504.515380859375, + "loss": 0.609, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.6725581884384155, - "rewards/margins": 0.34351325035095215, - "rewards/rejected": -2.016071319580078, + "rewards/chosen": -1.7021582126617432, + "rewards/margins": 0.33437836170196533, + "rewards/rejected": -2.036536693572998, "step": 880 }, { "epoch": 0.71, "learning_rate": 1.1564235941356016e-06, - "logits/chosen": -4.544154167175293, - "logits/rejected": -4.480313301086426, - "logps/chosen": -452.08245849609375, - "logps/rejected": -509.763427734375, - "loss": 0.6428, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.7576968669891357, - "rewards/margins": 0.3635411858558655, - "rewards/rejected": -2.1212382316589355, + "logits/chosen": -4.47339391708374, + "logits/rejected": -4.411566257476807, + "logps/chosen": -455.85986328125, + "logps/rejected": -513.0088500976562, + "loss": 0.6417, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7953081130981445, + "rewards/margins": 0.3589649796485901, + "rewards/rejected": -2.154273271560669, "step": 890 }, { "epoch": 0.72, "learning_rate": 1.0980296546599254e-06, - "logits/chosen": -4.359221935272217, - "logits/rejected": -4.357415199279785, - "logps/chosen": -476.0204162597656, - "logps/rejected": -513.69287109375, - "loss": 0.6651, + "logits/chosen": -4.28812313079834, + "logits/rejected": -4.28458309173584, + "logps/chosen": -480.78045654296875, + "logps/rejected": -516.3783569335938, + "loss": 0.6728, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.769547462463379, - "rewards/margins": 0.4242987036705017, - "rewards/rejected": -2.1938462257385254, + "rewards/chosen": -1.8175837993621826, + "rewards/margins": 0.40267056226730347, + "rewards/rejected": -2.220254421234131, "step": 900 }, { "epoch": 0.72, - "eval_logits/chosen": -4.617808818817139, - "eval_logits/rejected": -4.603901386260986, - "eval_logps/chosen": -387.0414123535156, - "eval_logps/rejected": -406.733154296875, - "eval_loss": 0.6818779706954956, - "eval_rewards/accuracies": 0.5693333148956299, - "eval_rewards/chosen": -1.6632905006408691, - "eval_rewards/margins": 0.14075076580047607, - "eval_rewards/rejected": -1.8040413856506348, - "eval_runtime": 1619.8038, - "eval_samples_per_second": 1.848, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -4.539231300354004, + "eval_logits/rejected": -4.5250043869018555, + "eval_logps/chosen": -389.2689208984375, + "eval_logps/rejected": -409.6128234863281, + "eval_loss": 0.6791353821754456, + "eval_rewards/accuracies": 0.5713333487510681, + "eval_rewards/chosen": -1.6856240034103394, + "eval_rewards/margins": 0.14714699983596802, + "eval_rewards/rejected": -1.8327711820602417, + "eval_runtime": 1601.9204, + "eval_samples_per_second": 1.869, + "eval_steps_per_second": 0.234, "step": 900 }, { "epoch": 0.73, "learning_rate": 1.040730875447083e-06, - "logits/chosen": -4.3218793869018555, - "logits/rejected": -4.3270487785339355, - "logps/chosen": -451.809814453125, - "logps/rejected": -478.60491943359375, - "loss": 0.6503, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.565606713294983, - "rewards/margins": 0.20372018218040466, - "rewards/rejected": -1.7693268060684204, + "logits/chosen": -4.249950885772705, + "logits/rejected": -4.256103515625, + "logps/chosen": -454.0740661621094, + "logps/rejected": -481.5208435058594, + "loss": 0.6463, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5884063243865967, + "rewards/margins": 0.2100685089826584, + "rewards/rejected": -1.798474669456482, "step": 910 }, { "epoch": 0.74, "learning_rate": 9.845720158932414e-07, - "logits/chosen": -4.289103031158447, - "logits/rejected": -4.304252624511719, - "logps/chosen": -420.05926513671875, - "logps/rejected": -442.1366271972656, - "loss": 0.6505, + "logits/chosen": -4.217880725860596, + "logits/rejected": -4.230615615844727, + "logps/chosen": -421.2108459472656, + "logps/rejected": -443.8993225097656, + "loss": 0.6443, "rewards/accuracies": 0.625, - "rewards/chosen": -1.4516870975494385, - "rewards/margins": 0.20668797194957733, - "rewards/rejected": -1.6583747863769531, + "rewards/chosen": -1.46346914768219, + "rewards/margins": 0.2133568972349167, + "rewards/rejected": -1.6768258810043335, "step": 920 }, { "epoch": 0.74, "learning_rate": 9.295969449372796e-07, - "logits/chosen": -4.317067623138428, - "logits/rejected": -4.289021968841553, - "logps/chosen": -417.4593811035156, - "logps/rejected": -454.826904296875, - "loss": 0.6219, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3394687175750732, - "rewards/margins": 0.25516074895858765, - "rewards/rejected": -1.5946292877197266, + "logits/chosen": -4.250065803527832, + "logits/rejected": -4.226481914520264, + "logps/chosen": -418.61883544921875, + "logps/rejected": -455.81170654296875, + "loss": 0.6211, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3508899211883545, + "rewards/margins": 0.25358742475509644, + "rewards/rejected": -1.6044775247573853, "step": 930 }, { "epoch": 0.75, "learning_rate": 8.758486067922176e-07, - "logits/chosen": -4.306538105010986, - "logits/rejected": -4.262487411499023, - "logps/chosen": -414.806884765625, - "logps/rejected": -465.29931640625, - "loss": 0.5998, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3654025793075562, - "rewards/margins": 0.3171504735946655, - "rewards/rejected": -1.6825529336929321, + "logits/chosen": -4.2414469718933105, + "logits/rejected": -4.200159549713135, + "logps/chosen": -413.689697265625, + "logps/rejected": -464.592529296875, + "loss": 0.5956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3547637462615967, + "rewards/margins": 0.32075968384742737, + "rewards/rejected": -1.6755234003067017, "step": 940 }, { "epoch": 0.76, "learning_rate": 8.233689873990006e-07, - "logits/chosen": -4.317531585693359, - "logits/rejected": -4.282795429229736, - "logps/chosen": -420.382568359375, - "logps/rejected": -493.355712890625, - "loss": 0.5953, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.419424295425415, - "rewards/margins": 0.5507811903953552, - "rewards/rejected": -1.970205545425415, + "logits/chosen": -4.249865531921387, + "logits/rejected": -4.214231491088867, + "logps/chosen": -419.63397216796875, + "logps/rejected": -491.7757873535156, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4119402170181274, + "rewards/margins": 0.5434682369232178, + "rewards/rejected": -1.9554083347320557, "step": 950 }, { "epoch": 0.77, "learning_rate": 7.721990816288555e-07, - "logits/chosen": -4.283775329589844, - "logits/rejected": -4.2459330558776855, - "logps/chosen": -395.4678955078125, - "logps/rejected": -434.7349548339844, - "loss": 0.6193, + "logits/chosen": -4.227639198303223, + "logits/rejected": -4.187970161437988, + "logps/chosen": -395.7589416503906, + "logps/rejected": -436.3963928222656, + "loss": 0.612, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3240183591842651, - "rewards/margins": 0.2636045813560486, - "rewards/rejected": -1.587622880935669, + "rewards/chosen": -1.3264906406402588, + "rewards/margins": 0.2781200408935547, + "rewards/rejected": -1.604610800743103, "step": 960 }, { "epoch": 0.78, "learning_rate": 7.223788612598148e-07, - "logits/chosen": -4.310162544250488, - "logits/rejected": -4.287927627563477, - "logps/chosen": -436.2978515625, - "logps/rejected": -470.63079833984375, - "loss": 0.6316, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4355452060699463, - "rewards/margins": 0.18634586036205292, - "rewards/rejected": -1.6218910217285156, + "logits/chosen": -4.25420618057251, + "logits/rejected": -4.232513427734375, + "logps/chosen": -437.45721435546875, + "logps/rejected": -472.1407165527344, + "loss": 0.6314, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4481703042984009, + "rewards/margins": 0.1885295808315277, + "rewards/rejected": -1.636699914932251, "step": 970 }, { "epoch": 0.78, "learning_rate": 6.73947243752448e-07, - "logits/chosen": -4.377969264984131, - "logits/rejected": -4.389736652374268, - "logps/chosen": -458.318359375, - "logps/rejected": -487.77874755859375, - "loss": 0.6632, + "logits/chosen": -4.329432964324951, + "logits/rejected": -4.338817596435547, + "logps/chosen": -462.8741760253906, + "logps/rejected": -490.8958435058594, + "loss": 0.6649, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.6293509006500244, - "rewards/margins": 0.36782675981521606, - "rewards/rejected": -1.9971777200698853, + "rewards/chosen": -1.6740070581436157, + "rewards/margins": 0.3544973134994507, + "rewards/rejected": -2.0285041332244873, "step": 980 }, { "epoch": 0.79, "learning_rate": 6.269420618491759e-07, - "logits/chosen": -4.321467876434326, - "logits/rejected": -4.286978721618652, - "logps/chosen": -420.07220458984375, - "logps/rejected": -458.8563537597656, - "loss": 0.6383, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4905959367752075, - "rewards/margins": 0.22982291877269745, - "rewards/rejected": -1.720418930053711, + "logits/chosen": -4.2786760330200195, + "logits/rejected": -4.245335578918457, + "logps/chosen": -423.904296875, + "logps/rejected": -463.173095703125, + "loss": 0.6426, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5292352437973022, + "rewards/margins": 0.23503565788269043, + "rewards/rejected": -1.7642710208892822, "step": 990 }, { "epoch": 0.8, "learning_rate": 5.814000340209267e-07, - "logits/chosen": -4.299461364746094, - "logits/rejected": -4.237879276275635, - "logps/chosen": -435.0042419433594, - "logps/rejected": -491.1070861816406, - "loss": 0.5993, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4882529973983765, - "rewards/margins": 0.3519567549228668, - "rewards/rejected": -1.8402099609375, + "logits/chosen": -4.25986385345459, + "logits/rejected": -4.194713592529297, + "logps/chosen": -439.45086669921875, + "logps/rejected": -495.15771484375, + "loss": 0.603, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.533198356628418, + "rewards/margins": 0.34696659445762634, + "rewards/rejected": -1.8801651000976562, "step": 1000 }, { "epoch": 0.8, - "eval_logits/chosen": -4.549124717712402, - "eval_logits/rejected": -4.535641193389893, - "eval_logps/chosen": -378.4713439941406, - "eval_logps/rejected": -398.2364196777344, - "eval_loss": 0.6785325407981873, - "eval_rewards/accuracies": 0.5683333277702332, - "eval_rewards/chosen": -1.577589511871338, - "eval_rewards/margins": 0.1414840668439865, - "eval_rewards/rejected": -1.7190735340118408, - "eval_runtime": 1620.6034, - "eval_samples_per_second": 1.847, - "eval_steps_per_second": 0.231, + "eval_logits/chosen": -4.503380298614502, + "eval_logits/rejected": -4.4899396896362305, + "eval_logps/chosen": -382.5414733886719, + "eval_logps/rejected": -403.2298889160156, + "eval_loss": 0.676650881767273, + "eval_rewards/accuracies": 0.5770000219345093, + "eval_rewards/chosen": -1.618349313735962, + "eval_rewards/margins": 0.15059219300746918, + "eval_rewards/rejected": -1.7689412832260132, + "eval_runtime": 1634.1825, + "eval_samples_per_second": 1.832, + "eval_steps_per_second": 0.229, "step": 1000 }, { "epoch": 0.81, "learning_rate": 5.373567357842111e-07, - "logits/chosen": -4.278590202331543, - "logits/rejected": -4.246352195739746, - "logps/chosen": -429.3436584472656, - "logps/rejected": -471.84722900390625, - "loss": 0.5957, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.422374963760376, - "rewards/margins": 0.33030739426612854, - "rewards/rejected": -1.7526824474334717, + "logits/chosen": -4.2339959144592285, + "logits/rejected": -4.205721378326416, + "logps/chosen": -433.2574768066406, + "logps/rejected": -476.2079162597656, + "loss": 0.5976, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4623594284057617, + "rewards/margins": 0.33363303542137146, + "rewards/rejected": -1.7959926128387451, "step": 1010 }, { "epoch": 0.82, "learning_rate": 4.948465719110226e-07, - "logits/chosen": -4.373248100280762, - "logits/rejected": -4.3430304527282715, - "logps/chosen": -415.3047790527344, - "logps/rejected": -445.019287109375, - "loss": 0.6443, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.40928053855896, - "rewards/margins": 0.228702574968338, - "rewards/rejected": -1.6379830837249756, + "logits/chosen": -4.32258415222168, + "logits/rejected": -4.291365623474121, + "logps/chosen": -420.07330322265625, + "logps/rejected": -451.5908203125, + "loss": 0.6421, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.45734441280365, + "rewards/margins": 0.24639002978801727, + "rewards/rejected": -1.7037346363067627, "step": 1020 }, { "epoch": 0.82, "learning_rate": 4.539027495532766e-07, - "logits/chosen": -4.33120059967041, - "logits/rejected": -4.347836494445801, - "logps/chosen": -415.25555419921875, - "logps/rejected": -453.518798828125, - "loss": 0.6177, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4035775661468506, - "rewards/margins": 0.28108319640159607, - "rewards/rejected": -1.6846606731414795, + "logits/chosen": -4.28394889831543, + "logits/rejected": -4.300284385681152, + "logps/chosen": -417.7969665527344, + "logps/rejected": -456.79180908203125, + "loss": 0.6168, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4292632341384888, + "rewards/margins": 0.28898704051971436, + "rewards/rejected": -1.718250036239624, "step": 1030 }, { "epoch": 0.83, "learning_rate": 4.14557252302783e-07, - "logits/chosen": -4.341530799865723, - "logits/rejected": -4.305572032928467, - "logps/chosen": -433.0341796875, - "logps/rejected": -472.1036682128906, - "loss": 0.6713, + "logits/chosen": -4.28948450088501, + "logits/rejected": -4.251836776733398, + "logps/chosen": -435.3079528808594, + "logps/rejected": -474.75006103515625, + "loss": 0.6739, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.5271722078323364, - "rewards/margins": 0.23339995741844177, - "rewards/rejected": -1.760572075843811, + "rewards/chosen": -1.5503196716308594, + "rewards/margins": 0.23696406185626984, + "rewards/rejected": -1.7872836589813232, "step": 1040 }, { "epoch": 0.84, "learning_rate": 3.7684081520700884e-07, - "logits/chosen": -4.2357988357543945, - "logits/rejected": -4.237625598907471, - "logps/chosen": -450.54827880859375, - "logps/rejected": -471.02423095703125, - "loss": 0.6429, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.5025393962860107, - "rewards/margins": 0.2623196542263031, - "rewards/rejected": -1.7648589611053467, + "logits/chosen": -4.184053897857666, + "logits/rejected": -4.1847357749938965, + "logps/chosen": -452.0301818847656, + "logps/rejected": -472.8980407714844, + "loss": 0.6398, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5176578760147095, + "rewards/margins": 0.2665901780128479, + "rewards/rejected": -1.7842479944229126, "step": 1050 }, { "epoch": 0.85, "learning_rate": 3.407829007601507e-07, - "logits/chosen": -4.270508766174316, - "logits/rejected": -4.225382328033447, - "logps/chosen": -428.3861389160156, - "logps/rejected": -479.0707092285156, - "loss": 0.6119, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4203639030456543, - "rewards/margins": 0.3360464870929718, - "rewards/rejected": -1.7564103603363037, + "logits/chosen": -4.219012260437012, + "logits/rejected": -4.17362117767334, + "logps/chosen": -430.40997314453125, + "logps/rejected": -481.66363525390625, + "loss": 0.6108, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4400875568389893, + "rewards/margins": 0.3422803282737732, + "rewards/rejected": -1.7823677062988281, "step": 1060 }, { "epoch": 0.86, "learning_rate": 3.064116758882724e-07, - "logits/chosen": -4.24053955078125, - "logits/rejected": -4.1828999519348145, - "logps/chosen": -443.78485107421875, - "logps/rejected": -504.28070068359375, - "loss": 0.6037, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5132863521575928, - "rewards/margins": 0.4130435883998871, - "rewards/rejected": -1.9263302087783813, + "logits/chosen": -4.185604572296143, + "logits/rejected": -4.131557941436768, + "logps/chosen": -446.9971618652344, + "logps/rejected": -507.7557067871094, + "loss": 0.607, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5457991361618042, + "rewards/margins": 0.41560330986976624, + "rewards/rejected": -1.9614025354385376, "step": 1070 }, { "epoch": 0.86, "learning_rate": 2.737539899464908e-07, - "logits/chosen": -4.2971696853637695, - "logits/rejected": -4.288466453552246, - "logps/chosen": -403.1819763183594, - "logps/rejected": -454.8402404785156, - "loss": 0.6141, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3817451000213623, - "rewards/margins": 0.379900187253952, - "rewards/rejected": -1.7616455554962158, + "logits/chosen": -4.246437072753906, + "logits/rejected": -4.235133647918701, + "logps/chosen": -405.3433532714844, + "logps/rejected": -456.494140625, + "loss": 0.6183, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4034773111343384, + "rewards/margins": 0.37458181381225586, + "rewards/rejected": -1.7780590057373047, "step": 1080 }, { "epoch": 0.87, "learning_rate": 2.4283535374538645e-07, - "logits/chosen": -4.242150783538818, - "logits/rejected": -4.240577220916748, - "logps/chosen": -434.7925720214844, - "logps/rejected": -473.67803955078125, - "loss": 0.6122, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.4003090858459473, - "rewards/margins": 0.31833842396736145, - "rewards/rejected": -1.7186473608016968, + "logits/chosen": -4.18662691116333, + "logits/rejected": -4.184709072113037, + "logps/chosen": -436.64849853515625, + "logps/rejected": -474.5104064941406, + "loss": 0.6166, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4191557168960571, + "rewards/margins": 0.30806541442871094, + "rewards/rejected": -1.727220892906189, "step": 1090 }, { "epoch": 0.88, "learning_rate": 2.1367991962303298e-07, - "logits/chosen": -4.232297420501709, - "logits/rejected": -4.2173943519592285, - "logps/chosen": -418.9490661621094, - "logps/rejected": -446.4789123535156, - "loss": 0.6759, + "logits/chosen": -4.172500133514404, + "logits/rejected": -4.158700942993164, + "logps/chosen": -419.20977783203125, + "logps/rejected": -447.9820251464844, + "loss": 0.6732, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.5112884044647217, - "rewards/margins": 0.1443391889333725, - "rewards/rejected": -1.6556276082992554, + "rewards/chosen": -1.5145409107208252, + "rewards/margins": 0.15635111927986145, + "rewards/rejected": -1.6708920001983643, "step": 1100 }, { "epoch": 0.88, - "eval_logits/chosen": -4.485485553741455, - "eval_logits/rejected": -4.472198009490967, - "eval_logps/chosen": -375.86541748046875, - "eval_logps/rejected": -395.56036376953125, - "eval_loss": 0.6777821779251099, - "eval_rewards/accuracies": 0.5686666369438171, - "eval_rewards/chosen": -1.5515305995941162, - "eval_rewards/margins": 0.14078289270401, - "eval_rewards/rejected": -1.6923134326934814, - "eval_runtime": 1618.2899, - "eval_samples_per_second": 1.85, - "eval_steps_per_second": 0.232, + "eval_logits/chosen": -4.4235520362854, + "eval_logits/rejected": -4.410409927368164, + "eval_logps/chosen": -377.40155029296875, + "eval_logps/rejected": -397.9567565917969, + "eval_loss": 0.6752503514289856, + "eval_rewards/accuracies": 0.5776666402816772, + "eval_rewards/chosen": -1.5669503211975098, + "eval_rewards/margins": 0.1492597907781601, + "eval_rewards/rejected": -1.7162100076675415, + "eval_runtime": 1708.8293, + "eval_samples_per_second": 1.752, + "eval_steps_per_second": 0.219, "step": 1100 }, { "epoch": 0.89, "learning_rate": 1.8631046257820278e-07, - "logits/chosen": -4.233702659606934, - "logits/rejected": -4.249814033508301, - "logps/chosen": -432.20458984375, - "logps/rejected": -461.7394104003906, - "loss": 0.6275, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.41855788230896, - "rewards/margins": 0.2600322365760803, - "rewards/rejected": -1.678590178489685, + "logits/chosen": -4.177311897277832, + "logits/rejected": -4.192288398742676, + "logps/chosen": -433.6348571777344, + "logps/rejected": -464.0807189941406, + "loss": 0.6238, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4329723119735718, + "rewards/margins": 0.2695394456386566, + "rewards/rejected": -1.7025117874145508, "step": 1110 }, { "epoch": 0.9, "learning_rate": 1.6074836247950143e-07, - "logits/chosen": -4.206725597381592, - "logits/rejected": -4.209759712219238, - "logps/chosen": -443.52459716796875, - "logps/rejected": -470.96588134765625, - "loss": 0.6415, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.4318974018096924, - "rewards/margins": 0.21563585102558136, - "rewards/rejected": -1.6475334167480469, + "logits/chosen": -4.152982234954834, + "logits/rejected": -4.155800819396973, + "logps/chosen": -444.71722412109375, + "logps/rejected": -471.8829650878906, + "loss": 0.6421, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4441962242126465, + "rewards/margins": 0.21269936859607697, + "rewards/rejected": -1.656895399093628, "step": 1120 }, { "epoch": 0.9, "learning_rate": 1.370135873643097e-07, - "logits/chosen": -4.226916313171387, - "logits/rejected": -4.266509532928467, - "logps/chosen": -436.6336975097656, - "logps/rejected": -456.3214416503906, - "loss": 0.6262, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4590485095977783, - "rewards/margins": 0.2844700217247009, - "rewards/rejected": -1.7435184717178345, + "logits/chosen": -4.171530723571777, + "logits/rejected": -4.211108684539795, + "logps/chosen": -438.52178955078125, + "logps/rejected": -458.4681701660156, + "loss": 0.6246, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4780899286270142, + "rewards/margins": 0.28698402643203735, + "rewards/rejected": -1.7650740146636963, "step": 1130 }, { "epoch": 0.91, "learning_rate": 1.1512467784059372e-07, - "logits/chosen": -4.302299499511719, - "logits/rejected": -4.261002540588379, - "logps/chosen": -398.5301208496094, - "logps/rejected": -437.83673095703125, - "loss": 0.6221, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3698493242263794, - "rewards/margins": 0.2959776520729065, - "rewards/rejected": -1.6658270359039307, + "logits/chosen": -4.241854667663574, + "logits/rejected": -4.201731204986572, + "logps/chosen": -400.08306884765625, + "logps/rejected": -440.25250244140625, + "loss": 0.6219, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.385571837425232, + "rewards/margins": 0.30431246757507324, + "rewards/rejected": -1.6898844242095947, "step": 1140 }, { "epoch": 0.92, "learning_rate": 9.509873260376251e-08, - "logits/chosen": -4.228875160217285, - "logits/rejected": -4.17572021484375, - "logps/chosen": -429.0230407714844, - "logps/rejected": -501.6388244628906, - "loss": 0.5814, + "logits/chosen": -4.173992156982422, + "logits/rejected": -4.1186909675598145, + "logps/chosen": -429.03570556640625, + "logps/rejected": -502.2874450683594, + "loss": 0.5799, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4344505071640015, - "rewards/margins": 0.4450379014015198, - "rewards/rejected": -1.8794885873794556, + "rewards/chosen": -1.4350130558013916, + "rewards/margins": 0.4506250321865082, + "rewards/rejected": -1.8856379985809326, "step": 1150 }, { "epoch": 0.93, "learning_rate": 7.695139507988559e-08, - "logits/chosen": -4.261081218719482, - "logits/rejected": -4.282492160797119, - "logps/chosen": -444.05096435546875, - "logps/rejected": -479.0032653808594, - "loss": 0.6376, + "logits/chosen": -4.202746391296387, + "logits/rejected": -4.225730895996094, + "logps/chosen": -442.8606872558594, + "logps/rejected": -478.89776611328125, + "loss": 0.6358, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.457777500152588, - "rewards/margins": 0.2464013397693634, - "rewards/rejected": -1.704178810119629, + "rewards/chosen": -1.4460103511810303, + "rewards/margins": 0.257608562707901, + "rewards/rejected": -1.7036190032958984, "step": 1160 }, { "epoch": 0.94, "learning_rate": 6.069684120570684e-08, - "logits/chosen": -4.286696434020996, - "logits/rejected": -4.1916022300720215, - "logps/chosen": -431.74053955078125, - "logps/rejected": -482.0506896972656, - "loss": 0.6355, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5193744897842407, - "rewards/margins": 0.3015151619911194, - "rewards/rejected": -1.8208894729614258, + "logits/chosen": -4.227932929992676, + "logits/rejected": -4.133112907409668, + "logps/chosen": -433.41192626953125, + "logps/rejected": -483.49383544921875, + "loss": 0.6375, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5354173183441162, + "rewards/margins": 0.30013877153396606, + "rewards/rejected": -1.8355562686920166, "step": 1170 }, { "epoch": 0.94, "learning_rate": 4.634776835499871e-08, - "logits/chosen": -4.216092109680176, - "logits/rejected": -4.185781955718994, - "logps/chosen": -413.5948791503906, - "logps/rejected": -455.0482482910156, - "loss": 0.6371, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4376914501190186, - "rewards/margins": 0.27367502450942993, - "rewards/rejected": -1.7113662958145142, + "logits/chosen": -4.156412124633789, + "logits/rejected": -4.1273393630981445, + "logps/chosen": -413.9664611816406, + "logps/rejected": -457.55877685546875, + "loss": 0.6273, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4416838884353638, + "rewards/margins": 0.2954239845275879, + "rewards/rejected": -1.7371078729629517, "step": 1180 }, { "epoch": 0.95, "learning_rate": 3.3915385419908964e-08, - "logits/chosen": -4.179436206817627, - "logits/rejected": -4.222240447998047, - "logps/chosen": -430.4244079589844, - "logps/rejected": -464.51629638671875, - "loss": 0.6297, + "logits/chosen": -4.120321750640869, + "logits/rejected": -4.165139675140381, + "logps/chosen": -432.268310546875, + "logps/rejected": -465.0489807128906, + "loss": 0.6372, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4367833137512207, - "rewards/margins": 0.29136672616004944, - "rewards/rejected": -1.7281500101089478, + "rewards/chosen": -1.4545971155166626, + "rewards/margins": 0.2788635790348053, + "rewards/rejected": -1.7334604263305664, "step": 1190 }, { "epoch": 0.96, "learning_rate": 2.3409404055043938e-08, - "logits/chosen": -4.306519508361816, - "logits/rejected": -4.272242546081543, - "logps/chosen": -437.791259765625, - "logps/rejected": -471.58575439453125, - "loss": 0.6402, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.5211617946624756, - "rewards/margins": 0.2325367033481598, - "rewards/rejected": -1.7536985874176025, + "logits/chosen": -4.251112937927246, + "logits/rejected": -4.216982841491699, + "logps/chosen": -437.51922607421875, + "logps/rejected": -471.106201171875, + "loss": 0.6431, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5188627243041992, + "rewards/margins": 0.23063072562217712, + "rewards/rejected": -1.7494935989379883, "step": 1200 }, { "epoch": 0.96, - "eval_logits/chosen": -4.457742691040039, - "eval_logits/rejected": -4.444427967071533, - "eval_logps/chosen": -375.10284423828125, - "eval_logps/rejected": -394.7001647949219, - "eval_loss": 0.6772644519805908, - "eval_rewards/accuracies": 0.5690000057220459, - "eval_rewards/chosen": -1.5439047813415527, - "eval_rewards/margins": 0.13980673253536224, - "eval_rewards/rejected": -1.6837116479873657, - "eval_runtime": 1624.6813, - "eval_samples_per_second": 1.843, - "eval_steps_per_second": 0.231, + "eval_logits/chosen": -4.397921085357666, + "eval_logits/rejected": -4.384759902954102, + "eval_logps/chosen": -375.8670959472656, + "eval_logps/rejected": -396.2835693359375, + "eval_loss": 0.6749910712242126, + "eval_rewards/accuracies": 0.5773333311080933, + "eval_rewards/chosen": -1.5516059398651123, + "eval_rewards/margins": 0.14787256717681885, + "eval_rewards/rejected": -1.6994785070419312, + "eval_runtime": 1629.1995, + "eval_samples_per_second": 1.838, + "eval_steps_per_second": 0.23, "step": 1200 }, { "epoch": 0.97, "learning_rate": 1.4838031091134186e-08, - "logits/chosen": -4.243393898010254, - "logits/rejected": -4.175347805023193, - "logps/chosen": -404.5989990234375, - "logps/rejected": -469.9576110839844, - "loss": 0.5982, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3969910144805908, - "rewards/margins": 0.3869909644126892, - "rewards/rejected": -1.7839819192886353, + "logits/chosen": -4.187882423400879, + "logits/rejected": -4.119401454925537, + "logps/chosen": -407.01824951171875, + "logps/rejected": -472.28631591796875, + "loss": 0.6076, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4217784404754639, + "rewards/margins": 0.3864634931087494, + "rewards/rejected": -1.808241844177246, "step": 1210 }, { "epoch": 0.98, "learning_rate": 8.207962124201774e-09, - "logits/chosen": -4.263760566711426, - "logits/rejected": -4.225130081176758, - "logps/chosen": -435.9114685058594, - "logps/rejected": -472.66839599609375, - "loss": 0.6017, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.37319016456604, - "rewards/margins": 0.3209651708602905, - "rewards/rejected": -1.6941554546356201, + "logits/chosen": -4.208897590637207, + "logits/rejected": -4.17116641998291, + "logps/chosen": -435.45013427734375, + "logps/rejected": -472.75421142578125, + "loss": 0.6003, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3684009313583374, + "rewards/margins": 0.32645002007484436, + "rewards/rejected": -1.6948511600494385, "step": 1220 }, { "epoch": 0.98, "learning_rate": 3.5243762852441023e-09, - "logits/chosen": -4.210858345031738, - "logits/rejected": -4.163503170013428, - "logps/chosen": -427.44342041015625, - "logps/rejected": -469.5209045410156, - "loss": 0.6376, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4386152029037476, - "rewards/margins": 0.2547362446784973, - "rewards/rejected": -1.6933513879776, + "logits/chosen": -4.157495021820068, + "logits/rejected": -4.110323429107666, + "logps/chosen": -427.61651611328125, + "logps/rejected": -471.6131286621094, + "loss": 0.6286, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4408422708511353, + "rewards/margins": 0.2736855149269104, + "rewards/rejected": -1.7145278453826904, "step": 1230 }, { "epoch": 0.99, "learning_rate": 7.909321945129278e-10, - "logits/chosen": -4.18636417388916, - "logits/rejected": -4.141125679016113, - "logps/chosen": -443.1625061035156, - "logps/rejected": -493.43975830078125, - "loss": 0.6016, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4015429019927979, - "rewards/margins": 0.35573890805244446, - "rewards/rejected": -1.7572818994522095, + "logits/chosen": -4.126371383666992, + "logits/rejected": -4.085521221160889, + "logps/chosen": -441.79095458984375, + "logps/rejected": -493.57049560546875, + "loss": 0.5948, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3874802589416504, + "rewards/margins": 0.3715583086013794, + "rewards/rejected": -1.7590383291244507, "step": 1240 }, { "epoch": 1.0, "step": 1249, "total_flos": 0.0, - "train_loss": 0.6475976420174225, - "train_runtime": 42677.4758, + "train_loss": 0.647387065536218, + "train_runtime": 42673.6748, "train_samples_per_second": 0.469, "train_steps_per_second": 0.029 }