diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,20 +1,20 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9992254066615027, + "epoch": 2.998451213216314, "eval_steps": 100, - "global_step": 726, + "global_step": 2904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 6.84931506849315e-09, - "logits/chosen": -2.348097085952759, - "logits/rejected": -2.4175631999969482, - "logps/chosen": -271.4220275878906, - "logps/rejected": -208.93710327148438, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -1.8589259386062622, + "logits/rejected": -1.833059310913086, + "logps/chosen": -252.5240478515625, + "logps/rejected": -228.24844360351562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -23,1073 +23,4125 @@ "step": 1 }, { - "epoch": 0.04, - "learning_rate": 6.84931506849315e-08, - "logits/chosen": -2.4228014945983887, - "logits/rejected": -2.356138229370117, - "logps/chosen": -293.40594482421875, - "logps/rejected": -226.2930450439453, + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -1.834423542022705, + "logits/rejected": -1.7540024518966675, + "logps/chosen": -268.87396240234375, + "logps/rejected": -193.86492919921875, "loss": 0.6938, - "rewards/accuracies": 0.4652777910232544, - "rewards/chosen": 0.00204316433519125, - "rewards/margins": 0.002578532788902521, - "rewards/rejected": -0.0005353688611648977, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0023073465563356876, + "rewards/margins": 0.00032774999272078276, + "rewards/rejected": 0.0019795962143689394, "step": 10 }, { - "epoch": 0.08, - "learning_rate": 1.36986301369863e-07, - "logits/chosen": -2.431565046310425, - "logits/rejected": -2.40484881401062, - "logps/chosen": -278.5029296875, - "logps/rejected": -216.76760864257812, - "loss": 0.6929, - "rewards/accuracies": 0.534375011920929, - "rewards/chosen": 0.003985759802162647, - "rewards/margins": 0.004827108699828386, - "rewards/rejected": -0.0008413494797423482, + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.007349729537964, + "logits/rejected": -1.7954082489013672, + "logps/chosen": -305.00164794921875, + "logps/rejected": -236.2772674560547, + "loss": 0.6928, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.007546703331172466, + "rewards/margins": 0.0009935442358255386, + "rewards/rejected": 0.006553159095346928, "step": 20 }, { - "epoch": 0.12, - "learning_rate": 2.054794520547945e-07, - "logits/chosen": -2.38895583152771, - "logits/rejected": -2.348848819732666, - "logps/chosen": -253.019775390625, - "logps/rejected": -207.16421508789062, - "loss": 0.6933, - "rewards/accuracies": 0.4828124940395355, - "rewards/chosen": -0.0008784265955910087, - "rewards/margins": 0.0001159904059022665, - "rewards/rejected": -0.0009944172343239188, + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -1.9593127965927124, + "logits/rejected": -1.8910452127456665, + "logps/chosen": -254.04476928710938, + "logps/rejected": -215.4446258544922, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0025548995472490788, + "rewards/margins": 0.005599315278232098, + "rewards/rejected": -0.003044415730983019, "step": 30 }, { - "epoch": 0.17, - "learning_rate": 2.73972602739726e-07, - "logits/chosen": -2.4601290225982666, - "logits/rejected": -2.4147398471832275, - "logps/chosen": -283.7552795410156, - "logps/rejected": -216.47750854492188, - "loss": 0.6905, - "rewards/accuracies": 0.520312488079071, - "rewards/chosen": 0.004305425100028515, - "rewards/margins": 0.006620796862989664, - "rewards/rejected": -0.0023153722286224365, + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -1.9929354190826416, + "logits/rejected": -1.9428246021270752, + "logps/chosen": -293.15924072265625, + "logps/rejected": -240.9603729248047, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0011303538922220469, + "rewards/margins": -0.0012932273093611002, + "rewards/rejected": 0.002423581201583147, "step": 40 }, { - "epoch": 0.21, - "learning_rate": 3.424657534246575e-07, - "logits/chosen": -2.4362146854400635, - "logits/rejected": -2.3963656425476074, - "logps/chosen": -267.26031494140625, - "logps/rejected": -223.655029296875, - "loss": 0.6895, - "rewards/accuracies": 0.578125, - "rewards/chosen": 0.005432569421827793, - "rewards/margins": 0.007884417660534382, - "rewards/rejected": -0.002451848704367876, + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -1.8717906475067139, + "logits/rejected": -1.8086845874786377, + "logps/chosen": -282.8701477050781, + "logps/rejected": -214.1832733154297, + "loss": 0.6947, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005239286459982395, + "rewards/margins": -0.006810429040342569, + "rewards/rejected": 0.0015711418818682432, "step": 50 }, { - "epoch": 0.25, - "learning_rate": 4.10958904109589e-07, - "logits/chosen": -2.412108898162842, - "logits/rejected": -2.3888187408447266, - "logps/chosen": -266.83953857421875, - "logps/rejected": -214.74533081054688, - "loss": 0.6876, - "rewards/accuracies": 0.578125, - "rewards/chosen": 0.008083345368504524, - "rewards/margins": 0.013038607314229012, - "rewards/rejected": -0.0049552614800632, + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -1.7758302688598633, + "logits/rejected": -1.9071727991104126, + "logps/chosen": -241.18197631835938, + "logps/rejected": -217.21548461914062, + "loss": 0.6934, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.005848951172083616, + "rewards/margins": -0.0020723845809698105, + "rewards/rejected": -0.0037765665911138058, "step": 60 }, { - "epoch": 0.29, - "learning_rate": 4.794520547945205e-07, - "logits/chosen": -2.390169858932495, - "logits/rejected": -2.3971309661865234, - "logps/chosen": -254.08779907226562, - "logps/rejected": -214.64498901367188, - "loss": 0.6839, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.007395301945507526, - "rewards/margins": 0.017588596791028976, - "rewards/rejected": -0.010193293914198875, + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -1.9986045360565186, + "logits/rejected": -1.8738359212875366, + "logps/chosen": -298.9707336425781, + "logps/rejected": -241.0879669189453, + "loss": 0.6918, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.005427352152764797, + "rewards/margins": -0.0031789124477654696, + "rewards/rejected": -0.0022484397049993277, "step": 70 }, { - "epoch": 0.33, - "learning_rate": 4.946401225114854e-07, - "logits/chosen": -2.4293293952941895, - "logits/rejected": -2.3780810832977295, - "logps/chosen": -265.03875732421875, - "logps/rejected": -218.77554321289062, - "loss": 0.679, - "rewards/accuracies": 0.6421874761581421, - "rewards/chosen": 0.016448482871055603, - "rewards/margins": 0.03233351930975914, - "rewards/rejected": -0.015885034576058388, + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -1.9847288131713867, + "logits/rejected": -1.8877817392349243, + "logps/chosen": -287.6217346191406, + "logps/rejected": -231.26315307617188, + "loss": 0.6917, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0030095516704022884, + "rewards/margins": 0.0036421152763068676, + "rewards/rejected": -0.000632563722319901, "step": 80 }, { - "epoch": 0.37, - "learning_rate": 4.869831546707504e-07, - "logits/chosen": -2.478893756866455, - "logits/rejected": -2.422461986541748, - "logps/chosen": -271.6805114746094, - "logps/rejected": -224.47640991210938, - "loss": 0.6727, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.026180211454629898, - "rewards/margins": 0.05046360567212105, - "rewards/rejected": -0.0242833960801363, + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -1.8527441024780273, + "logits/rejected": -1.8398630619049072, + "logps/chosen": -253.68093872070312, + "logps/rejected": -205.139892578125, + "loss": 0.691, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0005496363155543804, + "rewards/margins": 0.0028937645256519318, + "rewards/rejected": -0.0023441289085894823, "step": 90 }, { - "epoch": 0.41, - "learning_rate": 4.793261868300153e-07, - "logits/chosen": -2.417175054550171, - "logits/rejected": -2.4038565158843994, - "logps/chosen": -273.87823486328125, - "logps/rejected": -227.7811279296875, - "loss": 0.6687, - "rewards/accuracies": 0.6734374761581421, - "rewards/chosen": 0.026967059820890427, - "rewards/margins": 0.05768311023712158, - "rewards/rejected": -0.030716046690940857, + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -1.7725082635879517, + "logits/rejected": -1.8361854553222656, + "logps/chosen": -260.54302978515625, + "logps/rejected": -213.9029998779297, + "loss": 0.6924, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.0016784339677542448, + "rewards/margins": 0.001359444228000939, + "rewards/rejected": 0.00031898936140351, "step": 100 }, { - "epoch": 0.45, - "learning_rate": 4.7166921898928023e-07, - "logits/chosen": -2.450390338897705, - "logits/rejected": -2.39005708694458, - "logps/chosen": -252.6284942626953, - "logps/rejected": -222.63601684570312, - "loss": 0.6633, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": 0.022277798503637314, - "rewards/margins": 0.060135841369628906, - "rewards/rejected": -0.03785804286599159, + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -1.9289977550506592, + "logits/rejected": -1.7746009826660156, + "logps/chosen": -260.48541259765625, + "logps/rejected": -220.8142547607422, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007216868456453085, + "rewards/margins": 0.011277751997113228, + "rewards/rejected": -0.00406088400632143, "step": 110 }, { - "epoch": 0.5, - "learning_rate": 4.640122511485451e-07, - "logits/chosen": -2.412520170211792, - "logits/rejected": -2.401998281478882, - "logps/chosen": -256.9021301269531, - "logps/rejected": -219.8153076171875, - "loss": 0.6579, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.027152011170983315, - "rewards/margins": 0.07799454033374786, - "rewards/rejected": -0.0508425310254097, + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -1.8858321905136108, + "logits/rejected": -1.9249963760375977, + "logps/chosen": -303.71063232421875, + "logps/rejected": -223.1724395751953, + "loss": 0.6906, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.003943001385778189, + "rewards/margins": 0.011311030015349388, + "rewards/rejected": -0.007368029095232487, "step": 120 }, { - "epoch": 0.54, - "learning_rate": 4.563552833078101e-07, - "logits/chosen": -2.456604480743408, - "logits/rejected": -2.3952419757843018, - "logps/chosen": -265.9836730957031, - "logps/rejected": -225.9434356689453, - "loss": 0.6504, - "rewards/accuracies": 0.667187511920929, - "rewards/chosen": 0.02718115784227848, - "rewards/margins": 0.09283201396465302, - "rewards/rejected": -0.06565085798501968, + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -1.958130121231079, + "logits/rejected": -1.9546114206314087, + "logps/chosen": -280.0518493652344, + "logps/rejected": -233.37637329101562, + "loss": 0.6885, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.008822163566946983, + "rewards/margins": 0.019602559506893158, + "rewards/rejected": -0.0107803950086236, "step": 130 }, { - "epoch": 0.58, - "learning_rate": 4.4869831546707505e-07, - "logits/chosen": -2.4601752758026123, - "logits/rejected": -2.4006924629211426, - "logps/chosen": -270.29620361328125, - "logps/rejected": -230.3610076904297, - "loss": 0.65, - "rewards/accuracies": 0.6796875, - "rewards/chosen": 0.03528743237257004, - "rewards/margins": 0.1161808967590332, - "rewards/rejected": -0.08089347183704376, + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.036907911300659, + "logits/rejected": -1.927520513534546, + "logps/chosen": -286.6536865234375, + "logps/rejected": -227.0584716796875, + "loss": 0.6897, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.007285796105861664, + "rewards/margins": 0.008207736536860466, + "rewards/rejected": -0.0009219406056217849, "step": 140 }, { - "epoch": 0.62, - "learning_rate": 4.4104134762633994e-07, - "logits/chosen": -2.4845683574676514, - "logits/rejected": -2.4314005374908447, - "logps/chosen": -265.96466064453125, - "logps/rejected": -226.5783233642578, - "loss": 0.643, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.03652546927332878, - "rewards/margins": 0.12623493373394012, - "rewards/rejected": -0.08970947563648224, + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -1.878059983253479, + "logits/rejected": -1.9622585773468018, + "logps/chosen": -268.6002502441406, + "logps/rejected": -247.3784942626953, + "loss": 0.6871, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011237042024731636, + "rewards/margins": 0.011841908097267151, + "rewards/rejected": -0.0006048673531040549, "step": 150 }, { - "epoch": 0.66, - "learning_rate": 4.333843797856049e-07, - "logits/chosen": -2.4465065002441406, - "logits/rejected": -2.402820348739624, - "logps/chosen": -279.31195068359375, - "logps/rejected": -238.9833221435547, - "loss": 0.6353, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": 0.042833976447582245, - "rewards/margins": 0.14586737751960754, - "rewards/rejected": -0.10303342342376709, + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.034170150756836, + "logits/rejected": -2.049903392791748, + "logps/chosen": -272.08197021484375, + "logps/rejected": -210.9315643310547, + "loss": 0.6847, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010213760659098625, + "rewards/margins": 0.014107374474406242, + "rewards/rejected": -0.0038936142809689045, "step": 160 }, { - "epoch": 0.7, - "learning_rate": 4.257274119448698e-07, - "logits/chosen": -2.479480028152466, - "logits/rejected": -2.4522595405578613, - "logps/chosen": -270.7332458496094, - "logps/rejected": -239.1272735595703, - "loss": 0.6338, - "rewards/accuracies": 0.667187511920929, - "rewards/chosen": 0.022215088829398155, - "rewards/margins": 0.1372935026884079, - "rewards/rejected": -0.11507842689752579, + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -1.9434633255004883, + "logits/rejected": -1.9493672847747803, + "logps/chosen": -285.885498046875, + "logps/rejected": -228.5830535888672, + "loss": 0.6876, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0032173257786780596, + "rewards/margins": 0.009711216203868389, + "rewards/rejected": -0.006493890192359686, "step": 170 }, { - "epoch": 0.74, - "learning_rate": 4.180704441041347e-07, - "logits/chosen": -2.4355900287628174, - "logits/rejected": -2.3883790969848633, - "logps/chosen": -256.80621337890625, - "logps/rejected": -226.12857055664062, - "loss": 0.6362, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.01779203675687313, - "rewards/margins": 0.14599665999412537, - "rewards/rejected": -0.1282046139240265, + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -1.9711917638778687, + "logits/rejected": -1.9319254159927368, + "logps/chosen": -242.7555389404297, + "logps/rejected": -203.4986572265625, + "loss": 0.6841, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.011803502216935158, + "rewards/margins": 0.024607401341199875, + "rewards/rejected": -0.012803901918232441, "step": 180 }, { - "epoch": 0.78, - "learning_rate": 4.1041347626339966e-07, - "logits/chosen": -2.44791316986084, - "logits/rejected": -2.406308174133301, - "logps/chosen": -261.647705078125, - "logps/rejected": -213.19442749023438, - "loss": 0.6272, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.019534433260560036, - "rewards/margins": 0.17334969341754913, - "rewards/rejected": -0.15381526947021484, + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -1.8602354526519775, + "logits/rejected": -1.7703956365585327, + "logps/chosen": -238.77481079101562, + "logps/rejected": -194.1616973876953, + "loss": 0.679, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.014951497316360474, + "rewards/margins": 0.024222875013947487, + "rewards/rejected": -0.009271375834941864, "step": 190 }, { - "epoch": 0.83, - "learning_rate": 4.027565084226646e-07, - "logits/chosen": -2.4711079597473145, - "logits/rejected": -2.414386510848999, - "logps/chosen": -262.3142395019531, - "logps/rejected": -217.9895477294922, - "loss": 0.6187, - "rewards/accuracies": 0.692187488079071, - "rewards/chosen": 0.02562810108065605, - "rewards/margins": 0.19041840732097626, - "rewards/rejected": -0.1647903025150299, + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -1.908044457435608, + "logits/rejected": -1.8577945232391357, + "logps/chosen": -246.58517456054688, + "logps/rejected": -183.02694702148438, + "loss": 0.681, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.01906001567840576, + "rewards/margins": 0.038775451481342316, + "rewards/rejected": -0.019715435802936554, "step": 200 }, { - "epoch": 0.87, - "learning_rate": 3.9509954058192954e-07, - "logits/chosen": -2.474663019180298, - "logits/rejected": -2.4352869987487793, - "logps/chosen": -283.8729248046875, - "logps/rejected": -231.70376586914062, - "loss": 0.6162, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": 0.029517430812120438, - "rewards/margins": 0.2187691181898117, - "rewards/rejected": -0.18925169110298157, + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -1.839012861251831, + "logits/rejected": -1.811547040939331, + "logps/chosen": -285.8274841308594, + "logps/rejected": -207.23275756835938, + "loss": 0.6759, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.014916220679879189, + "rewards/margins": 0.03331177681684494, + "rewards/rejected": -0.018395554274320602, "step": 210 }, { - "epoch": 0.91, - "learning_rate": 3.874425727411945e-07, - "logits/chosen": -2.4217212200164795, - "logits/rejected": -2.4082586765289307, - "logps/chosen": -276.77996826171875, - "logps/rejected": -229.3105010986328, - "loss": 0.6138, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.017411770299077034, - "rewards/margins": 0.2216881960630417, - "rewards/rejected": -0.204276442527771, + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -1.8409188985824585, + "logits/rejected": -1.8116531372070312, + "logps/chosen": -253.5845489501953, + "logps/rejected": -224.10543823242188, + "loss": 0.6736, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.012754586525261402, + "rewards/margins": 0.03681974858045578, + "rewards/rejected": -0.024065162986516953, "step": 220 }, { - "epoch": 0.95, - "learning_rate": 3.797856049004594e-07, - "logits/chosen": -2.4441373348236084, - "logits/rejected": -2.3886184692382812, - "logps/chosen": -264.1249084472656, - "logps/rejected": -228.3013458251953, - "loss": 0.6119, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": 0.02687259018421173, - "rewards/margins": 0.21526794135570526, - "rewards/rejected": -0.18839535117149353, + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -1.8429826498031616, + "logits/rejected": -1.7822067737579346, + "logps/chosen": -256.49371337890625, + "logps/rejected": -231.0501708984375, + "loss": 0.6725, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.01603628322482109, + "rewards/margins": 0.039132822304964066, + "rewards/rejected": -0.023096540942788124, "step": 230 }, { - "epoch": 0.99, - "learning_rate": 3.7212863705972436e-07, - "logits/chosen": -2.462822198867798, - "logits/rejected": -2.418255567550659, - "logps/chosen": -271.4612731933594, - "logps/rejected": -222.49185180664062, - "loss": 0.6115, - "rewards/accuracies": 0.6953125, - "rewards/chosen": 0.025034094229340553, - "rewards/margins": 0.21872933208942413, - "rewards/rejected": -0.19369521737098694, + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -1.9088108539581299, + "logits/rejected": -1.8135614395141602, + "logps/chosen": -292.645751953125, + "logps/rejected": -206.9217529296875, + "loss": 0.6704, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.022067774087190628, + "rewards/margins": 0.044131770730018616, + "rewards/rejected": -0.02206399478018284, "step": 240 }, { - "epoch": 1.0, - "eval_logits/chosen": -2.1432242393493652, - "eval_logits/rejected": -2.0230488777160645, - "eval_logps/chosen": -264.6322937011719, - "eval_logps/rejected": -221.69525146484375, - "eval_loss": 0.6071053147315979, - "eval_rewards/accuracies": 0.6700000166893005, - "eval_rewards/chosen": 0.0028544252272695303, - "eval_rewards/margins": 0.24394646286964417, - "eval_rewards/rejected": -0.2410920411348343, - "eval_runtime": 443.3534, - "eval_samples_per_second": 4.511, - "eval_steps_per_second": 0.282, - "step": 242 - }, - { - "epoch": 1.03, - "learning_rate": 3.6447166921898925e-07, - "logits/chosen": -2.3999691009521484, - "logits/rejected": -2.347167730331421, - "logps/chosen": -257.44598388671875, - "logps/rejected": -210.4270782470703, - "loss": 0.617, - "rewards/accuracies": 0.6703125238418579, - "rewards/chosen": 0.0046371398493647575, - "rewards/margins": 0.2242714911699295, - "rewards/rejected": -0.21963433921337128, + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -1.9705746173858643, + "logits/rejected": -1.9164187908172607, + "logps/chosen": -258.31402587890625, + "logps/rejected": -226.0135955810547, + "loss": 0.6691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02264360897243023, + "rewards/margins": 0.049908820539712906, + "rewards/rejected": -0.027265211567282677, "step": 250 }, { - "epoch": 1.07, - "learning_rate": 3.568147013782542e-07, - "logits/chosen": -2.4152634143829346, - "logits/rejected": -2.37550687789917, - "logps/chosen": -261.9479675292969, - "logps/rejected": -226.1567840576172, - "loss": 0.6004, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.021805018186569214, - "rewards/margins": 0.26413100957870483, - "rewards/rejected": -0.2423260509967804, + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -1.9965183734893799, + "logits/rejected": -1.8718656301498413, + "logps/chosen": -256.3481140136719, + "logps/rejected": -210.12704467773438, + "loss": 0.6666, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.02700256183743477, + "rewards/margins": 0.059316955506801605, + "rewards/rejected": -0.03231438994407654, "step": 260 }, { - "epoch": 1.12, - "learning_rate": 3.4915773353751913e-07, - "logits/chosen": -2.42952299118042, - "logits/rejected": -2.365931272506714, - "logps/chosen": -278.4363708496094, - "logps/rejected": -236.10116577148438, - "loss": 0.5919, - "rewards/accuracies": 0.745312511920929, - "rewards/chosen": 0.026421889662742615, - "rewards/margins": 0.33800598978996277, - "rewards/rejected": -0.31158408522605896, + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.004511594772339, + "logits/rejected": -1.9170646667480469, + "logps/chosen": -267.9630432128906, + "logps/rejected": -221.8316650390625, + "loss": 0.6598, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.024454887956380844, + "rewards/margins": 0.07100498676300049, + "rewards/rejected": -0.046550098806619644, "step": 270 }, { - "epoch": 1.16, - "learning_rate": 3.41500765696784e-07, - "logits/chosen": -2.4357521533966064, - "logits/rejected": -2.396134614944458, - "logps/chosen": -251.1164093017578, - "logps/rejected": -225.797119140625, - "loss": 0.6028, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": 0.009803700260818005, - "rewards/margins": 0.2455863505601883, - "rewards/rejected": -0.2357826679944992, + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -1.9476372003555298, + "logits/rejected": -1.8397985696792603, + "logps/chosen": -277.18975830078125, + "logps/rejected": -225.5077362060547, + "loss": 0.6583, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.04471539333462715, + "rewards/margins": 0.08860420435667038, + "rewards/rejected": -0.04388881474733353, "step": 280 }, { - "epoch": 1.2, - "learning_rate": 3.33843797856049e-07, - "logits/chosen": -2.4084014892578125, - "logits/rejected": -2.394484043121338, - "logps/chosen": -283.18255615234375, - "logps/rejected": -228.3118438720703, - "loss": 0.5931, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": 0.00656654080376029, - "rewards/margins": 0.30469295382499695, - "rewards/rejected": -0.2981263995170593, + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -1.9021289348602295, + "logits/rejected": -1.9396965503692627, + "logps/chosen": -264.3427429199219, + "logps/rejected": -216.1573486328125, + "loss": 0.656, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.041467390954494476, + "rewards/margins": 0.10826016962528229, + "rewards/rejected": -0.06679276376962662, "step": 290 }, { - "epoch": 1.24, - "learning_rate": 3.2618683001531396e-07, - "logits/chosen": -2.408291816711426, - "logits/rejected": -2.337432861328125, - "logps/chosen": -261.3971252441406, - "logps/rejected": -227.752685546875, - "loss": 0.5892, - "rewards/accuracies": 0.707812488079071, - "rewards/chosen": 0.009347354993224144, - "rewards/margins": 0.2984946370124817, - "rewards/rejected": -0.289147287607193, + "epoch": 0.31, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": -1.8552868366241455, + "logits/rejected": -1.7529213428497314, + "logps/chosen": -239.3164825439453, + "logps/rejected": -210.6975860595703, + "loss": 0.6489, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.02189425379037857, + "rewards/margins": 0.10783751308917999, + "rewards/rejected": -0.08594325184822083, "step": 300 }, { - "epoch": 1.28, - "learning_rate": 3.1852986217457885e-07, - "logits/chosen": -2.45731782913208, - "logits/rejected": -2.3738834857940674, - "logps/chosen": -261.27099609375, - "logps/rejected": -228.57211303710938, - "loss": 0.5974, - "rewards/accuracies": 0.7109375, - "rewards/chosen": 0.005537848919630051, - "rewards/margins": 0.2985115647315979, - "rewards/rejected": -0.29297369718551636, + "epoch": 0.32, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": -1.985440969467163, + "logits/rejected": -1.9840974807739258, + "logps/chosen": -291.8209533691406, + "logps/rejected": -245.8662109375, + "loss": 0.6481, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.0408959835767746, + "rewards/margins": 0.10587803274393082, + "rewards/rejected": -0.06498204171657562, "step": 310 }, { - "epoch": 1.32, - "learning_rate": 3.108728943338438e-07, - "logits/chosen": -2.4441769123077393, - "logits/rejected": -2.4352951049804688, - "logps/chosen": -252.8352813720703, - "logps/rejected": -237.8563995361328, - "loss": 0.5959, - "rewards/accuracies": 0.667187511920929, - "rewards/chosen": -0.016652364283800125, - "rewards/margins": 0.24610964953899384, - "rewards/rejected": -0.26276201009750366, + "epoch": 0.33, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": -1.8532111644744873, + "logits/rejected": -1.769161581993103, + "logps/chosen": -236.22659301757812, + "logps/rejected": -180.43446350097656, + "loss": 0.6356, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.028678173199295998, + "rewards/margins": 0.124606192111969, + "rewards/rejected": -0.09592802077531815, "step": 320 }, { - "epoch": 1.36, - "learning_rate": 3.0321592649310873e-07, - "logits/chosen": -2.424257516860962, - "logits/rejected": -2.357093572616577, - "logps/chosen": -253.7244873046875, - "logps/rejected": -224.3267822265625, - "loss": 0.5865, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.007790223695337772, - "rewards/margins": 0.30015501379966736, - "rewards/rejected": -0.30794528126716614, + "epoch": 0.34, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": -1.8861719369888306, + "logits/rejected": -1.88065505027771, + "logps/chosen": -296.05987548828125, + "logps/rejected": -228.4789276123047, + "loss": 0.6429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04638337343931198, + "rewards/margins": 0.13865838944911957, + "rewards/rejected": -0.09227502346038818, "step": 330 }, { - "epoch": 1.4, - "learning_rate": 2.955589586523736e-07, - "logits/chosen": -2.442631244659424, - "logits/rejected": -2.3830604553222656, - "logps/chosen": -265.7073974609375, - "logps/rejected": -226.4299774169922, - "loss": 0.5916, - "rewards/accuracies": 0.6953125, - "rewards/chosen": -0.016206540167331696, - "rewards/margins": 0.32533493638038635, - "rewards/rejected": -0.34154146909713745, + "epoch": 0.35, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": -1.7821033000946045, + "logits/rejected": -1.6849457025527954, + "logps/chosen": -228.7474365234375, + "logps/rejected": -227.21273803710938, + "loss": 0.6433, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.01950683258473873, + "rewards/margins": 0.09855355322360992, + "rewards/rejected": -0.07904672622680664, "step": 340 }, { - "epoch": 1.45, - "learning_rate": 2.8790199081163856e-07, - "logits/chosen": -2.4397027492523193, - "logits/rejected": -2.369683265686035, - "logps/chosen": -266.0716857910156, - "logps/rejected": -225.29916381835938, - "loss": 0.5941, - "rewards/accuracies": 0.6640625, - "rewards/chosen": -0.029331281781196594, - "rewards/margins": 0.2816322445869446, - "rewards/rejected": -0.31096351146698, + "epoch": 0.36, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": -2.012197256088257, + "logits/rejected": -1.9556312561035156, + "logps/chosen": -298.3753662109375, + "logps/rejected": -237.82766723632812, + "loss": 0.6307, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.04107990115880966, + "rewards/margins": 0.17710928618907928, + "rewards/rejected": -0.13602939248085022, "step": 350 }, { - "epoch": 1.49, - "learning_rate": 2.802450229709035e-07, - "logits/chosen": -2.3996329307556152, - "logits/rejected": -2.348875045776367, - "logps/chosen": -254.8851776123047, - "logps/rejected": -213.3196563720703, - "loss": 0.5843, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.037624605000019073, - "rewards/margins": 0.30041417479515076, - "rewards/rejected": -0.3380388021469116, + "epoch": 0.37, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": -1.8956018686294556, + "logits/rejected": -1.8774607181549072, + "logps/chosen": -288.48712158203125, + "logps/rejected": -240.55929565429688, + "loss": 0.6225, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.021517125889658928, + "rewards/margins": 0.1826322227716446, + "rewards/rejected": -0.16111508011817932, "step": 360 }, { - "epoch": 1.53, - "learning_rate": 2.725880551301684e-07, - "logits/chosen": -2.4621551036834717, - "logits/rejected": -2.4065911769866943, - "logps/chosen": -274.72161865234375, - "logps/rejected": -232.85791015625, - "loss": 0.5928, - "rewards/accuracies": 0.6734374761581421, - "rewards/chosen": -0.0233127661049366, - "rewards/margins": 0.30943089723587036, - "rewards/rejected": -0.33274370431900024, + "epoch": 0.38, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": -2.036818265914917, + "logits/rejected": -1.9309518337249756, + "logps/chosen": -297.98736572265625, + "logps/rejected": -260.5028991699219, + "loss": 0.6217, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.06692071259021759, + "rewards/margins": 0.22046951949596405, + "rewards/rejected": -0.15354883670806885, "step": 370 }, { - "epoch": 1.57, - "learning_rate": 2.649310872894334e-07, - "logits/chosen": -2.448286771774292, - "logits/rejected": -2.4156296253204346, - "logps/chosen": -274.979248046875, - "logps/rejected": -223.11279296875, - "loss": 0.5817, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.01715572364628315, - "rewards/margins": 0.3696514070034027, - "rewards/rejected": -0.3868071436882019, + "epoch": 0.39, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": -1.9570751190185547, + "logits/rejected": -1.8644596338272095, + "logps/chosen": -275.3887939453125, + "logps/rejected": -229.76473999023438, + "loss": 0.6335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05313148349523544, + "rewards/margins": 0.19295012950897217, + "rewards/rejected": -0.13981863856315613, "step": 380 }, { - "epoch": 1.61, - "learning_rate": 2.572741194486983e-07, - "logits/chosen": -2.4490604400634766, - "logits/rejected": -2.3936374187469482, - "logps/chosen": -273.819580078125, - "logps/rejected": -208.39321899414062, - "loss": 0.5798, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.02806365117430687, - "rewards/margins": 0.3674148619174957, - "rewards/rejected": -0.3954785466194153, + "epoch": 0.4, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": -1.9391593933105469, + "logits/rejected": -1.8515427112579346, + "logps/chosen": -264.26422119140625, + "logps/rejected": -227.9110870361328, + "loss": 0.6251, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.018016207963228226, + "rewards/margins": 0.18983347713947296, + "rewards/rejected": -0.17181725800037384, "step": 390 }, { - "epoch": 1.65, - "learning_rate": 2.496171516079632e-07, - "logits/chosen": -2.4658896923065186, - "logits/rejected": -2.3998141288757324, - "logps/chosen": -293.2339172363281, - "logps/rejected": -239.4620819091797, - "loss": 0.5813, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": 0.006555554457008839, - "rewards/margins": 0.39713528752326965, - "rewards/rejected": -0.3905797302722931, + "epoch": 0.41, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": -1.8489261865615845, + "logits/rejected": -1.8198875188827515, + "logps/chosen": -247.95181274414062, + "logps/rejected": -203.08200073242188, + "loss": 0.6196, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.038538627326488495, + "rewards/margins": 0.20510415732860565, + "rewards/rejected": -0.16656549274921417, "step": 400 }, { - "epoch": 1.69, - "learning_rate": 2.4196018376722816e-07, - "logits/chosen": -2.4295265674591064, - "logits/rejected": -2.4006526470184326, - "logps/chosen": -278.5876159667969, - "logps/rejected": -228.4311065673828, - "loss": 0.5869, - "rewards/accuracies": 0.6890624761581421, - "rewards/chosen": -0.02325974591076374, - "rewards/margins": 0.365833044052124, - "rewards/rejected": -0.3890928328037262, + "epoch": 0.42, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": -1.8924095630645752, + "logits/rejected": -1.8541799783706665, + "logps/chosen": -254.6027069091797, + "logps/rejected": -242.6875457763672, + "loss": 0.6208, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.014495603740215302, + "rewards/margins": 0.17543359100818634, + "rewards/rejected": -0.16093799471855164, "step": 410 }, { - "epoch": 1.74, - "learning_rate": 2.343032159264931e-07, - "logits/chosen": -2.403923511505127, - "logits/rejected": -2.3335375785827637, - "logps/chosen": -268.8808898925781, - "logps/rejected": -224.3599395751953, - "loss": 0.5782, - "rewards/accuracies": 0.7203124761581421, - "rewards/chosen": -0.01940683089196682, - "rewards/margins": 0.4325495660305023, - "rewards/rejected": -0.4519564211368561, + "epoch": 0.43, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": -1.9146015644073486, + "logits/rejected": -1.826513648033142, + "logps/chosen": -239.00094604492188, + "logps/rejected": -194.21633911132812, + "loss": 0.6302, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02996433712542057, + "rewards/margins": 0.14245793223381042, + "rewards/rejected": -0.17242226004600525, "step": 420 }, { - "epoch": 1.78, - "learning_rate": 2.26646248085758e-07, - "logits/chosen": -2.3836731910705566, - "logits/rejected": -2.3358378410339355, - "logps/chosen": -259.7103576660156, - "logps/rejected": -217.7801055908203, - "loss": 0.5726, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.03996918350458145, - "rewards/margins": 0.39050012826919556, - "rewards/rejected": -0.4304693341255188, + "epoch": 0.44, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": -1.9913543462753296, + "logits/rejected": -1.9355945587158203, + "logps/chosen": -250.6079864501953, + "logps/rejected": -232.06924438476562, + "loss": 0.616, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.0017566995229572058, + "rewards/margins": 0.21342739462852478, + "rewards/rejected": -0.21518409252166748, "step": 430 }, { - "epoch": 1.82, - "learning_rate": 2.1898928024502298e-07, - "logits/chosen": -2.4447569847106934, - "logits/rejected": -2.387596845626831, - "logps/chosen": -263.4599609375, - "logps/rejected": -221.48947143554688, - "loss": 0.5757, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.018027612939476967, - "rewards/margins": 0.3998379111289978, - "rewards/rejected": -0.417865514755249, + "epoch": 0.45, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": -1.9374210834503174, + "logits/rejected": -1.892005205154419, + "logps/chosen": -272.4983825683594, + "logps/rejected": -239.8685760498047, + "loss": 0.6026, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.026430880650877953, + "rewards/margins": 0.24110619723796844, + "rewards/rejected": -0.26753711700439453, "step": 440 }, { - "epoch": 1.86, - "learning_rate": 2.113323124042879e-07, - "logits/chosen": -2.4276745319366455, - "logits/rejected": -2.390892744064331, - "logps/chosen": -271.26995849609375, - "logps/rejected": -231.4360809326172, - "loss": 0.5756, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.021533582359552383, - "rewards/margins": 0.40987348556518555, - "rewards/rejected": -0.43140706419944763, + "epoch": 0.46, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": -1.8150333166122437, + "logits/rejected": -1.8681037425994873, + "logps/chosen": -262.8167419433594, + "logps/rejected": -219.86087036132812, + "loss": 0.636, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.015690278261899948, + "rewards/margins": 0.18283550441265106, + "rewards/rejected": -0.16714522242546082, "step": 450 }, { - "epoch": 1.9, - "learning_rate": 2.036753445635528e-07, - "logits/chosen": -2.4362778663635254, - "logits/rejected": -2.4150547981262207, - "logps/chosen": -284.03668212890625, - "logps/rejected": -238.2490234375, - "loss": 0.5694, - "rewards/accuracies": 0.7171875238418579, - "rewards/chosen": -0.020668620243668556, - "rewards/margins": 0.45047345757484436, - "rewards/rejected": -0.47114211320877075, + "epoch": 0.47, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": -1.8804519176483154, + "logits/rejected": -1.8782047033309937, + "logps/chosen": -257.576416015625, + "logps/rejected": -212.7547607421875, + "loss": 0.5978, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.010168890468776226, + "rewards/margins": 0.23074540495872498, + "rewards/rejected": -0.24091430008411407, "step": 460 }, { - "epoch": 1.94, - "learning_rate": 1.9601837672281775e-07, - "logits/chosen": -2.418320894241333, - "logits/rejected": -2.385996103286743, - "logps/chosen": -270.16778564453125, - "logps/rejected": -236.37612915039062, - "loss": 0.5749, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.05912749841809273, - "rewards/margins": 0.3770337700843811, - "rewards/rejected": -0.4361612796783447, + "epoch": 0.49, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": -1.8830446004867554, + "logits/rejected": -1.8582241535186768, + "logps/chosen": -232.5907745361328, + "logps/rejected": -205.01565551757812, + "loss": 0.5964, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.03179175406694412, + "rewards/margins": 0.24738144874572754, + "rewards/rejected": -0.21558968722820282, "step": 470 }, { - "epoch": 1.98, - "learning_rate": 1.883614088820827e-07, - "logits/chosen": -2.438436269760132, - "logits/rejected": -2.3922858238220215, - "logps/chosen": -268.78424072265625, - "logps/rejected": -224.87625122070312, - "loss": 0.5855, - "rewards/accuracies": 0.7203124761581421, - "rewards/chosen": -0.05140022188425064, - "rewards/margins": 0.37707191705703735, - "rewards/rejected": -0.4284721314907074, + "epoch": 0.5, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": -1.9318273067474365, + "logits/rejected": -1.8749887943267822, + "logps/chosen": -274.21826171875, + "logps/rejected": -221.87710571289062, + "loss": 0.6148, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.012484663166105747, + "rewards/margins": 0.2855837941169739, + "rewards/rejected": -0.2730991244316101, "step": 480 }, { - "epoch": 2.0, - "eval_logits/chosen": -2.1276535987854004, - "eval_logits/rejected": -2.0070486068725586, - "eval_logps/chosen": -265.26239013671875, - "eval_logps/rejected": -223.9853057861328, - "eval_loss": 0.5732065439224243, - "eval_rewards/accuracies": 0.6919999718666077, - "eval_rewards/chosen": -0.06015579402446747, - "eval_rewards/margins": 0.4099440574645996, - "eval_rewards/rejected": -0.47009986639022827, - "eval_runtime": 443.3179, - "eval_samples_per_second": 4.511, - "eval_steps_per_second": 0.282, - "step": 484 - }, - { - "epoch": 2.02, - "learning_rate": 1.807044410413476e-07, - "logits/chosen": -2.4100403785705566, - "logits/rejected": -2.3768649101257324, - "logps/chosen": -257.94842529296875, - "logps/rejected": -236.3602294921875, - "loss": 0.5773, - "rewards/accuracies": 0.707812488079071, - "rewards/chosen": -0.05285615846514702, - "rewards/margins": 0.3980172574520111, - "rewards/rejected": -0.4508734345436096, + "epoch": 0.51, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": -2.0985186100006104, + "logits/rejected": -1.8489367961883545, + "logps/chosen": -277.88531494140625, + "logps/rejected": -235.677978515625, + "loss": 0.5945, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04954073578119278, + "rewards/margins": 0.3147658705711365, + "rewards/rejected": -0.2652251124382019, "step": 490 }, { - "epoch": 2.07, - "learning_rate": 1.7304747320061255e-07, - "logits/chosen": -2.4070606231689453, - "logits/rejected": -2.403454542160034, - "logps/chosen": -263.56475830078125, - "logps/rejected": -230.64697265625, - "loss": 0.577, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.01720670983195305, - "rewards/margins": 0.4001820683479309, - "rewards/rejected": -0.41738876700401306, + "epoch": 0.52, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": -1.884758710861206, + "logits/rejected": -1.857834815979004, + "logps/chosen": -239.9447479248047, + "logps/rejected": -211.59353637695312, + "loss": 0.5969, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.019142238423228264, + "rewards/margins": 0.22466382384300232, + "rewards/rejected": -0.24380607903003693, "step": 500 }, { - "epoch": 2.11, - "learning_rate": 1.6539050535987747e-07, - "logits/chosen": -2.479860782623291, - "logits/rejected": -2.3707234859466553, - "logps/chosen": -270.1263122558594, - "logps/rejected": -225.0696563720703, - "loss": 0.5704, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.03719691187143326, - "rewards/margins": 0.4389099180698395, - "rewards/rejected": -0.47610679268836975, + "epoch": 0.53, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": -1.838883638381958, + "logits/rejected": -1.8409969806671143, + "logps/chosen": -296.5715026855469, + "logps/rejected": -247.6753692626953, + "loss": 0.5966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0053814551793038845, + "rewards/margins": 0.2795620262622833, + "rewards/rejected": -0.28494349122047424, "step": 510 }, { - "epoch": 2.15, - "learning_rate": 1.5773353751914243e-07, - "logits/chosen": -2.486112356185913, - "logits/rejected": -2.425720691680908, - "logps/chosen": -284.87408447265625, - "logps/rejected": -229.99740600585938, - "loss": 0.5715, - "rewards/accuracies": 0.739062488079071, - "rewards/chosen": -0.023222165182232857, - "rewards/margins": 0.4711039662361145, - "rewards/rejected": -0.4943261742591858, + "epoch": 0.54, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": -1.9855676889419556, + "logits/rejected": -1.9108524322509766, + "logps/chosen": -263.95355224609375, + "logps/rejected": -235.2255096435547, + "loss": 0.5998, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0008183140307664871, + "rewards/margins": 0.33000558614730835, + "rewards/rejected": -0.329187273979187, "step": 520 }, { - "epoch": 2.19, - "learning_rate": 1.5007656967840735e-07, - "logits/chosen": -2.3832104206085205, - "logits/rejected": -2.342264175415039, - "logps/chosen": -254.04464721679688, - "logps/rejected": -230.5897216796875, - "loss": 0.5673, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.0390273854136467, - "rewards/margins": 0.44446659088134766, - "rewards/rejected": -0.4834940433502197, + "epoch": 0.55, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": -1.8624356985092163, + "logits/rejected": -1.8562076091766357, + "logps/chosen": -229.1457977294922, + "logps/rejected": -215.29031372070312, + "loss": 0.604, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05138016492128372, + "rewards/margins": 0.24304652214050293, + "rewards/rejected": -0.29442673921585083, "step": 530 }, { - "epoch": 2.23, - "learning_rate": 1.4241960183767226e-07, - "logits/chosen": -2.429234266281128, - "logits/rejected": -2.3727104663848877, - "logps/chosen": -282.85357666015625, - "logps/rejected": -235.9081573486328, - "loss": 0.5731, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.05646444484591484, - "rewards/margins": 0.42204102873802185, - "rewards/rejected": -0.47850552201271057, + "epoch": 0.56, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": -1.9267988204956055, + "logits/rejected": -1.8159420490264893, + "logps/chosen": -273.76165771484375, + "logps/rejected": -231.14498901367188, + "loss": 0.6014, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018783237785100937, + "rewards/margins": 0.30618900060653687, + "rewards/rejected": -0.287405788898468, "step": 540 }, { - "epoch": 2.27, - "learning_rate": 1.347626339969372e-07, - "logits/chosen": -2.4232146739959717, - "logits/rejected": -2.387540578842163, - "logps/chosen": -270.8162841796875, - "logps/rejected": -242.1083984375, - "loss": 0.5748, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.012197254225611687, - "rewards/margins": 0.41364020109176636, - "rewards/rejected": -0.4258374571800232, + "epoch": 0.57, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": -1.9478130340576172, + "logits/rejected": -1.7429454326629639, + "logps/chosen": -250.5004119873047, + "logps/rejected": -221.5373077392578, + "loss": 0.5986, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.010875668376684189, + "rewards/margins": 0.25844842195510864, + "rewards/rejected": -0.24757274985313416, "step": 550 }, { - "epoch": 2.31, - "learning_rate": 1.2710566615620215e-07, - "logits/chosen": -2.3734302520751953, - "logits/rejected": -2.3281941413879395, - "logps/chosen": -274.3441467285156, - "logps/rejected": -225.1654052734375, - "loss": 0.5604, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": -0.03567253798246384, - "rewards/margins": 0.44296032190322876, - "rewards/rejected": -0.4786328375339508, + "epoch": 0.58, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": -1.9563947916030884, + "logits/rejected": -1.8643051385879517, + "logps/chosen": -287.89630126953125, + "logps/rejected": -244.06112670898438, + "loss": 0.5865, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.017231445759534836, + "rewards/margins": 0.34001216292381287, + "rewards/rejected": -0.32278066873550415, "step": 560 }, { - "epoch": 2.35, - "learning_rate": 1.1944869831546706e-07, - "logits/chosen": -2.3997061252593994, - "logits/rejected": -2.3794922828674316, - "logps/chosen": -267.03338623046875, - "logps/rejected": -238.7495880126953, - "loss": 0.573, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.07530860602855682, - "rewards/margins": 0.4091506898403168, - "rewards/rejected": -0.4844593107700348, + "epoch": 0.59, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": -1.8329023122787476, + "logits/rejected": -1.934369683265686, + "logps/chosen": -255.2831268310547, + "logps/rejected": -214.7139129638672, + "loss": 0.603, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.05096319317817688, + "rewards/margins": 0.2860959768295288, + "rewards/rejected": -0.3370591998100281, "step": 570 }, { - "epoch": 2.4, - "learning_rate": 1.11791730474732e-07, - "logits/chosen": -2.413222551345825, - "logits/rejected": -2.3744866847991943, - "logps/chosen": -262.73187255859375, - "logps/rejected": -226.4947967529297, - "loss": 0.5655, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.0698440670967102, - "rewards/margins": 0.4843732416629791, - "rewards/rejected": -0.5542173385620117, + "epoch": 0.6, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": -1.9306224584579468, + "logits/rejected": -1.8157587051391602, + "logps/chosen": -284.09368896484375, + "logps/rejected": -231.8200225830078, + "loss": 0.5852, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.016507575288414955, + "rewards/margins": 0.43438202142715454, + "rewards/rejected": -0.4178744852542877, "step": 580 }, { - "epoch": 2.44, - "learning_rate": 1.0413476263399694e-07, - "logits/chosen": -2.4516360759735107, - "logits/rejected": -2.4073665142059326, - "logps/chosen": -269.501220703125, - "logps/rejected": -219.1105499267578, - "loss": 0.567, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.07218033820390701, - "rewards/margins": 0.43574219942092896, - "rewards/rejected": -0.5079224705696106, + "epoch": 0.61, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": -1.8250099420547485, + "logits/rejected": -1.757300615310669, + "logps/chosen": -238.3928680419922, + "logps/rejected": -209.80966186523438, + "loss": 0.5861, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.061179619282484055, + "rewards/margins": 0.2821842432022095, + "rewards/rejected": -0.34336385130882263, "step": 590 }, { - "epoch": 2.48, - "learning_rate": 9.647779479326186e-08, - "logits/chosen": -2.3931024074554443, - "logits/rejected": -2.3882880210876465, - "logps/chosen": -251.0735321044922, - "logps/rejected": -224.48800659179688, - "loss": 0.5767, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.06539195775985718, - "rewards/margins": 0.3568459153175354, - "rewards/rejected": -0.4222378134727478, + "epoch": 0.62, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": -1.9161020517349243, + "logits/rejected": -1.7822034358978271, + "logps/chosen": -242.69790649414062, + "logps/rejected": -197.79483032226562, + "loss": 0.5904, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.059310100972652435, + "rewards/margins": 0.3206678032875061, + "rewards/rejected": -0.3799779415130615, "step": 600 }, { - "epoch": 2.52, - "learning_rate": 8.88208269525268e-08, - "logits/chosen": -2.3815078735351562, - "logits/rejected": -2.3914084434509277, - "logps/chosen": -260.74053955078125, - "logps/rejected": -223.4197540283203, - "loss": 0.5663, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.07167039811611176, - "rewards/margins": 0.41475382447242737, - "rewards/rejected": -0.4864242672920227, + "epoch": 0.63, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": -1.8835821151733398, + "logits/rejected": -1.829000473022461, + "logps/chosen": -249.94384765625, + "logps/rejected": -213.3036651611328, + "loss": 0.5885, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -9.891353693092242e-05, + "rewards/margins": 0.29385074973106384, + "rewards/rejected": -0.2939496636390686, "step": 610 }, { - "epoch": 2.56, - "learning_rate": 8.116385911179173e-08, - "logits/chosen": -2.3715600967407227, - "logits/rejected": -2.362975597381592, - "logps/chosen": -279.5382080078125, - "logps/rejected": -219.50137329101562, - "loss": 0.5684, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.020305603742599487, - "rewards/margins": 0.4676498472690582, - "rewards/rejected": -0.4879554212093353, + "epoch": 0.64, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": -1.9622195959091187, + "logits/rejected": -1.8651930093765259, + "logps/chosen": -299.0150146484375, + "logps/rejected": -242.2936248779297, + "loss": 0.57, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.017981451004743576, + "rewards/margins": 0.39614418148994446, + "rewards/rejected": -0.41412559151649475, "step": 620 }, { - "epoch": 2.6, - "learning_rate": 7.350689127105667e-08, - "logits/chosen": -2.453105926513672, - "logits/rejected": -2.378638744354248, - "logps/chosen": -272.66357421875, - "logps/rejected": -232.73440551757812, - "loss": 0.5605, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": -0.04887257143855095, - "rewards/margins": 0.47081702947616577, - "rewards/rejected": -0.519689679145813, + "epoch": 0.65, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": -1.8097641468048096, + "logits/rejected": -1.8262989521026611, + "logps/chosen": -245.4670867919922, + "logps/rejected": -231.83248901367188, + "loss": 0.5847, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.011098229326307774, + "rewards/margins": 0.3274097144603729, + "rewards/rejected": -0.3385079503059387, "step": 630 }, { - "epoch": 2.64, - "learning_rate": 6.584992343032159e-08, - "logits/chosen": -2.355799436569214, - "logits/rejected": -2.3423495292663574, - "logps/chosen": -263.2275390625, - "logps/rejected": -229.0945587158203, - "loss": 0.5757, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.05284266546368599, - "rewards/margins": 0.4212065637111664, - "rewards/rejected": -0.47404924035072327, + "epoch": 0.66, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": -1.8816455602645874, + "logits/rejected": -1.80124831199646, + "logps/chosen": -251.793701171875, + "logps/rejected": -210.52481079101562, + "loss": 0.5697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03153982385993004, + "rewards/margins": 0.38736575841903687, + "rewards/rejected": -0.4189055562019348, "step": 640 }, { - "epoch": 2.69, - "learning_rate": 5.819295558958652e-08, - "logits/chosen": -2.399625301361084, - "logits/rejected": -2.349210262298584, - "logps/chosen": -288.31817626953125, - "logps/rejected": -222.42892456054688, - "loss": 0.5645, - "rewards/accuracies": 0.7484375238418579, - "rewards/chosen": -0.02390669845044613, - "rewards/margins": 0.5248440504074097, - "rewards/rejected": -0.5487507581710815, + "epoch": 0.67, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": -1.952106237411499, + "logits/rejected": -1.8799129724502563, + "logps/chosen": -270.39935302734375, + "logps/rejected": -226.58645629882812, + "loss": 0.5763, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.027015209197998047, + "rewards/margins": 0.4403092861175537, + "rewards/rejected": -0.41329407691955566, "step": 650 }, { - "epoch": 2.73, - "learning_rate": 5.0535987748851455e-08, - "logits/chosen": -2.4646735191345215, - "logits/rejected": -2.4114043712615967, - "logps/chosen": -275.809326171875, - "logps/rejected": -231.97802734375, - "loss": 0.5615, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.07590371370315552, - "rewards/margins": 0.411339670419693, - "rewards/rejected": -0.4872434139251709, + "epoch": 0.68, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": -1.9546396732330322, + "logits/rejected": -1.8226925134658813, + "logps/chosen": -264.48699951171875, + "logps/rejected": -226.3621368408203, + "loss": 0.5699, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.02068941853940487, + "rewards/margins": 0.4405900537967682, + "rewards/rejected": -0.4612794518470764, "step": 660 }, { - "epoch": 2.77, - "learning_rate": 4.287901990811638e-08, - "logits/chosen": -2.4426465034484863, - "logits/rejected": -2.3569984436035156, - "logps/chosen": -268.51104736328125, - "logps/rejected": -244.9394989013672, - "loss": 0.5639, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.04740380123257637, - "rewards/margins": 0.4665365219116211, - "rewards/rejected": -0.5139402747154236, + "epoch": 0.69, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": -1.8923488855361938, + "logits/rejected": -1.8289083242416382, + "logps/chosen": -280.862060546875, + "logps/rejected": -231.81396484375, + "loss": 0.573, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07524853944778442, + "rewards/margins": 0.38956543803215027, + "rewards/rejected": -0.4648140072822571, "step": 670 }, { - "epoch": 2.81, - "learning_rate": 3.522205206738132e-08, - "logits/chosen": -2.4069762229919434, - "logits/rejected": -2.3731188774108887, - "logps/chosen": -275.3673400878906, - "logps/rejected": -230.1641387939453, - "loss": 0.568, - "rewards/accuracies": 0.723437488079071, - "rewards/chosen": -0.07594893872737885, - "rewards/margins": 0.4454987049102783, - "rewards/rejected": -0.5214475989341736, + "epoch": 0.7, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": -1.9917113780975342, + "logits/rejected": -1.8489996194839478, + "logps/chosen": -276.03302001953125, + "logps/rejected": -221.58871459960938, + "loss": 0.6129, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0764780268073082, + "rewards/margins": 0.30070167779922485, + "rewards/rejected": -0.37717974185943604, "step": 680 }, { - "epoch": 2.85, - "learning_rate": 2.7565084226646246e-08, - "logits/chosen": -2.3884053230285645, - "logits/rejected": -2.3761892318725586, - "logps/chosen": -264.29754638671875, - "logps/rejected": -236.7384033203125, - "loss": 0.5738, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.04798636958003044, - "rewards/margins": 0.462782621383667, - "rewards/rejected": -0.5107689499855042, + "epoch": 0.71, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": -1.9777377843856812, + "logits/rejected": -1.9532722234725952, + "logps/chosen": -253.9358367919922, + "logps/rejected": -222.5725555419922, + "loss": 0.6016, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.10901274532079697, + "rewards/margins": 0.38281282782554626, + "rewards/rejected": -0.49182558059692383, "step": 690 }, { - "epoch": 2.89, - "learning_rate": 1.9908116385911178e-08, - "logits/chosen": -2.4066903591156006, - "logits/rejected": -2.388782501220703, - "logps/chosen": -260.63067626953125, - "logps/rejected": -227.6019744873047, - "loss": 0.5721, - "rewards/accuracies": 0.667187511920929, - "rewards/chosen": -0.0927988812327385, - "rewards/margins": 0.37709805369377136, - "rewards/rejected": -0.46989694237709045, + "epoch": 0.72, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": -1.885032057762146, + "logits/rejected": -1.7957684993743896, + "logps/chosen": -268.89044189453125, + "logps/rejected": -214.43466186523438, + "loss": 0.5913, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.005024688318371773, + "rewards/margins": 0.4345860481262207, + "rewards/rejected": -0.4295613765716553, "step": 700 }, { - "epoch": 2.93, - "learning_rate": 1.225114854517611e-08, - "logits/chosen": -2.4368255138397217, - "logits/rejected": -2.371954917907715, - "logps/chosen": -280.86224365234375, - "logps/rejected": -221.8843536376953, - "loss": 0.5689, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.06322827190160751, - "rewards/margins": 0.44322142004966736, - "rewards/rejected": -0.5064496397972107, + "epoch": 0.73, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": -1.9487273693084717, + "logits/rejected": -1.7702715396881104, + "logps/chosen": -256.96099853515625, + "logps/rejected": -201.6789093017578, + "loss": 0.586, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.019070638343691826, + "rewards/margins": 0.4329298138618469, + "rewards/rejected": -0.45200037956237793, "step": 710 }, { - "epoch": 2.97, - "learning_rate": 4.594180704441042e-09, - "logits/chosen": -2.4003658294677734, - "logits/rejected": -2.3783950805664062, - "logps/chosen": -257.96258544921875, - "logps/rejected": -225.0150146484375, - "loss": 0.5678, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.05350871756672859, - "rewards/margins": 0.414130300283432, - "rewards/rejected": -0.4676390290260315, + "epoch": 0.74, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": -1.9309972524642944, + "logits/rejected": -1.9100515842437744, + "logps/chosen": -289.480712890625, + "logps/rejected": -249.810546875, + "loss": 0.5954, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03227793052792549, + "rewards/margins": 0.26363760232925415, + "rewards/rejected": -0.29591551423072815, "step": 720 }, { - "epoch": 3.0, - "eval_logits/chosen": -2.1217968463897705, - "eval_logits/rejected": -2.000995397567749, - "eval_logps/chosen": -265.43603515625, - "eval_logps/rejected": -224.63743591308594, - "eval_loss": 0.5667475461959839, - "eval_rewards/accuracies": 0.7059999704360962, - "eval_rewards/chosen": -0.07752041518688202, - "eval_rewards/margins": 0.4577913284301758, - "eval_rewards/rejected": -0.5353116989135742, - "eval_runtime": 444.751, - "eval_samples_per_second": 4.497, - "eval_steps_per_second": 0.281, - "step": 726 + "epoch": 0.75, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": -1.9196300506591797, + "logits/rejected": -1.8512144088745117, + "logps/chosen": -255.46847534179688, + "logps/rejected": -244.3988037109375, + "loss": 0.5816, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.061547208577394485, + "rewards/margins": 0.30254802107810974, + "rewards/rejected": -0.36409521102905273, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": -1.7961009740829468, + "logits/rejected": -1.8163458108901978, + "logps/chosen": -263.87298583984375, + "logps/rejected": -192.55197143554688, + "loss": 0.5642, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.05877164006233215, + "rewards/margins": 0.5112999677658081, + "rewards/rejected": -0.5700715780258179, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": -1.7912629842758179, + "logits/rejected": -1.734718680381775, + "logps/chosen": -256.4027404785156, + "logps/rejected": -214.3182373046875, + "loss": 0.58, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.06740725785493851, + "rewards/margins": 0.4714421331882477, + "rewards/rejected": -0.538849413394928, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -1.853451132774353, + "logits/rejected": -1.866167426109314, + "logps/chosen": -256.91363525390625, + "logps/rejected": -242.52590942382812, + "loss": 0.5773, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13397860527038574, + "rewards/margins": 0.34158021211624146, + "rewards/rejected": -0.4755588173866272, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": -1.8612697124481201, + "logits/rejected": -1.7442042827606201, + "logps/chosen": -259.656982421875, + "logps/rejected": -230.004150390625, + "loss": 0.5562, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04484036937355995, + "rewards/margins": 0.37435808777809143, + "rewards/rejected": -0.4191984534263611, + "step": 770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": -1.9800891876220703, + "logits/rejected": -1.8991854190826416, + "logps/chosen": -278.4007263183594, + "logps/rejected": -231.08724975585938, + "loss": 0.5685, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12108190357685089, + "rewards/margins": 0.4823724329471588, + "rewards/rejected": -0.6034542918205261, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": -1.841783881187439, + "logits/rejected": -1.8057258129119873, + "logps/chosen": -255.5233917236328, + "logps/rejected": -214.1595458984375, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04014988988637924, + "rewards/margins": 0.4424513876438141, + "rewards/rejected": -0.48260125517845154, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": -1.8774309158325195, + "logits/rejected": -1.8645069599151611, + "logps/chosen": -272.11627197265625, + "logps/rejected": -235.83847045898438, + "loss": 0.5726, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.09583926200866699, + "rewards/margins": 0.3844660818576813, + "rewards/rejected": -0.48030534386634827, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": -1.9018551111221313, + "logits/rejected": -1.841225028038025, + "logps/chosen": -255.79263305664062, + "logps/rejected": -225.6138458251953, + "loss": 0.544, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.07462090253829956, + "rewards/margins": 0.5299316048622131, + "rewards/rejected": -0.6045525074005127, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": -2.0083446502685547, + "logits/rejected": -1.8861629962921143, + "logps/chosen": -287.9122009277344, + "logps/rejected": -221.60012817382812, + "loss": 0.5607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03642572462558746, + "rewards/margins": 0.4943397045135498, + "rewards/rejected": -0.5307654738426208, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": -1.9512745141983032, + "logits/rejected": -1.760939598083496, + "logps/chosen": -273.5978698730469, + "logps/rejected": -217.01632690429688, + "loss": 0.5422, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03662416711449623, + "rewards/margins": 0.642555832862854, + "rewards/rejected": -0.6059316396713257, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": -1.833461046218872, + "logits/rejected": -1.7712657451629639, + "logps/chosen": -245.89334106445312, + "logps/rejected": -210.9978790283203, + "loss": 0.6005, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.14411701261997223, + "rewards/margins": 0.35258370637893677, + "rewards/rejected": -0.4967007637023926, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": -1.8862361907958984, + "logits/rejected": -1.9118025302886963, + "logps/chosen": -248.77099609375, + "logps/rejected": -223.5210418701172, + "loss": 0.566, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09480221569538116, + "rewards/margins": 0.4293753206729889, + "rewards/rejected": -0.524177610874176, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": -2.0598442554473877, + "logits/rejected": -1.9262638092041016, + "logps/chosen": -284.6533508300781, + "logps/rejected": -265.4016418457031, + "loss": 0.5653, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09414469450712204, + "rewards/margins": 0.491023451089859, + "rewards/rejected": -0.585168182849884, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": -1.8470954895019531, + "logits/rejected": -1.799574851989746, + "logps/chosen": -299.83453369140625, + "logps/rejected": -232.516357421875, + "loss": 0.5622, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0625360757112503, + "rewards/margins": 0.5294080972671509, + "rewards/rejected": -0.59194415807724, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": -1.9551128149032593, + "logits/rejected": -1.8529279232025146, + "logps/chosen": -277.6325378417969, + "logps/rejected": -234.42556762695312, + "loss": 0.5589, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.037693943828344345, + "rewards/margins": 0.5901383757591248, + "rewards/rejected": -0.627832293510437, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": -1.9130725860595703, + "logits/rejected": -1.9241355657577515, + "logps/chosen": -269.7417297363281, + "logps/rejected": -229.4433135986328, + "loss": 0.5796, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.001588334096595645, + "rewards/margins": 0.6215437650680542, + "rewards/rejected": -0.623132050037384, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": -1.9254659414291382, + "logits/rejected": -1.7164087295532227, + "logps/chosen": -258.0804748535156, + "logps/rejected": -205.9691619873047, + "loss": 0.5466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.011478861793875694, + "rewards/margins": 0.6032481789588928, + "rewards/rejected": -0.6147270202636719, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": -1.859127402305603, + "logits/rejected": -1.8909375667572021, + "logps/chosen": -225.0897674560547, + "logps/rejected": -200.79922485351562, + "loss": 0.5657, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.09809949994087219, + "rewards/margins": 0.5184981822967529, + "rewards/rejected": -0.6165977120399475, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": -2.0053133964538574, + "logits/rejected": -1.962281584739685, + "logps/chosen": -268.1788330078125, + "logps/rejected": -217.4096221923828, + "loss": 0.5564, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06839360296726227, + "rewards/margins": 0.4477333128452301, + "rewards/rejected": -0.5161268711090088, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": -1.8085294961929321, + "logits/rejected": -1.7422670125961304, + "logps/chosen": -242.10464477539062, + "logps/rejected": -197.039794921875, + "loss": 0.5648, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09487004578113556, + "rewards/margins": 0.5353657007217407, + "rewards/rejected": -0.6302357316017151, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": -1.910382628440857, + "logits/rejected": -1.8696739673614502, + "logps/chosen": -257.5924072265625, + "logps/rejected": -225.9730987548828, + "loss": 0.5693, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.06897349655628204, + "rewards/margins": 0.5433675050735474, + "rewards/rejected": -0.612341046333313, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": -1.930299997329712, + "logits/rejected": -1.8282740116119385, + "logps/chosen": -271.03204345703125, + "logps/rejected": -227.7154083251953, + "loss": 0.5717, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11821737140417099, + "rewards/margins": 0.47633272409439087, + "rewards/rejected": -0.5945500731468201, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": -1.8711103200912476, + "logits/rejected": -1.7967535257339478, + "logps/chosen": -268.1916198730469, + "logps/rejected": -233.38150024414062, + "loss": 0.5506, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.057533152401447296, + "rewards/margins": 0.41135653853416443, + "rewards/rejected": -0.4688897132873535, + "step": 960 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.1122982501983643, + "eval_logits/rejected": -1.9914103746414185, + "eval_logps/chosen": -265.78839111328125, + "eval_logps/rejected": -225.7088623046875, + "eval_loss": 0.555598258972168, + "eval_rewards/accuracies": 0.7120000123977661, + "eval_rewards/chosen": -0.11275824159383774, + "eval_rewards/margins": 0.5296958684921265, + "eval_rewards/rejected": -0.6424540281295776, + "eval_runtime": 447.2998, + "eval_samples_per_second": 4.471, + "eval_steps_per_second": 0.279, + "step": 968 + }, + { + "epoch": 1.0, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": -1.9309650659561157, + "logits/rejected": -1.8147058486938477, + "logps/chosen": -267.860595703125, + "logps/rejected": -233.5863494873047, + "loss": 0.5513, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.06416840106248856, + "rewards/margins": 0.539823055267334, + "rewards/rejected": -0.6039914488792419, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": -1.9178416728973389, + "logits/rejected": -1.771106481552124, + "logps/chosen": -266.7783508300781, + "logps/rejected": -227.1133575439453, + "loss": 0.5523, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.08205634355545044, + "rewards/margins": 0.5639916062355042, + "rewards/rejected": -0.6460479497909546, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": -1.8973121643066406, + "logits/rejected": -1.7602847814559937, + "logps/chosen": -246.4560089111328, + "logps/rejected": -220.7578582763672, + "loss": 0.5985, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16480432450771332, + "rewards/margins": 0.4767611026763916, + "rewards/rejected": -0.6415655016899109, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": -1.891383409500122, + "logits/rejected": -1.7983802556991577, + "logps/chosen": -254.2264404296875, + "logps/rejected": -250.4141082763672, + "loss": 0.577, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.14169907569885254, + "rewards/margins": 0.46360692381858826, + "rewards/rejected": -0.6053060293197632, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": -1.8937060832977295, + "logits/rejected": -1.7699038982391357, + "logps/chosen": -264.3402404785156, + "logps/rejected": -229.96975708007812, + "loss": 0.5669, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.08274277299642563, + "rewards/margins": 0.4730890691280365, + "rewards/rejected": -0.5558319091796875, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": -1.8347629308700562, + "logits/rejected": -1.8547874689102173, + "logps/chosen": -258.7706604003906, + "logps/rejected": -207.8643798828125, + "loss": 0.5414, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0507674440741539, + "rewards/margins": 0.6220256090164185, + "rewards/rejected": -0.6727930903434753, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": -1.8121258020401, + "logits/rejected": -1.8327728509902954, + "logps/chosen": -271.30853271484375, + "logps/rejected": -242.1578826904297, + "loss": 0.5407, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.034175340086221695, + "rewards/margins": 0.5325769782066345, + "rewards/rejected": -0.5667523145675659, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": -1.9302997589111328, + "logits/rejected": -1.8248958587646484, + "logps/chosen": -267.46441650390625, + "logps/rejected": -229.4574432373047, + "loss": 0.5558, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1353200078010559, + "rewards/margins": 0.41318050026893616, + "rewards/rejected": -0.5485004186630249, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": -1.8329179286956787, + "logits/rejected": -1.8623487949371338, + "logps/chosen": -279.0738220214844, + "logps/rejected": -207.0067138671875, + "loss": 0.5283, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1448301523923874, + "rewards/margins": 0.6097829341888428, + "rewards/rejected": -0.7546130418777466, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": -1.8090789318084717, + "logits/rejected": -1.7515262365341187, + "logps/chosen": -251.6829071044922, + "logps/rejected": -236.7270965576172, + "loss": 0.5409, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.05564679950475693, + "rewards/margins": 0.6186949014663696, + "rewards/rejected": -0.67434161901474, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": -1.8456182479858398, + "logits/rejected": -1.6566247940063477, + "logps/chosen": -274.19683837890625, + "logps/rejected": -236.01339721679688, + "loss": 0.5476, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.14642968773841858, + "rewards/margins": 0.5259400010108948, + "rewards/rejected": -0.6723695993423462, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": -1.9297821521759033, + "logits/rejected": -1.7852903604507446, + "logps/chosen": -253.9114532470703, + "logps/rejected": -230.21533203125, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1505003273487091, + "rewards/margins": 0.5807836055755615, + "rewards/rejected": -0.7312839031219482, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": -1.80386483669281, + "logits/rejected": -1.7010151147842407, + "logps/chosen": -283.9732971191406, + "logps/rejected": -233.914306640625, + "loss": 0.5663, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.04631692171096802, + "rewards/margins": 0.6188243627548218, + "rewards/rejected": -0.665141224861145, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": -1.8113272190093994, + "logits/rejected": -1.8279342651367188, + "logps/chosen": -243.00473022460938, + "logps/rejected": -205.41311645507812, + "loss": 0.558, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14684683084487915, + "rewards/margins": 0.516217827796936, + "rewards/rejected": -0.6630645990371704, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": -2.0112392902374268, + "logits/rejected": -1.836309790611267, + "logps/chosen": -290.1194763183594, + "logps/rejected": -260.72442626953125, + "loss": 0.5487, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.011916500516235828, + "rewards/margins": 0.6095856428146362, + "rewards/rejected": -0.621502161026001, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": -1.8178952932357788, + "logits/rejected": -1.776402473449707, + "logps/chosen": -233.38626098632812, + "logps/rejected": -231.48934936523438, + "loss": 0.5423, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10915694385766983, + "rewards/margins": 0.5478355884552002, + "rewards/rejected": -0.6569925546646118, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": -1.918738603591919, + "logits/rejected": -1.7267446517944336, + "logps/chosen": -297.24591064453125, + "logps/rejected": -243.4203338623047, + "loss": 0.5358, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1057930737733841, + "rewards/margins": 0.6121749877929688, + "rewards/rejected": -0.7179681658744812, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": -1.8028564453125, + "logits/rejected": -1.7646026611328125, + "logps/chosen": -254.3768768310547, + "logps/rejected": -244.33328247070312, + "loss": 0.5127, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.15064296126365662, + "rewards/margins": 0.5970216393470764, + "rewards/rejected": -0.7476645708084106, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": -1.9135738611221313, + "logits/rejected": -1.6617759466171265, + "logps/chosen": -287.608154296875, + "logps/rejected": -218.7330780029297, + "loss": 0.5629, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.14234164357185364, + "rewards/margins": 0.6442137956619263, + "rewards/rejected": -0.7865555286407471, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": -1.7726719379425049, + "logits/rejected": -1.6829383373260498, + "logps/chosen": -253.5757598876953, + "logps/rejected": -228.3224639892578, + "loss": 0.5371, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.08248370885848999, + "rewards/margins": 0.6653919219970703, + "rewards/rejected": -0.7478755712509155, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": -1.8725894689559937, + "logits/rejected": -1.6902002096176147, + "logps/chosen": -264.3750915527344, + "logps/rejected": -224.15225219726562, + "loss": 0.548, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1552276611328125, + "rewards/margins": 0.56285160779953, + "rewards/rejected": -0.7180793881416321, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": -1.8164494037628174, + "logits/rejected": -1.7787806987762451, + "logps/chosen": -250.0327911376953, + "logps/rejected": -222.24093627929688, + "loss": 0.5399, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2459467649459839, + "rewards/margins": 0.5705105066299438, + "rewards/rejected": -0.8164572715759277, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": -1.8070465326309204, + "logits/rejected": -1.8013019561767578, + "logps/chosen": -243.19503784179688, + "logps/rejected": -226.52072143554688, + "loss": 0.5302, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.18580548465251923, + "rewards/margins": 0.5814526677131653, + "rewards/rejected": -0.7672581076622009, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": -1.8752357959747314, + "logits/rejected": -1.743018388748169, + "logps/chosen": -254.1011505126953, + "logps/rejected": -224.49685668945312, + "loss": 0.5218, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.10988108813762665, + "rewards/margins": 0.6526879668235779, + "rewards/rejected": -0.7625690698623657, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": -1.8495900630950928, + "logits/rejected": -1.8988640308380127, + "logps/chosen": -257.2715759277344, + "logps/rejected": -218.9684600830078, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13320106267929077, + "rewards/margins": 0.61826491355896, + "rewards/rejected": -0.751465916633606, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": -1.810633659362793, + "logits/rejected": -1.708275556564331, + "logps/chosen": -262.0743408203125, + "logps/rejected": -253.2130126953125, + "loss": 0.554, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15741756558418274, + "rewards/margins": 0.514065146446228, + "rewards/rejected": -0.6714826822280884, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": -1.902016043663025, + "logits/rejected": -1.6794923543930054, + "logps/chosen": -283.8649597167969, + "logps/rejected": -245.8186798095703, + "loss": 0.5652, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1987680196762085, + "rewards/margins": 0.46017202734947205, + "rewards/rejected": -0.6589400768280029, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": -1.869837760925293, + "logits/rejected": -1.8361009359359741, + "logps/chosen": -283.623291015625, + "logps/rejected": -254.81338500976562, + "loss": 0.5416, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.07560212910175323, + "rewards/margins": 0.6903966665267944, + "rewards/rejected": -0.7659987807273865, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": -1.7925386428833008, + "logits/rejected": -1.7958791255950928, + "logps/chosen": -243.9271697998047, + "logps/rejected": -217.29736328125, + "loss": 0.5416, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.21342837810516357, + "rewards/margins": 0.5950428247451782, + "rewards/rejected": -0.8084712028503418, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": -1.923183798789978, + "logits/rejected": -1.832482933998108, + "logps/chosen": -264.4095458984375, + "logps/rejected": -223.9330291748047, + "loss": 0.5411, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.107899509370327, + "rewards/margins": 0.7285504341125488, + "rewards/rejected": -0.836449921131134, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": -1.9629684686660767, + "logits/rejected": -1.899493932723999, + "logps/chosen": -301.8197326660156, + "logps/rejected": -264.41986083984375, + "loss": 0.5426, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.13049855828285217, + "rewards/margins": 0.6736435890197754, + "rewards/rejected": -0.80414217710495, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": -1.7553589344024658, + "logits/rejected": -1.8107513189315796, + "logps/chosen": -233.2200927734375, + "logps/rejected": -218.7114715576172, + "loss": 0.5753, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11746728420257568, + "rewards/margins": 0.4923877716064453, + "rewards/rejected": -0.609855055809021, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": -1.8349711894989014, + "logits/rejected": -1.747821569442749, + "logps/chosen": -250.79537963867188, + "logps/rejected": -221.89242553710938, + "loss": 0.5544, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.14142295718193054, + "rewards/margins": 0.62542724609375, + "rewards/rejected": -0.7668501734733582, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": -1.7938839197158813, + "logits/rejected": -1.9109312295913696, + "logps/chosen": -271.4996032714844, + "logps/rejected": -228.3076629638672, + "loss": 0.5514, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.1456792950630188, + "rewards/margins": 0.6148759126663208, + "rewards/rejected": -0.7605552077293396, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": -1.8954797983169556, + "logits/rejected": -1.8206110000610352, + "logps/chosen": -279.8629150390625, + "logps/rejected": -226.99923706054688, + "loss": 0.5314, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.04422885179519653, + "rewards/margins": 0.6719091534614563, + "rewards/rejected": -0.7161380052566528, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": -1.8194704055786133, + "logits/rejected": -1.6345094442367554, + "logps/chosen": -237.7805633544922, + "logps/rejected": -205.6898651123047, + "loss": 0.5165, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.18722450733184814, + "rewards/margins": 0.6255088448524475, + "rewards/rejected": -0.8127333521842957, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": -1.808471918106079, + "logits/rejected": -1.87363600730896, + "logps/chosen": -256.4935302734375, + "logps/rejected": -227.7115020751953, + "loss": 0.5419, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.07518905401229858, + "rewards/margins": 0.6806653738021851, + "rewards/rejected": -0.7558544278144836, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": -1.9065767526626587, + "logits/rejected": -1.8920596837997437, + "logps/chosen": -254.2109375, + "logps/rejected": -224.54122924804688, + "loss": 0.5538, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.19278357923030853, + "rewards/margins": 0.6983119249343872, + "rewards/rejected": -0.8910955190658569, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": -1.7620071172714233, + "logits/rejected": -1.6717274188995361, + "logps/chosen": -239.1033935546875, + "logps/rejected": -223.27352905273438, + "loss": 0.5505, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.21111862361431122, + "rewards/margins": 0.572688102722168, + "rewards/rejected": -0.7838066816329956, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": -1.862436294555664, + "logits/rejected": -1.830780267715454, + "logps/chosen": -266.7626647949219, + "logps/rejected": -211.84066772460938, + "loss": 0.5523, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.20181214809417725, + "rewards/margins": 0.5214165449142456, + "rewards/rejected": -0.7232287526130676, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": -1.9817575216293335, + "logits/rejected": -1.858848214149475, + "logps/chosen": -278.64483642578125, + "logps/rejected": -221.8795928955078, + "loss": 0.5654, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.19922414422035217, + "rewards/margins": 0.62843918800354, + "rewards/rejected": -0.8276633024215698, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": -1.8901679515838623, + "logits/rejected": -1.7563245296478271, + "logps/chosen": -243.6305694580078, + "logps/rejected": -215.34848022460938, + "loss": 0.5456, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.028823787346482277, + "rewards/margins": 0.74996018409729, + "rewards/rejected": -0.778783917427063, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": -1.9129832983016968, + "logits/rejected": -1.8331798315048218, + "logps/chosen": -288.71820068359375, + "logps/rejected": -218.37100219726562, + "loss": 0.5641, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11229536682367325, + "rewards/margins": 0.6004088521003723, + "rewards/rejected": -0.712704062461853, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": -1.8954912424087524, + "logits/rejected": -1.7878671884536743, + "logps/chosen": -278.468994140625, + "logps/rejected": -242.9048309326172, + "loss": 0.5541, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.13781218230724335, + "rewards/margins": 0.5570717453956604, + "rewards/rejected": -0.6948838829994202, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": -1.8558862209320068, + "logits/rejected": -1.7647888660430908, + "logps/chosen": -260.31878662109375, + "logps/rejected": -234.2343292236328, + "loss": 0.5306, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1553259789943695, + "rewards/margins": 0.6341598629951477, + "rewards/rejected": -0.7894858121871948, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": -1.827294945716858, + "logits/rejected": -1.7461655139923096, + "logps/chosen": -259.10076904296875, + "logps/rejected": -228.10107421875, + "loss": 0.5366, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.12409050762653351, + "rewards/margins": 0.7103191018104553, + "rewards/rejected": -0.8344095349311829, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": -1.6903371810913086, + "logits/rejected": -1.6766010522842407, + "logps/chosen": -252.8011016845703, + "logps/rejected": -250.18869018554688, + "loss": 0.5391, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.09092485159635544, + "rewards/margins": 0.6495856642723083, + "rewards/rejected": -0.7405105829238892, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": -1.8704569339752197, + "logits/rejected": -1.7946221828460693, + "logps/chosen": -244.53988647460938, + "logps/rejected": -214.50454711914062, + "loss": 0.544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18335691094398499, + "rewards/margins": 0.5455018281936646, + "rewards/rejected": -0.7288587689399719, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": -1.8366893529891968, + "logits/rejected": -1.695364236831665, + "logps/chosen": -265.47137451171875, + "logps/rejected": -214.5950164794922, + "loss": 0.5649, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.19029895961284637, + "rewards/margins": 0.6464245915412903, + "rewards/rejected": -0.8367235064506531, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": -1.8242470026016235, + "logits/rejected": -1.7533226013183594, + "logps/chosen": -276.3430480957031, + "logps/rejected": -208.4492950439453, + "loss": 0.5599, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.21704845130443573, + "rewards/margins": 0.5505436658859253, + "rewards/rejected": -0.767592191696167, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": -1.8290176391601562, + "logits/rejected": -1.7711703777313232, + "logps/chosen": -267.44952392578125, + "logps/rejected": -217.92715454101562, + "loss": 0.557, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.12331853806972504, + "rewards/margins": 0.668969988822937, + "rewards/rejected": -0.7922885417938232, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": -1.875618577003479, + "logits/rejected": -1.775080919265747, + "logps/chosen": -249.10879516601562, + "logps/rejected": -225.7317657470703, + "loss": 0.5418, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.14979299902915955, + "rewards/margins": 0.556437611579895, + "rewards/rejected": -0.7062305808067322, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": -1.7776212692260742, + "logits/rejected": -1.7803932428359985, + "logps/chosen": -279.66436767578125, + "logps/rejected": -236.1451873779297, + "loss": 0.5298, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.18123303353786469, + "rewards/margins": 0.6183615326881409, + "rewards/rejected": -0.799594521522522, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": -1.9013497829437256, + "logits/rejected": -1.7950599193572998, + "logps/chosen": -257.5596008300781, + "logps/rejected": -216.0666961669922, + "loss": 0.5463, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1224035993218422, + "rewards/margins": 0.8045811653137207, + "rewards/rejected": -0.9269847869873047, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": -1.8473272323608398, + "logits/rejected": -1.8571611642837524, + "logps/chosen": -280.41265869140625, + "logps/rejected": -216.84228515625, + "loss": 0.5252, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.16351863741874695, + "rewards/margins": 0.5998553037643433, + "rewards/rejected": -0.7633739113807678, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": -1.8803428411483765, + "logits/rejected": -1.7163865566253662, + "logps/chosen": -219.8080596923828, + "logps/rejected": -222.00283813476562, + "loss": 0.5456, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11293137073516846, + "rewards/margins": 0.6036781072616577, + "rewards/rejected": -0.716609537601471, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": -1.7868821620941162, + "logits/rejected": -1.7498390674591064, + "logps/chosen": -264.19482421875, + "logps/rejected": -221.72030639648438, + "loss": 0.5192, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.19108808040618896, + "rewards/margins": 0.5844706296920776, + "rewards/rejected": -0.7755586504936218, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": -1.9263372421264648, + "logits/rejected": -1.8345693349838257, + "logps/chosen": -284.00885009765625, + "logps/rejected": -231.9293670654297, + "loss": 0.5536, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1310052126646042, + "rewards/margins": 0.5621177554130554, + "rewards/rejected": -0.6931229829788208, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": -1.8512241840362549, + "logits/rejected": -1.612044095993042, + "logps/chosen": -242.47299194335938, + "logps/rejected": -224.6700897216797, + "loss": 0.5258, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10790286213159561, + "rewards/margins": 0.6397491693496704, + "rewards/rejected": -0.7476519346237183, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": -1.9594770669937134, + "logits/rejected": -1.8314173221588135, + "logps/chosen": -292.87005615234375, + "logps/rejected": -233.72705078125, + "loss": 0.545, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11370010673999786, + "rewards/margins": 0.7442849278450012, + "rewards/rejected": -0.8579851388931274, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": -1.8655388355255127, + "logits/rejected": -1.8951002359390259, + "logps/chosen": -273.2738952636719, + "logps/rejected": -235.41665649414062, + "loss": 0.537, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058467138558626175, + "rewards/margins": 0.716724693775177, + "rewards/rejected": -0.7751919031143188, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": -1.8273719549179077, + "logits/rejected": -1.8933742046356201, + "logps/chosen": -274.70074462890625, + "logps/rejected": -227.990234375, + "loss": 0.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0908978134393692, + "rewards/margins": 0.746134877204895, + "rewards/rejected": -0.837032675743103, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": -1.9826558828353882, + "logits/rejected": -1.75235915184021, + "logps/chosen": -309.63140869140625, + "logps/rejected": -242.9651336669922, + "loss": 0.5592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06581602245569229, + "rewards/margins": 0.6739424467086792, + "rewards/rejected": -0.7397584915161133, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": -1.923727035522461, + "logits/rejected": -1.7058594226837158, + "logps/chosen": -263.69439697265625, + "logps/rejected": -225.170166015625, + "loss": 0.5567, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11397922039031982, + "rewards/margins": 0.7004713416099548, + "rewards/rejected": -0.8144504427909851, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": -1.8580553531646729, + "logits/rejected": -1.8299728631973267, + "logps/chosen": -266.441650390625, + "logps/rejected": -226.8827362060547, + "loss": 0.5355, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22734478116035461, + "rewards/margins": 0.5635603666305542, + "rewards/rejected": -0.7909051179885864, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": -1.8391910791397095, + "logits/rejected": -1.8953899145126343, + "logps/chosen": -279.6722717285156, + "logps/rejected": -236.7119598388672, + "loss": 0.5685, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.15163454413414001, + "rewards/margins": 0.494490385055542, + "rewards/rejected": -0.6461249589920044, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": -1.82537841796875, + "logits/rejected": -1.753363013267517, + "logps/chosen": -258.87847900390625, + "logps/rejected": -230.94479370117188, + "loss": 0.5298, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.04291687160730362, + "rewards/margins": 0.7452040910720825, + "rewards/rejected": -0.7881208658218384, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": -1.7349640130996704, + "logits/rejected": -1.876021146774292, + "logps/chosen": -260.9759521484375, + "logps/rejected": -228.2615509033203, + "loss": 0.558, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18380671739578247, + "rewards/margins": 0.516321063041687, + "rewards/rejected": -0.7001277208328247, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": -1.8721230030059814, + "logits/rejected": -1.8479413986206055, + "logps/chosen": -266.7557373046875, + "logps/rejected": -229.310791015625, + "loss": 0.5334, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.14805713295936584, + "rewards/margins": 0.6217719912528992, + "rewards/rejected": -0.7698291540145874, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": -1.8186630010604858, + "logits/rejected": -1.6445763111114502, + "logps/chosen": -238.71884155273438, + "logps/rejected": -213.4270782470703, + "loss": 0.5523, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.046378009021282196, + "rewards/margins": 0.7334533333778381, + "rewards/rejected": -0.7798312902450562, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": -1.8854576349258423, + "logits/rejected": -1.816144585609436, + "logps/chosen": -274.6640930175781, + "logps/rejected": -220.7818603515625, + "loss": 0.502, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.016058530658483505, + "rewards/margins": 0.7327315211296082, + "rewards/rejected": -0.7487900257110596, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": -1.9531692266464233, + "logits/rejected": -1.7653286457061768, + "logps/chosen": -267.1680908203125, + "logps/rejected": -244.0019073486328, + "loss": 0.5542, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.15730252861976624, + "rewards/margins": 0.5572208166122437, + "rewards/rejected": -0.7145233154296875, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": -1.905790090560913, + "logits/rejected": -1.862587332725525, + "logps/chosen": -311.9358825683594, + "logps/rejected": -248.84475708007812, + "loss": 0.5244, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.08019833266735077, + "rewards/margins": 0.7832614183425903, + "rewards/rejected": -0.8634597659111023, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": -1.795546531677246, + "logits/rejected": -1.753016471862793, + "logps/chosen": -252.33975219726562, + "logps/rejected": -222.95034790039062, + "loss": 0.519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.21246707439422607, + "rewards/margins": 0.604225754737854, + "rewards/rejected": -0.8166926503181458, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": -1.8621381521224976, + "logits/rejected": -1.8103243112564087, + "logps/chosen": -261.7289733886719, + "logps/rejected": -243.3286590576172, + "loss": 0.5318, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1618395298719406, + "rewards/margins": 0.6970196962356567, + "rewards/rejected": -0.8588592410087585, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": -1.7618186473846436, + "logits/rejected": -1.6055066585540771, + "logps/chosen": -262.61932373046875, + "logps/rejected": -247.23562622070312, + "loss": 0.5442, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1766400933265686, + "rewards/margins": 0.6458948850631714, + "rewards/rejected": -0.8225349187850952, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": -1.8695290088653564, + "logits/rejected": -1.8926464319229126, + "logps/chosen": -296.50262451171875, + "logps/rejected": -231.22598266601562, + "loss": 0.54, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.19134683907032013, + "rewards/margins": 0.6846526861190796, + "rewards/rejected": -0.8759994506835938, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": -1.8353866338729858, + "logits/rejected": -1.7200958728790283, + "logps/chosen": -284.80230712890625, + "logps/rejected": -229.96908569335938, + "loss": 0.5292, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2048969715833664, + "rewards/margins": 0.651934802532196, + "rewards/rejected": -0.8568318486213684, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": -1.9265692234039307, + "logits/rejected": -1.8183717727661133, + "logps/chosen": -290.4231872558594, + "logps/rejected": -242.4643096923828, + "loss": 0.536, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1356951892375946, + "rewards/margins": 0.6436403393745422, + "rewards/rejected": -0.7793355584144592, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": -1.8183395862579346, + "logits/rejected": -1.819898247718811, + "logps/chosen": -268.138427734375, + "logps/rejected": -266.44384765625, + "loss": 0.5342, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.1618737280368805, + "rewards/margins": 0.6986343264579773, + "rewards/rejected": -0.860508143901825, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": -1.846778154373169, + "logits/rejected": -1.6753194332122803, + "logps/chosen": -249.790771484375, + "logps/rejected": -246.05227661132812, + "loss": 0.5601, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24082091450691223, + "rewards/margins": 0.6031243801116943, + "rewards/rejected": -0.8439452052116394, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": -1.8025175333023071, + "logits/rejected": -1.7556968927383423, + "logps/chosen": -243.3855743408203, + "logps/rejected": -215.2812957763672, + "loss": 0.5336, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.21409931778907776, + "rewards/margins": 0.7002004384994507, + "rewards/rejected": -0.9142996668815613, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": -1.7889652252197266, + "logits/rejected": -1.8297256231307983, + "logps/chosen": -239.4217071533203, + "logps/rejected": -202.02899169921875, + "loss": 0.5465, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.27625101804733276, + "rewards/margins": 0.6161350011825562, + "rewards/rejected": -0.8923860788345337, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": -1.8434979915618896, + "logits/rejected": -1.7568118572235107, + "logps/chosen": -280.7035827636719, + "logps/rejected": -215.9335479736328, + "loss": 0.511, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.15645402669906616, + "rewards/margins": 0.674486517906189, + "rewards/rejected": -0.8309405446052551, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": -1.7911853790283203, + "logits/rejected": -1.7433240413665771, + "logps/chosen": -260.5579528808594, + "logps/rejected": -224.52804565429688, + "loss": 0.555, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2193847894668579, + "rewards/margins": 0.5018665194511414, + "rewards/rejected": -0.7212512493133545, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": -1.8497813940048218, + "logits/rejected": -1.7559770345687866, + "logps/chosen": -298.0724792480469, + "logps/rejected": -230.3092041015625, + "loss": 0.5112, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.0832061618566513, + "rewards/margins": 0.8249706029891968, + "rewards/rejected": -0.9081767797470093, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": -1.873970627784729, + "logits/rejected": -1.8093993663787842, + "logps/chosen": -288.2675476074219, + "logps/rejected": -234.6558074951172, + "loss": 0.5225, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.21631982922554016, + "rewards/margins": 0.590010404586792, + "rewards/rejected": -0.8063302040100098, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": -1.885053277015686, + "logits/rejected": -1.7519279718399048, + "logps/chosen": -302.2279357910156, + "logps/rejected": -237.22775268554688, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11545169353485107, + "rewards/margins": 0.834151566028595, + "rewards/rejected": -0.949603259563446, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": -1.9534156322479248, + "logits/rejected": -1.7689378261566162, + "logps/chosen": -273.7957458496094, + "logps/rejected": -261.95404052734375, + "loss": 0.5159, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12274491786956787, + "rewards/margins": 0.7097848653793335, + "rewards/rejected": -0.8325297236442566, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -1.9115434885025024, + "logits/rejected": -1.7871429920196533, + "logps/chosen": -289.39373779296875, + "logps/rejected": -199.10531616210938, + "loss": 0.5339, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.038600485771894455, + "rewards/margins": 0.8317173719406128, + "rewards/rejected": -0.8703179359436035, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": -1.7981081008911133, + "logits/rejected": -1.7638845443725586, + "logps/chosen": -257.83685302734375, + "logps/rejected": -232.5749053955078, + "loss": 0.5597, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1678551286458969, + "rewards/margins": 0.61128169298172, + "rewards/rejected": -0.7791367769241333, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": -1.8950119018554688, + "logits/rejected": -1.7373759746551514, + "logps/chosen": -242.224609375, + "logps/rejected": -226.2384796142578, + "loss": 0.5187, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.0838487520813942, + "rewards/margins": 0.7088004350662231, + "rewards/rejected": -0.7926491498947144, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -1.854583740234375, + "logits/rejected": -1.7873001098632812, + "logps/chosen": -274.5412902832031, + "logps/rejected": -230.3712615966797, + "loss": 0.5542, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1923322081565857, + "rewards/margins": 0.7458332777023315, + "rewards/rejected": -0.938165545463562, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": -1.815239667892456, + "logits/rejected": -1.717268943786621, + "logps/chosen": -262.6066589355469, + "logps/rejected": -237.44442749023438, + "loss": 0.5731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2399211823940277, + "rewards/margins": 0.6559697389602661, + "rewards/rejected": -0.8958908915519714, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": -1.8970063924789429, + "logits/rejected": -1.7926464080810547, + "logps/chosen": -256.3522644042969, + "logps/rejected": -222.38330078125, + "loss": 0.5527, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.08024901151657104, + "rewards/margins": 0.6879512071609497, + "rewards/rejected": -0.7682002186775208, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -1.8556569814682007, + "logits/rejected": -1.7505378723144531, + "logps/chosen": -262.74908447265625, + "logps/rejected": -210.7177734375, + "loss": 0.5305, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.13729843497276306, + "rewards/margins": 0.8056707382202148, + "rewards/rejected": -0.9429691433906555, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": -1.895007848739624, + "logits/rejected": -1.801287055015564, + "logps/chosen": -260.9505920410156, + "logps/rejected": -228.9954376220703, + "loss": 0.545, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.19283542037010193, + "rewards/margins": 0.7510678768157959, + "rewards/rejected": -0.9439032673835754, + "step": 1930 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -2.0746004581451416, + "eval_logits/rejected": -1.9506009817123413, + "eval_logps/chosen": -266.1286926269531, + "eval_logps/rejected": -227.90768432617188, + "eval_loss": 0.5312557220458984, + "eval_rewards/accuracies": 0.7440000176429749, + "eval_rewards/chosen": -0.14678505063056946, + "eval_rewards/margins": 0.715552568435669, + "eval_rewards/rejected": -0.8623375296592712, + "eval_runtime": 445.0864, + "eval_samples_per_second": 4.494, + "eval_steps_per_second": 0.281, + "step": 1937 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": -1.8309762477874756, + "logits/rejected": -1.7428325414657593, + "logps/chosen": -259.0466003417969, + "logps/rejected": -218.4160919189453, + "loss": 0.5391, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16953520476818085, + "rewards/margins": 0.6288765668869019, + "rewards/rejected": -0.7984118461608887, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -1.867438554763794, + "logits/rejected": -1.8701823949813843, + "logps/chosen": -259.49285888671875, + "logps/rejected": -232.43960571289062, + "loss": 0.5413, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22325801849365234, + "rewards/margins": 0.7092195749282837, + "rewards/rejected": -0.9324776530265808, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": -1.725606918334961, + "logits/rejected": -1.762326955795288, + "logps/chosen": -260.210693359375, + "logps/rejected": -267.48980712890625, + "loss": 0.529, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11623837798833847, + "rewards/margins": 0.7028089761734009, + "rewards/rejected": -0.8190473318099976, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": -1.8463878631591797, + "logits/rejected": -1.8618934154510498, + "logps/chosen": -257.70281982421875, + "logps/rejected": -212.46566772460938, + "loss": 0.548, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.09152659773826599, + "rewards/margins": 0.7549413442611694, + "rewards/rejected": -0.8464679718017578, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -1.8100248575210571, + "logits/rejected": -1.6833438873291016, + "logps/chosen": -277.1850280761719, + "logps/rejected": -232.51846313476562, + "loss": 0.5304, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.13607701659202576, + "rewards/margins": 0.7636335492134094, + "rewards/rejected": -0.8997105360031128, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": -1.8786256313323975, + "logits/rejected": -1.735037088394165, + "logps/chosen": -268.2481384277344, + "logps/rejected": -240.97836303710938, + "loss": 0.5426, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.14459525048732758, + "rewards/margins": 0.6826692223548889, + "rewards/rejected": -0.8272644877433777, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": -1.9178975820541382, + "logits/rejected": -1.698952317237854, + "logps/chosen": -253.433837890625, + "logps/rejected": -209.8586883544922, + "loss": 0.5445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2135821133852005, + "rewards/margins": 0.7801007032394409, + "rewards/rejected": -0.9936826825141907, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -1.8799207210540771, + "logits/rejected": -1.713808298110962, + "logps/chosen": -269.6289978027344, + "logps/rejected": -222.32015991210938, + "loss": 0.5263, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.11016847938299179, + "rewards/margins": 0.9144603610038757, + "rewards/rejected": -1.0246288776397705, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": -1.936486840248108, + "logits/rejected": -1.5898669958114624, + "logps/chosen": -282.9302673339844, + "logps/rejected": -235.68594360351562, + "loss": 0.5424, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1706642210483551, + "rewards/margins": 0.6915091276168823, + "rewards/rejected": -0.8621733784675598, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": -1.8118308782577515, + "logits/rejected": -1.885777473449707, + "logps/chosen": -242.67636108398438, + "logps/rejected": -217.92343139648438, + "loss": 0.5394, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1947106271982193, + "rewards/margins": 0.6723589897155762, + "rewards/rejected": -0.8670696020126343, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -1.855661153793335, + "logits/rejected": -1.737422227859497, + "logps/chosen": -273.73272705078125, + "logps/rejected": -245.52420043945312, + "loss": 0.5177, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11165623366832733, + "rewards/margins": 0.8057673573493958, + "rewards/rejected": -0.91742342710495, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": -1.8942610025405884, + "logits/rejected": -1.7890903949737549, + "logps/chosen": -276.583740234375, + "logps/rejected": -250.9980926513672, + "loss": 0.539, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.13766427338123322, + "rewards/margins": 0.6409403085708618, + "rewards/rejected": -0.7786045670509338, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": -1.9332313537597656, + "logits/rejected": -1.719761610031128, + "logps/chosen": -287.0956115722656, + "logps/rejected": -229.3668212890625, + "loss": 0.5139, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1157849058508873, + "rewards/margins": 0.7927757501602173, + "rewards/rejected": -0.9085606336593628, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -1.9708349704742432, + "logits/rejected": -1.8285831212997437, + "logps/chosen": -250.4374237060547, + "logps/rejected": -258.11676025390625, + "loss": 0.5489, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.053410161286592484, + "rewards/margins": 0.6914520263671875, + "rewards/rejected": -0.7448622584342957, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": -1.7705554962158203, + "logits/rejected": -1.7897024154663086, + "logps/chosen": -294.6583251953125, + "logps/rejected": -229.2305450439453, + "loss": 0.5364, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.04103150963783264, + "rewards/margins": 0.7850837111473083, + "rewards/rejected": -0.8261152505874634, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": -1.8095719814300537, + "logits/rejected": -1.8483651876449585, + "logps/chosen": -263.2212829589844, + "logps/rejected": -229.52511596679688, + "loss": 0.5266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09365183115005493, + "rewards/margins": 0.763170063495636, + "rewards/rejected": -0.8568218946456909, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -1.827618956565857, + "logits/rejected": -1.6979715824127197, + "logps/chosen": -249.76107788085938, + "logps/rejected": -207.6736602783203, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15310709178447723, + "rewards/margins": 0.8517143130302429, + "rewards/rejected": -1.0048214197158813, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": -1.8750379085540771, + "logits/rejected": -1.7648203372955322, + "logps/chosen": -282.3287658691406, + "logps/rejected": -217.60525512695312, + "loss": 0.5142, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.026097621768712997, + "rewards/margins": 0.8535500764846802, + "rewards/rejected": -0.8796476125717163, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": -1.8425514698028564, + "logits/rejected": -1.8122365474700928, + "logps/chosen": -276.6051025390625, + "logps/rejected": -243.2883758544922, + "loss": 0.5303, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.22309927642345428, + "rewards/margins": 0.67050701379776, + "rewards/rejected": -0.8936061859130859, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -1.947784185409546, + "logits/rejected": -1.742669701576233, + "logps/chosen": -246.5569305419922, + "logps/rejected": -221.22140502929688, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1406203657388687, + "rewards/margins": 0.774002194404602, + "rewards/rejected": -0.9146224856376648, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": -1.7936385869979858, + "logits/rejected": -1.6397230625152588, + "logps/chosen": -235.45187377929688, + "logps/rejected": -197.29556274414062, + "loss": 0.5112, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11487104743719101, + "rewards/margins": 0.8687982559204102, + "rewards/rejected": -0.9836692810058594, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": -1.8870512247085571, + "logits/rejected": -1.7370729446411133, + "logps/chosen": -314.30548095703125, + "logps/rejected": -258.6022033691406, + "loss": 0.5565, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11233721673488617, + "rewards/margins": 0.721551775932312, + "rewards/rejected": -0.8338890075683594, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -1.8495047092437744, + "logits/rejected": -1.896612524986267, + "logps/chosen": -270.6409606933594, + "logps/rejected": -233.9993133544922, + "loss": 0.5218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10107968002557755, + "rewards/margins": 0.8023698925971985, + "rewards/rejected": -0.9034496545791626, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": -1.8769447803497314, + "logits/rejected": -1.7662044763565063, + "logps/chosen": -282.11590576171875, + "logps/rejected": -245.39773559570312, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10892156511545181, + "rewards/margins": 0.8461700677871704, + "rewards/rejected": -0.9550915956497192, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": -1.9545669555664062, + "logits/rejected": -1.7949491739273071, + "logps/chosen": -283.76239013671875, + "logps/rejected": -222.0093536376953, + "loss": 0.5561, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13183176517486572, + "rewards/margins": 0.7535629272460938, + "rewards/rejected": -0.8853947520256042, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -1.822066068649292, + "logits/rejected": -1.7269483804702759, + "logps/chosen": -245.1105499267578, + "logps/rejected": -228.8251190185547, + "loss": 0.5324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1489664912223816, + "rewards/margins": 0.7033448815345764, + "rewards/rejected": -0.8523114323616028, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": -1.795738935470581, + "logits/rejected": -1.6468427181243896, + "logps/chosen": -230.1129913330078, + "logps/rejected": -212.0528564453125, + "loss": 0.5352, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2626517415046692, + "rewards/margins": 0.6540511846542358, + "rewards/rejected": -0.916702926158905, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": -1.8404350280761719, + "logits/rejected": -1.65227472782135, + "logps/chosen": -288.52960205078125, + "logps/rejected": -236.2271728515625, + "loss": 0.5391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2433219850063324, + "rewards/margins": 0.7124189138412476, + "rewards/rejected": -0.9557409286499023, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -1.8941370248794556, + "logits/rejected": -1.7674516439437866, + "logps/chosen": -307.49566650390625, + "logps/rejected": -229.32421875, + "loss": 0.4823, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.06332740932703018, + "rewards/margins": 0.8840253949165344, + "rewards/rejected": -0.9473527669906616, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": -1.8467296361923218, + "logits/rejected": -1.7252922058105469, + "logps/chosen": -279.69232177734375, + "logps/rejected": -234.534423828125, + "loss": 0.5248, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12596972286701202, + "rewards/margins": 0.7875716090202332, + "rewards/rejected": -0.9135414361953735, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": -1.8842837810516357, + "logits/rejected": -1.796021819114685, + "logps/chosen": -267.0751953125, + "logps/rejected": -240.4868621826172, + "loss": 0.5167, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.10872729867696762, + "rewards/margins": 0.8399373292922974, + "rewards/rejected": -0.948664665222168, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -1.8770482540130615, + "logits/rejected": -1.7965824604034424, + "logps/chosen": -248.2671356201172, + "logps/rejected": -235.351318359375, + "loss": 0.5274, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.229864239692688, + "rewards/margins": 0.6358078122138977, + "rewards/rejected": -0.8656721115112305, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": -1.9136450290679932, + "logits/rejected": -1.7778723239898682, + "logps/chosen": -294.6365051269531, + "logps/rejected": -232.9202880859375, + "loss": 0.5574, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.14158369600772858, + "rewards/margins": 0.7766921520233154, + "rewards/rejected": -0.9182759523391724, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": -1.885565996170044, + "logits/rejected": -1.8137308359146118, + "logps/chosen": -284.42779541015625, + "logps/rejected": -246.0910186767578, + "loss": 0.5154, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2045001983642578, + "rewards/margins": 0.7741453051567078, + "rewards/rejected": -0.9786455035209656, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -1.7657403945922852, + "logits/rejected": -1.799722671508789, + "logps/chosen": -269.136962890625, + "logps/rejected": -220.0470733642578, + "loss": 0.5204, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17988713085651398, + "rewards/margins": 0.7485309839248657, + "rewards/rejected": -0.9284180402755737, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": -1.9315412044525146, + "logits/rejected": -1.7396808862686157, + "logps/chosen": -279.1973876953125, + "logps/rejected": -214.91732788085938, + "loss": 0.5089, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15460029244422913, + "rewards/margins": 0.7415857315063477, + "rewards/rejected": -0.8961860537528992, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": -1.8266870975494385, + "logits/rejected": -1.6883373260498047, + "logps/chosen": -266.30865478515625, + "logps/rejected": -228.08285522460938, + "loss": 0.5163, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1466519981622696, + "rewards/margins": 0.7096244096755981, + "rewards/rejected": -0.8562763929367065, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": -1.8241113424301147, + "logits/rejected": -1.8161547183990479, + "logps/chosen": -267.0104064941406, + "logps/rejected": -235.6630859375, + "loss": 0.5525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.21230247616767883, + "rewards/margins": 0.6911984086036682, + "rewards/rejected": -0.9035008549690247, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": -1.7367159128189087, + "logits/rejected": -1.6535007953643799, + "logps/chosen": -228.43429565429688, + "logps/rejected": -225.22537231445312, + "loss": 0.5245, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2316330373287201, + "rewards/margins": 0.6564348340034485, + "rewards/rejected": -0.8880678415298462, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": -1.896563172340393, + "logits/rejected": -1.7375274896621704, + "logps/chosen": -250.31442260742188, + "logps/rejected": -224.95828247070312, + "loss": 0.4816, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.23072636127471924, + "rewards/margins": 0.7319254279136658, + "rewards/rejected": -0.9626517295837402, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": -1.7249380350112915, + "logits/rejected": -1.6061763763427734, + "logps/chosen": -230.9118194580078, + "logps/rejected": -253.45858764648438, + "loss": 0.5352, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.19668206572532654, + "rewards/margins": 0.6849542260169983, + "rewards/rejected": -0.8816363215446472, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": -1.7855503559112549, + "logits/rejected": -1.649736762046814, + "logps/chosen": -288.23529052734375, + "logps/rejected": -241.0659637451172, + "loss": 0.5518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1557173877954483, + "rewards/margins": 0.7118433117866516, + "rewards/rejected": -0.8675606846809387, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": -1.8088535070419312, + "logits/rejected": -1.8049001693725586, + "logps/chosen": -257.82806396484375, + "logps/rejected": -232.189453125, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.138966903090477, + "rewards/margins": 0.7472674250602722, + "rewards/rejected": -0.8862342834472656, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -1.731967568397522, + "logits/rejected": -1.6263824701309204, + "logps/chosen": -272.4329833984375, + "logps/rejected": -227.0443878173828, + "loss": 0.5573, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1651538461446762, + "rewards/margins": 0.6294243335723877, + "rewards/rejected": -0.7945781946182251, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": -1.7465593814849854, + "logits/rejected": -1.5932323932647705, + "logps/chosen": -266.763427734375, + "logps/rejected": -213.6304931640625, + "loss": 0.5402, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13140526413917542, + "rewards/margins": 0.70847487449646, + "rewards/rejected": -0.8398801684379578, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": -1.8030688762664795, + "logits/rejected": -1.809012770652771, + "logps/chosen": -275.38812255859375, + "logps/rejected": -225.98660278320312, + "loss": 0.5564, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.23618058860301971, + "rewards/margins": 0.7667824029922485, + "rewards/rejected": -1.0029628276824951, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": -1.827406644821167, + "logits/rejected": -1.7971584796905518, + "logps/chosen": -283.8343811035156, + "logps/rejected": -238.54214477539062, + "loss": 0.537, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.16179491579532623, + "rewards/margins": 0.7565305233001709, + "rewards/rejected": -0.9183254241943359, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": -1.8009891510009766, + "logits/rejected": -1.7545578479766846, + "logps/chosen": -270.2419128417969, + "logps/rejected": -240.7090301513672, + "loss": 0.5386, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.12055712938308716, + "rewards/margins": 0.7144738435745239, + "rewards/rejected": -0.8350311517715454, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": -1.7695505619049072, + "logits/rejected": -1.7582073211669922, + "logps/chosen": -265.74786376953125, + "logps/rejected": -218.88107299804688, + "loss": 0.5256, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21400511264801025, + "rewards/margins": 0.5858114361763, + "rewards/rejected": -0.7998165488243103, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -1.8036329746246338, + "logits/rejected": -1.7557218074798584, + "logps/chosen": -261.3813781738281, + "logps/rejected": -265.0194091796875, + "loss": 0.5538, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.24859030544757843, + "rewards/margins": 0.676604151725769, + "rewards/rejected": -0.9251944422721863, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": -1.825974702835083, + "logits/rejected": -1.817724585533142, + "logps/chosen": -253.3404998779297, + "logps/rejected": -253.4492645263672, + "loss": 0.5102, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.0755300372838974, + "rewards/margins": 0.8097864985466003, + "rewards/rejected": -0.8853166699409485, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": -1.8062114715576172, + "logits/rejected": -1.7560087442398071, + "logps/chosen": -241.2262725830078, + "logps/rejected": -240.1786651611328, + "loss": 0.5371, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.18391039967536926, + "rewards/margins": 0.6250076293945312, + "rewards/rejected": -0.8089178800582886, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -1.7616338729858398, + "logits/rejected": -1.7940946817398071, + "logps/chosen": -291.49346923828125, + "logps/rejected": -230.8258819580078, + "loss": 0.5382, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1201476901769638, + "rewards/margins": 0.7259420156478882, + "rewards/rejected": -0.8460898399353027, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": -1.8886840343475342, + "logits/rejected": -1.8246275186538696, + "logps/chosen": -257.1405029296875, + "logps/rejected": -243.4104461669922, + "loss": 0.5278, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1191997155547142, + "rewards/margins": 0.805163562297821, + "rewards/rejected": -0.9243633151054382, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": -1.9535953998565674, + "logits/rejected": -1.8544635772705078, + "logps/chosen": -301.24676513671875, + "logps/rejected": -230.9423828125, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09769626706838608, + "rewards/margins": 0.8011761903762817, + "rewards/rejected": -0.8988723754882812, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -1.8355233669281006, + "logits/rejected": -1.6367290019989014, + "logps/chosen": -260.4992980957031, + "logps/rejected": -222.38693237304688, + "loss": 0.514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13300195336341858, + "rewards/margins": 0.7211569547653198, + "rewards/rejected": -0.854158878326416, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": -1.8741086721420288, + "logits/rejected": -1.8212274312973022, + "logps/chosen": -275.5597839355469, + "logps/rejected": -240.51608276367188, + "loss": 0.5101, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10640747845172882, + "rewards/margins": 0.7899595499038696, + "rewards/rejected": -0.8963669538497925, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": -1.8639538288116455, + "logits/rejected": -1.7817881107330322, + "logps/chosen": -287.79132080078125, + "logps/rejected": -250.21066284179688, + "loss": 0.5227, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1344916671514511, + "rewards/margins": 0.8036550283432007, + "rewards/rejected": -0.9381467700004578, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -1.8294289112091064, + "logits/rejected": -1.783036231994629, + "logps/chosen": -257.20635986328125, + "logps/rejected": -193.5693359375, + "loss": 0.5419, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.12061061710119247, + "rewards/margins": 0.7805212736129761, + "rewards/rejected": -0.9011319875717163, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": -1.771731972694397, + "logits/rejected": -1.6772375106811523, + "logps/chosen": -252.85006713867188, + "logps/rejected": -214.2803497314453, + "loss": 0.5349, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15694710612297058, + "rewards/margins": 0.6949201226234436, + "rewards/rejected": -0.8518671989440918, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": -1.8179445266723633, + "logits/rejected": -1.7768890857696533, + "logps/chosen": -291.8775939941406, + "logps/rejected": -242.23178100585938, + "loss": 0.5355, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0762079656124115, + "rewards/margins": 0.8202627301216125, + "rewards/rejected": -0.8964706659317017, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": -1.745713233947754, + "logits/rejected": -1.7205911874771118, + "logps/chosen": -258.77825927734375, + "logps/rejected": -222.5753631591797, + "loss": 0.5238, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.08277976512908936, + "rewards/margins": 0.8328452110290527, + "rewards/rejected": -0.9156249761581421, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": -1.769708275794983, + "logits/rejected": -1.8346328735351562, + "logps/chosen": -257.17034912109375, + "logps/rejected": -248.9686279296875, + "loss": 0.563, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23198464512825012, + "rewards/margins": 0.49988383054733276, + "rewards/rejected": -0.73186856508255, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": -1.8102190494537354, + "logits/rejected": -1.7782459259033203, + "logps/chosen": -266.3741149902344, + "logps/rejected": -241.44412231445312, + "loss": 0.5278, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2132963240146637, + "rewards/margins": 0.6920968294143677, + "rewards/rejected": -0.9053932428359985, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -1.7425388097763062, + "logits/rejected": -1.7338457107543945, + "logps/chosen": -279.8531188964844, + "logps/rejected": -242.36764526367188, + "loss": 0.5355, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.09482035785913467, + "rewards/margins": 0.7523207068443298, + "rewards/rejected": -0.8471410870552063, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": -1.8110402822494507, + "logits/rejected": -1.8718684911727905, + "logps/chosen": -271.5622253417969, + "logps/rejected": -217.5170440673828, + "loss": 0.5185, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20837874710559845, + "rewards/margins": 0.6790444254875183, + "rewards/rejected": -0.8874232172966003, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": -1.9279320240020752, + "logits/rejected": -1.8386805057525635, + "logps/chosen": -293.8493957519531, + "logps/rejected": -244.04849243164062, + "loss": 0.5146, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12184705585241318, + "rewards/margins": 0.8180648684501648, + "rewards/rejected": -0.9399120211601257, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": -1.7666857242584229, + "logits/rejected": -1.7620136737823486, + "logps/chosen": -239.4011993408203, + "logps/rejected": -239.3124542236328, + "loss": 0.5185, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1580074578523636, + "rewards/margins": 0.6797144412994385, + "rewards/rejected": -0.8377218246459961, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": -1.7690048217773438, + "logits/rejected": -1.6718857288360596, + "logps/chosen": -266.88946533203125, + "logps/rejected": -212.6228485107422, + "loss": 0.523, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.0628361701965332, + "rewards/margins": 0.9115892648696899, + "rewards/rejected": -0.9744253158569336, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": -1.7922004461288452, + "logits/rejected": -1.658286452293396, + "logps/chosen": -279.45086669921875, + "logps/rejected": -220.2642059326172, + "loss": 0.5194, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.13799555599689484, + "rewards/margins": 0.7635752558708191, + "rewards/rejected": -0.9015709161758423, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": -1.7312570810317993, + "logits/rejected": -1.7711658477783203, + "logps/chosen": -251.97451782226562, + "logps/rejected": -229.8913116455078, + "loss": 0.5334, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.22601954638957977, + "rewards/margins": 0.7322612404823303, + "rewards/rejected": -0.9582807421684265, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": -1.8644888401031494, + "logits/rejected": -1.7808336019515991, + "logps/chosen": -267.78851318359375, + "logps/rejected": -227.9579620361328, + "loss": 0.539, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11055097728967667, + "rewards/margins": 0.7926282286643982, + "rewards/rejected": -0.9031792879104614, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": -1.8347488641738892, + "logits/rejected": -1.7388532161712646, + "logps/chosen": -237.673095703125, + "logps/rejected": -254.39358520507812, + "loss": 0.5261, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23437213897705078, + "rewards/margins": 0.6564434170722961, + "rewards/rejected": -0.8908154368400574, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": -1.8402671813964844, + "logits/rejected": -1.7257741689682007, + "logps/chosen": -283.63519287109375, + "logps/rejected": -235.09774780273438, + "loss": 0.4881, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.05417463928461075, + "rewards/margins": 0.8945249319076538, + "rewards/rejected": -0.9486994743347168, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": -1.9866876602172852, + "logits/rejected": -1.7852119207382202, + "logps/chosen": -254.2245330810547, + "logps/rejected": -231.99362182617188, + "loss": 0.5503, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12182004749774933, + "rewards/margins": 0.7041906714439392, + "rewards/rejected": -0.8260107040405273, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": -1.7733800411224365, + "logits/rejected": -1.7326838970184326, + "logps/chosen": -249.9972381591797, + "logps/rejected": -241.3240966796875, + "loss": 0.5399, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.13251790404319763, + "rewards/margins": 0.7282706499099731, + "rewards/rejected": -0.8607885241508484, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": -1.7894248962402344, + "logits/rejected": -1.7059452533721924, + "logps/chosen": -251.04580688476562, + "logps/rejected": -203.8979034423828, + "loss": 0.5085, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10947040468454361, + "rewards/margins": 0.823805034160614, + "rewards/rejected": -0.9332754015922546, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": -1.8997509479522705, + "logits/rejected": -1.7137277126312256, + "logps/chosen": -275.0599365234375, + "logps/rejected": -219.4813995361328, + "loss": 0.5397, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23407335579395294, + "rewards/margins": 0.5349047780036926, + "rewards/rejected": -0.7689779996871948, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": -1.8389432430267334, + "logits/rejected": -1.8381201028823853, + "logps/chosen": -271.7427673339844, + "logps/rejected": -260.01513671875, + "loss": 0.5358, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16513243317604065, + "rewards/margins": 0.5701314806938171, + "rewards/rejected": -0.7352639436721802, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -1.82058846950531, + "logits/rejected": -1.7122089862823486, + "logps/chosen": -262.57904052734375, + "logps/rejected": -245.53134155273438, + "loss": 0.5516, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.16321533918380737, + "rewards/margins": 0.7112065553665161, + "rewards/rejected": -0.8744218945503235, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": -1.9143041372299194, + "logits/rejected": -1.7294342517852783, + "logps/chosen": -289.9866027832031, + "logps/rejected": -258.2250671386719, + "loss": 0.5742, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1562449038028717, + "rewards/margins": 0.8107137680053711, + "rewards/rejected": -0.9669586420059204, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": -1.8519618511199951, + "logits/rejected": -1.62540602684021, + "logps/chosen": -261.181396484375, + "logps/rejected": -222.9465789794922, + "loss": 0.5165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051930081099271774, + "rewards/margins": 0.8730701208114624, + "rewards/rejected": -0.9250003099441528, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -1.7001488208770752, + "logits/rejected": -1.7324724197387695, + "logps/chosen": -244.74472045898438, + "logps/rejected": -230.0176239013672, + "loss": 0.5089, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1713591068983078, + "rewards/margins": 0.6921594142913818, + "rewards/rejected": -0.8635184168815613, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": -1.8177697658538818, + "logits/rejected": -1.7782388925552368, + "logps/chosen": -263.41650390625, + "logps/rejected": -236.8643341064453, + "loss": 0.5166, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07830659300088882, + "rewards/margins": 0.8659903407096863, + "rewards/rejected": -0.9442968368530273, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": -1.8631365299224854, + "logits/rejected": -1.7254209518432617, + "logps/chosen": -295.65338134765625, + "logps/rejected": -264.22210693359375, + "loss": 0.5431, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17194876074790955, + "rewards/margins": 0.7064958810806274, + "rewards/rejected": -0.8784445524215698, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -1.8437837362289429, + "logits/rejected": -1.7293872833251953, + "logps/chosen": -269.7587585449219, + "logps/rejected": -217.47433471679688, + "loss": 0.542, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15066538751125336, + "rewards/margins": 0.7276660203933716, + "rewards/rejected": -0.8783313632011414, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": -1.8254578113555908, + "logits/rejected": -1.7057873010635376, + "logps/chosen": -274.4022521972656, + "logps/rejected": -218.07015991210938, + "loss": 0.5428, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.17712940275669098, + "rewards/margins": 0.6617370247840881, + "rewards/rejected": -0.8388664126396179, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": -1.729116678237915, + "logits/rejected": -1.7570183277130127, + "logps/chosen": -271.8856506347656, + "logps/rejected": -264.41473388671875, + "loss": 0.4937, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1282016783952713, + "rewards/margins": 0.6722147464752197, + "rewards/rejected": -0.8004163503646851, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -1.8318380117416382, + "logits/rejected": -1.6401628255844116, + "logps/chosen": -267.8665771484375, + "logps/rejected": -214.05642700195312, + "loss": 0.5384, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16904288530349731, + "rewards/margins": 0.7819380760192871, + "rewards/rejected": -0.9509809613227844, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": -1.7322664260864258, + "logits/rejected": -1.8341996669769287, + "logps/chosen": -253.93136596679688, + "logps/rejected": -220.20571899414062, + "loss": 0.5253, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.18089726567268372, + "rewards/margins": 0.6462094783782959, + "rewards/rejected": -0.8271068334579468, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": -1.8754618167877197, + "logits/rejected": -1.7015602588653564, + "logps/chosen": -278.8792419433594, + "logps/rejected": -227.3442840576172, + "loss": 0.5596, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3005698025226593, + "rewards/margins": 0.5349802374839783, + "rewards/rejected": -0.8355501294136047, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -1.8064409494400024, + "logits/rejected": -1.8547006845474243, + "logps/chosen": -274.5710754394531, + "logps/rejected": -240.2202606201172, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.141297847032547, + "rewards/margins": 0.6742552518844604, + "rewards/rejected": -0.8155530691146851, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": -1.740308403968811, + "logits/rejected": -1.693386435508728, + "logps/chosen": -252.54067993164062, + "logps/rejected": -222.9604034423828, + "loss": 0.5065, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1813466101884842, + "rewards/margins": 0.7808928489685059, + "rewards/rejected": -0.9622395634651184, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": -1.7816598415374756, + "logits/rejected": -1.8385436534881592, + "logps/chosen": -259.6197509765625, + "logps/rejected": -223.94271850585938, + "loss": 0.5295, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14454451203346252, + "rewards/margins": 0.7407659888267517, + "rewards/rejected": -0.8853104710578918, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -1.8226430416107178, + "logits/rejected": -1.821807622909546, + "logps/chosen": -271.1612854003906, + "logps/rejected": -224.535888671875, + "loss": 0.5507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1211395114660263, + "rewards/margins": 0.6229888200759888, + "rewards/rejected": -0.7441283464431763, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": -1.853369951248169, + "logits/rejected": -1.7586214542388916, + "logps/chosen": -288.68878173828125, + "logps/rejected": -237.36465454101562, + "loss": 0.5195, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.12492711842060089, + "rewards/margins": 0.8443253636360168, + "rewards/rejected": -0.9692524671554565, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": -1.6634130477905273, + "logits/rejected": -1.804986596107483, + "logps/chosen": -246.12142944335938, + "logps/rejected": -237.26327514648438, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10875705629587173, + "rewards/margins": 0.7373288869857788, + "rewards/rejected": -0.8460859060287476, + "step": 2900 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -2.0662753582000732, + "eval_logits/rejected": -1.9411602020263672, + "eval_logps/chosen": -266.15380859375, + "eval_logps/rejected": -228.28196716308594, + "eval_loss": 0.5263338685035706, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -0.14929771423339844, + "eval_rewards/margins": 0.7504671812057495, + "eval_rewards/rejected": -0.899764895439148, + "eval_runtime": 442.4111, + "eval_samples_per_second": 4.521, + "eval_steps_per_second": 0.283, + "step": 2904 }, { "epoch": 3.0, - "step": 726, + "step": 2904, "total_flos": 0.0, - "train_loss": 0.6037136622532668, - "train_runtime": 61083.6691, - "train_samples_per_second": 3.043, - "train_steps_per_second": 0.012 + "train_loss": 0.5640471254170105, + "train_runtime": 84492.9354, + "train_samples_per_second": 2.2, + "train_steps_per_second": 0.034 } ], "logging_steps": 10, - "max_steps": 726, + "max_steps": 2904, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0,