diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8382 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9996397045577372, + "eval_steps": 100, + "global_step": 5550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003602954422626554, + "grad_norm": 3.3397774696350098, + "learning_rate": 9.009009009009008e-10, + "logits/chosen": -1.9418047666549683, + "logits/rejected": -1.931673288345337, + "logps/chosen": -29.41129493713379, + "logps/rejected": -34.63249206542969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0036029544226265538, + "grad_norm": 3.7182133197784424, + "learning_rate": 9.009009009009009e-09, + "logits/chosen": -1.7383267879486084, + "logits/rejected": -1.7110590934753418, + "logps/chosen": -43.03986740112305, + "logps/rejected": -43.70203399658203, + "loss": 0.6932, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": -0.0002388593857176602, + "rewards/margins": -0.00014030587044544518, + "rewards/rejected": -9.855348616838455e-05, + "step": 10 + }, + { + "epoch": 0.0072059088452531075, + "grad_norm": 3.187004804611206, + "learning_rate": 1.8018018018018017e-08, + "logits/chosen": -1.7016605138778687, + "logits/rejected": -1.684544324874878, + "logps/chosen": -41.77806854248047, + "logps/rejected": -44.841636657714844, + "loss": 0.6932, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -5.4537718824576586e-05, + "rewards/margins": -8.961025741882622e-05, + "rewards/rejected": 3.50725713360589e-05, + "step": 20 + }, + { + "epoch": 0.010808863267879661, + "grad_norm": 4.540226459503174, + "learning_rate": 2.7027027027027028e-08, + "logits/chosen": -1.6793386936187744, + "logits/rejected": -1.6608819961547852, + "logps/chosen": -45.18395233154297, + "logps/rejected": -48.683021545410156, + "loss": 0.6932, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -5.281760422803927e-06, + "rewards/margins": -0.00016755127580836415, + "rewards/rejected": 0.0001622694981051609, + "step": 30 + }, + { + "epoch": 0.014411817690506215, + "grad_norm": 3.5439584255218506, + "learning_rate": 3.6036036036036035e-08, + "logits/chosen": -1.7683446407318115, + "logits/rejected": -1.7544472217559814, + "logps/chosen": -42.96169662475586, + "logps/rejected": -44.480674743652344, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.9183791664545424e-05, + "rewards/margins": 0.00010197125084232539, + "rewards/rejected": -0.00012115505523979664, + "step": 40 + }, + { + "epoch": 0.01801477211313277, + "grad_norm": 4.501108646392822, + "learning_rate": 4.504504504504504e-08, + "logits/chosen": -1.7577524185180664, + "logits/rejected": -1.741100549697876, + "logps/chosen": -50.425445556640625, + "logps/rejected": -52.546630859375, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00023509531456511468, + "rewards/margins": 0.00022509522386826575, + "rewards/rejected": 1.0000122529163491e-05, + "step": 50 + }, + { + "epoch": 0.021617726535759323, + "grad_norm": 4.594651222229004, + "learning_rate": 5.4054054054054056e-08, + "logits/chosen": -1.5931284427642822, + "logits/rejected": -1.5910460948944092, + "logps/chosen": -45.28499984741211, + "logps/rejected": -49.063377380371094, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00016079492343124002, + "rewards/margins": -1.9398279619053937e-05, + "rewards/rejected": -0.00014139662380330265, + "step": 60 + }, + { + "epoch": 0.025220680958385876, + "grad_norm": 3.4112484455108643, + "learning_rate": 6.306306306306305e-08, + "logits/chosen": -1.7921539545059204, + "logits/rejected": -1.7922885417938232, + "logps/chosen": -46.665245056152344, + "logps/rejected": -48.13717269897461, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -4.898941915598698e-05, + "rewards/margins": -0.0002294534060638398, + "rewards/rejected": 0.00018046401964966208, + "step": 70 + }, + { + "epoch": 0.02882363538101243, + "grad_norm": 3.629711627960205, + "learning_rate": 7.207207207207207e-08, + "logits/chosen": -1.7722715139389038, + "logits/rejected": -1.7587053775787354, + "logps/chosen": -39.46010208129883, + "logps/rejected": -42.94160461425781, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -3.910423401976004e-05, + "rewards/margins": -1.4359999113366939e-05, + "rewards/rejected": -2.4744216716499068e-05, + "step": 80 + }, + { + "epoch": 0.03242658980363899, + "grad_norm": 3.2938477993011475, + "learning_rate": 8.108108108108108e-08, + "logits/chosen": -1.6195322275161743, + "logits/rejected": -1.6349570751190186, + "logps/chosen": -43.25347900390625, + "logps/rejected": -45.54538345336914, + "loss": 0.6933, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.00023719388991594315, + "rewards/margins": -0.000220087036723271, + "rewards/rejected": 0.00045728092663921416, + "step": 90 + }, + { + "epoch": 0.03602954422626554, + "grad_norm": 4.304085731506348, + "learning_rate": 9.009009009009008e-08, + "logits/chosen": -1.690190076828003, + "logits/rejected": -1.668461799621582, + "logps/chosen": -42.72962188720703, + "logps/rejected": -44.08342361450195, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.6253164580557495e-05, + "rewards/margins": -4.533391984296031e-05, + "rewards/rejected": 9.080766176339239e-06, + "step": 100 + }, + { + "epoch": 0.039632498648892095, + "grad_norm": 4.63545560836792, + "learning_rate": 9.909909909909909e-08, + "logits/chosen": -1.753379225730896, + "logits/rejected": -1.748039960861206, + "logps/chosen": -49.486961364746094, + "logps/rejected": -52.599761962890625, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -7.084323442541063e-06, + "rewards/margins": -0.0001257166441064328, + "rewards/rejected": 0.00011863231338793412, + "step": 110 + }, + { + "epoch": 0.043235453071518645, + "grad_norm": 5.669524669647217, + "learning_rate": 1.0810810810810811e-07, + "logits/chosen": -1.7147136926651, + "logits/rejected": -1.702016830444336, + "logps/chosen": -51.8984489440918, + "logps/rejected": -51.4279899597168, + "loss": 0.6931, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 9.030712681123987e-05, + "rewards/margins": 5.703966598957777e-05, + "rewards/rejected": 3.326743535581045e-05, + "step": 120 + }, + { + "epoch": 0.0468384074941452, + "grad_norm": 3.564293146133423, + "learning_rate": 1.171171171171171e-07, + "logits/chosen": -1.665785789489746, + "logits/rejected": -1.6387412548065186, + "logps/chosen": -46.71335983276367, + "logps/rejected": -49.116981506347656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00015762264956720173, + "rewards/margins": 9.968892118195072e-05, + "rewards/rejected": 5.7933742937166244e-05, + "step": 130 + }, + { + "epoch": 0.05044136191677175, + "grad_norm": 3.2253429889678955, + "learning_rate": 1.261261261261261e-07, + "logits/chosen": -1.7900612354278564, + "logits/rejected": -1.8017972707748413, + "logps/chosen": -53.443931579589844, + "logps/rejected": -56.4853515625, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.9897346394136548e-05, + "rewards/margins": 7.243625441333279e-05, + "rewards/rejected": -0.00010233358625555411, + "step": 140 + }, + { + "epoch": 0.05404431633939831, + "grad_norm": 3.723487377166748, + "learning_rate": 1.3513513513513515e-07, + "logits/chosen": -1.722447156906128, + "logits/rejected": -1.7019611597061157, + "logps/chosen": -53.44658279418945, + "logps/rejected": -54.85149383544922, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.108219193061814e-05, + "rewards/margins": 0.0001519794896012172, + "rewards/rejected": -0.0002030616597039625, + "step": 150 + }, + { + "epoch": 0.05764727076202486, + "grad_norm": 5.437617301940918, + "learning_rate": 1.4414414414414414e-07, + "logits/chosen": -1.7410888671875, + "logits/rejected": -1.7434587478637695, + "logps/chosen": -48.747833251953125, + "logps/rejected": -50.5507698059082, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0003015986585523933, + "rewards/margins": 0.00039068650221452117, + "rewards/rejected": -8.908782911021262e-05, + "step": 160 + }, + { + "epoch": 0.06125022518465142, + "grad_norm": 5.302024841308594, + "learning_rate": 1.5315315315315313e-07, + "logits/chosen": -1.5808699131011963, + "logits/rejected": -1.5805413722991943, + "logps/chosen": -50.6431770324707, + "logps/rejected": -52.430580139160156, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0003786823363043368, + "rewards/margins": 0.00028472780832089484, + "rewards/rejected": 9.395449887961149e-05, + "step": 170 + }, + { + "epoch": 0.06485317960727797, + "grad_norm": 3.489992141723633, + "learning_rate": 1.6216216216216215e-07, + "logits/chosen": -1.7394211292266846, + "logits/rejected": -1.7349265813827515, + "logps/chosen": -47.33000946044922, + "logps/rejected": -49.8638801574707, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00022297371469903737, + "rewards/margins": -5.568843334913254e-05, + "rewards/rejected": 0.0002786621334962547, + "step": 180 + }, + { + "epoch": 0.06845613402990453, + "grad_norm": 3.072382688522339, + "learning_rate": 1.7117117117117117e-07, + "logits/chosen": -1.8074477910995483, + "logits/rejected": -1.7950681447982788, + "logps/chosen": -40.70524978637695, + "logps/rejected": -44.75992202758789, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.00018566983635537326, + "rewards/margins": -0.00016857947048265487, + "rewards/rejected": 0.0003542492922861129, + "step": 190 + }, + { + "epoch": 0.07205908845253108, + "grad_norm": 4.253015041351318, + "learning_rate": 1.8018018018018017e-07, + "logits/chosen": -1.7659227848052979, + "logits/rejected": -1.762160062789917, + "logps/chosen": -50.597862243652344, + "logps/rejected": -52.700660705566406, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0002902823907788843, + "rewards/margins": 0.00015576460282318294, + "rewards/rejected": 0.0001345178607152775, + "step": 200 + }, + { + "epoch": 0.07566204287515763, + "grad_norm": 4.544305324554443, + "learning_rate": 1.891891891891892e-07, + "logits/chosen": -1.6628167629241943, + "logits/rejected": -1.645263433456421, + "logps/chosen": -50.96441650390625, + "logps/rejected": -52.852989196777344, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00016514495655428618, + "rewards/margins": 0.0003304726560600102, + "rewards/rejected": -0.00016532771405763924, + "step": 210 + }, + { + "epoch": 0.07926499729778419, + "grad_norm": 2.668222427368164, + "learning_rate": 1.9819819819819818e-07, + "logits/chosen": -1.717664122581482, + "logits/rejected": -1.686693787574768, + "logps/chosen": -46.837650299072266, + "logps/rejected": -49.449607849121094, + "loss": 0.693, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.00034715054789558053, + "rewards/margins": 0.0003410983190406114, + "rewards/rejected": 6.052269782230724e-06, + "step": 220 + }, + { + "epoch": 0.08286795172041074, + "grad_norm": 3.3485231399536133, + "learning_rate": 2.072072072072072e-07, + "logits/chosen": -1.6586008071899414, + "logits/rejected": -1.6540342569351196, + "logps/chosen": -36.831581115722656, + "logps/rejected": -39.075965881347656, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0005781602812930942, + "rewards/margins": 0.0003644340031314641, + "rewards/rejected": 0.00021372627816163003, + "step": 230 + }, + { + "epoch": 0.08647090614303729, + "grad_norm": 3.4034311771392822, + "learning_rate": 2.1621621621621622e-07, + "logits/chosen": -1.6835705041885376, + "logits/rejected": -1.6781299114227295, + "logps/chosen": -51.54401779174805, + "logps/rejected": -54.11664581298828, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0009162005735561252, + "rewards/margins": 0.0011217787396162748, + "rewards/rejected": -0.00020557809330057353, + "step": 240 + }, + { + "epoch": 0.09007386056566384, + "grad_norm": 3.7628700733184814, + "learning_rate": 2.2522522522522522e-07, + "logits/chosen": -1.756882905960083, + "logits/rejected": -1.7513837814331055, + "logps/chosen": -55.46052932739258, + "logps/rejected": -57.28558349609375, + "loss": 0.6928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0012927096104249358, + "rewards/margins": 0.0006762434495612979, + "rewards/rejected": 0.0006164660444483161, + "step": 250 + }, + { + "epoch": 0.0936768149882904, + "grad_norm": 4.458310604095459, + "learning_rate": 2.342342342342342e-07, + "logits/chosen": -1.7510229349136353, + "logits/rejected": -1.7402935028076172, + "logps/chosen": -43.141502380371094, + "logps/rejected": -45.22883987426758, + "loss": 0.6928, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0007634961511939764, + "rewards/margins": 0.0006153647555038333, + "rewards/rejected": 0.00014813135203439742, + "step": 260 + }, + { + "epoch": 0.09727976941091696, + "grad_norm": 3.138791799545288, + "learning_rate": 2.4324324324324326e-07, + "logits/chosen": -1.6900430917739868, + "logits/rejected": -1.6775468587875366, + "logps/chosen": -48.70859909057617, + "logps/rejected": -53.02415084838867, + "loss": 0.6926, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0015983255580067635, + "rewards/margins": 0.0010901021305471659, + "rewards/rejected": 0.0005082233110442758, + "step": 270 + }, + { + "epoch": 0.1008827238335435, + "grad_norm": 3.4879350662231445, + "learning_rate": 2.522522522522522e-07, + "logits/chosen": -1.659205675125122, + "logits/rejected": -1.650123953819275, + "logps/chosen": -42.67085647583008, + "logps/rejected": -47.37016677856445, + "loss": 0.6926, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0017988558392971754, + "rewards/margins": 0.001047928468324244, + "rewards/rejected": 0.0007509274873882532, + "step": 280 + }, + { + "epoch": 0.10448567825617006, + "grad_norm": 5.909618377685547, + "learning_rate": 2.6126126126126124e-07, + "logits/chosen": -1.6975767612457275, + "logits/rejected": -1.6904857158660889, + "logps/chosen": -49.704307556152344, + "logps/rejected": -53.21978759765625, + "loss": 0.6926, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.001446915091946721, + "rewards/margins": 0.001141748740337789, + "rewards/rejected": 0.0003051663807127625, + "step": 290 + }, + { + "epoch": 0.10808863267879662, + "grad_norm": 6.1678853034973145, + "learning_rate": 2.702702702702703e-07, + "logits/chosen": -1.7316814661026, + "logits/rejected": -1.7211496829986572, + "logps/chosen": -46.54294967651367, + "logps/rejected": -49.60760498046875, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.002290563890710473, + "rewards/margins": 0.0022531836293637753, + "rewards/rejected": 3.7380214052973315e-05, + "step": 300 + }, + { + "epoch": 0.11169158710142317, + "grad_norm": 3.0397493839263916, + "learning_rate": 2.7927927927927923e-07, + "logits/chosen": -1.7990278005599976, + "logits/rejected": -1.7732198238372803, + "logps/chosen": -38.29768753051758, + "logps/rejected": -41.79767990112305, + "loss": 0.6921, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0024967927020043135, + "rewards/margins": 0.0021534208208322525, + "rewards/rejected": 0.0003433715901337564, + "step": 310 + }, + { + "epoch": 0.11529454152404972, + "grad_norm": 3.145111560821533, + "learning_rate": 2.882882882882883e-07, + "logits/chosen": -1.5970180034637451, + "logits/rejected": -1.5767868757247925, + "logps/chosen": -43.7572135925293, + "logps/rejected": -47.30242919921875, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0019000060856342316, + "rewards/margins": 0.0017187722260132432, + "rewards/rejected": 0.00018123406334780157, + "step": 320 + }, + { + "epoch": 0.11889749594667627, + "grad_norm": 3.4514622688293457, + "learning_rate": 2.972972972972973e-07, + "logits/chosen": -1.7697391510009766, + "logits/rejected": -1.725419282913208, + "logps/chosen": -46.631221771240234, + "logps/rejected": -48.73198699951172, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0027934780810028315, + "rewards/margins": 0.002238652901723981, + "rewards/rejected": 0.0005548248300328851, + "step": 330 + }, + { + "epoch": 0.12250045036930284, + "grad_norm": 4.041067123413086, + "learning_rate": 3.0630630630630627e-07, + "logits/chosen": -1.674444556236267, + "logits/rejected": -1.674968957901001, + "logps/chosen": -42.48195266723633, + "logps/rejected": -44.92963409423828, + "loss": 0.6918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0031822682358324528, + "rewards/margins": 0.0027774940244853497, + "rewards/rejected": 0.0004047740367241204, + "step": 340 + }, + { + "epoch": 0.12610340479192939, + "grad_norm": 5.933683395385742, + "learning_rate": 3.153153153153153e-07, + "logits/chosen": -1.6941207647323608, + "logits/rejected": -1.681911826133728, + "logps/chosen": -49.30591583251953, + "logps/rejected": -51.21431350708008, + "loss": 0.6919, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0034376985859125853, + "rewards/margins": 0.002514602616429329, + "rewards/rejected": 0.0009230962023139, + "step": 350 + }, + { + "epoch": 0.12970635921455595, + "grad_norm": 4.507534027099609, + "learning_rate": 3.243243243243243e-07, + "logits/chosen": -1.7088005542755127, + "logits/rejected": -1.693945288658142, + "logps/chosen": -40.02022171020508, + "logps/rejected": -42.422332763671875, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0054672518745064735, + "rewards/margins": 0.005961936432868242, + "rewards/rejected": -0.0004946846747770905, + "step": 360 + }, + { + "epoch": 0.13330931363718249, + "grad_norm": 2.685657501220703, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.703002691268921, + "logits/rejected": -1.7116321325302124, + "logps/chosen": -41.613525390625, + "logps/rejected": -46.64238739013672, + "loss": 0.6911, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004459428135305643, + "rewards/margins": 0.004147970117628574, + "rewards/rejected": 0.00031145798857323825, + "step": 370 + }, + { + "epoch": 0.13691226805980905, + "grad_norm": 3.4917635917663574, + "learning_rate": 3.4234234234234235e-07, + "logits/chosen": -1.8192132711410522, + "logits/rejected": -1.80001699924469, + "logps/chosen": -41.8767204284668, + "logps/rejected": -43.953147888183594, + "loss": 0.6911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.005829638335853815, + "rewards/margins": 0.004118152894079685, + "rewards/rejected": 0.001711485325358808, + "step": 380 + }, + { + "epoch": 0.1405152224824356, + "grad_norm": 3.218789577484131, + "learning_rate": 3.5135135135135134e-07, + "logits/chosen": -1.7629024982452393, + "logits/rejected": -1.7467416524887085, + "logps/chosen": -42.2618293762207, + "logps/rejected": -44.198760986328125, + "loss": 0.6912, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.004580915905535221, + "rewards/margins": 0.003979508765041828, + "rewards/rejected": 0.000601407082285732, + "step": 390 + }, + { + "epoch": 0.14411817690506215, + "grad_norm": 6.135593414306641, + "learning_rate": 3.6036036036036033e-07, + "logits/chosen": -1.6290171146392822, + "logits/rejected": -1.6323350667953491, + "logps/chosen": -43.67991256713867, + "logps/rejected": -49.84873580932617, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005232072435319424, + "rewards/margins": 0.004124890547245741, + "rewards/rejected": 0.0011071818880736828, + "step": 400 + }, + { + "epoch": 0.14772113132768872, + "grad_norm": 3.853224039077759, + "learning_rate": 3.6936936936936933e-07, + "logits/chosen": -1.7248551845550537, + "logits/rejected": -1.7100718021392822, + "logps/chosen": -38.718650817871094, + "logps/rejected": -40.440452575683594, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004796669818460941, + "rewards/margins": 0.007348486687988043, + "rewards/rejected": -0.002551817102357745, + "step": 410 + }, + { + "epoch": 0.15132408575031525, + "grad_norm": 3.83880877494812, + "learning_rate": 3.783783783783784e-07, + "logits/chosen": -1.6671268939971924, + "logits/rejected": -1.6588226556777954, + "logps/chosen": -46.447330474853516, + "logps/rejected": -46.89680862426758, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004480901174247265, + "rewards/margins": 0.00695028668269515, + "rewards/rejected": -0.002469385042786598, + "step": 420 + }, + { + "epoch": 0.15492704017294182, + "grad_norm": 4.403101921081543, + "learning_rate": 3.8738738738738737e-07, + "logits/chosen": -1.7254947423934937, + "logits/rejected": -1.7298320531845093, + "logps/chosen": -47.78804016113281, + "logps/rejected": -55.29218292236328, + "loss": 0.6878, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.003573561552911997, + "rewards/margins": 0.010887719690799713, + "rewards/rejected": -0.007314157672226429, + "step": 430 + }, + { + "epoch": 0.15852999459556838, + "grad_norm": 4.868645191192627, + "learning_rate": 3.9639639639639636e-07, + "logits/chosen": -1.688886284828186, + "logits/rejected": -1.670697569847107, + "logps/chosen": -46.06879806518555, + "logps/rejected": -51.101715087890625, + "loss": 0.6875, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0051663839258253574, + "rewards/margins": 0.01150798611342907, + "rewards/rejected": -0.006341601721942425, + "step": 440 + }, + { + "epoch": 0.16213294901819492, + "grad_norm": 4.604848384857178, + "learning_rate": 4.054054054054054e-07, + "logits/chosen": -1.6967153549194336, + "logits/rejected": -1.6993358135223389, + "logps/chosen": -44.147483825683594, + "logps/rejected": -50.80498504638672, + "loss": 0.6879, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.006432811263948679, + "rewards/margins": 0.010621501132845879, + "rewards/rejected": -0.0041886912658810616, + "step": 450 + }, + { + "epoch": 0.16573590344082148, + "grad_norm": 4.497652053833008, + "learning_rate": 4.144144144144144e-07, + "logits/chosen": -1.764259696006775, + "logits/rejected": -1.7514150142669678, + "logps/chosen": -43.84403610229492, + "logps/rejected": -46.98237991333008, + "loss": 0.6873, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0029778308235108852, + "rewards/margins": 0.011888700537383556, + "rewards/rejected": -0.008910869248211384, + "step": 460 + }, + { + "epoch": 0.16933885786344802, + "grad_norm": 4.147437572479248, + "learning_rate": 4.234234234234234e-07, + "logits/chosen": -1.8267230987548828, + "logits/rejected": -1.8229129314422607, + "logps/chosen": -42.10190963745117, + "logps/rejected": -46.67747497558594, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0027582012116909027, + "rewards/margins": 0.006137028336524963, + "rewards/rejected": -0.0033788266591727734, + "step": 470 + }, + { + "epoch": 0.17294181228607458, + "grad_norm": 4.121939659118652, + "learning_rate": 4.3243243243243244e-07, + "logits/chosen": -1.7018091678619385, + "logits/rejected": -1.681560754776001, + "logps/chosen": -55.833404541015625, + "logps/rejected": -57.60951614379883, + "loss": 0.6854, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0008912247722037137, + "rewards/margins": 0.016034971922636032, + "rewards/rejected": -0.015143746510148048, + "step": 480 + }, + { + "epoch": 0.17654476670870115, + "grad_norm": 3.8007025718688965, + "learning_rate": 4.414414414414414e-07, + "logits/chosen": -1.6423200368881226, + "logits/rejected": -1.6475427150726318, + "logps/chosen": -49.587913513183594, + "logps/rejected": -54.42341232299805, + "loss": 0.6843, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0029760824982076883, + "rewards/margins": 0.018089579418301582, + "rewards/rejected": -0.015113498084247112, + "step": 490 + }, + { + "epoch": 0.18014772113132768, + "grad_norm": 6.992798328399658, + "learning_rate": 4.5045045045045043e-07, + "logits/chosen": -1.6029027700424194, + "logits/rejected": -1.576078176498413, + "logps/chosen": -49.34995651245117, + "logps/rejected": -51.18292999267578, + "loss": 0.6797, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0019840318709611893, + "rewards/margins": 0.027620479464530945, + "rewards/rejected": -0.025636449456214905, + "step": 500 + }, + { + "epoch": 0.18375067555395425, + "grad_norm": 6.653567790985107, + "learning_rate": 4.594594594594595e-07, + "logits/chosen": -1.6146643161773682, + "logits/rejected": -1.6065555810928345, + "logps/chosen": -45.69719314575195, + "logps/rejected": -49.471824645996094, + "loss": 0.6827, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 3.5562366974772885e-05, + "rewards/margins": 0.0216769240796566, + "rewards/rejected": -0.021641362458467484, + "step": 510 + }, + { + "epoch": 0.1873536299765808, + "grad_norm": 5.723504066467285, + "learning_rate": 4.684684684684684e-07, + "logits/chosen": -1.687718391418457, + "logits/rejected": -1.6782705783843994, + "logps/chosen": -55.974578857421875, + "logps/rejected": -58.01393508911133, + "loss": 0.69, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.016683388501405716, + "rewards/margins": 0.007352002896368504, + "rewards/rejected": -0.024035390466451645, + "step": 520 + }, + { + "epoch": 0.19095658439920735, + "grad_norm": 4.197725772857666, + "learning_rate": 4.774774774774775e-07, + "logits/chosen": -1.7161273956298828, + "logits/rejected": -1.7011467218399048, + "logps/chosen": -44.741886138916016, + "logps/rejected": -50.11282730102539, + "loss": 0.6836, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.013202029280364513, + "rewards/margins": 0.02028127945959568, + "rewards/rejected": -0.03348330780863762, + "step": 530 + }, + { + "epoch": 0.1945595388218339, + "grad_norm": 4.084933280944824, + "learning_rate": 4.864864864864865e-07, + "logits/chosen": -1.6937358379364014, + "logits/rejected": -1.670254111289978, + "logps/chosen": -50.84514236450195, + "logps/rejected": -55.533592224121094, + "loss": 0.6794, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.007875502109527588, + "rewards/margins": 0.02891036495566368, + "rewards/rejected": -0.03678586333990097, + "step": 540 + }, + { + "epoch": 0.19816249324446045, + "grad_norm": 6.4256911277771, + "learning_rate": 4.954954954954955e-07, + "logits/chosen": -1.5930603742599487, + "logits/rejected": -1.5785481929779053, + "logps/chosen": -49.83226776123047, + "logps/rejected": -51.831695556640625, + "loss": 0.6835, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00859083142131567, + "rewards/margins": 0.02024068497121334, + "rewards/rejected": -0.028831515461206436, + "step": 550 + }, + { + "epoch": 0.201765447667087, + "grad_norm": 5.257549285888672, + "learning_rate": 4.999987638293614e-07, + "logits/chosen": -1.688340187072754, + "logits/rejected": -1.6726744174957275, + "logps/chosen": -37.71453094482422, + "logps/rejected": -44.47661590576172, + "loss": 0.6733, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.017620814964175224, + "rewards/margins": 0.04227476567029953, + "rewards/rejected": -0.0598955862224102, + "step": 560 + }, + { + "epoch": 0.20536840208971358, + "grad_norm": 6.3576884269714355, + "learning_rate": 4.999888745376028e-07, + "logits/chosen": -1.5950968265533447, + "logits/rejected": -1.5918595790863037, + "logps/chosen": -48.646385192871094, + "logps/rejected": -54.294288635253906, + "loss": 0.6794, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03091433085501194, + "rewards/margins": 0.02918051742017269, + "rewards/rejected": -0.06009485572576523, + "step": 570 + }, + { + "epoch": 0.2089713565123401, + "grad_norm": 6.035130500793457, + "learning_rate": 4.999690963452795e-07, + "logits/chosen": -1.7354274988174438, + "logits/rejected": -1.7171862125396729, + "logps/chosen": -49.645389556884766, + "logps/rejected": -51.204010009765625, + "loss": 0.6857, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0618695430457592, + "rewards/margins": 0.016820868477225304, + "rewards/rejected": -0.07869041711091995, + "step": 580 + }, + { + "epoch": 0.21257431093496668, + "grad_norm": 7.045923233032227, + "learning_rate": 4.999394300347652e-07, + "logits/chosen": -1.5585837364196777, + "logits/rejected": -1.5490951538085938, + "logps/chosen": -58.975677490234375, + "logps/rejected": -63.8331413269043, + "loss": 0.6795, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.058443933725357056, + "rewards/margins": 0.0303555428981781, + "rewards/rejected": -0.08879947662353516, + "step": 590 + }, + { + "epoch": 0.21617726535759324, + "grad_norm": 5.241565704345703, + "learning_rate": 4.998998767795804e-07, + "logits/chosen": -1.551060676574707, + "logits/rejected": -1.5270111560821533, + "logps/chosen": -52.183326721191406, + "logps/rejected": -61.41755294799805, + "loss": 0.6659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08854193985462189, + "rewards/margins": 0.0613928847014904, + "rewards/rejected": -0.1499348133802414, + "step": 600 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 8.864941596984863, + "learning_rate": 4.998504381443478e-07, + "logits/chosen": -1.402044653892517, + "logits/rejected": -1.4134305715560913, + "logps/chosen": -62.51812744140625, + "logps/rejected": -69.44048309326172, + "loss": 0.6708, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12022509425878525, + "rewards/margins": 0.0526595413684845, + "rewards/rejected": -0.17288464307785034, + "step": 610 + }, + { + "epoch": 0.22338317420284634, + "grad_norm": 12.818338394165039, + "learning_rate": 4.997911160847295e-07, + "logits/chosen": -1.4413927793502808, + "logits/rejected": -1.4285277128219604, + "logps/chosen": -59.63031005859375, + "logps/rejected": -68.47859191894531, + "loss": 0.6644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1399981677532196, + "rewards/margins": 0.06519350409507751, + "rewards/rejected": -0.2051916867494583, + "step": 620 + }, + { + "epoch": 0.22698612862547288, + "grad_norm": 13.486601829528809, + "learning_rate": 4.997219129473494e-07, + "logits/chosen": -1.3681890964508057, + "logits/rejected": -1.3525458574295044, + "logps/chosen": -73.00723266601562, + "logps/rejected": -84.07337951660156, + "loss": 0.666, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.26885923743247986, + "rewards/margins": 0.06815784424543381, + "rewards/rejected": -0.3370170593261719, + "step": 630 + }, + { + "epoch": 0.23058908304809944, + "grad_norm": 11.271373748779297, + "learning_rate": 4.996428314697015e-07, + "logits/chosen": -1.2895841598510742, + "logits/rejected": -1.2766430377960205, + "logps/chosen": -70.99652862548828, + "logps/rejected": -81.04373931884766, + "loss": 0.6722, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2290995866060257, + "rewards/margins": 0.05413205549120903, + "rewards/rejected": -0.283231645822525, + "step": 640 + }, + { + "epoch": 0.234192037470726, + "grad_norm": 8.742398262023926, + "learning_rate": 4.995538747800402e-07, + "logits/chosen": -1.2628368139266968, + "logits/rejected": -1.2450822591781616, + "logps/chosen": -70.45149993896484, + "logps/rejected": -74.6968765258789, + "loss": 0.6719, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1835000365972519, + "rewards/margins": 0.05531931668519974, + "rewards/rejected": -0.23881936073303223, + "step": 650 + }, + { + "epoch": 0.23779499189335254, + "grad_norm": 8.972411155700684, + "learning_rate": 4.994550463972576e-07, + "logits/chosen": -1.288618803024292, + "logits/rejected": -1.2918486595153809, + "logps/chosen": -69.60689544677734, + "logps/rejected": -72.79714965820312, + "loss": 0.68, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20229852199554443, + "rewards/margins": 0.04259002208709717, + "rewards/rejected": -0.2448885142803192, + "step": 660 + }, + { + "epoch": 0.2413979463159791, + "grad_norm": 10.873099327087402, + "learning_rate": 4.99346350230744e-07, + "logits/chosen": -1.3272793292999268, + "logits/rejected": -1.3133655786514282, + "logps/chosen": -82.43290710449219, + "logps/rejected": -95.42029571533203, + "loss": 0.6403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23164968192577362, + "rewards/margins": 0.12773051857948303, + "rewards/rejected": -0.35938018560409546, + "step": 670 + }, + { + "epoch": 0.24500090073860567, + "grad_norm": 12.110127449035645, + "learning_rate": 4.992277905802331e-07, + "logits/chosen": -1.0249826908111572, + "logits/rejected": -1.024247407913208, + "logps/chosen": -71.12178802490234, + "logps/rejected": -87.04933166503906, + "loss": 0.6347, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2818991243839264, + "rewards/margins": 0.14090515673160553, + "rewards/rejected": -0.4228042662143707, + "step": 680 + }, + { + "epoch": 0.2486038551612322, + "grad_norm": 18.234243392944336, + "learning_rate": 4.990993721356315e-07, + "logits/chosen": -0.8412225842475891, + "logits/rejected": -0.8490460515022278, + "logps/chosen": -85.99247741699219, + "logps/rejected": -94.976806640625, + "loss": 0.6776, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.40948066115379333, + "rewards/margins": 0.04921000450849533, + "rewards/rejected": -0.45869070291519165, + "step": 690 + }, + { + "epoch": 0.25220680958385877, + "grad_norm": 13.038808822631836, + "learning_rate": 4.989610999768348e-07, + "logits/chosen": -0.7974184155464172, + "logits/rejected": -0.7845913767814636, + "logps/chosen": -83.24944305419922, + "logps/rejected": -98.57124328613281, + "loss": 0.6403, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3702521026134491, + "rewards/margins": 0.13462284207344055, + "rewards/rejected": -0.5048748850822449, + "step": 700 + }, + { + "epoch": 0.2558097640064853, + "grad_norm": 11.371808052062988, + "learning_rate": 4.988129795735248e-07, + "logits/chosen": -0.8534584045410156, + "logits/rejected": -0.8415006399154663, + "logps/chosen": -89.6837158203125, + "logps/rejected": -102.817626953125, + "loss": 0.645, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.32941603660583496, + "rewards/margins": 0.11651470512151718, + "rewards/rejected": -0.44593071937561035, + "step": 710 + }, + { + "epoch": 0.2594127184291119, + "grad_norm": 14.147513389587402, + "learning_rate": 4.986550167849537e-07, + "logits/chosen": -0.7822316884994507, + "logits/rejected": -0.7774645090103149, + "logps/chosen": -92.55155944824219, + "logps/rejected": -106.33164978027344, + "loss": 0.6527, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.41462668776512146, + "rewards/margins": 0.09965832531452179, + "rewards/rejected": -0.5142850279808044, + "step": 720 + }, + { + "epoch": 0.26301567285173844, + "grad_norm": 16.63142204284668, + "learning_rate": 4.98487217859713e-07, + "logits/chosen": -0.6910918354988098, + "logits/rejected": -0.700725793838501, + "logps/chosen": -93.5431900024414, + "logps/rejected": -109.87126159667969, + "loss": 0.6497, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.44943666458129883, + "rewards/margins": 0.1114300936460495, + "rewards/rejected": -0.5608667135238647, + "step": 730 + }, + { + "epoch": 0.26661862727436497, + "grad_norm": 19.47760772705078, + "learning_rate": 4.983095894354857e-07, + "logits/chosen": -0.6674941778182983, + "logits/rejected": -0.6683529019355774, + "logps/chosen": -103.37028503417969, + "logps/rejected": -119.79246520996094, + "loss": 0.6494, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5528812408447266, + "rewards/margins": 0.1224491149187088, + "rewards/rejected": -0.6753303408622742, + "step": 740 + }, + { + "epoch": 0.2702215816969915, + "grad_norm": 20.79136085510254, + "learning_rate": 4.981221385387837e-07, + "logits/chosen": -0.8482168316841125, + "logits/rejected": -0.8399343490600586, + "logps/chosen": -95.40355682373047, + "logps/rejected": -100.78687286376953, + "loss": 0.6708, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4453433156013489, + "rewards/margins": 0.06763346493244171, + "rewards/rejected": -0.5129767656326294, + "step": 750 + }, + { + "epoch": 0.2738245361196181, + "grad_norm": 11.2230863571167, + "learning_rate": 4.979248725846701e-07, + "logits/chosen": -1.2313950061798096, + "logits/rejected": -1.2393567562103271, + "logps/chosen": -69.79106140136719, + "logps/rejected": -82.90087890625, + "loss": 0.6528, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2664026618003845, + "rewards/margins": 0.09638935327529907, + "rewards/rejected": -0.3627920150756836, + "step": 760 + }, + { + "epoch": 0.27742749054224464, + "grad_norm": 14.57046127319336, + "learning_rate": 4.977177993764659e-07, + "logits/chosen": -1.3615708351135254, + "logits/rejected": -1.343949556350708, + "logps/chosen": -77.51803588867188, + "logps/rejected": -86.98101806640625, + "loss": 0.6578, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2723795175552368, + "rewards/margins": 0.08878382295370102, + "rewards/rejected": -0.36116331815719604, + "step": 770 + }, + { + "epoch": 0.2810304449648712, + "grad_norm": 12.90167236328125, + "learning_rate": 4.975009271054409e-07, + "logits/chosen": -1.2985954284667969, + "logits/rejected": -1.3072916269302368, + "logps/chosen": -72.69700622558594, + "logps/rejected": -90.4751968383789, + "loss": 0.645, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.26569822430610657, + "rewards/margins": 0.11753193289041519, + "rewards/rejected": -0.38323014974594116, + "step": 780 + }, + { + "epoch": 0.28463339938749777, + "grad_norm": 13.231833457946777, + "learning_rate": 4.972742643504904e-07, + "logits/chosen": -1.2923033237457275, + "logits/rejected": -1.2837045192718506, + "logps/chosen": -73.27848052978516, + "logps/rejected": -74.68667602539062, + "loss": 0.6953, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.27220702171325684, + "rewards/margins": 0.008732852526009083, + "rewards/rejected": -0.28093987703323364, + "step": 790 + }, + { + "epoch": 0.2882363538101243, + "grad_norm": 12.5423002243042, + "learning_rate": 4.970378200777948e-07, + "logits/chosen": -1.3273861408233643, + "logits/rejected": -1.3253200054168701, + "logps/chosen": -69.21495056152344, + "logps/rejected": -80.0616683959961, + "loss": 0.6575, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2545456290245056, + "rewards/margins": 0.08964885026216507, + "rewards/rejected": -0.3441944718360901, + "step": 800 + }, + { + "epoch": 0.29183930823275084, + "grad_norm": 27.842836380004883, + "learning_rate": 4.967916036404664e-07, + "logits/chosen": -1.2187349796295166, + "logits/rejected": -1.2240660190582275, + "logps/chosen": -74.37635803222656, + "logps/rejected": -85.68818664550781, + "loss": 0.673, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3206605315208435, + "rewards/margins": 0.07676301896572113, + "rewards/rejected": -0.3974235951900482, + "step": 810 + }, + { + "epoch": 0.29544226265537743, + "grad_norm": 16.682466506958008, + "learning_rate": 4.965356247781778e-07, + "logits/chosen": -1.3194482326507568, + "logits/rejected": -1.3158743381500244, + "logps/chosen": -88.14268493652344, + "logps/rejected": -96.72323608398438, + "loss": 0.6606, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3540545701980591, + "rewards/margins": 0.08011126518249512, + "rewards/rejected": -0.4341658651828766, + "step": 820 + }, + { + "epoch": 0.29904521707800397, + "grad_norm": 11.893294334411621, + "learning_rate": 4.962698936167778e-07, + "logits/chosen": -1.3116481304168701, + "logits/rejected": -1.3138093948364258, + "logps/chosen": -78.41636657714844, + "logps/rejected": -91.9908447265625, + "loss": 0.6513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32214006781578064, + "rewards/margins": 0.10692934691905975, + "rewards/rejected": -0.4290694296360016, + "step": 830 + }, + { + "epoch": 0.3026481715006305, + "grad_norm": 16.89285659790039, + "learning_rate": 4.959944206678903e-07, + "logits/chosen": -1.1496102809906006, + "logits/rejected": -1.1655839681625366, + "logps/chosen": -85.49686431884766, + "logps/rejected": -94.6566390991211, + "loss": 0.6752, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.40377917885780334, + "rewards/margins": 0.054061077535152435, + "rewards/rejected": -0.4578402638435364, + "step": 840 + }, + { + "epoch": 0.3062511259232571, + "grad_norm": 15.432175636291504, + "learning_rate": 4.957092168284986e-07, + "logits/chosen": -1.125409483909607, + "logits/rejected": -1.1296653747558594, + "logps/chosen": -84.02685546875, + "logps/rejected": -89.34877014160156, + "loss": 0.6686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.35803866386413574, + "rewards/margins": 0.059441715478897095, + "rewards/rejected": -0.41748037934303284, + "step": 850 + }, + { + "epoch": 0.30985408034588363, + "grad_norm": 14.408708572387695, + "learning_rate": 4.954142933805145e-07, + "logits/chosen": -0.9184282422065735, + "logits/rejected": -0.9147431254386902, + "logps/chosen": -91.42207336425781, + "logps/rejected": -102.40242004394531, + "loss": 0.6539, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.47459083795547485, + "rewards/margins": 0.10091207176446915, + "rewards/rejected": -0.5755028128623962, + "step": 860 + }, + { + "epoch": 0.31345703476851017, + "grad_norm": 12.480093955993652, + "learning_rate": 4.951096619903317e-07, + "logits/chosen": -0.9125533103942871, + "logits/rejected": -0.913791835308075, + "logps/chosen": -91.58851623535156, + "logps/rejected": -107.85359191894531, + "loss": 0.6569, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.46451687812805176, + "rewards/margins": 0.10352380573749542, + "rewards/rejected": -0.568040668964386, + "step": 870 + }, + { + "epoch": 0.31705998919113676, + "grad_norm": 12.55040168762207, + "learning_rate": 4.947953347083645e-07, + "logits/chosen": -1.222707986831665, + "logits/rejected": -1.2129634618759155, + "logps/chosen": -86.14817810058594, + "logps/rejected": -98.69571685791016, + "loss": 0.6519, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.39243194460868835, + "rewards/margins": 0.11647919565439224, + "rewards/rejected": -0.5089111328125, + "step": 880 + }, + { + "epoch": 0.3206629436137633, + "grad_norm": 14.883810997009277, + "learning_rate": 4.944713239685713e-07, + "logits/chosen": -1.4141329526901245, + "logits/rejected": -1.3852875232696533, + "logps/chosen": -95.36436462402344, + "logps/rejected": -96.96843719482422, + "loss": 0.6809, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3810012936592102, + "rewards/margins": 0.0413370355963707, + "rewards/rejected": -0.4223383069038391, + "step": 890 + }, + { + "epoch": 0.32426589803638983, + "grad_norm": 10.3993501663208, + "learning_rate": 4.941376425879623e-07, + "logits/chosen": -1.3478530645370483, + "logits/rejected": -1.336745023727417, + "logps/chosen": -88.2947998046875, + "logps/rejected": -102.8171615600586, + "loss": 0.6299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3908626437187195, + "rewards/margins": 0.15889796614646912, + "rewards/rejected": -0.549760639667511, + "step": 900 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 14.404094696044922, + "learning_rate": 4.93794303766093e-07, + "logits/chosen": -1.1831821203231812, + "logits/rejected": -1.1733192205429077, + "logps/chosen": -92.18428802490234, + "logps/rejected": -102.34025573730469, + "loss": 0.6588, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4617028832435608, + "rewards/margins": 0.08420713245868683, + "rewards/rejected": -0.5459100008010864, + "step": 910 + }, + { + "epoch": 0.33147180688164296, + "grad_norm": 13.073732376098633, + "learning_rate": 4.934413210845417e-07, + "logits/chosen": -1.1828866004943848, + "logits/rejected": -1.1633810997009277, + "logps/chosen": -102.12568664550781, + "logps/rejected": -116.79570007324219, + "loss": 0.6401, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4046645164489746, + "rewards/margins": 0.1273595094680786, + "rewards/rejected": -0.5320240259170532, + "step": 920 + }, + { + "epoch": 0.3350747613042695, + "grad_norm": 16.678611755371094, + "learning_rate": 4.930787085063722e-07, + "logits/chosen": -1.1912957429885864, + "logits/rejected": -1.1653717756271362, + "logps/chosen": -96.25555419921875, + "logps/rejected": -109.46162414550781, + "loss": 0.6369, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4989704489707947, + "rewards/margins": 0.13963612914085388, + "rewards/rejected": -0.638606607913971, + "step": 930 + }, + { + "epoch": 0.33867771572689603, + "grad_norm": 17.838438034057617, + "learning_rate": 4.927064803755819e-07, + "logits/chosen": -1.061514973640442, + "logits/rejected": -1.0441539287567139, + "logps/chosen": -104.01774597167969, + "logps/rejected": -120.51075744628906, + "loss": 0.6323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.45036545395851135, + "rewards/margins": 0.14668992161750793, + "rewards/rejected": -0.5970553159713745, + "step": 940 + }, + { + "epoch": 0.3422806701495226, + "grad_norm": 13.769434928894043, + "learning_rate": 4.923246514165338e-07, + "logits/chosen": -0.9698660969734192, + "logits/rejected": -0.949456512928009, + "logps/chosen": -89.14216613769531, + "logps/rejected": -103.09718322753906, + "loss": 0.6471, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.46100878715515137, + "rewards/margins": 0.11799683421850204, + "rewards/rejected": -0.5790055990219116, + "step": 950 + }, + { + "epoch": 0.34588362457214916, + "grad_norm": 13.711132049560547, + "learning_rate": 4.919332367333748e-07, + "logits/chosen": -0.7421332597732544, + "logits/rejected": -0.737074077129364, + "logps/chosen": -93.01615905761719, + "logps/rejected": -108.7341537475586, + "loss": 0.6456, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5092485547065735, + "rewards/margins": 0.12116242945194244, + "rewards/rejected": -0.6304109692573547, + "step": 960 + }, + { + "epoch": 0.3494865789947757, + "grad_norm": 13.020487785339355, + "learning_rate": 4.915322518094369e-07, + "logits/chosen": -0.666183590888977, + "logits/rejected": -0.6703653931617737, + "logps/chosen": -102.07322692871094, + "logps/rejected": -115.79020690917969, + "loss": 0.6512, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5666999816894531, + "rewards/margins": 0.11544916778802872, + "rewards/rejected": -0.6821491122245789, + "step": 970 + }, + { + "epoch": 0.3530895334174023, + "grad_norm": 15.92390251159668, + "learning_rate": 4.911217125066267e-07, + "logits/chosen": -0.7013139724731445, + "logits/rejected": -0.6965673565864563, + "logps/chosen": -99.9740219116211, + "logps/rejected": -125.32454681396484, + "loss": 0.6062, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5242640376091003, + "rewards/margins": 0.21401219069957733, + "rewards/rejected": -0.7382762432098389, + "step": 980 + }, + { + "epoch": 0.3566924878400288, + "grad_norm": 22.443201065063477, + "learning_rate": 4.90701635064796e-07, + "logits/chosen": -0.6783886551856995, + "logits/rejected": -0.6558721661567688, + "logps/chosen": -104.22843933105469, + "logps/rejected": -125.3359375, + "loss": 0.6275, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5865387320518494, + "rewards/margins": 0.16658911108970642, + "rewards/rejected": -0.7531278729438782, + "step": 990 + }, + { + "epoch": 0.36029544226265536, + "grad_norm": 16.392536163330078, + "learning_rate": 4.902720361011007e-07, + "logits/chosen": -0.5944384336471558, + "logits/rejected": -0.5874532461166382, + "logps/chosen": -97.95899963378906, + "logps/rejected": -116.11844635009766, + "loss": 0.6374, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5813181400299072, + "rewards/margins": 0.16180863976478577, + "rewards/rejected": -0.7431267499923706, + "step": 1000 + }, + { + "epoch": 0.36389839668528196, + "grad_norm": 15.579933166503906, + "learning_rate": 4.898329326093426e-07, + "logits/chosen": -0.7319918870925903, + "logits/rejected": -0.7233752012252808, + "logps/chosen": -96.44636535644531, + "logps/rejected": -106.9745864868164, + "loss": 0.6682, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5093886256217957, + "rewards/margins": 0.08220504969358444, + "rewards/rejected": -0.5915936827659607, + "step": 1010 + }, + { + "epoch": 0.3675013511079085, + "grad_norm": 15.561565399169922, + "learning_rate": 4.893843419592977e-07, + "logits/chosen": -0.7630731463432312, + "logits/rejected": -0.7514520287513733, + "logps/chosen": -105.5160903930664, + "logps/rejected": -120.8371810913086, + "loss": 0.64, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5894420742988586, + "rewards/margins": 0.1372137814760208, + "rewards/rejected": -0.7266558408737183, + "step": 1020 + }, + { + "epoch": 0.37110430553053503, + "grad_norm": 18.01420021057129, + "learning_rate": 4.889262818960293e-07, + "logits/chosen": -0.817065417766571, + "logits/rejected": -0.8111998438835144, + "logps/chosen": -103.73628997802734, + "logps/rejected": -110.2214584350586, + "loss": 0.6694, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.510665774345398, + "rewards/margins": 0.0809985101222992, + "rewards/rejected": -0.5916643738746643, + "step": 1030 + }, + { + "epoch": 0.3747072599531616, + "grad_norm": 10.637860298156738, + "learning_rate": 4.884587705391851e-07, + "logits/chosen": -0.9392485618591309, + "logits/rejected": -0.9319330453872681, + "logps/chosen": -100.20902252197266, + "logps/rejected": -122.7046127319336, + "loss": 0.638, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4494263529777527, + "rewards/margins": 0.16700775921344757, + "rewards/rejected": -0.6164341568946838, + "step": 1040 + }, + { + "epoch": 0.37831021437578816, + "grad_norm": 13.383898735046387, + "learning_rate": 4.879818263822816e-07, + "logits/chosen": -0.7321975231170654, + "logits/rejected": -0.7081071138381958, + "logps/chosen": -94.94624328613281, + "logps/rejected": -108.52220153808594, + "loss": 0.6477, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.48819559812545776, + "rewards/margins": 0.1281278431415558, + "rewards/rejected": -0.6163234114646912, + "step": 1050 + }, + { + "epoch": 0.3819131687984147, + "grad_norm": 13.881481170654297, + "learning_rate": 4.874954682919718e-07, + "logits/chosen": -0.7877952456474304, + "logits/rejected": -0.7841506004333496, + "logps/chosen": -96.11666107177734, + "logps/rejected": -111.68910217285156, + "loss": 0.6458, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4922049045562744, + "rewards/margins": 0.14194722473621368, + "rewards/rejected": -0.6341521739959717, + "step": 1060 + }, + { + "epoch": 0.38551612322104123, + "grad_norm": 9.143404006958008, + "learning_rate": 4.869997155072988e-07, + "logits/chosen": -0.9350810050964355, + "logits/rejected": -0.9262323379516602, + "logps/chosen": -84.93910217285156, + "logps/rejected": -100.52149963378906, + "loss": 0.6481, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.43665748834609985, + "rewards/margins": 0.1451665163040161, + "rewards/rejected": -0.5818240642547607, + "step": 1070 + }, + { + "epoch": 0.3891190776436678, + "grad_norm": 21.315027236938477, + "learning_rate": 4.864945876389356e-07, + "logits/chosen": -1.0451407432556152, + "logits/rejected": -1.0247318744659424, + "logps/chosen": -100.16545104980469, + "logps/rejected": -116.10115814208984, + "loss": 0.6461, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4950154423713684, + "rewards/margins": 0.13210979104042053, + "rewards/rejected": -0.6271252036094666, + "step": 1080 + }, + { + "epoch": 0.39272203206629436, + "grad_norm": 11.744331359863281, + "learning_rate": 4.859801046684082e-07, + "logits/chosen": -1.0961428880691528, + "logits/rejected": -1.0880721807479858, + "logps/chosen": -86.44023895263672, + "logps/rejected": -95.47077178955078, + "loss": 0.6537, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3389251232147217, + "rewards/margins": 0.10952061414718628, + "rewards/rejected": -0.44844573736190796, + "step": 1090 + }, + { + "epoch": 0.3963249864889209, + "grad_norm": 16.391756057739258, + "learning_rate": 4.854562869473063e-07, + "logits/chosen": -0.8601453900337219, + "logits/rejected": -0.8658612370491028, + "logps/chosen": -91.15989685058594, + "logps/rejected": -108.34666442871094, + "loss": 0.6385, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4680752754211426, + "rewards/margins": 0.15381982922554016, + "rewards/rejected": -0.6218951344490051, + "step": 1100 + }, + { + "epoch": 0.3999279409115475, + "grad_norm": 21.71736717224121, + "learning_rate": 4.849231551964771e-07, + "logits/chosen": -0.6439577341079712, + "logits/rejected": -0.6424895524978638, + "logps/chosen": -99.82119750976562, + "logps/rejected": -113.4087905883789, + "loss": 0.6658, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5158894062042236, + "rewards/margins": 0.10277016460895538, + "rewards/rejected": -0.6186595559120178, + "step": 1110 + }, + { + "epoch": 0.403530895334174, + "grad_norm": 19.188968658447266, + "learning_rate": 4.843807305052068e-07, + "logits/chosen": -0.5007362365722656, + "logits/rejected": -0.5037881135940552, + "logps/chosen": -101.67707824707031, + "logps/rejected": -116.08494567871094, + "loss": 0.6444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5278171300888062, + "rewards/margins": 0.14936643838882446, + "rewards/rejected": -0.6771835088729858, + "step": 1120 + }, + { + "epoch": 0.40713384975680056, + "grad_norm": 13.658629417419434, + "learning_rate": 4.838290343303857e-07, + "logits/chosen": -0.6048570871353149, + "logits/rejected": -0.5855607986450195, + "logps/chosen": -109.92044830322266, + "logps/rejected": -127.4346923828125, + "loss": 0.6403, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5725590586662292, + "rewards/margins": 0.16029593348503113, + "rewards/rejected": -0.732854962348938, + "step": 1130 + }, + { + "epoch": 0.41073680417942715, + "grad_norm": 22.05428123474121, + "learning_rate": 4.832680884956593e-07, + "logits/chosen": -0.4781038761138916, + "logits/rejected": -0.4626520574092865, + "logps/chosen": -124.91099548339844, + "logps/rejected": -144.09007263183594, + "loss": 0.6293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7242249846458435, + "rewards/margins": 0.16776317358016968, + "rewards/rejected": -0.8919881582260132, + "step": 1140 + }, + { + "epoch": 0.4143397586020537, + "grad_norm": 18.160388946533203, + "learning_rate": 4.826979151905655e-07, + "logits/chosen": -0.478691965341568, + "logits/rejected": -0.47778424620628357, + "logps/chosen": -108.81644439697266, + "logps/rejected": -129.92697143554688, + "loss": 0.6229, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6904457807540894, + "rewards/margins": 0.18578965961933136, + "rewards/rejected": -0.8762354850769043, + "step": 1150 + }, + { + "epoch": 0.4179427130246802, + "grad_norm": 18.417818069458008, + "learning_rate": 4.821185369696564e-07, + "logits/chosen": -0.35122150182724, + "logits/rejected": -0.3426826000213623, + "logps/chosen": -127.5208511352539, + "logps/rejected": -142.33328247070312, + "loss": 0.6416, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8188759088516235, + "rewards/margins": 0.14513902366161346, + "rewards/rejected": -0.9640148878097534, + "step": 1160 + }, + { + "epoch": 0.4215456674473068, + "grad_norm": 22.334169387817383, + "learning_rate": 4.815299767516065e-07, + "logits/chosen": -0.45871931314468384, + "logits/rejected": -0.4563066065311432, + "logps/chosen": -123.07237243652344, + "logps/rejected": -150.54190063476562, + "loss": 0.627, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7688714265823364, + "rewards/margins": 0.20027267932891846, + "rewards/rejected": -0.9691441655158997, + "step": 1170 + }, + { + "epoch": 0.42514862186993335, + "grad_norm": 26.37111473083496, + "learning_rate": 4.809322578183055e-07, + "logits/chosen": -0.5161224007606506, + "logits/rejected": -0.5265077352523804, + "logps/chosen": -116.867431640625, + "logps/rejected": -133.74142456054688, + "loss": 0.6467, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6781013607978821, + "rewards/margins": 0.13253547251224518, + "rewards/rejected": -0.8106366991996765, + "step": 1180 + }, + { + "epoch": 0.4287515762925599, + "grad_norm": 13.813657760620117, + "learning_rate": 4.803254038139385e-07, + "logits/chosen": -0.6843366026878357, + "logits/rejected": -0.6704198122024536, + "logps/chosen": -122.2076187133789, + "logps/rejected": -136.3664093017578, + "loss": 0.6609, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.7395728826522827, + "rewards/margins": 0.11493418365716934, + "rewards/rejected": -0.8545069694519043, + "step": 1190 + }, + { + "epoch": 0.4323545307151865, + "grad_norm": 22.220705032348633, + "learning_rate": 4.79709438744049e-07, + "logits/chosen": -0.7626785635948181, + "logits/rejected": -0.7570369839668274, + "logps/chosen": -108.4599609375, + "logps/rejected": -138.76187133789062, + "loss": 0.5998, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6571046113967896, + "rewards/margins": 0.2574872076511383, + "rewards/rejected": -0.9145916700363159, + "step": 1200 + }, + { + "epoch": 0.435957485137813, + "grad_norm": 18.7367000579834, + "learning_rate": 4.790843869745907e-07, + "logits/chosen": -0.6571752429008484, + "logits/rejected": -0.646159291267395, + "logps/chosen": -111.79692077636719, + "logps/rejected": -133.7711181640625, + "loss": 0.6151, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6365227699279785, + "rewards/margins": 0.21131476759910583, + "rewards/rejected": -0.8478374481201172, + "step": 1210 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 16.725400924682617, + "learning_rate": 4.784502732309633e-07, + "logits/chosen": -0.8071640133857727, + "logits/rejected": -0.7955895662307739, + "logps/chosen": -108.87413024902344, + "logps/rejected": -112.9708480834961, + "loss": 0.6868, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.5596252679824829, + "rewards/margins": 0.037677209824323654, + "rewards/rejected": -0.5973024964332581, + "step": 1220 + }, + { + "epoch": 0.4431633939830661, + "grad_norm": 21.371368408203125, + "learning_rate": 4.778071225970339e-07, + "logits/chosen": -0.6365097165107727, + "logits/rejected": -0.6329114437103271, + "logps/chosen": -108.97966003417969, + "logps/rejected": -123.94303131103516, + "loss": 0.649, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6522558331489563, + "rewards/margins": 0.14867492020130157, + "rewards/rejected": -0.8009306788444519, + "step": 1230 + }, + { + "epoch": 0.4467663484056927, + "grad_norm": 18.487884521484375, + "learning_rate": 4.771549605141455e-07, + "logits/chosen": -0.7710438966751099, + "logits/rejected": -0.7733847498893738, + "logps/chosen": -102.35829162597656, + "logps/rejected": -111.59107971191406, + "loss": 0.6823, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6114463806152344, + "rewards/margins": 0.05293966457247734, + "rewards/rejected": -0.6643860340118408, + "step": 1240 + }, + { + "epoch": 0.4503693028283192, + "grad_norm": 23.91623878479004, + "learning_rate": 4.764938127801099e-07, + "logits/chosen": -0.8593417406082153, + "logits/rejected": -0.8609575033187866, + "logps/chosen": -111.25724792480469, + "logps/rejected": -130.29165649414062, + "loss": 0.637, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6470273733139038, + "rewards/margins": 0.15380927920341492, + "rewards/rejected": -0.8008366823196411, + "step": 1250 + }, + { + "epoch": 0.45397225725094575, + "grad_norm": 13.975992202758789, + "learning_rate": 4.7582370554818805e-07, + "logits/chosen": -0.8937684893608093, + "logits/rejected": -0.8855475187301636, + "logps/chosen": -93.56584167480469, + "logps/rejected": -113.898681640625, + "loss": 0.6287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5378883481025696, + "rewards/margins": 0.17181679606437683, + "rewards/rejected": -0.709705114364624, + "step": 1260 + }, + { + "epoch": 0.45757521167357235, + "grad_norm": 17.672836303710938, + "learning_rate": 4.7514466532605457e-07, + "logits/chosen": -0.8986455798149109, + "logits/rejected": -0.9085055589675903, + "logps/chosen": -97.85987854003906, + "logps/rejected": -112.6082534790039, + "loss": 0.6393, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.49343031644821167, + "rewards/margins": 0.1343407779932022, + "rewards/rejected": -0.6277710795402527, + "step": 1270 + }, + { + "epoch": 0.4611781660961989, + "grad_norm": 13.817961692810059, + "learning_rate": 4.744567189747498e-07, + "logits/chosen": -0.9279476404190063, + "logits/rejected": -0.9124320149421692, + "logps/chosen": -99.10932159423828, + "logps/rejected": -119.8096694946289, + "loss": 0.641, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5664225816726685, + "rewards/margins": 0.1693536341190338, + "rewards/rejected": -0.7357761859893799, + "step": 1280 + }, + { + "epoch": 0.4647811205188254, + "grad_norm": 10.271222114562988, + "learning_rate": 4.7375989370761695e-07, + "logits/chosen": -1.136845350265503, + "logits/rejected": -1.1129014492034912, + "logps/chosen": -105.83380126953125, + "logps/rejected": -125.69932556152344, + "loss": 0.6223, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5844363570213318, + "rewards/margins": 0.20007582008838654, + "rewards/rejected": -0.7845122218132019, + "step": 1290 + }, + { + "epoch": 0.468384074941452, + "grad_norm": 14.582992553710938, + "learning_rate": 4.7305421708922594e-07, + "logits/chosen": -1.0308201313018799, + "logits/rejected": -1.0145576000213623, + "logps/chosen": -100.9151611328125, + "logps/rejected": -112.91084289550781, + "loss": 0.6483, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5713368654251099, + "rewards/margins": 0.121244415640831, + "rewards/rejected": -0.692581295967102, + "step": 1300 + }, + { + "epoch": 0.47198702936407855, + "grad_norm": 22.442279815673828, + "learning_rate": 4.7233971703428253e-07, + "logits/chosen": -0.9299166798591614, + "logits/rejected": -0.9262005090713501, + "logps/chosen": -99.68379211425781, + "logps/rejected": -122.30403900146484, + "loss": 0.6232, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5639885663986206, + "rewards/margins": 0.20361557602882385, + "rewards/rejected": -0.7676041126251221, + "step": 1310 + }, + { + "epoch": 0.4755899837867051, + "grad_norm": 24.155487060546875, + "learning_rate": 4.7161642180652463e-07, + "logits/chosen": -0.7510775327682495, + "logits/rejected": -0.7490900754928589, + "logps/chosen": -122.33967590332031, + "logps/rejected": -141.98947143554688, + "loss": 0.6294, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7110903859138489, + "rewards/margins": 0.16365990042686462, + "rewards/rejected": -0.8747503161430359, + "step": 1320 + }, + { + "epoch": 0.4791929382093317, + "grad_norm": 21.514019012451172, + "learning_rate": 4.708843600176038e-07, + "logits/chosen": -0.6898788213729858, + "logits/rejected": -0.6778839230537415, + "logps/chosen": -122.47212219238281, + "logps/rejected": -131.47409057617188, + "loss": 0.663, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7074660062789917, + "rewards/margins": 0.08889790624380112, + "rewards/rejected": -0.796363890171051, + "step": 1330 + }, + { + "epoch": 0.4827958926319582, + "grad_norm": 17.670543670654297, + "learning_rate": 4.7014356062595364e-07, + "logits/chosen": -0.7467285990715027, + "logits/rejected": -0.7351511716842651, + "logps/chosen": -120.872802734375, + "logps/rejected": -142.69851684570312, + "loss": 0.6231, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7269989252090454, + "rewards/margins": 0.19409465789794922, + "rewards/rejected": -0.9210936427116394, + "step": 1340 + }, + { + "epoch": 0.48639884705458475, + "grad_norm": 21.745296478271484, + "learning_rate": 4.693940529356444e-07, + "logits/chosen": -0.7076036334037781, + "logits/rejected": -0.6931304931640625, + "logps/chosen": -115.3056411743164, + "logps/rejected": -131.36289978027344, + "loss": 0.6321, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6344283819198608, + "rewards/margins": 0.1647411286830902, + "rewards/rejected": -0.7991694211959839, + "step": 1350 + }, + { + "epoch": 0.49000180147721134, + "grad_norm": 13.729718208312988, + "learning_rate": 4.6863586659522353e-07, + "logits/chosen": -0.7514731287956238, + "logits/rejected": -0.7414983510971069, + "logps/chosen": -115.65132141113281, + "logps/rejected": -134.80738830566406, + "loss": 0.623, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6774997711181641, + "rewards/margins": 0.19184240698814392, + "rewards/rejected": -0.8693421483039856, + "step": 1360 + }, + { + "epoch": 0.4936047558998379, + "grad_norm": 16.541501998901367, + "learning_rate": 4.678690315965431e-07, + "logits/chosen": -0.9065626263618469, + "logits/rejected": -0.889665424823761, + "logps/chosen": -106.21175384521484, + "logps/rejected": -117.76409912109375, + "loss": 0.6473, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.616832971572876, + "rewards/margins": 0.12367036193609238, + "rewards/rejected": -0.7405033111572266, + "step": 1370 + }, + { + "epoch": 0.4972077103224644, + "grad_norm": 38.636070251464844, + "learning_rate": 4.6709357827357316e-07, + "logits/chosen": -0.9566957354545593, + "logits/rejected": -0.9666906595230103, + "logps/chosen": -125.11567687988281, + "logps/rejected": -149.4060516357422, + "loss": 0.6209, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7375609874725342, + "rewards/margins": 0.18065717816352844, + "rewards/rejected": -0.9182182550430298, + "step": 1380 + }, + { + "epoch": 0.500810664745091, + "grad_norm": 15.660926818847656, + "learning_rate": 4.66309537301202e-07, + "logits/chosen": -1.0064380168914795, + "logits/rejected": -1.0011049509048462, + "logps/chosen": -116.38114929199219, + "logps/rejected": -130.26541137695312, + "loss": 0.6501, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6288421154022217, + "rewards/margins": 0.11721490323543549, + "rewards/rejected": -0.7460570335388184, + "step": 1390 + }, + { + "epoch": 0.5044136191677175, + "grad_norm": 12.651188850402832, + "learning_rate": 4.655169396940228e-07, + "logits/chosen": -0.9192444086074829, + "logits/rejected": -0.908920407295227, + "logps/chosen": -97.5096206665039, + "logps/rejected": -124.25872802734375, + "loss": 0.6087, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5784972906112671, + "rewards/margins": 0.23468546569347382, + "rewards/rejected": -0.8131827116012573, + "step": 1400 + }, + { + "epoch": 0.5080165735903441, + "grad_norm": 13.578185081481934, + "learning_rate": 4.647158168051065e-07, + "logits/chosen": -0.7523177266120911, + "logits/rejected": -0.7385509610176086, + "logps/chosen": -96.3977279663086, + "logps/rejected": -120.1695785522461, + "loss": 0.6198, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5647341012954712, + "rewards/margins": 0.2241365611553192, + "rewards/rejected": -0.7888704538345337, + "step": 1410 + }, + { + "epoch": 0.5116195280129706, + "grad_norm": 9.682286262512207, + "learning_rate": 4.6390620032476165e-07, + "logits/chosen": -0.8279479742050171, + "logits/rejected": -0.8157111406326294, + "logps/chosen": -111.77999114990234, + "logps/rejected": -131.52413940429688, + "loss": 0.636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6873849034309387, + "rewards/margins": 0.17221364378929138, + "rewards/rejected": -0.8595984578132629, + "step": 1420 + }, + { + "epoch": 0.5152224824355972, + "grad_norm": 18.07782554626465, + "learning_rate": 4.6308812227928097e-07, + "logits/chosen": -0.7680369019508362, + "logits/rejected": -0.7670931816101074, + "logps/chosen": -117.39216613769531, + "logps/rejected": -135.0698699951172, + "loss": 0.6481, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.627860426902771, + "rewards/margins": 0.14340507984161377, + "rewards/rejected": -0.7712655663490295, + "step": 1430 + }, + { + "epoch": 0.5188254368582238, + "grad_norm": 11.54117488861084, + "learning_rate": 4.622616150296744e-07, + "logits/chosen": -0.8040092587471008, + "logits/rejected": -0.785892128944397, + "logps/chosen": -109.7896957397461, + "logps/rejected": -128.6796417236328, + "loss": 0.6359, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6554213762283325, + "rewards/margins": 0.16084139049053192, + "rewards/rejected": -0.8162628412246704, + "step": 1440 + }, + { + "epoch": 0.5224283912808503, + "grad_norm": 17.470726013183594, + "learning_rate": 4.61426711270389e-07, + "logits/chosen": -0.7985628247261047, + "logits/rejected": -0.7914355397224426, + "logps/chosen": -122.31645202636719, + "logps/rejected": -141.3916473388672, + "loss": 0.6334, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6971213221549988, + "rewards/margins": 0.17073138058185577, + "rewards/rejected": -0.8678528070449829, + "step": 1450 + }, + { + "epoch": 0.5260313457034769, + "grad_norm": 18.514780044555664, + "learning_rate": 4.605834440280154e-07, + "logits/chosen": -0.5825433135032654, + "logits/rejected": -0.571456789970398, + "logps/chosen": -104.1174545288086, + "logps/rejected": -122.6390380859375, + "loss": 0.6368, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.585911214351654, + "rewards/margins": 0.16434451937675476, + "rewards/rejected": -0.7502557039260864, + "step": 1460 + }, + { + "epoch": 0.5296343001261034, + "grad_norm": 15.687395095825195, + "learning_rate": 4.5973184665998184e-07, + "logits/chosen": -0.6224468350410461, + "logits/rejected": -0.6154752969741821, + "logps/chosen": -105.5159683227539, + "logps/rejected": -120.9432601928711, + "loss": 0.6397, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5748180747032166, + "rewards/margins": 0.14824509620666504, + "rewards/rejected": -0.7230631709098816, + "step": 1470 + }, + { + "epoch": 0.5332372545487299, + "grad_norm": 27.54644203186035, + "learning_rate": 4.588719528532341e-07, + "logits/chosen": -0.49367189407348633, + "logits/rejected": -0.5038386583328247, + "logps/chosen": -114.81755065917969, + "logps/rejected": -122.81019592285156, + "loss": 0.6636, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.691633939743042, + "rewards/margins": 0.0962417870759964, + "rewards/rejected": -0.7878756523132324, + "step": 1480 + }, + { + "epoch": 0.5368402089713565, + "grad_norm": 24.379594802856445, + "learning_rate": 4.580037966229033e-07, + "logits/chosen": -0.5394241213798523, + "logits/rejected": -0.5318597555160522, + "logps/chosen": -112.4027099609375, + "logps/rejected": -135.67672729492188, + "loss": 0.6245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6120216250419617, + "rewards/margins": 0.21039803326129913, + "rewards/rejected": -0.8224196434020996, + "step": 1490 + }, + { + "epoch": 0.540443163393983, + "grad_norm": 16.60763168334961, + "learning_rate": 4.571274123109605e-07, + "logits/chosen": -0.2344832718372345, + "logits/rejected": -0.22624747455120087, + "logps/chosen": -114.34037780761719, + "logps/rejected": -133.9674530029297, + "loss": 0.6215, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6741780042648315, + "rewards/margins": 0.19762232899665833, + "rewards/rejected": -0.871800422668457, + "step": 1500 + }, + { + "epoch": 0.5440461178166096, + "grad_norm": 21.019861221313477, + "learning_rate": 4.5624283458485753e-07, + "logits/chosen": -0.23028871417045593, + "logits/rejected": -0.23043613135814667, + "logps/chosen": -99.55493927001953, + "logps/rejected": -118.29109954833984, + "loss": 0.6355, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5799737572669983, + "rewards/margins": 0.1384848803281784, + "rewards/rejected": -0.7184587121009827, + "step": 1510 + }, + { + "epoch": 0.5476490722392362, + "grad_norm": 17.716548919677734, + "learning_rate": 4.553500984361563e-07, + "logits/chosen": -0.0724945068359375, + "logits/rejected": -0.08050285279750824, + "logps/chosen": -112.50837707519531, + "logps/rejected": -126.670166015625, + "loss": 0.6481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6275107264518738, + "rewards/margins": 0.1303718388080597, + "rewards/rejected": -0.7578826546669006, + "step": 1520 + }, + { + "epoch": 0.5512520266618627, + "grad_norm": 10.604673385620117, + "learning_rate": 4.5444923917914444e-07, + "logits/chosen": -0.1800159364938736, + "logits/rejected": -0.17468824982643127, + "logps/chosen": -110.12767028808594, + "logps/rejected": -129.58322143554688, + "loss": 0.6307, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7002070546150208, + "rewards/margins": 0.1697019636631012, + "rewards/rejected": -0.8699091076850891, + "step": 1530 + }, + { + "epoch": 0.5548549810844893, + "grad_norm": 11.269343376159668, + "learning_rate": 4.5354029244943814e-07, + "logits/chosen": -0.3396390676498413, + "logits/rejected": -0.33701950311660767, + "logps/chosen": -108.12324523925781, + "logps/rejected": -131.24703979492188, + "loss": 0.6193, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6141228675842285, + "rewards/margins": 0.20558467507362366, + "rewards/rejected": -0.8197075724601746, + "step": 1540 + }, + { + "epoch": 0.5584579355071159, + "grad_norm": 22.327653884887695, + "learning_rate": 4.5262329420257293e-07, + "logits/chosen": -0.43441152572631836, + "logits/rejected": -0.4208933413028717, + "logps/chosen": -111.65888977050781, + "logps/rejected": -131.4414825439453, + "loss": 0.622, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5926408171653748, + "rewards/margins": 0.18177416920661926, + "rewards/rejected": -0.7744150161743164, + "step": 1550 + }, + { + "epoch": 0.5620608899297423, + "grad_norm": 21.105283737182617, + "learning_rate": 4.516982807125811e-07, + "logits/chosen": -0.07212761789560318, + "logits/rejected": -0.07756079733371735, + "logps/chosen": -109.25408935546875, + "logps/rejected": -126.6335220336914, + "loss": 0.643, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.644785463809967, + "rewards/margins": 0.1567734330892563, + "rewards/rejected": -0.8015588521957397, + "step": 1560 + }, + { + "epoch": 0.5656638443523689, + "grad_norm": 19.152124404907227, + "learning_rate": 4.507652885705564e-07, + "logits/chosen": -0.08397520333528519, + "logits/rejected": -0.07510174810886383, + "logps/chosen": -124.69132232666016, + "logps/rejected": -134.48948669433594, + "loss": 0.6805, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7654344439506531, + "rewards/margins": 0.07131467014551163, + "rewards/rejected": -0.8367490768432617, + "step": 1570 + }, + { + "epoch": 0.5692667987749955, + "grad_norm": 18.12226104736328, + "learning_rate": 4.4982435468320757e-07, + "logits/chosen": -0.06124221533536911, + "logits/rejected": -0.04790102690458298, + "logps/chosen": -123.93861389160156, + "logps/rejected": -152.51882934570312, + "loss": 0.6041, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7189621925354004, + "rewards/margins": 0.24458980560302734, + "rewards/rejected": -0.9635521173477173, + "step": 1580 + }, + { + "epoch": 0.572869753197622, + "grad_norm": 25.4002742767334, + "learning_rate": 4.488755162713975e-07, + "logits/chosen": -0.2089006006717682, + "logits/rejected": -0.20480160415172577, + "logps/chosen": -109.4067611694336, + "logps/rejected": -129.50677490234375, + "loss": 0.6268, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6752637028694153, + "rewards/margins": 0.1842677891254425, + "rewards/rejected": -0.8595314025878906, + "step": 1590 + }, + { + "epoch": 0.5764727076202486, + "grad_norm": 21.045480728149414, + "learning_rate": 4.4791881086867133e-07, + "logits/chosen": -0.20409497618675232, + "logits/rejected": -0.2089627981185913, + "logps/chosen": -113.8995132446289, + "logps/rejected": -125.38606262207031, + "loss": 0.6564, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6167466640472412, + "rewards/margins": 0.1140073910355568, + "rewards/rejected": -0.730754017829895, + "step": 1600 + }, + { + "epoch": 0.5800756620428752, + "grad_norm": 33.92525100708008, + "learning_rate": 4.469542763197717e-07, + "logits/chosen": -0.2622045874595642, + "logits/rejected": -0.2459571808576584, + "logps/chosen": -110.90557861328125, + "logps/rejected": -135.90432739257812, + "loss": 0.6245, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6229537129402161, + "rewards/margins": 0.2116650640964508, + "rewards/rejected": -0.8346187472343445, + "step": 1610 + }, + { + "epoch": 0.5836786164655017, + "grad_norm": 12.207719802856445, + "learning_rate": 4.459819507791414e-07, + "logits/chosen": -0.2609815299510956, + "logits/rejected": -0.27185386419296265, + "logps/chosen": -104.3377914428711, + "logps/rejected": -121.11332702636719, + "loss": 0.6477, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6319000124931335, + "rewards/margins": 0.14542250335216522, + "rewards/rejected": -0.7773224711418152, + "step": 1620 + }, + { + "epoch": 0.5872815708881283, + "grad_norm": 27.201845169067383, + "learning_rate": 4.450018727094146e-07, + "logits/chosen": -0.27219557762145996, + "logits/rejected": -0.2478889524936676, + "logps/chosen": -123.58785247802734, + "logps/rejected": -139.5199432373047, + "loss": 0.6442, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7428973913192749, + "rewards/margins": 0.1621369570493698, + "rewards/rejected": -0.9050344228744507, + "step": 1630 + }, + { + "epoch": 0.5908845253107549, + "grad_norm": 21.65987205505371, + "learning_rate": 4.4401408087989475e-07, + "logits/chosen": -0.18356722593307495, + "logits/rejected": -0.1742212474346161, + "logps/chosen": -106.0003662109375, + "logps/rejected": -124.266357421875, + "loss": 0.6408, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6102654337882996, + "rewards/margins": 0.1706978976726532, + "rewards/rejected": -0.7809633612632751, + "step": 1640 + }, + { + "epoch": 0.5944874797333813, + "grad_norm": 19.172645568847656, + "learning_rate": 4.4301861436502155e-07, + "logits/chosen": -0.09111490100622177, + "logits/rejected": -0.08929113298654556, + "logps/chosen": -104.4176025390625, + "logps/rejected": -128.53921508789062, + "loss": 0.6126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6372945308685303, + "rewards/margins": 0.2133907973766327, + "rewards/rejected": -0.8506854176521301, + "step": 1650 + }, + { + "epoch": 0.5980904341560079, + "grad_norm": 23.5217342376709, + "learning_rate": 4.420155125428249e-07, + "logits/chosen": -0.23100826144218445, + "logits/rejected": -0.22501571476459503, + "logps/chosen": -116.77998352050781, + "logps/rejected": -138.1328582763672, + "loss": 0.6313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7341734766960144, + "rewards/margins": 0.18791931867599487, + "rewards/rejected": -0.9220927357673645, + "step": 1660 + }, + { + "epoch": 0.6016933885786345, + "grad_norm": 28.95935821533203, + "learning_rate": 4.4100481509336727e-07, + "logits/chosen": -0.33691197633743286, + "logits/rejected": -0.3338681161403656, + "logps/chosen": -124.83697509765625, + "logps/rejected": -145.29898071289062, + "loss": 0.6312, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7636333703994751, + "rewards/margins": 0.19949015974998474, + "rewards/rejected": -0.9631235003471375, + "step": 1670 + }, + { + "epoch": 0.605296343001261, + "grad_norm": 13.79696273803711, + "learning_rate": 4.3998656199717433e-07, + "logits/chosen": -0.16279760003089905, + "logits/rejected": -0.15554110705852509, + "logps/chosen": -123.12381744384766, + "logps/rejected": -146.53384399414062, + "loss": 0.6095, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7670550346374512, + "rewards/margins": 0.23701255023479462, + "rewards/rejected": -1.0040675401687622, + "step": 1680 + }, + { + "epoch": 0.6088992974238876, + "grad_norm": 32.479736328125, + "learning_rate": 4.38960793533653e-07, + "logits/chosen": -0.1557508260011673, + "logits/rejected": -0.1466209590435028, + "logps/chosen": -115.62281799316406, + "logps/rejected": -147.75576782226562, + "loss": 0.5807, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7074651718139648, + "rewards/margins": 0.3161167502403259, + "rewards/rejected": -1.023581862449646, + "step": 1690 + }, + { + "epoch": 0.6125022518465142, + "grad_norm": 13.241331100463867, + "learning_rate": 4.379275502794983e-07, + "logits/chosen": -0.31330347061157227, + "logits/rejected": -0.30750396847724915, + "logps/chosen": -119.7376937866211, + "logps/rejected": -142.54885864257812, + "loss": 0.6352, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6978413462638855, + "rewards/margins": 0.1720629334449768, + "rewards/rejected": -0.8699043393135071, + "step": 1700 + }, + { + "epoch": 0.6161052062691407, + "grad_norm": 18.612834930419922, + "learning_rate": 4.368868731070884e-07, + "logits/chosen": -0.3954562842845917, + "logits/rejected": -0.3834627866744995, + "logps/chosen": -130.12315368652344, + "logps/rejected": -147.21743774414062, + "loss": 0.6209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6318448185920715, + "rewards/margins": 0.1988515406847, + "rewards/rejected": -0.8306962847709656, + "step": 1710 + }, + { + "epoch": 0.6197081606917673, + "grad_norm": 23.820240020751953, + "learning_rate": 4.358388031828675e-07, + "logits/chosen": -0.4271976053714752, + "logits/rejected": -0.41857513785362244, + "logps/chosen": -111.3486099243164, + "logps/rejected": -128.419189453125, + "loss": 0.6427, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6416288614273071, + "rewards/margins": 0.1477826088666916, + "rewards/rejected": -0.7894114851951599, + "step": 1720 + }, + { + "epoch": 0.6233111151143939, + "grad_norm": 19.6106014251709, + "learning_rate": 4.3478338196571774e-07, + "logits/chosen": -0.592642605304718, + "logits/rejected": -0.5813112258911133, + "logps/chosen": -105.65118408203125, + "logps/rejected": -126.93350982666016, + "loss": 0.6237, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6063104867935181, + "rewards/margins": 0.18938353657722473, + "rewards/rejected": -0.7956939935684204, + "step": 1730 + }, + { + "epoch": 0.6269140695370203, + "grad_norm": 26.405860900878906, + "learning_rate": 4.3372065120531896e-07, + "logits/chosen": -0.6595714688301086, + "logits/rejected": -0.6470447778701782, + "logps/chosen": -116.03114318847656, + "logps/rejected": -144.71456909179688, + "loss": 0.6025, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6541780829429626, + "rewards/margins": 0.2541576027870178, + "rewards/rejected": -0.9083356857299805, + "step": 1740 + }, + { + "epoch": 0.6305170239596469, + "grad_norm": 21.304424285888672, + "learning_rate": 4.326506529404972e-07, + "logits/chosen": -0.4566799998283386, + "logits/rejected": -0.4474863111972809, + "logps/chosen": -122.94720458984375, + "logps/rejected": -147.1781463623047, + "loss": 0.6345, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7978330850601196, + "rewards/margins": 0.20655818283557892, + "rewards/rejected": -1.0043913125991821, + "step": 1750 + }, + { + "epoch": 0.6341199783822735, + "grad_norm": 27.25872230529785, + "learning_rate": 4.3157342949756176e-07, + "logits/chosen": -0.5053147077560425, + "logits/rejected": -0.5010774731636047, + "logps/chosen": -123.2105712890625, + "logps/rejected": -130.94049072265625, + "loss": 0.6711, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.730686604976654, + "rewards/margins": 0.0933484211564064, + "rewards/rejected": -0.8240349888801575, + "step": 1760 + }, + { + "epoch": 0.6377229328049, + "grad_norm": 13.439959526062012, + "learning_rate": 4.3048902348863106e-07, + "logits/chosen": -0.5893678665161133, + "logits/rejected": -0.5871925354003906, + "logps/chosen": -92.13633728027344, + "logps/rejected": -117.0339584350586, + "loss": 0.617, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5311223268508911, + "rewards/margins": 0.20927007496356964, + "rewards/rejected": -0.7403924465179443, + "step": 1770 + }, + { + "epoch": 0.6413258872275266, + "grad_norm": 14.695592880249023, + "learning_rate": 4.2939747780994696e-07, + "logits/chosen": -0.7266982197761536, + "logits/rejected": -0.7190583944320679, + "logps/chosen": -110.9961929321289, + "logps/rejected": -130.0135498046875, + "loss": 0.6243, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6118022203445435, + "rewards/margins": 0.2004629671573639, + "rewards/rejected": -0.8122652173042297, + "step": 1780 + }, + { + "epoch": 0.6449288416501531, + "grad_norm": 17.60588836669922, + "learning_rate": 4.2829883564017755e-07, + "logits/chosen": -0.6400290727615356, + "logits/rejected": -0.6384719014167786, + "logps/chosen": -111.03657531738281, + "logps/rejected": -124.96858978271484, + "loss": 0.6603, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.649237871170044, + "rewards/margins": 0.11901859939098358, + "rewards/rejected": -0.7682564854621887, + "step": 1790 + }, + { + "epoch": 0.6485317960727797, + "grad_norm": 37.39412307739258, + "learning_rate": 4.2719314043870956e-07, + "logits/chosen": -0.6858614683151245, + "logits/rejected": -0.6850418448448181, + "logps/chosen": -115.84126281738281, + "logps/rejected": -136.5898895263672, + "loss": 0.6471, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5579274892807007, + "rewards/margins": 0.1683448851108551, + "rewards/rejected": -0.726272463798523, + "step": 1800 + }, + { + "epoch": 0.6521347504954063, + "grad_norm": 32.6982536315918, + "learning_rate": 4.260804359439291e-07, + "logits/chosen": -0.4608843922615051, + "logits/rejected": -0.4496752619743347, + "logps/chosen": -121.73396301269531, + "logps/rejected": -141.05642700195312, + "loss": 0.6373, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6710211634635925, + "rewards/margins": 0.17933639883995056, + "rewards/rejected": -0.8503575325012207, + "step": 1810 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 15.985424041748047, + "learning_rate": 4.2496076617149134e-07, + "logits/chosen": -0.22309105098247528, + "logits/rejected": -0.22793006896972656, + "logps/chosen": -118.05805969238281, + "logps/rejected": -141.6474609375, + "loss": 0.615, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7154508233070374, + "rewards/margins": 0.22133982181549072, + "rewards/rejected": -0.9367905855178833, + "step": 1820 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 23.936935424804688, + "learning_rate": 4.238341754125795e-07, + "logits/chosen": -0.20337358117103577, + "logits/rejected": -0.1934824436903, + "logps/chosen": -133.47775268554688, + "logps/rejected": -165.22193908691406, + "loss": 0.6023, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.967487633228302, + "rewards/margins": 0.28225192427635193, + "rewards/rejected": -1.249739646911621, + "step": 1830 + }, + { + "epoch": 0.6629436137632859, + "grad_norm": 34.532161712646484, + "learning_rate": 4.2270070823215275e-07, + "logits/chosen": -0.20488937199115753, + "logits/rejected": -0.2023644894361496, + "logps/chosen": -129.74929809570312, + "logps/rejected": -150.7936553955078, + "loss": 0.6268, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8183504939079285, + "rewards/margins": 0.21211810410022736, + "rewards/rejected": -1.0304687023162842, + "step": 1840 + }, + { + "epoch": 0.6665465681859124, + "grad_norm": 22.62578773498535, + "learning_rate": 4.2156040946718343e-07, + "logits/chosen": -0.37808769941329956, + "logits/rejected": -0.3699187636375427, + "logps/chosen": -115.9354248046875, + "logps/rejected": -139.9909210205078, + "loss": 0.6198, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7293509244918823, + "rewards/margins": 0.1937699019908905, + "rewards/rejected": -0.9231207966804504, + "step": 1850 + }, + { + "epoch": 0.670149522608539, + "grad_norm": 25.69864845275879, + "learning_rate": 4.204133242248832e-07, + "logits/chosen": -0.40861696004867554, + "logits/rejected": -0.40402859449386597, + "logps/chosen": -118.96281433105469, + "logps/rejected": -133.44906616210938, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6875437498092651, + "rewards/margins": 0.14636746048927307, + "rewards/rejected": -0.8339112401008606, + "step": 1860 + }, + { + "epoch": 0.6737524770311656, + "grad_norm": 15.054122924804688, + "learning_rate": 4.1925949788091907e-07, + "logits/chosen": -0.4089645743370056, + "logits/rejected": -0.40489277243614197, + "logps/chosen": -111.43312072753906, + "logps/rejected": -127.93333435058594, + "loss": 0.6398, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6141126751899719, + "rewards/margins": 0.15593598783016205, + "rewards/rejected": -0.7700486779212952, + "step": 1870 + }, + { + "epoch": 0.6773554314537921, + "grad_norm": 36.226905822753906, + "learning_rate": 4.1809897607761814e-07, + "logits/chosen": -0.4524051547050476, + "logits/rejected": -0.4494766294956207, + "logps/chosen": -135.3714141845703, + "logps/rejected": -156.458984375, + "loss": 0.6346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7505577802658081, + "rewards/margins": 0.20968285202980042, + "rewards/rejected": -0.9602406620979309, + "step": 1880 + }, + { + "epoch": 0.6809583858764187, + "grad_norm": 38.99123001098633, + "learning_rate": 4.169318047221621e-07, + "logits/chosen": -0.25047487020492554, + "logits/rejected": -0.24576766788959503, + "logps/chosen": -122.78657531738281, + "logps/rejected": -141.0259552001953, + "loss": 0.6485, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.722198486328125, + "rewards/margins": 0.13076011836528778, + "rewards/rejected": -0.8529586791992188, + "step": 1890 + }, + { + "epoch": 0.6845613402990453, + "grad_norm": 23.391263961791992, + "learning_rate": 4.157580299847717e-07, + "logits/chosen": -0.11810042709112167, + "logits/rejected": -0.10987571626901627, + "logps/chosen": -133.02330017089844, + "logps/rejected": -162.12051391601562, + "loss": 0.6082, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9049941301345825, + "rewards/margins": 0.2531413733959198, + "rewards/rejected": -1.1581355333328247, + "step": 1900 + }, + { + "epoch": 0.6881642947216717, + "grad_norm": 26.655284881591797, + "learning_rate": 4.145776982968797e-07, + "logits/chosen": -0.06702479720115662, + "logits/rejected": -0.06362093985080719, + "logps/chosen": -139.19175720214844, + "logps/rejected": -151.38427734375, + "loss": 0.6659, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.8984958529472351, + "rewards/margins": 0.10784594714641571, + "rewards/rejected": -1.006341814994812, + "step": 1910 + }, + { + "epoch": 0.6917672491442983, + "grad_norm": 22.961259841918945, + "learning_rate": 4.1339085634929485e-07, + "logits/chosen": -0.07665994018316269, + "logits/rejected": -0.07137580215930939, + "logps/chosen": -140.45701599121094, + "logps/rejected": -164.6908416748047, + "loss": 0.6227, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9155426025390625, + "rewards/margins": 0.21152743697166443, + "rewards/rejected": -1.1270700693130493, + "step": 1920 + }, + { + "epoch": 0.6953702035669249, + "grad_norm": 24.024768829345703, + "learning_rate": 4.1219755109035423e-07, + "logits/chosen": -0.04297425225377083, + "logits/rejected": -0.029266545549035072, + "logps/chosen": -125.0589599609375, + "logps/rejected": -154.24522399902344, + "loss": 0.6034, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8275787234306335, + "rewards/margins": 0.25777915120124817, + "rewards/rejected": -1.085357904434204, + "step": 1930 + }, + { + "epoch": 0.6989731579895514, + "grad_norm": 26.384260177612305, + "learning_rate": 4.1099782972406703e-07, + "logits/chosen": -0.04297472909092903, + "logits/rejected": -0.02821093238890171, + "logps/chosen": -108.7773666381836, + "logps/rejected": -138.58921813964844, + "loss": 0.6024, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.615945041179657, + "rewards/margins": 0.2714507281780243, + "rewards/rejected": -0.8873957395553589, + "step": 1940 + }, + { + "epoch": 0.702576112412178, + "grad_norm": 19.373769760131836, + "learning_rate": 4.097917397082462e-07, + "logits/chosen": -0.1805184781551361, + "logits/rejected": -0.17657090723514557, + "logps/chosen": -118.64015197753906, + "logps/rejected": -136.58822631835938, + "loss": 0.6442, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6089118123054504, + "rewards/margins": 0.13456028699874878, + "rewards/rejected": -0.7434720993041992, + "step": 1950 + }, + { + "epoch": 0.7061790668348046, + "grad_norm": 15.632219314575195, + "learning_rate": 4.085793287526319e-07, + "logits/chosen": -0.018003929406404495, + "logits/rejected": -0.0015068978536874056, + "logps/chosen": -116.03665924072266, + "logps/rejected": -139.82785034179688, + "loss": 0.6434, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.719316303730011, + "rewards/margins": 0.18363362550735474, + "rewards/rejected": -0.9029499292373657, + "step": 1960 + }, + { + "epoch": 0.7097820212574311, + "grad_norm": 26.803855895996094, + "learning_rate": 4.0736064481700396e-07, + "logits/chosen": 0.03305846452713013, + "logits/rejected": 0.03875232860445976, + "logps/chosen": -110.02363586425781, + "logps/rejected": -128.6298828125, + "loss": 0.6317, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6139898896217346, + "rewards/margins": 0.1760130375623703, + "rewards/rejected": -0.7900028228759766, + "step": 1970 + }, + { + "epoch": 0.7133849756800577, + "grad_norm": 13.6882963180542, + "learning_rate": 4.0613573610928477e-07, + "logits/chosen": 0.2014857977628708, + "logits/rejected": 0.2058900147676468, + "logps/chosen": -119.346435546875, + "logps/rejected": -143.44493103027344, + "loss": 0.6174, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6651163101196289, + "rewards/margins": 0.2247789204120636, + "rewards/rejected": -0.8898951411247253, + "step": 1980 + }, + { + "epoch": 0.7169879301026842, + "grad_norm": 54.356929779052734, + "learning_rate": 4.0490465108363213e-07, + "logits/chosen": 0.4385454058647156, + "logits/rejected": 0.4460521340370178, + "logps/chosen": -135.75320434570312, + "logps/rejected": -148.539794921875, + "loss": 0.6576, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8634354472160339, + "rewards/margins": 0.12716665863990784, + "rewards/rejected": -0.9906021952629089, + "step": 1990 + }, + { + "epoch": 0.7205908845253107, + "grad_norm": 35.139137268066406, + "learning_rate": 4.036674384385231e-07, + "logits/chosen": 0.5260182619094849, + "logits/rejected": 0.5229222178459167, + "logps/chosen": -133.07302856445312, + "logps/rejected": -154.39791870117188, + "loss": 0.6436, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8775787353515625, + "rewards/margins": 0.17777515947818756, + "rewards/rejected": -1.0553538799285889, + "step": 2000 + }, + { + "epoch": 0.7241938389479373, + "grad_norm": 24.67560577392578, + "learning_rate": 4.0242414711482673e-07, + "logits/chosen": 0.4192674160003662, + "logits/rejected": 0.42251092195510864, + "logps/chosen": -135.64370727539062, + "logps/rejected": -154.87327575683594, + "loss": 0.6399, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9285451769828796, + "rewards/margins": 0.16990457475185394, + "rewards/rejected": -1.0984498262405396, + "step": 2010 + }, + { + "epoch": 0.7277967933705639, + "grad_norm": 21.97782325744629, + "learning_rate": 4.0117482629386884e-07, + "logits/chosen": 0.5319436192512512, + "logits/rejected": 0.5361469984054565, + "logps/chosen": -129.13584899902344, + "logps/rejected": -156.07022094726562, + "loss": 0.6011, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8532201647758484, + "rewards/margins": 0.26026397943496704, + "rewards/rejected": -1.1134841442108154, + "step": 2020 + }, + { + "epoch": 0.7313997477931904, + "grad_norm": 37.31975173950195, + "learning_rate": 3.9991952539548616e-07, + "logits/chosen": 0.46142640709877014, + "logits/rejected": 0.47692495584487915, + "logps/chosen": -147.18496704101562, + "logps/rejected": -165.7284698486328, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0022342205047607, + "rewards/margins": 0.16152818500995636, + "rewards/rejected": -1.1637624502182007, + "step": 2030 + }, + { + "epoch": 0.735002702215817, + "grad_norm": 22.876155853271484, + "learning_rate": 3.9865829407607166e-07, + "logits/chosen": 0.4777229428291321, + "logits/rejected": 0.4776241183280945, + "logps/chosen": -138.6183319091797, + "logps/rejected": -151.57363891601562, + "loss": 0.6566, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.9196840524673462, + "rewards/margins": 0.11917316913604736, + "rewards/rejected": -1.0388572216033936, + "step": 2040 + }, + { + "epoch": 0.7386056566384436, + "grad_norm": 24.542051315307617, + "learning_rate": 3.9739118222660983e-07, + "logits/chosen": 0.40690964460372925, + "logits/rejected": 0.41948142647743225, + "logps/chosen": -150.92098999023438, + "logps/rejected": -181.51345825195312, + "loss": 0.6036, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9969013929367065, + "rewards/margins": 0.2801375389099121, + "rewards/rejected": -1.2770389318466187, + "step": 2050 + }, + { + "epoch": 0.7422086110610701, + "grad_norm": 15.832708358764648, + "learning_rate": 3.961182399707037e-07, + "logits/chosen": 0.4410591721534729, + "logits/rejected": 0.4448552131652832, + "logps/chosen": -142.0412139892578, + "logps/rejected": -171.34796142578125, + "loss": 0.6038, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0556821823120117, + "rewards/margins": 0.25854361057281494, + "rewards/rejected": -1.3142259120941162, + "step": 2060 + }, + { + "epoch": 0.7458115654836966, + "grad_norm": 30.200489044189453, + "learning_rate": 3.9483951766259174e-07, + "logits/chosen": 0.19090792536735535, + "logits/rejected": 0.1829378306865692, + "logps/chosen": -144.0277557373047, + "logps/rejected": -155.68121337890625, + "loss": 0.6687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9700512886047363, + "rewards/margins": 0.10892186313867569, + "rewards/rejected": -1.0789731740951538, + "step": 2070 + }, + { + "epoch": 0.7494145199063232, + "grad_norm": 16.804433822631836, + "learning_rate": 3.9355506588515587e-07, + "logits/chosen": 0.358919233083725, + "logits/rejected": 0.3698652684688568, + "logps/chosen": -143.83709716796875, + "logps/rejected": -174.1533966064453, + "loss": 0.594, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9783726930618286, + "rewards/margins": 0.2765392065048218, + "rewards/rejected": -1.2549118995666504, + "step": 2080 + }, + { + "epoch": 0.7530174743289497, + "grad_norm": 30.392404556274414, + "learning_rate": 3.922649354479209e-07, + "logits/chosen": 0.514492392539978, + "logits/rejected": 0.5247939825057983, + "logps/chosen": -149.9234161376953, + "logps/rejected": -183.11795043945312, + "loss": 0.5925, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0595362186431885, + "rewards/margins": 0.30505871772766113, + "rewards/rejected": -1.3645950555801392, + "step": 2090 + }, + { + "epoch": 0.7566204287515763, + "grad_norm": 27.8906192779541, + "learning_rate": 3.9096917738504444e-07, + "logits/chosen": 0.5503655672073364, + "logits/rejected": 0.5556210875511169, + "logps/chosen": -157.77444458007812, + "logps/rejected": -168.21670532226562, + "loss": 0.697, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.185608148574829, + "rewards/margins": 0.07584402710199356, + "rewards/rejected": -1.261452078819275, + "step": 2100 + }, + { + "epoch": 0.7602233831742028, + "grad_norm": 16.32037925720215, + "learning_rate": 3.89667842953298e-07, + "logits/chosen": 0.3441751003265381, + "logits/rejected": 0.34723028540611267, + "logps/chosen": -156.60671997070312, + "logps/rejected": -182.07041931152344, + "loss": 0.6121, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.087058663368225, + "rewards/margins": 0.24376948177814484, + "rewards/rejected": -1.330828070640564, + "step": 2110 + }, + { + "epoch": 0.7638263375968294, + "grad_norm": 21.35797119140625, + "learning_rate": 3.8836098363003966e-07, + "logits/chosen": 0.24712154269218445, + "logits/rejected": 0.2610171139240265, + "logps/chosen": -148.96975708007812, + "logps/rejected": -174.50173950195312, + "loss": 0.6262, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0182521343231201, + "rewards/margins": 0.24236364662647247, + "rewards/rejected": -1.2606159448623657, + "step": 2120 + }, + { + "epoch": 0.767429292019456, + "grad_norm": 17.814006805419922, + "learning_rate": 3.8704865111117746e-07, + "logits/chosen": 0.08137331157922745, + "logits/rejected": 0.08755414187908173, + "logps/chosen": -154.65521240234375, + "logps/rejected": -178.48974609375, + "loss": 0.6247, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9661283493041992, + "rewards/margins": 0.23251216113567352, + "rewards/rejected": -1.1986405849456787, + "step": 2130 + }, + { + "epoch": 0.7710322464420825, + "grad_norm": 47.21306610107422, + "learning_rate": 3.8573089730912486e-07, + "logits/chosen": 0.10043950378894806, + "logits/rejected": 0.09298092871904373, + "logps/chosen": -155.009033203125, + "logps/rejected": -180.91134643554688, + "loss": 0.6346, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0485827922821045, + "rewards/margins": 0.19028575718402863, + "rewards/rejected": -1.2388685941696167, + "step": 2140 + }, + { + "epoch": 0.774635200864709, + "grad_norm": 27.711566925048828, + "learning_rate": 3.8440777435074677e-07, + "logits/chosen": 0.06114129349589348, + "logits/rejected": 0.07233893126249313, + "logps/chosen": -147.94363403320312, + "logps/rejected": -179.6943359375, + "loss": 0.5764, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0121772289276123, + "rewards/margins": 0.3053652346134186, + "rewards/rejected": -1.3175424337387085, + "step": 2150 + }, + { + "epoch": 0.7782381552873356, + "grad_norm": 46.265296936035156, + "learning_rate": 3.8307933457529803e-07, + "logits/chosen": -0.10898490250110626, + "logits/rejected": -0.10529482364654541, + "logps/chosen": -149.0132293701172, + "logps/rejected": -181.5883026123047, + "loss": 0.5966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.987149715423584, + "rewards/margins": 0.28507328033447266, + "rewards/rejected": -1.2722227573394775, + "step": 2160 + }, + { + "epoch": 0.7818411097099621, + "grad_norm": 33.7895393371582, + "learning_rate": 3.8174563053235244e-07, + "logits/chosen": 0.034177035093307495, + "logits/rejected": 0.03491184115409851, + "logps/chosen": -132.97091674804688, + "logps/rejected": -162.5496826171875, + "loss": 0.5929, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8792848587036133, + "rewards/margins": 0.26454678177833557, + "rewards/rejected": -1.1438316106796265, + "step": 2170 + }, + { + "epoch": 0.7854440641325887, + "grad_norm": 26.706649780273438, + "learning_rate": 3.804067149797244e-07, + "logits/chosen": 0.025342971086502075, + "logits/rejected": 0.042313508689403534, + "logps/chosen": -153.59963989257812, + "logps/rejected": -181.06259155273438, + "loss": 0.6178, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9308815002441406, + "rewards/margins": 0.2412043809890747, + "rewards/rejected": -1.1720860004425049, + "step": 2180 + }, + { + "epoch": 0.7890470185552153, + "grad_norm": 22.888463973999023, + "learning_rate": 3.790626408813822e-07, + "logits/chosen": 0.09760870039463043, + "logits/rejected": 0.11327888816595078, + "logps/chosen": -149.55943298339844, + "logps/rejected": -168.8435516357422, + "loss": 0.6433, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0106549263000488, + "rewards/margins": 0.19457527995109558, + "rewards/rejected": -1.2052302360534668, + "step": 2190 + }, + { + "epoch": 0.7926499729778418, + "grad_norm": 16.771135330200195, + "learning_rate": 3.7771346140535214e-07, + "logits/chosen": 0.08035653084516525, + "logits/rejected": 0.10142220556735992, + "logps/chosen": -150.521484375, + "logps/rejected": -182.3574981689453, + "loss": 0.6012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0371589660644531, + "rewards/margins": 0.2926921248435974, + "rewards/rejected": -1.3298509120941162, + "step": 2200 + }, + { + "epoch": 0.7962529274004684, + "grad_norm": 21.914215087890625, + "learning_rate": 3.763592299216161e-07, + "logits/chosen": 0.14479905366897583, + "logits/rejected": 0.16052904725074768, + "logps/chosen": -140.46218872070312, + "logps/rejected": -165.92160034179688, + "loss": 0.6112, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9338615536689758, + "rewards/margins": 0.2532302141189575, + "rewards/rejected": -1.1870917081832886, + "step": 2210 + }, + { + "epoch": 0.799855881823095, + "grad_norm": 20.929664611816406, + "learning_rate": 3.75e-07, + "logits/chosen": 0.1540573537349701, + "logits/rejected": 0.15413573384284973, + "logps/chosen": -127.8952865600586, + "logps/rejected": -146.11402893066406, + "loss": 0.6447, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8141363263130188, + "rewards/margins": 0.145987868309021, + "rewards/rejected": -0.9601241946220398, + "step": 2220 + }, + { + "epoch": 0.8034588362457215, + "grad_norm": 14.718154907226562, + "learning_rate": 3.7363582540805473e-07, + "logits/chosen": 0.19897550344467163, + "logits/rejected": 0.20991668105125427, + "logps/chosen": -136.5651397705078, + "logps/rejected": -160.22506713867188, + "loss": 0.6231, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8688238859176636, + "rewards/margins": 0.23662717640399933, + "rewards/rejected": -1.105450987815857, + "step": 2230 + }, + { + "epoch": 0.807061790668348, + "grad_norm": 25.30946159362793, + "learning_rate": 3.722667601089292e-07, + "logits/chosen": 0.3127673864364624, + "logits/rejected": 0.3147187829017639, + "logps/chosen": -128.371337890625, + "logps/rejected": -145.5037078857422, + "loss": 0.6542, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8618924021720886, + "rewards/margins": 0.15794196724891663, + "rewards/rejected": -1.019834280014038, + "step": 2240 + }, + { + "epoch": 0.8106647450909746, + "grad_norm": 23.615474700927734, + "learning_rate": 3.7089285825923613e-07, + "logits/chosen": 0.2622937560081482, + "logits/rejected": 0.27894237637519836, + "logps/chosen": -140.64035034179688, + "logps/rejected": -158.76181030273438, + "loss": 0.655, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9477087259292603, + "rewards/margins": 0.16048268973827362, + "rewards/rejected": -1.1081912517547607, + "step": 2250 + }, + { + "epoch": 0.8142676995136011, + "grad_norm": 18.702375411987305, + "learning_rate": 3.69514174206909e-07, + "logits/chosen": 0.2706855833530426, + "logits/rejected": 0.27766889333724976, + "logps/chosen": -120.02188873291016, + "logps/rejected": -126.6515121459961, + "loss": 0.6858, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7856532335281372, + "rewards/margins": 0.05522824451327324, + "rewards/rejected": -0.8408814668655396, + "step": 2260 + }, + { + "epoch": 0.8178706539362277, + "grad_norm": 26.36187744140625, + "learning_rate": 3.6813076248905296e-07, + "logits/chosen": 0.08216744661331177, + "logits/rejected": 0.08492139726877213, + "logps/chosen": -157.15606689453125, + "logps/rejected": -181.71217346191406, + "loss": 0.6458, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.082599401473999, + "rewards/margins": 0.22989189624786377, + "rewards/rejected": -1.3124912977218628, + "step": 2270 + }, + { + "epoch": 0.8214736083588543, + "grad_norm": 13.744657516479492, + "learning_rate": 3.66742677829787e-07, + "logits/chosen": 0.02621442638337612, + "logits/rejected": 0.043502770364284515, + "logps/chosen": -130.1800537109375, + "logps/rejected": -156.7100372314453, + "loss": 0.6069, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8457847833633423, + "rewards/margins": 0.24271400272846222, + "rewards/rejected": -1.088498830795288, + "step": 2280 + }, + { + "epoch": 0.8250765627814808, + "grad_norm": 27.42162322998047, + "learning_rate": 3.6534997513807933e-07, + "logits/chosen": 0.24865540862083435, + "logits/rejected": 0.25898540019989014, + "logps/chosen": -127.29747009277344, + "logps/rejected": -148.6510009765625, + "loss": 0.6276, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7310788035392761, + "rewards/margins": 0.2048756331205368, + "rewards/rejected": -0.9359544515609741, + "step": 2290 + }, + { + "epoch": 0.8286795172041074, + "grad_norm": 18.887805938720703, + "learning_rate": 3.639527095055753e-07, + "logits/chosen": 0.11071660369634628, + "logits/rejected": 0.12018336355686188, + "logps/chosen": -124.1941909790039, + "logps/rejected": -148.11679077148438, + "loss": 0.6363, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8029730916023254, + "rewards/margins": 0.21025899052619934, + "rewards/rejected": -1.0132321119308472, + "step": 2300 + }, + { + "epoch": 0.832282471626734, + "grad_norm": 18.347463607788086, + "learning_rate": 3.625509362044183e-07, + "logits/chosen": 0.2225940227508545, + "logits/rejected": 0.22224624454975128, + "logps/chosen": -137.62863159179688, + "logps/rejected": -155.7116241455078, + "loss": 0.655, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8755800127983093, + "rewards/margins": 0.14910510182380676, + "rewards/rejected": -1.024685025215149, + "step": 2310 + }, + { + "epoch": 0.8358854260493604, + "grad_norm": 21.459951400756836, + "learning_rate": 3.6114471068506315e-07, + "logits/chosen": 0.22385597229003906, + "logits/rejected": 0.24280044436454773, + "logps/chosen": -137.79945373535156, + "logps/rejected": -149.93775939941406, + "loss": 0.6715, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.859727680683136, + "rewards/margins": 0.14403629302978516, + "rewards/rejected": -1.0037639141082764, + "step": 2320 + }, + { + "epoch": 0.839488380471987, + "grad_norm": 13.235859870910645, + "learning_rate": 3.5973408857408263e-07, + "logits/chosen": 0.2293127030134201, + "logits/rejected": 0.2375296801328659, + "logps/chosen": -114.64483642578125, + "logps/rejected": -139.06341552734375, + "loss": 0.6137, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7316290140151978, + "rewards/margins": 0.23739755153656006, + "rewards/rejected": -0.9690265655517578, + "step": 2330 + }, + { + "epoch": 0.8430913348946136, + "grad_norm": 24.24904441833496, + "learning_rate": 3.5831912567196717e-07, + "logits/chosen": 0.15491041541099548, + "logits/rejected": 0.16376709938049316, + "logps/chosen": -143.97264099121094, + "logps/rejected": -166.48516845703125, + "loss": 0.6167, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9123767614364624, + "rewards/margins": 0.236104816198349, + "rewards/rejected": -1.1484816074371338, + "step": 2340 + }, + { + "epoch": 0.8466942893172401, + "grad_norm": 20.42092514038086, + "learning_rate": 3.568998779509173e-07, + "logits/chosen": 0.20104598999023438, + "logits/rejected": 0.2010325938463211, + "logps/chosen": -132.96450805664062, + "logps/rejected": -158.20303344726562, + "loss": 0.6197, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8694693446159363, + "rewards/margins": 0.22495004534721375, + "rewards/rejected": -1.0944193601608276, + "step": 2350 + }, + { + "epoch": 0.8502972437398667, + "grad_norm": 34.526084899902344, + "learning_rate": 3.5547640155262984e-07, + "logits/chosen": 0.10567860305309296, + "logits/rejected": 0.11500003188848495, + "logps/chosen": -153.9732208251953, + "logps/rejected": -160.86959838867188, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0203911066055298, + "rewards/margins": 0.08179245889186859, + "rewards/rejected": -1.1021835803985596, + "step": 2360 + }, + { + "epoch": 0.8539001981624933, + "grad_norm": 14.429137229919434, + "learning_rate": 3.5404875278607685e-07, + "logits/chosen": -0.07767397910356522, + "logits/rejected": -0.05828147381544113, + "logps/chosen": -143.14736938476562, + "logps/rejected": -166.0102081298828, + "loss": 0.6237, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9092574119567871, + "rewards/margins": 0.21947212517261505, + "rewards/rejected": -1.1287293434143066, + "step": 2370 + }, + { + "epoch": 0.8575031525851198, + "grad_norm": 18.862186431884766, + "learning_rate": 3.5261698812527847e-07, + "logits/chosen": -0.05472679063677788, + "logits/rejected": -0.04479784518480301, + "logps/chosen": -126.60133361816406, + "logps/rejected": -149.07211303710938, + "loss": 0.608, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8011096119880676, + "rewards/margins": 0.21337732672691345, + "rewards/rejected": -1.0144869089126587, + "step": 2380 + }, + { + "epoch": 0.8611061070077464, + "grad_norm": 25.388795852661133, + "learning_rate": 3.511811642070684e-07, + "logits/chosen": -0.014821426942944527, + "logits/rejected": 0.0019395619165152311, + "logps/chosen": -143.45809936523438, + "logps/rejected": -159.7408447265625, + "loss": 0.6523, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.913117527961731, + "rewards/margins": 0.1669197529554367, + "rewards/rejected": -1.080037236213684, + "step": 2390 + }, + { + "epoch": 0.864709061430373, + "grad_norm": 16.353431701660156, + "learning_rate": 3.4974133782885407e-07, + "logits/chosen": 0.17188173532485962, + "logits/rejected": 0.17839708924293518, + "logps/chosen": -126.90568542480469, + "logps/rejected": -164.10240173339844, + "loss": 0.5844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7514697313308716, + "rewards/margins": 0.33532294631004333, + "rewards/rejected": -1.0867927074432373, + "step": 2400 + }, + { + "epoch": 0.8683120158529994, + "grad_norm": 28.33049964904785, + "learning_rate": 3.482975659463697e-07, + "logits/chosen": 0.23143163323402405, + "logits/rejected": 0.23064498603343964, + "logps/chosen": -144.11611938476562, + "logps/rejected": -165.2440643310547, + "loss": 0.6236, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9507433176040649, + "rewards/margins": 0.20142099261283875, + "rewards/rejected": -1.1521642208099365, + "step": 2410 + }, + { + "epoch": 0.871914970275626, + "grad_norm": 19.122085571289062, + "learning_rate": 3.4684990567142326e-07, + "logits/chosen": 0.2861153483390808, + "logits/rejected": 0.3029120862483978, + "logps/chosen": -135.65419006347656, + "logps/rejected": -157.39295959472656, + "loss": 0.6447, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9725810885429382, + "rewards/margins": 0.17101779580116272, + "rewards/rejected": -1.1435989141464233, + "step": 2420 + }, + { + "epoch": 0.8755179246982525, + "grad_norm": 25.838401794433594, + "learning_rate": 3.4539841426963714e-07, + "logits/chosen": 0.27975279092788696, + "logits/rejected": 0.2869497239589691, + "logps/chosen": -138.52810668945312, + "logps/rejected": -165.72518920898438, + "loss": 0.6253, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8694036602973938, + "rewards/margins": 0.24049177765846252, + "rewards/rejected": -1.1098954677581787, + "step": 2430 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 21.262428283691406, + "learning_rate": 3.43943149158183e-07, + "logits/chosen": 0.41645437479019165, + "logits/rejected": 0.41389793157577515, + "logps/chosen": -127.73988342285156, + "logps/rejected": -157.58078002929688, + "loss": 0.6083, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8539397120475769, + "rewards/margins": 0.26630404591560364, + "rewards/rejected": -1.120243787765503, + "step": 2440 + }, + { + "epoch": 0.8827238335435057, + "grad_norm": 14.900445938110352, + "learning_rate": 3.4248416790351084e-07, + "logits/chosen": 0.21880879998207092, + "logits/rejected": 0.2357928305864334, + "logps/chosen": -148.3535614013672, + "logps/rejected": -178.9004669189453, + "loss": 0.6129, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.9314918518066406, + "rewards/margins": 0.276610791683197, + "rewards/rejected": -1.2081027030944824, + "step": 2450 + }, + { + "epoch": 0.8863267879661322, + "grad_norm": 15.81990909576416, + "learning_rate": 3.4102152821907094e-07, + "logits/chosen": 0.2893638610839844, + "logits/rejected": 0.2972751259803772, + "logps/chosen": -119.5505142211914, + "logps/rejected": -146.17257690429688, + "loss": 0.6135, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7644268870353699, + "rewards/margins": 0.26205748319625854, + "rewards/rejected": -1.0264842510223389, + "step": 2460 + }, + { + "epoch": 0.8899297423887588, + "grad_norm": 18.757734298706055, + "learning_rate": 3.395552879630318e-07, + "logits/chosen": 0.45842042565345764, + "logits/rejected": 0.44962722063064575, + "logps/chosen": -130.2977752685547, + "logps/rejected": -158.32522583007812, + "loss": 0.5978, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.893252968788147, + "rewards/margins": 0.27786940336227417, + "rewards/rejected": -1.171122431755066, + "step": 2470 + }, + { + "epoch": 0.8935326968113854, + "grad_norm": 31.11212730407715, + "learning_rate": 3.380855051359911e-07, + "logits/chosen": 0.47310179471969604, + "logits/rejected": 0.4903450608253479, + "logps/chosen": -135.8131866455078, + "logps/rejected": -170.66873168945312, + "loss": 0.5922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9029496312141418, + "rewards/margins": 0.297838032245636, + "rewards/rejected": -1.2007876634597778, + "step": 2480 + }, + { + "epoch": 0.8971356512340118, + "grad_norm": 40.82261276245117, + "learning_rate": 3.366122378786809e-07, + "logits/chosen": 0.5346935987472534, + "logits/rejected": 0.5293043255805969, + "logps/chosen": -159.31536865234375, + "logps/rejected": -183.24658203125, + "loss": 0.6357, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0518207550048828, + "rewards/margins": 0.2071865350008011, + "rewards/rejected": -1.259007215499878, + "step": 2490 + }, + { + "epoch": 0.9007386056566384, + "grad_norm": 31.10675621032715, + "learning_rate": 3.351355444696684e-07, + "logits/chosen": 0.4077302813529968, + "logits/rejected": 0.40855541825294495, + "logps/chosen": -150.89895629882812, + "logps/rejected": -178.08035278320312, + "loss": 0.6147, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0311760902404785, + "rewards/margins": 0.24248750507831573, + "rewards/rejected": -1.2736636400222778, + "step": 2500 + }, + { + "epoch": 0.904341560079265, + "grad_norm": 20.22747802734375, + "learning_rate": 3.336554833230504e-07, + "logits/chosen": 0.32154136896133423, + "logits/rejected": 0.31612735986709595, + "logps/chosen": -149.76686096191406, + "logps/rejected": -174.6000213623047, + "loss": 0.6399, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9929599761962891, + "rewards/margins": 0.18228769302368164, + "rewards/rejected": -1.1752477884292603, + "step": 2510 + }, + { + "epoch": 0.9079445145018915, + "grad_norm": 21.268247604370117, + "learning_rate": 3.3217211298614225e-07, + "logits/chosen": 0.4436109662055969, + "logits/rejected": 0.43255481123924255, + "logps/chosen": -143.47824096679688, + "logps/rejected": -167.93153381347656, + "loss": 0.6506, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0040336847305298, + "rewards/margins": 0.17765824496746063, + "rewards/rejected": -1.1816918849945068, + "step": 2520 + }, + { + "epoch": 0.9115474689245181, + "grad_norm": 29.47437858581543, + "learning_rate": 3.306854921371623e-07, + "logits/chosen": 0.32976576685905457, + "logits/rejected": 0.33072155714035034, + "logps/chosen": -149.66160583496094, + "logps/rejected": -180.8374481201172, + "loss": 0.6016, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0591691732406616, + "rewards/margins": 0.2985631823539734, + "rewards/rejected": -1.3577325344085693, + "step": 2530 + }, + { + "epoch": 0.9151504233471447, + "grad_norm": 15.695672035217285, + "learning_rate": 3.291956795829107e-07, + "logits/chosen": 0.21910643577575684, + "logits/rejected": 0.21998150646686554, + "logps/chosen": -149.00985717773438, + "logps/rejected": -182.81680297851562, + "loss": 0.5892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0104659795761108, + "rewards/margins": 0.3092425763607025, + "rewards/rejected": -1.3197085857391357, + "step": 2540 + }, + { + "epoch": 0.9187533777697712, + "grad_norm": 36.10321044921875, + "learning_rate": 3.277027342564428e-07, + "logits/chosen": 0.2809387147426605, + "logits/rejected": 0.29367387294769287, + "logps/chosen": -140.8743438720703, + "logps/rejected": -170.73983764648438, + "loss": 0.613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.974753737449646, + "rewards/margins": 0.25796306133270264, + "rewards/rejected": -1.2327167987823486, + "step": 2550 + }, + { + "epoch": 0.9223563321923978, + "grad_norm": 50.42454528808594, + "learning_rate": 3.262067152147383e-07, + "logits/chosen": 0.2883756756782532, + "logits/rejected": 0.2923746705055237, + "logps/chosen": -167.38450622558594, + "logps/rejected": -207.9312286376953, + "loss": 0.5722, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1744787693023682, + "rewards/margins": 0.38665971159935, + "rewards/rejected": -1.5611385107040405, + "step": 2560 + }, + { + "epoch": 0.9259592866150244, + "grad_norm": 15.738551139831543, + "learning_rate": 3.247076816363649e-07, + "logits/chosen": 0.5385319590568542, + "logits/rejected": 0.550905704498291, + "logps/chosen": -145.3175506591797, + "logps/rejected": -181.95762634277344, + "loss": 0.5829, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9679282903671265, + "rewards/margins": 0.34122833609580994, + "rewards/rejected": -1.3091566562652588, + "step": 2570 + }, + { + "epoch": 0.9295622410376508, + "grad_norm": 34.513187408447266, + "learning_rate": 3.2320569281913754e-07, + "logits/chosen": 0.5941283106803894, + "logits/rejected": 0.600369930267334, + "logps/chosen": -156.93695068359375, + "logps/rejected": -183.19618225097656, + "loss": 0.6259, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1337244510650635, + "rewards/margins": 0.22543902695178986, + "rewards/rejected": -1.3591634035110474, + "step": 2580 + }, + { + "epoch": 0.9331651954602774, + "grad_norm": 59.64002227783203, + "learning_rate": 3.2170080817777257e-07, + "logits/chosen": 0.6295598745346069, + "logits/rejected": 0.6174293756484985, + "logps/chosen": -164.24746704101562, + "logps/rejected": -174.8365936279297, + "loss": 0.6714, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.202728509902954, + "rewards/margins": 0.09689317643642426, + "rewards/rejected": -1.29962158203125, + "step": 2590 + }, + { + "epoch": 0.936768149882904, + "grad_norm": 14.286884307861328, + "learning_rate": 3.2019308724153736e-07, + "logits/chosen": 0.48095375299453735, + "logits/rejected": 0.47925907373428345, + "logps/chosen": -157.25457763671875, + "logps/rejected": -174.47314453125, + "loss": 0.66, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.0621507167816162, + "rewards/margins": 0.1528588831424713, + "rewards/rejected": -1.2150094509124756, + "step": 2600 + }, + { + "epoch": 0.9403711043055305, + "grad_norm": 22.402284622192383, + "learning_rate": 3.186825896518958e-07, + "logits/chosen": 0.3799865245819092, + "logits/rejected": 0.3996959328651428, + "logps/chosen": -141.32208251953125, + "logps/rejected": -182.70462036132812, + "loss": 0.5751, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0077296495437622, + "rewards/margins": 0.37155282497406006, + "rewards/rejected": -1.3792823553085327, + "step": 2610 + }, + { + "epoch": 0.9439740587281571, + "grad_norm": 42.79711151123047, + "learning_rate": 3.171693751601486e-07, + "logits/chosen": 0.4481154978275299, + "logits/rejected": 0.453838974237442, + "logps/chosen": -152.2375030517578, + "logps/rejected": -168.63235473632812, + "loss": 0.6532, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0837265253067017, + "rewards/margins": 0.14884427189826965, + "rewards/rejected": -1.2325708866119385, + "step": 2620 + }, + { + "epoch": 0.9475770131507837, + "grad_norm": 25.737060546875, + "learning_rate": 3.156535036250705e-07, + "logits/chosen": 0.44727545976638794, + "logits/rejected": 0.4579724669456482, + "logps/chosen": -133.33920288085938, + "logps/rejected": -167.18508911132812, + "loss": 0.6, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8165891766548157, + "rewards/margins": 0.2997525632381439, + "rewards/rejected": -1.1163415908813477, + "step": 2630 + }, + { + "epoch": 0.9511799675734102, + "grad_norm": 18.539308547973633, + "learning_rate": 3.141350350105413e-07, + "logits/chosen": 0.5598092675209045, + "logits/rejected": 0.5695661306381226, + "logps/chosen": -134.55953979492188, + "logps/rejected": -171.2039031982422, + "loss": 0.573, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9224777221679688, + "rewards/margins": 0.3509232997894287, + "rewards/rejected": -1.273400902748108, + "step": 2640 + }, + { + "epoch": 0.9547829219960368, + "grad_norm": 20.745208740234375, + "learning_rate": 3.126140293831746e-07, + "logits/chosen": 0.5451745986938477, + "logits/rejected": 0.5272158980369568, + "logps/chosen": -166.62710571289062, + "logps/rejected": -186.47088623046875, + "loss": 0.6488, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1236568689346313, + "rewards/margins": 0.18489673733711243, + "rewards/rejected": -1.3085535764694214, + "step": 2650 + }, + { + "epoch": 0.9583858764186634, + "grad_norm": 30.563156127929688, + "learning_rate": 3.1109054690994175e-07, + "logits/chosen": 0.7699509859085083, + "logits/rejected": 0.7913106679916382, + "logps/chosen": -189.5458221435547, + "logps/rejected": -222.76651000976562, + "loss": 0.6159, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4004586935043335, + "rewards/margins": 0.3266792893409729, + "rewards/rejected": -1.7271379232406616, + "step": 2660 + }, + { + "epoch": 0.9619888308412898, + "grad_norm": 18.110149383544922, + "learning_rate": 3.095646478557912e-07, + "logits/chosen": 0.760633111000061, + "logits/rejected": 0.7680120468139648, + "logps/chosen": -154.74481201171875, + "logps/rejected": -184.4192352294922, + "loss": 0.63, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.055311918258667, + "rewards/margins": 0.264528751373291, + "rewards/rejected": -1.3198407888412476, + "step": 2670 + }, + { + "epoch": 0.9655917852639164, + "grad_norm": 32.540931701660156, + "learning_rate": 3.0803639258126533e-07, + "logits/chosen": 0.7593638300895691, + "logits/rejected": 0.7703729271888733, + "logps/chosen": -142.2159423828125, + "logps/rejected": -161.55844116210938, + "loss": 0.6595, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9925827980041504, + "rewards/margins": 0.16774709522724152, + "rewards/rejected": -1.160329818725586, + "step": 2680 + }, + { + "epoch": 0.969194739686543, + "grad_norm": 29.07330894470215, + "learning_rate": 3.0650584154011226e-07, + "logits/chosen": 0.6375503540039062, + "logits/rejected": 0.6571865081787109, + "logps/chosen": -135.99136352539062, + "logps/rejected": -157.39926147460938, + "loss": 0.6373, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8374320864677429, + "rewards/margins": 0.20494922995567322, + "rewards/rejected": -1.0423814058303833, + "step": 2690 + }, + { + "epoch": 0.9727976941091695, + "grad_norm": 12.809856414794922, + "learning_rate": 3.049730552768944e-07, + "logits/chosen": 0.5168333649635315, + "logits/rejected": 0.5086256265640259, + "logps/chosen": -136.57504272460938, + "logps/rejected": -155.1215057373047, + "loss": 0.6371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8350626826286316, + "rewards/margins": 0.18451206386089325, + "rewards/rejected": -1.019574761390686, + "step": 2700 + }, + { + "epoch": 0.9764006485317961, + "grad_norm": 29.495820999145508, + "learning_rate": 3.034380944245939e-07, + "logits/chosen": 0.5153013467788696, + "logits/rejected": 0.5032767057418823, + "logps/chosen": -120.64376068115234, + "logps/rejected": -144.13934326171875, + "loss": 0.6334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7888122200965881, + "rewards/margins": 0.2317841351032257, + "rewards/rejected": -1.0205962657928467, + "step": 2710 + }, + { + "epoch": 0.9800036029544227, + "grad_norm": 16.63632583618164, + "learning_rate": 3.0190101970221383e-07, + "logits/chosen": 0.5460031628608704, + "logits/rejected": 0.5601615905761719, + "logps/chosen": -151.58692932128906, + "logps/rejected": -186.6927490234375, + "loss": 0.5919, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0471632480621338, + "rewards/margins": 0.3273688852787018, + "rewards/rejected": -1.3745321035385132, + "step": 2720 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 19.62331199645996, + "learning_rate": 3.0036189191237625e-07, + "logits/chosen": 0.5685732960700989, + "logits/rejected": 0.5920356512069702, + "logps/chosen": -140.84339904785156, + "logps/rejected": -171.97987365722656, + "loss": 0.6112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9551582336425781, + "rewards/margins": 0.2565491199493408, + "rewards/rejected": -1.211707353591919, + "step": 2730 + }, + { + "epoch": 0.9872095117996758, + "grad_norm": 28.855976104736328, + "learning_rate": 2.9882077193891746e-07, + "logits/chosen": 0.6157333254814148, + "logits/rejected": 0.6215740442276001, + "logps/chosen": -139.96463012695312, + "logps/rejected": -158.04739379882812, + "loss": 0.6528, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9794723391532898, + "rewards/margins": 0.1677200049161911, + "rewards/rejected": -1.147192358970642, + "step": 2740 + }, + { + "epoch": 0.9908124662223022, + "grad_norm": 13.104145050048828, + "learning_rate": 2.972777207444791e-07, + "logits/chosen": 0.955846905708313, + "logits/rejected": 0.9601390957832336, + "logps/chosen": -132.74032592773438, + "logps/rejected": -150.71829223632812, + "loss": 0.6473, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8894306421279907, + "rewards/margins": 0.15613070130348206, + "rewards/rejected": -1.0455615520477295, + "step": 2750 + }, + { + "epoch": 0.9944154206449288, + "grad_norm": 26.207012176513672, + "learning_rate": 2.9573279936809665e-07, + "logits/chosen": 1.0859369039535522, + "logits/rejected": 1.0927445888519287, + "logps/chosen": -141.00888061523438, + "logps/rejected": -158.0570068359375, + "loss": 0.6478, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9025720357894897, + "rewards/margins": 0.15658636391162872, + "rewards/rejected": -1.0591583251953125, + "step": 2760 + }, + { + "epoch": 0.9980183750675554, + "grad_norm": 27.988555908203125, + "learning_rate": 2.941860689227854e-07, + "logits/chosen": 1.4826513528823853, + "logits/rejected": 1.5037438869476318, + "logps/chosen": -163.47537231445312, + "logps/rejected": -182.8646240234375, + "loss": 0.6467, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.173262119293213, + "rewards/margins": 0.19181916117668152, + "rewards/rejected": -1.3650814294815063, + "step": 2770 + }, + { + "epoch": 1.001621329490182, + "grad_norm": 16.333715438842773, + "learning_rate": 2.9263759059312243e-07, + "logits/chosen": 1.7516014575958252, + "logits/rejected": 1.7535407543182373, + "logps/chosen": -157.38601684570312, + "logps/rejected": -180.1123504638672, + "loss": 0.6446, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1189873218536377, + "rewards/margins": 0.19123554229736328, + "rewards/rejected": -1.3102229833602905, + "step": 2780 + }, + { + "epoch": 1.0052242839128085, + "grad_norm": 18.72308349609375, + "learning_rate": 2.910874256328265e-07, + "logits/chosen": 1.6995985507965088, + "logits/rejected": 1.6985797882080078, + "logps/chosen": -158.37786865234375, + "logps/rejected": -183.59176635742188, + "loss": 0.6184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1016285419464111, + "rewards/margins": 0.22949960827827454, + "rewards/rejected": -1.3311281204223633, + "step": 2790 + }, + { + "epoch": 1.008827238335435, + "grad_norm": 26.921524047851562, + "learning_rate": 2.895356353623352e-07, + "logits/chosen": 1.3525912761688232, + "logits/rejected": 1.3631973266601562, + "logps/chosen": -141.93482971191406, + "logps/rejected": -191.4202423095703, + "loss": 0.5353, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9720309972763062, + "rewards/margins": 0.47538915276527405, + "rewards/rejected": -1.447420358657837, + "step": 2800 + }, + { + "epoch": 1.0124301927580617, + "grad_norm": 17.440500259399414, + "learning_rate": 2.8798228116637895e-07, + "logits/chosen": 1.2329285144805908, + "logits/rejected": 1.2448673248291016, + "logps/chosen": -143.15548706054688, + "logps/rejected": -182.87033081054688, + "loss": 0.5705, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9814394116401672, + "rewards/margins": 0.378670334815979, + "rewards/rejected": -1.360109806060791, + "step": 2810 + }, + { + "epoch": 1.0160331471806883, + "grad_norm": 21.634838104248047, + "learning_rate": 2.8642742449155284e-07, + "logits/chosen": 1.0341891050338745, + "logits/rejected": 1.0338951349258423, + "logps/chosen": -134.21636962890625, + "logps/rejected": -170.69606018066406, + "loss": 0.5857, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8977562785148621, + "rewards/margins": 0.31924647092819214, + "rewards/rejected": -1.2170026302337646, + "step": 2820 + }, + { + "epoch": 1.0196361016033146, + "grad_norm": 30.021751403808594, + "learning_rate": 2.8487112684388637e-07, + "logits/chosen": 1.0514543056488037, + "logits/rejected": 1.0618269443511963, + "logps/chosen": -148.8506317138672, + "logps/rejected": -187.91366577148438, + "loss": 0.5703, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9830052256584167, + "rewards/margins": 0.3607930541038513, + "rewards/rejected": -1.3437983989715576, + "step": 2830 + }, + { + "epoch": 1.0232390560259412, + "grad_norm": 19.15184783935547, + "learning_rate": 2.8331344978640993e-07, + "logits/chosen": 1.1980868577957153, + "logits/rejected": 1.2043404579162598, + "logps/chosen": -135.40493774414062, + "logps/rejected": -177.10235595703125, + "loss": 0.5614, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8613203763961792, + "rewards/margins": 0.4097590446472168, + "rewards/rejected": -1.271079421043396, + "step": 2840 + }, + { + "epoch": 1.0268420104485678, + "grad_norm": 37.32911682128906, + "learning_rate": 2.8175445493671966e-07, + "logits/chosen": 1.3482367992401123, + "logits/rejected": 1.3609994649887085, + "logps/chosen": -138.9056396484375, + "logps/rejected": -172.74681091308594, + "loss": 0.6008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0595502853393555, + "rewards/margins": 0.3141409754753113, + "rewards/rejected": -1.373691439628601, + "step": 2850 + }, + { + "epoch": 1.0304449648711944, + "grad_norm": 47.66082763671875, + "learning_rate": 2.801942039645403e-07, + "logits/chosen": 1.2990331649780273, + "logits/rejected": 1.3113834857940674, + "logps/chosen": -154.54800415039062, + "logps/rejected": -191.79144287109375, + "loss": 0.5699, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1076867580413818, + "rewards/margins": 0.37979164719581604, + "rewards/rejected": -1.487478494644165, + "step": 2860 + }, + { + "epoch": 1.034047919293821, + "grad_norm": 26.355426788330078, + "learning_rate": 2.7863275858928527e-07, + "logits/chosen": 1.304172396659851, + "logits/rejected": 1.31345534324646, + "logps/chosen": -170.28895568847656, + "logps/rejected": -190.68533325195312, + "loss": 0.6352, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1143049001693726, + "rewards/margins": 0.21352222561836243, + "rewards/rejected": -1.3278272151947021, + "step": 2870 + }, + { + "epoch": 1.0376508737164474, + "grad_norm": 25.115501403808594, + "learning_rate": 2.7707018057761543e-07, + "logits/chosen": 1.344402551651001, + "logits/rejected": 1.3745404481887817, + "logps/chosen": -170.15597534179688, + "logps/rejected": -207.5067138671875, + "loss": 0.5719, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.150154948234558, + "rewards/margins": 0.36334314942359924, + "rewards/rejected": -1.513498067855835, + "step": 2880 + }, + { + "epoch": 1.041253828139074, + "grad_norm": 24.93830680847168, + "learning_rate": 2.7550653174099604e-07, + "logits/chosen": 1.3183590173721313, + "logits/rejected": 1.3358465433120728, + "logps/chosen": -182.6671600341797, + "logps/rejected": -229.6891326904297, + "loss": 0.5773, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.382752537727356, + "rewards/margins": 0.40944772958755493, + "rewards/rejected": -1.7922000885009766, + "step": 2890 + }, + { + "epoch": 1.0448567825617006, + "grad_norm": 32.1740608215332, + "learning_rate": 2.73941873933251e-07, + "logits/chosen": 1.2893548011779785, + "logits/rejected": 1.303473711013794, + "logps/chosen": -163.84793090820312, + "logps/rejected": -199.80706787109375, + "loss": 0.5972, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2317514419555664, + "rewards/margins": 0.30811601877212524, + "rewards/rejected": -1.5398674011230469, + "step": 2900 + }, + { + "epoch": 1.0484597369843272, + "grad_norm": 47.1985969543457, + "learning_rate": 2.723762690481167e-07, + "logits/chosen": 1.1547422409057617, + "logits/rejected": 1.1732709407806396, + "logps/chosen": -161.27716064453125, + "logps/rejected": -200.41769409179688, + "loss": 0.5814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1469686031341553, + "rewards/margins": 0.3593052923679352, + "rewards/rejected": -1.5062739849090576, + "step": 2910 + }, + { + "epoch": 1.0520626914069537, + "grad_norm": 19.92612075805664, + "learning_rate": 2.708097790167932e-07, + "logits/chosen": 1.213963508605957, + "logits/rejected": 1.225260615348816, + "logps/chosen": -160.59677124023438, + "logps/rejected": -218.94448852539062, + "loss": 0.5272, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1272809505462646, + "rewards/margins": 0.5488572120666504, + "rewards/rejected": -1.676138162612915, + "step": 2920 + }, + { + "epoch": 1.0556656458295803, + "grad_norm": 27.217994689941406, + "learning_rate": 2.692424658054948e-07, + "logits/chosen": 1.2021996974945068, + "logits/rejected": 1.2243382930755615, + "logps/chosen": -150.98605346679688, + "logps/rejected": -191.73977661132812, + "loss": 0.5703, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0560792684555054, + "rewards/margins": 0.38107022643089294, + "rewards/rejected": -1.4371494054794312, + "step": 2930 + }, + { + "epoch": 1.0592686002522067, + "grad_norm": 23.613012313842773, + "learning_rate": 2.676743914129986e-07, + "logits/chosen": 1.4036846160888672, + "logits/rejected": 1.398654818534851, + "logps/chosen": -151.34022521972656, + "logps/rejected": -191.13412475585938, + "loss": 0.568, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0999627113342285, + "rewards/margins": 0.3745970129966736, + "rewards/rejected": -1.4745595455169678, + "step": 2940 + }, + { + "epoch": 1.0628715546748333, + "grad_norm": 20.741044998168945, + "learning_rate": 2.66105617868192e-07, + "logits/chosen": 1.2105770111083984, + "logits/rejected": 1.2305647134780884, + "logps/chosen": -150.4530487060547, + "logps/rejected": -188.18063354492188, + "loss": 0.5884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0379226207733154, + "rewards/margins": 0.35215944051742554, + "rewards/rejected": -1.3900820016860962, + "step": 2950 + }, + { + "epoch": 1.0664745090974599, + "grad_norm": 24.45860481262207, + "learning_rate": 2.6453620722761895e-07, + "logits/chosen": 1.2178564071655273, + "logits/rejected": 1.2193264961242676, + "logps/chosen": -141.83470153808594, + "logps/rejected": -169.4464569091797, + "loss": 0.604, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9304254651069641, + "rewards/margins": 0.2524339556694031, + "rewards/rejected": -1.1828593015670776, + "step": 2960 + }, + { + "epoch": 1.0700774635200865, + "grad_norm": 16.757301330566406, + "learning_rate": 2.629662215730253e-07, + "logits/chosen": 1.0107862949371338, + "logits/rejected": 1.0186793804168701, + "logps/chosen": -154.68051147460938, + "logps/rejected": -184.76063537597656, + "loss": 0.5993, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.013504981994629, + "rewards/margins": 0.2779073417186737, + "rewards/rejected": -1.291412353515625, + "step": 2970 + }, + { + "epoch": 1.073680417942713, + "grad_norm": 18.887826919555664, + "learning_rate": 2.6139572300890284e-07, + "logits/chosen": 0.9930673837661743, + "logits/rejected": 1.0053017139434814, + "logps/chosen": -149.2520751953125, + "logps/rejected": -189.97689819335938, + "loss": 0.5617, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9238582849502563, + "rewards/margins": 0.4110753536224365, + "rewards/rejected": -1.3349335193634033, + "step": 2980 + }, + { + "epoch": 1.0772833723653397, + "grad_norm": 45.729061126708984, + "learning_rate": 2.598247736600328e-07, + "logits/chosen": 1.266531229019165, + "logits/rejected": 1.2912251949310303, + "logps/chosen": -156.42117309570312, + "logps/rejected": -194.59445190429688, + "loss": 0.5698, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1178277730941772, + "rewards/margins": 0.35697516798973083, + "rewards/rejected": -1.47480309009552, + "step": 2990 + }, + { + "epoch": 1.080886326787966, + "grad_norm": 33.51045608520508, + "learning_rate": 2.5825343566902833e-07, + "logits/chosen": 1.4589413404464722, + "logits/rejected": 1.462807536125183, + "logps/chosen": -165.98814392089844, + "logps/rejected": -203.2148895263672, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2189103364944458, + "rewards/margins": 0.35195571184158325, + "rewards/rejected": -1.5708658695220947, + "step": 3000 + }, + { + "epoch": 1.0844892812105926, + "grad_norm": 25.705270767211914, + "learning_rate": 2.5668177119387617e-07, + "logits/chosen": 1.634394884109497, + "logits/rejected": 1.6530296802520752, + "logps/chosen": -167.57522583007812, + "logps/rejected": -193.9906005859375, + "loss": 0.6154, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.272362470626831, + "rewards/margins": 0.2553406059741974, + "rewards/rejected": -1.5277031660079956, + "step": 3010 + }, + { + "epoch": 1.0880922356332192, + "grad_norm": 34.7142219543457, + "learning_rate": 2.5510984240547787e-07, + "logits/chosen": 1.3826122283935547, + "logits/rejected": 1.4002482891082764, + "logps/chosen": -175.1686553955078, + "logps/rejected": -225.0598907470703, + "loss": 0.5373, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.293283224105835, + "rewards/margins": 0.46684423089027405, + "rewards/rejected": -1.7601274251937866, + "step": 3020 + }, + { + "epoch": 1.0916951900558458, + "grad_norm": 17.66757583618164, + "learning_rate": 2.535377114851905e-07, + "logits/chosen": 1.4605615139007568, + "logits/rejected": 1.4629911184310913, + "logps/chosen": -181.341796875, + "logps/rejected": -200.5535888671875, + "loss": 0.634, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2361996173858643, + "rewards/margins": 0.20834848284721375, + "rewards/rejected": -1.4445480108261108, + "step": 3030 + }, + { + "epoch": 1.0952981444784724, + "grad_norm": 18.307947158813477, + "learning_rate": 2.5196544062236707e-07, + "logits/chosen": 1.6120392084121704, + "logits/rejected": 1.6377861499786377, + "logps/chosen": -172.27444458007812, + "logps/rejected": -202.8404083251953, + "loss": 0.5851, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2176176309585571, + "rewards/margins": 0.3250825107097626, + "rewards/rejected": -1.542700171470642, + "step": 3040 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 19.542686462402344, + "learning_rate": 2.503930920118961e-07, + "logits/chosen": 1.651121735572815, + "logits/rejected": 1.6678673028945923, + "logps/chosen": -187.13082885742188, + "logps/rejected": -230.998291015625, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3362236022949219, + "rewards/margins": 0.4135264754295349, + "rewards/rejected": -1.7497501373291016, + "step": 3050 + }, + { + "epoch": 1.1025040533237254, + "grad_norm": 27.288055419921875, + "learning_rate": 2.4882072785174194e-07, + "logits/chosen": 1.6910793781280518, + "logits/rejected": 1.7082237005233765, + "logps/chosen": -180.62667846679688, + "logps/rejected": -218.1063995361328, + "loss": 0.5924, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3642722368240356, + "rewards/margins": 0.32754629850387573, + "rewards/rejected": -1.6918185949325562, + "step": 3060 + }, + { + "epoch": 1.106107007746352, + "grad_norm": 23.36140251159668, + "learning_rate": 2.472484103404839e-07, + "logits/chosen": 1.8909047842025757, + "logits/rejected": 1.9143555164337158, + "logps/chosen": -177.67005920410156, + "logps/rejected": -226.27023315429688, + "loss": 0.5493, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3711577653884888, + "rewards/margins": 0.4601329267024994, + "rewards/rejected": -1.831290602684021, + "step": 3070 + }, + { + "epoch": 1.1097099621689785, + "grad_norm": 23.70610809326172, + "learning_rate": 2.456762016748556e-07, + "logits/chosen": 1.6471796035766602, + "logits/rejected": 1.6680177450180054, + "logps/chosen": -182.0051727294922, + "logps/rejected": -212.8391876220703, + "loss": 0.6011, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.307997226715088, + "rewards/margins": 0.28122976422309875, + "rewards/rejected": -1.5892269611358643, + "step": 3080 + }, + { + "epoch": 1.1133129165916051, + "grad_norm": 29.763378143310547, + "learning_rate": 2.441041640472858e-07, + "logits/chosen": 1.7746632099151611, + "logits/rejected": 1.7879133224487305, + "logps/chosen": -177.87359619140625, + "logps/rejected": -214.80038452148438, + "loss": 0.5802, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3018782138824463, + "rewards/margins": 0.3463166654109955, + "rewards/rejected": -1.6481950283050537, + "step": 3090 + }, + { + "epoch": 1.1169158710142317, + "grad_norm": 18.889694213867188, + "learning_rate": 2.4253235964343674e-07, + "logits/chosen": 1.5324766635894775, + "logits/rejected": 1.5299230813980103, + "logps/chosen": -167.12747192382812, + "logps/rejected": -210.18603515625, + "loss": 0.5655, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2045719623565674, + "rewards/margins": 0.4194146990776062, + "rewards/rejected": -1.623986840248108, + "step": 3100 + }, + { + "epoch": 1.1205188254368583, + "grad_norm": 25.045333862304688, + "learning_rate": 2.409608506397452e-07, + "logits/chosen": 1.4209635257720947, + "logits/rejected": 1.4399478435516357, + "logps/chosen": -172.46139526367188, + "logps/rejected": -226.7716064453125, + "loss": 0.5278, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2924270629882812, + "rewards/margins": 0.5143944621086121, + "rewards/rejected": -1.8068214654922485, + "step": 3110 + }, + { + "epoch": 1.1241217798594847, + "grad_norm": 39.801109313964844, + "learning_rate": 2.3938969920096296e-07, + "logits/chosen": 1.3100801706314087, + "logits/rejected": 1.3263208866119385, + "logps/chosen": -182.75791931152344, + "logps/rejected": -214.8870086669922, + "loss": 0.5971, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2580931186676025, + "rewards/margins": 0.2964404225349426, + "rewards/rejected": -1.55453360080719, + "step": 3120 + }, + { + "epoch": 1.1277247342821113, + "grad_norm": 39.32272720336914, + "learning_rate": 2.3781896747769694e-07, + "logits/chosen": 1.4089020490646362, + "logits/rejected": 1.3984390497207642, + "logps/chosen": -169.61061096191406, + "logps/rejected": -199.11709594726562, + "loss": 0.6259, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2158353328704834, + "rewards/margins": 0.2781444191932678, + "rewards/rejected": -1.4939799308776855, + "step": 3130 + }, + { + "epoch": 1.1313276887047379, + "grad_norm": 28.20737648010254, + "learning_rate": 2.3624871760395174e-07, + "logits/chosen": 1.1554807424545288, + "logits/rejected": 1.1675853729248047, + "logps/chosen": -168.42044067382812, + "logps/rejected": -200.41329956054688, + "loss": 0.6016, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2163656949996948, + "rewards/margins": 0.28965193033218384, + "rewards/rejected": -1.5060179233551025, + "step": 3140 + }, + { + "epoch": 1.1349306431273645, + "grad_norm": 20.062538146972656, + "learning_rate": 2.3467901169467096e-07, + "logits/chosen": 0.9233118295669556, + "logits/rejected": 0.9547305107116699, + "logps/chosen": -155.99371337890625, + "logps/rejected": -204.6820068359375, + "loss": 0.5564, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0997803211212158, + "rewards/margins": 0.4566062092781067, + "rewards/rejected": -1.5563864707946777, + "step": 3150 + }, + { + "epoch": 1.138533597549991, + "grad_norm": 28.119457244873047, + "learning_rate": 2.331099118432804e-07, + "logits/chosen": 0.9703793525695801, + "logits/rejected": 0.9870656728744507, + "logps/chosen": -161.81423950195312, + "logps/rejected": -210.0536346435547, + "loss": 0.5446, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0635930299758911, + "rewards/margins": 0.4630206227302551, + "rewards/rejected": -1.5266135931015015, + "step": 3160 + }, + { + "epoch": 1.1421365519726177, + "grad_norm": 25.566730499267578, + "learning_rate": 2.3154148011923205e-07, + "logits/chosen": 1.0091289281845093, + "logits/rejected": 1.018500804901123, + "logps/chosen": -149.37063598632812, + "logps/rejected": -185.6090545654297, + "loss": 0.5847, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0620778799057007, + "rewards/margins": 0.33241063356399536, + "rewards/rejected": -1.3944883346557617, + "step": 3170 + }, + { + "epoch": 1.145739506395244, + "grad_norm": 19.381229400634766, + "learning_rate": 2.299737785655482e-07, + "logits/chosen": 0.9534355401992798, + "logits/rejected": 0.9737855195999146, + "logps/chosen": -136.06202697753906, + "logps/rejected": -177.0410614013672, + "loss": 0.5696, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8820406198501587, + "rewards/margins": 0.3690762519836426, + "rewards/rejected": -1.2511168718338013, + "step": 3180 + }, + { + "epoch": 1.1493424608178706, + "grad_norm": 22.582590103149414, + "learning_rate": 2.284068691963679e-07, + "logits/chosen": 0.9719399213790894, + "logits/rejected": 0.9772558212280273, + "logps/chosen": -156.23695373535156, + "logps/rejected": -189.47659301757812, + "loss": 0.6057, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9947269558906555, + "rewards/margins": 0.333953320980072, + "rewards/rejected": -1.3286802768707275, + "step": 3190 + }, + { + "epoch": 1.1529454152404972, + "grad_norm": 36.40176773071289, + "learning_rate": 2.2684081399449323e-07, + "logits/chosen": 1.103335976600647, + "logits/rejected": 1.128285527229309, + "logps/chosen": -155.7313995361328, + "logps/rejected": -195.7241973876953, + "loss": 0.5751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1343395709991455, + "rewards/margins": 0.3672861158847809, + "rewards/rejected": -1.5016257762908936, + "step": 3200 + }, + { + "epoch": 1.1565483696631238, + "grad_norm": 21.671106338500977, + "learning_rate": 2.2527567490893755e-07, + "logits/chosen": 1.2460180521011353, + "logits/rejected": 1.2542035579681396, + "logps/chosen": -171.56942749023438, + "logps/rejected": -207.84957885742188, + "loss": 0.5866, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2231214046478271, + "rewards/margins": 0.34527188539505005, + "rewards/rejected": -1.5683931112289429, + "step": 3210 + }, + { + "epoch": 1.1601513240857504, + "grad_norm": 43.77824020385742, + "learning_rate": 2.2371151385247544e-07, + "logits/chosen": 1.3986154794692993, + "logits/rejected": 1.405017375946045, + "logps/chosen": -173.19020080566406, + "logps/rejected": -201.94764709472656, + "loss": 0.6082, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.268537163734436, + "rewards/margins": 0.2790360450744629, + "rewards/rejected": -1.547573208808899, + "step": 3220 + }, + { + "epoch": 1.1637542785083768, + "grad_norm": 37.33060073852539, + "learning_rate": 2.2214839269919288e-07, + "logits/chosen": 1.2926814556121826, + "logits/rejected": 1.3005971908569336, + "logps/chosen": -177.24127197265625, + "logps/rejected": -223.33432006835938, + "loss": 0.5431, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.298537015914917, + "rewards/margins": 0.4248967170715332, + "rewards/rejected": -1.7234338521957397, + "step": 3230 + }, + { + "epoch": 1.1673572329310034, + "grad_norm": 18.869747161865234, + "learning_rate": 2.205863732820404e-07, + "logits/chosen": 1.297871708869934, + "logits/rejected": 1.309443473815918, + "logps/chosen": -170.31053161621094, + "logps/rejected": -194.85159301757812, + "loss": 0.6298, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1644783020019531, + "rewards/margins": 0.21012084186077118, + "rewards/rejected": -1.3745992183685303, + "step": 3240 + }, + { + "epoch": 1.17096018735363, + "grad_norm": 43.628421783447266, + "learning_rate": 2.1902551739038622e-07, + "logits/chosen": 1.1812689304351807, + "logits/rejected": 1.2138346433639526, + "logps/chosen": -169.75918579101562, + "logps/rejected": -204.258056640625, + "loss": 0.5745, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1383635997772217, + "rewards/margins": 0.37113919854164124, + "rewards/rejected": -1.50950288772583, + "step": 3250 + }, + { + "epoch": 1.1745631417762565, + "grad_norm": 33.38032913208008, + "learning_rate": 2.1746588676757308e-07, + "logits/chosen": 1.4970940351486206, + "logits/rejected": 1.4973194599151611, + "logps/chosen": -152.75123596191406, + "logps/rejected": -183.58334350585938, + "loss": 0.5906, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0627615451812744, + "rewards/margins": 0.29873785376548767, + "rewards/rejected": -1.361499547958374, + "step": 3260 + }, + { + "epoch": 1.1781660961988831, + "grad_norm": 38.73574447631836, + "learning_rate": 2.1590754310847508e-07, + "logits/chosen": 1.5429904460906982, + "logits/rejected": 1.5525925159454346, + "logps/chosen": -171.74215698242188, + "logps/rejected": -208.65048217773438, + "loss": 0.5845, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2257620096206665, + "rewards/margins": 0.37997758388519287, + "rewards/rejected": -1.6057395935058594, + "step": 3270 + }, + { + "epoch": 1.1817690506215097, + "grad_norm": 26.747926712036133, + "learning_rate": 2.143505480570573e-07, + "logits/chosen": 1.7026646137237549, + "logits/rejected": 1.706345796585083, + "logps/chosen": -172.7478485107422, + "logps/rejected": -202.12100219726562, + "loss": 0.6002, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.265164852142334, + "rewards/margins": 0.2747200131416321, + "rewards/rejected": -1.5398849248886108, + "step": 3280 + }, + { + "epoch": 1.1853720050441363, + "grad_norm": 18.19614601135254, + "learning_rate": 2.1279496320393779e-07, + "logits/chosen": 1.7919038534164429, + "logits/rejected": 1.8115230798721313, + "logps/chosen": -169.95596313476562, + "logps/rejected": -199.9071807861328, + "loss": 0.5977, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2951703071594238, + "rewards/margins": 0.2859644293785095, + "rewards/rejected": -1.5811350345611572, + "step": 3290 + }, + { + "epoch": 1.1889749594667627, + "grad_norm": 34.03652572631836, + "learning_rate": 2.112408500839505e-07, + "logits/chosen": 1.735272765159607, + "logits/rejected": 1.7567336559295654, + "logps/chosen": -174.78927612304688, + "logps/rejected": -207.58273315429688, + "loss": 0.5929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.353682279586792, + "rewards/margins": 0.32816195487976074, + "rewards/rejected": -1.6818439960479736, + "step": 3300 + }, + { + "epoch": 1.1925779138893893, + "grad_norm": 24.930068969726562, + "learning_rate": 2.0968827017371192e-07, + "logits/chosen": 1.8864787817001343, + "logits/rejected": 1.8942277431488037, + "logps/chosen": -179.2598876953125, + "logps/rejected": -224.4481964111328, + "loss": 0.5468, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3099626302719116, + "rewards/margins": 0.4144628047943115, + "rewards/rejected": -1.7244255542755127, + "step": 3310 + }, + { + "epoch": 1.1961808683120159, + "grad_norm": 29.145889282226562, + "learning_rate": 2.0813728488918848e-07, + "logits/chosen": 1.9467144012451172, + "logits/rejected": 1.9601352214813232, + "logps/chosen": -193.031494140625, + "logps/rejected": -219.81509399414062, + "loss": 0.6417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4769078493118286, + "rewards/margins": 0.26800230145454407, + "rewards/rejected": -1.7449100017547607, + "step": 3320 + }, + { + "epoch": 1.1997838227346425, + "grad_norm": 42.68914031982422, + "learning_rate": 2.065879555832674e-07, + "logits/chosen": 1.8014549016952515, + "logits/rejected": 1.8138593435287476, + "logps/chosen": -180.00479125976562, + "logps/rejected": -225.0125274658203, + "loss": 0.5526, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3541065454483032, + "rewards/margins": 0.42764395475387573, + "rewards/rejected": -1.7817504405975342, + "step": 3330 + }, + { + "epoch": 1.203386777157269, + "grad_norm": 34.69449996948242, + "learning_rate": 2.0504034354333004e-07, + "logits/chosen": 1.8517045974731445, + "logits/rejected": 1.8728597164154053, + "logps/chosen": -181.0088348388672, + "logps/rejected": -217.8776092529297, + "loss": 0.5963, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3582488298416138, + "rewards/margins": 0.3246374726295471, + "rewards/rejected": -1.6828863620758057, + "step": 3340 + }, + { + "epoch": 1.2069897315798954, + "grad_norm": 32.25773239135742, + "learning_rate": 2.0349450998882698e-07, + "logits/chosen": 1.6548973321914673, + "logits/rejected": 1.6580450534820557, + "logps/chosen": -173.4766082763672, + "logps/rejected": -219.5607452392578, + "loss": 0.546, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3156660795211792, + "rewards/margins": 0.4414525032043457, + "rewards/rejected": -1.757118582725525, + "step": 3350 + }, + { + "epoch": 1.210592686002522, + "grad_norm": 23.346769332885742, + "learning_rate": 2.0195051606885681e-07, + "logits/chosen": 1.716970443725586, + "logits/rejected": 1.7288544178009033, + "logps/chosen": -179.84616088867188, + "logps/rejected": -232.7053985595703, + "loss": 0.5229, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2913398742675781, + "rewards/margins": 0.4949992597103119, + "rewards/rejected": -1.7863390445709229, + "step": 3360 + }, + { + "epoch": 1.2141956404251486, + "grad_norm": 41.44517517089844, + "learning_rate": 2.0040842285974683e-07, + "logits/chosen": 1.8047672510147095, + "logits/rejected": 1.8259674310684204, + "logps/chosen": -167.1689453125, + "logps/rejected": -208.96798706054688, + "loss": 0.572, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2015494108200073, + "rewards/margins": 0.40487393736839294, + "rewards/rejected": -1.6064231395721436, + "step": 3370 + }, + { + "epoch": 1.2177985948477752, + "grad_norm": 30.084402084350586, + "learning_rate": 1.9886829136263728e-07, + "logits/chosen": 1.648328185081482, + "logits/rejected": 1.664375901222229, + "logps/chosen": -181.54995727539062, + "logps/rejected": -223.4890594482422, + "loss": 0.5653, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2826671600341797, + "rewards/margins": 0.3996722102165222, + "rewards/rejected": -1.6823394298553467, + "step": 3380 + }, + { + "epoch": 1.2214015492704018, + "grad_norm": 36.883480072021484, + "learning_rate": 1.973301825010685e-07, + "logits/chosen": 1.5796527862548828, + "logits/rejected": 1.6008110046386719, + "logps/chosen": -179.32435607910156, + "logps/rejected": -232.6096649169922, + "loss": 0.536, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2281485795974731, + "rewards/margins": 0.506437361240387, + "rewards/rejected": -1.7345860004425049, + "step": 3390 + }, + { + "epoch": 1.2250045036930284, + "grad_norm": 24.782224655151367, + "learning_rate": 1.9579415711857016e-07, + "logits/chosen": 1.7208601236343384, + "logits/rejected": 1.7337154150009155, + "logps/chosen": -171.23948669433594, + "logps/rejected": -222.1886749267578, + "loss": 0.5463, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2556393146514893, + "rewards/margins": 0.48701682686805725, + "rewards/rejected": -1.7426563501358032, + "step": 3400 + }, + { + "epoch": 1.2286074581156547, + "grad_norm": 24.2948055267334, + "learning_rate": 1.9426027597625572e-07, + "logits/chosen": 1.6595081090927124, + "logits/rejected": 1.6720672845840454, + "logps/chosen": -154.27455139160156, + "logps/rejected": -200.77243041992188, + "loss": 0.5543, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1199270486831665, + "rewards/margins": 0.42178797721862793, + "rewards/rejected": -1.5417152643203735, + "step": 3410 + }, + { + "epoch": 1.2322104125382813, + "grad_norm": 42.27910614013672, + "learning_rate": 1.9272859975041752e-07, + "logits/chosen": 1.641770362854004, + "logits/rejected": 1.6533361673355103, + "logps/chosen": -157.47933959960938, + "logps/rejected": -201.40994262695312, + "loss": 0.5535, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.150720477104187, + "rewards/margins": 0.40662074089050293, + "rewards/rejected": -1.5573410987854004, + "step": 3420 + }, + { + "epoch": 1.235813366960908, + "grad_norm": 16.248056411743164, + "learning_rate": 1.911991890301275e-07, + "logits/chosen": 1.603314995765686, + "logits/rejected": 1.5976498126983643, + "logps/chosen": -158.0511474609375, + "logps/rejected": -201.87158203125, + "loss": 0.5674, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0927891731262207, + "rewards/margins": 0.4202328622341156, + "rewards/rejected": -1.5130221843719482, + "step": 3430 + }, + { + "epoch": 1.2394163213835345, + "grad_norm": 29.743022918701172, + "learning_rate": 1.896721043148402e-07, + "logits/chosen": 1.6673412322998047, + "logits/rejected": 1.6792166233062744, + "logps/chosen": -160.25917053222656, + "logps/rejected": -200.7294921875, + "loss": 0.5779, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2035908699035645, + "rewards/margins": 0.39745160937309265, + "rewards/rejected": -1.6010425090789795, + "step": 3440 + }, + { + "epoch": 1.2430192758061611, + "grad_norm": 27.176912307739258, + "learning_rate": 1.881474060119994e-07, + "logits/chosen": 1.6526479721069336, + "logits/rejected": 1.6897213459014893, + "logps/chosen": -177.30697631835938, + "logps/rejected": -229.65560913085938, + "loss": 0.5329, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3172175884246826, + "rewards/margins": 0.48564568161964417, + "rewards/rejected": -1.8028631210327148, + "step": 3450 + }, + { + "epoch": 1.2466222302287875, + "grad_norm": 36.72285079956055, + "learning_rate": 1.866251544346488e-07, + "logits/chosen": 1.756397008895874, + "logits/rejected": 1.7641305923461914, + "logps/chosen": -195.5851287841797, + "logps/rejected": -234.1574249267578, + "loss": 0.6014, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4670528173446655, + "rewards/margins": 0.36591073870658875, + "rewards/rejected": -1.8329633474349976, + "step": 3460 + }, + { + "epoch": 1.250225184651414, + "grad_norm": 22.320993423461914, + "learning_rate": 1.8510540979904617e-07, + "logits/chosen": 1.8513895273208618, + "logits/rejected": 1.8651978969573975, + "logps/chosen": -179.46591186523438, + "logps/rejected": -221.010986328125, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3116358518600464, + "rewards/margins": 0.3873592019081116, + "rewards/rejected": -1.6989952325820923, + "step": 3470 + }, + { + "epoch": 1.2538281390740407, + "grad_norm": 23.453344345092773, + "learning_rate": 1.8358823222228096e-07, + "logits/chosen": 1.7122529745101929, + "logits/rejected": 1.7139049768447876, + "logps/chosen": -181.01031494140625, + "logps/rejected": -227.14730834960938, + "loss": 0.5398, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3143945932388306, + "rewards/margins": 0.46285495162010193, + "rewards/rejected": -1.7772495746612549, + "step": 3480 + }, + { + "epoch": 1.2574310934966673, + "grad_norm": 19.974374771118164, + "learning_rate": 1.820736817198969e-07, + "logits/chosen": 1.8285239934921265, + "logits/rejected": 1.845882773399353, + "logps/chosen": -195.367431640625, + "logps/rejected": -251.32760620117188, + "loss": 0.5016, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4235846996307373, + "rewards/margins": 0.6087952256202698, + "rewards/rejected": -2.0323803424835205, + "step": 3490 + }, + { + "epoch": 1.2610340479192939, + "grad_norm": 31.72352409362793, + "learning_rate": 1.8056181820351735e-07, + "logits/chosen": 1.8216755390167236, + "logits/rejected": 1.8578245639801025, + "logps/chosen": -192.22169494628906, + "logps/rejected": -249.3321990966797, + "loss": 0.5411, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.42820143699646, + "rewards/margins": 0.5573378801345825, + "rewards/rejected": -1.985539197921753, + "step": 3500 + }, + { + "epoch": 1.2646370023419204, + "grad_norm": 26.99710464477539, + "learning_rate": 1.790527014784758e-07, + "logits/chosen": 1.7859792709350586, + "logits/rejected": 1.8002761602401733, + "logps/chosen": -193.13339233398438, + "logps/rejected": -230.9226531982422, + "loss": 0.5891, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4562654495239258, + "rewards/margins": 0.3704061210155487, + "rewards/rejected": -1.8266716003417969, + "step": 3510 + }, + { + "epoch": 1.268239956764547, + "grad_norm": 24.12320899963379, + "learning_rate": 1.7754639124144977e-07, + "logits/chosen": 1.3900184631347656, + "logits/rejected": 1.4246901273727417, + "logps/chosen": -177.86846923828125, + "logps/rejected": -229.09835815429688, + "loss": 0.5582, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2260959148406982, + "rewards/margins": 0.4854293763637543, + "rewards/rejected": -1.711525321006775, + "step": 3520 + }, + { + "epoch": 1.2718429111871734, + "grad_norm": 22.157583236694336, + "learning_rate": 1.760429470780994e-07, + "logits/chosen": 1.6068884134292603, + "logits/rejected": 1.6049220561981201, + "logps/chosen": -195.89663696289062, + "logps/rejected": -228.62124633789062, + "loss": 0.602, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4787858724594116, + "rewards/margins": 0.31004488468170166, + "rewards/rejected": -1.7888309955596924, + "step": 3530 + }, + { + "epoch": 1.2754458656098, + "grad_norm": 28.59613609313965, + "learning_rate": 1.7454242846071082e-07, + "logits/chosen": 1.819340467453003, + "logits/rejected": 1.8372827768325806, + "logps/chosen": -177.56573486328125, + "logps/rejected": -217.68765258789062, + "loss": 0.581, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3727514743804932, + "rewards/margins": 0.3638874888420105, + "rewards/rejected": -1.7366390228271484, + "step": 3540 + }, + { + "epoch": 1.2790488200324266, + "grad_norm": 13.975077629089355, + "learning_rate": 1.7304489474584304e-07, + "logits/chosen": 1.8392823934555054, + "logits/rejected": 1.8703858852386475, + "logps/chosen": -188.02796936035156, + "logps/rejected": -227.65371704101562, + "loss": 0.5815, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.389987826347351, + "rewards/margins": 0.3787449598312378, + "rewards/rejected": -1.7687326669692993, + "step": 3550 + }, + { + "epoch": 1.2826517744550532, + "grad_norm": 30.64284324645996, + "learning_rate": 1.715504051719804e-07, + "logits/chosen": 1.9738810062408447, + "logits/rejected": 2.007951498031616, + "logps/chosen": -186.0692138671875, + "logps/rejected": -219.5050811767578, + "loss": 0.5717, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2764512300491333, + "rewards/margins": 0.3545674681663513, + "rewards/rejected": -1.6310186386108398, + "step": 3560 + }, + { + "epoch": 1.2862547288776798, + "grad_norm": 42.76008605957031, + "learning_rate": 1.7005901885718867e-07, + "logits/chosen": 2.18925142288208, + "logits/rejected": 2.197112560272217, + "logps/chosen": -211.40823364257812, + "logps/rejected": -231.84521484375, + "loss": 0.6725, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6586414575576782, + "rewards/margins": 0.19599632918834686, + "rewards/rejected": -1.854637861251831, + "step": 3570 + }, + { + "epoch": 1.2898576833003061, + "grad_norm": 27.870746612548828, + "learning_rate": 1.6857079479677737e-07, + "logits/chosen": 2.225771427154541, + "logits/rejected": 2.2617292404174805, + "logps/chosen": -181.1365966796875, + "logps/rejected": -224.7388916015625, + "loss": 0.5556, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3274881839752197, + "rewards/margins": 0.4119059443473816, + "rewards/rejected": -1.739394187927246, + "step": 3580 + }, + { + "epoch": 1.2934606377229327, + "grad_norm": 48.422306060791016, + "learning_rate": 1.670857918609653e-07, + "logits/chosen": 2.1853294372558594, + "logits/rejected": 2.2130117416381836, + "logps/chosen": -202.51756286621094, + "logps/rejected": -249.8184051513672, + "loss": 0.5853, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5900542736053467, + "rewards/margins": 0.42613738775253296, + "rewards/rejected": -2.0161914825439453, + "step": 3590 + }, + { + "epoch": 1.2970635921455593, + "grad_norm": 37.77787780761719, + "learning_rate": 1.656040687925519e-07, + "logits/chosen": 1.9480301141738892, + "logits/rejected": 1.9540107250213623, + "logps/chosen": -200.03726196289062, + "logps/rejected": -242.9154510498047, + "loss": 0.591, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5215644836425781, + "rewards/margins": 0.3655272424221039, + "rewards/rejected": -1.8870916366577148, + "step": 3600 + }, + { + "epoch": 1.300666546568186, + "grad_norm": 25.00056266784668, + "learning_rate": 1.641256842045942e-07, + "logits/chosen": 2.2472565174102783, + "logits/rejected": 2.2626872062683105, + "logps/chosen": -182.2674102783203, + "logps/rejected": -223.0990447998047, + "loss": 0.541, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3268883228302002, + "rewards/margins": 0.4107748866081238, + "rewards/rejected": -1.7376632690429688, + "step": 3610 + }, + { + "epoch": 1.3042695009908125, + "grad_norm": 25.82343292236328, + "learning_rate": 1.6265069657808728e-07, + "logits/chosen": 2.3760194778442383, + "logits/rejected": 2.397968053817749, + "logps/chosen": -194.54000854492188, + "logps/rejected": -240.69888305664062, + "loss": 0.5759, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5028674602508545, + "rewards/margins": 0.40088844299316406, + "rewards/rejected": -1.903755784034729, + "step": 3620 + }, + { + "epoch": 1.307872455413439, + "grad_norm": 19.218997955322266, + "learning_rate": 1.6117916425965157e-07, + "logits/chosen": 2.402834892272949, + "logits/rejected": 2.4300312995910645, + "logps/chosen": -207.3802490234375, + "logps/rejected": -253.8350830078125, + "loss": 0.5656, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6533803939819336, + "rewards/margins": 0.43287163972854614, + "rewards/rejected": -2.086251735687256, + "step": 3630 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 37.13401794433594, + "learning_rate": 1.5971114545922475e-07, + "logits/chosen": 2.393101453781128, + "logits/rejected": 2.4327123165130615, + "logps/chosen": -183.045166015625, + "logps/rejected": -239.2936553955078, + "loss": 0.5543, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4144227504730225, + "rewards/margins": 0.5289689898490906, + "rewards/rejected": -1.943392038345337, + "step": 3640 + }, + { + "epoch": 1.315078364258692, + "grad_norm": 21.490629196166992, + "learning_rate": 1.5824669824775866e-07, + "logits/chosen": 2.260507822036743, + "logits/rejected": 2.273637056350708, + "logps/chosen": -193.7061004638672, + "logps/rejected": -234.71377563476562, + "loss": 0.5882, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4560885429382324, + "rewards/margins": 0.4170869290828705, + "rewards/rejected": -1.8731753826141357, + "step": 3650 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 23.327041625976562, + "learning_rate": 1.5678588055492286e-07, + "logits/chosen": 2.1389849185943604, + "logits/rejected": 2.1763854026794434, + "logps/chosen": -172.3188018798828, + "logps/rejected": -217.666259765625, + "loss": 0.5715, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2966365814208984, + "rewards/margins": 0.4119698405265808, + "rewards/rejected": -1.708606481552124, + "step": 3660 + }, + { + "epoch": 1.3222842731039453, + "grad_norm": 33.65090560913086, + "learning_rate": 1.5532875016681247e-07, + "logits/chosen": 2.0638270378112793, + "logits/rejected": 2.0853710174560547, + "logps/chosen": -195.6360321044922, + "logps/rejected": -230.1496124267578, + "loss": 0.5884, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.422572374343872, + "rewards/margins": 0.35662734508514404, + "rewards/rejected": -1.7791998386383057, + "step": 3670 + }, + { + "epoch": 1.3258872275265718, + "grad_norm": 27.418167114257812, + "learning_rate": 1.5387536472366275e-07, + "logits/chosen": 1.9518663883209229, + "logits/rejected": 1.9620939493179321, + "logps/chosen": -174.151123046875, + "logps/rejected": -207.51699829101562, + "loss": 0.6117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3327616453170776, + "rewards/margins": 0.31053417921066284, + "rewards/rejected": -1.6432958841323853, + "step": 3680 + }, + { + "epoch": 1.3294901819491982, + "grad_norm": 24.672998428344727, + "learning_rate": 1.5242578171756864e-07, + "logits/chosen": 1.8104326725006104, + "logits/rejected": 1.8332993984222412, + "logps/chosen": -172.15753173828125, + "logps/rejected": -208.52212524414062, + "loss": 0.5907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3121302127838135, + "rewards/margins": 0.3558919131755829, + "rewards/rejected": -1.6680221557617188, + "step": 3690 + }, + { + "epoch": 1.3330931363718248, + "grad_norm": 26.359895706176758, + "learning_rate": 1.5098005849021078e-07, + "logits/chosen": 1.8233158588409424, + "logits/rejected": 1.8709800243377686, + "logps/chosen": -188.22592163085938, + "logps/rejected": -236.8144073486328, + "loss": 0.5635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4036380052566528, + "rewards/margins": 0.44843512773513794, + "rewards/rejected": -1.852073073387146, + "step": 3700 + }, + { + "epoch": 1.3366960907944514, + "grad_norm": 29.816444396972656, + "learning_rate": 1.495382522305872e-07, + "logits/chosen": 1.8397347927093506, + "logits/rejected": 1.8576186895370483, + "logps/chosen": -191.6383819580078, + "logps/rejected": -232.81326293945312, + "loss": 0.5587, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.405597448348999, + "rewards/margins": 0.3850526213645935, + "rewards/rejected": -1.7906501293182373, + "step": 3710 + }, + { + "epoch": 1.340299045217078, + "grad_norm": 23.615833282470703, + "learning_rate": 1.4810041997275092e-07, + "logits/chosen": 1.8601493835449219, + "logits/rejected": 1.8741000890731812, + "logps/chosen": -184.04672241210938, + "logps/rejected": -241.630126953125, + "loss": 0.5135, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3591305017471313, + "rewards/margins": 0.5281540155410767, + "rewards/rejected": -1.887284517288208, + "step": 3720 + }, + { + "epoch": 1.3439019996397046, + "grad_norm": 20.799488067626953, + "learning_rate": 1.4666661859355404e-07, + "logits/chosen": 2.1675801277160645, + "logits/rejected": 2.1676344871520996, + "logps/chosen": -183.47018432617188, + "logps/rejected": -213.0686492919922, + "loss": 0.6005, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4248483180999756, + "rewards/margins": 0.26402515172958374, + "rewards/rejected": -1.688873529434204, + "step": 3730 + }, + { + "epoch": 1.3475049540623312, + "grad_norm": 28.859333038330078, + "learning_rate": 1.452369048103976e-07, + "logits/chosen": 1.9734678268432617, + "logits/rejected": 1.9968979358673096, + "logps/chosen": -192.2145538330078, + "logps/rejected": -248.0688018798828, + "loss": 0.5195, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4926023483276367, + "rewards/margins": 0.5251239538192749, + "rewards/rejected": -2.017726421356201, + "step": 3740 + }, + { + "epoch": 1.3511079084849578, + "grad_norm": 18.08772087097168, + "learning_rate": 1.4381133517898803e-07, + "logits/chosen": 2.115337610244751, + "logits/rejected": 2.1246225833892822, + "logps/chosen": -196.42152404785156, + "logps/rejected": -234.7654266357422, + "loss": 0.5767, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4249114990234375, + "rewards/margins": 0.39205366373062134, + "rewards/rejected": -1.816965103149414, + "step": 3750 + }, + { + "epoch": 1.3547108629075844, + "grad_norm": 33.767269134521484, + "learning_rate": 1.423899660911005e-07, + "logits/chosen": 1.9451555013656616, + "logits/rejected": 1.9652153253555298, + "logps/chosen": -189.8253631591797, + "logps/rejected": -238.01486206054688, + "loss": 0.5512, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4437839984893799, + "rewards/margins": 0.4582470953464508, + "rewards/rejected": -1.9020313024520874, + "step": 3760 + }, + { + "epoch": 1.3583138173302107, + "grad_norm": 21.92230796813965, + "learning_rate": 1.4097285377234724e-07, + "logits/chosen": 1.7935683727264404, + "logits/rejected": 1.811952829360962, + "logps/chosen": -189.38926696777344, + "logps/rejected": -227.97006225585938, + "loss": 0.5781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.397295355796814, + "rewards/margins": 0.3482345938682556, + "rewards/rejected": -1.7455298900604248, + "step": 3770 + }, + { + "epoch": 1.3619167717528373, + "grad_norm": 27.76797103881836, + "learning_rate": 1.395600542799542e-07, + "logits/chosen": 2.068068742752075, + "logits/rejected": 2.0708858966827393, + "logps/chosen": -191.26097106933594, + "logps/rejected": -233.2442169189453, + "loss": 0.5504, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4574092626571655, + "rewards/margins": 0.39677366614341736, + "rewards/rejected": -1.8541829586029053, + "step": 3780 + }, + { + "epoch": 1.365519726175464, + "grad_norm": 27.542654037475586, + "learning_rate": 1.381516235005433e-07, + "logits/chosen": 2.1321825981140137, + "logits/rejected": 2.1544833183288574, + "logps/chosen": -203.79705810546875, + "logps/rejected": -246.9839324951172, + "loss": 0.5738, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5344202518463135, + "rewards/margins": 0.42250028252601624, + "rewards/rejected": -1.9569206237792969, + "step": 3790 + }, + { + "epoch": 1.3691226805980905, + "grad_norm": 24.526832580566406, + "learning_rate": 1.367476171479215e-07, + "logits/chosen": 2.1942076683044434, + "logits/rejected": 2.220576524734497, + "logps/chosen": -180.60507202148438, + "logps/rejected": -225.52011108398438, + "loss": 0.5746, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4094579219818115, + "rewards/margins": 0.4005855917930603, + "rewards/rejected": -1.8100435733795166, + "step": 3800 + }, + { + "epoch": 1.3727256350207169, + "grad_norm": 44.125572204589844, + "learning_rate": 1.3534809076087732e-07, + "logits/chosen": 1.9495807886123657, + "logits/rejected": 1.999224066734314, + "logps/chosen": -195.178466796875, + "logps/rejected": -241.162109375, + "loss": 0.5579, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4358845949172974, + "rewards/margins": 0.4407084584236145, + "rewards/rejected": -1.876592993736267, + "step": 3810 + }, + { + "epoch": 1.3763285894433435, + "grad_norm": 24.335865020751953, + "learning_rate": 1.3395309970098342e-07, + "logits/chosen": 2.3719897270202637, + "logits/rejected": 2.4065325260162354, + "logps/chosen": -202.8979034423828, + "logps/rejected": -255.8293914794922, + "loss": 0.5364, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.622004747390747, + "rewards/margins": 0.4795509874820709, + "rewards/rejected": -2.101555347442627, + "step": 3820 + }, + { + "epoch": 1.37993154386597, + "grad_norm": 18.380468368530273, + "learning_rate": 1.3256269915040736e-07, + "logits/chosen": 2.3298912048339844, + "logits/rejected": 2.3397748470306396, + "logps/chosen": -200.35983276367188, + "logps/rejected": -234.7159881591797, + "loss": 0.5998, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5939356088638306, + "rewards/margins": 0.3117136061191559, + "rewards/rejected": -1.905648946762085, + "step": 3830 + }, + { + "epoch": 1.3835344982885966, + "grad_norm": 27.52941131591797, + "learning_rate": 1.3117694410972747e-07, + "logits/chosen": 2.283259868621826, + "logits/rejected": 2.3234059810638428, + "logps/chosen": -183.7685546875, + "logps/rejected": -246.74740600585938, + "loss": 0.529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4562674760818481, + "rewards/margins": 0.5746394991874695, + "rewards/rejected": -2.030907154083252, + "step": 3840 + }, + { + "epoch": 1.3871374527112232, + "grad_norm": 31.34552001953125, + "learning_rate": 1.2979588939575878e-07, + "logits/chosen": 2.1807518005371094, + "logits/rejected": 2.2002217769622803, + "logps/chosen": -208.7446746826172, + "logps/rejected": -253.20889282226562, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6211105585098267, + "rewards/margins": 0.4426586627960205, + "rewards/rejected": -2.0637693405151367, + "step": 3850 + }, + { + "epoch": 1.3907404071338498, + "grad_norm": 49.54113006591797, + "learning_rate": 1.2841958963938338e-07, + "logits/chosen": 2.1136391162872314, + "logits/rejected": 2.0968828201293945, + "logps/chosen": -206.9363250732422, + "logps/rejected": -228.25143432617188, + "loss": 0.6461, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4934473037719727, + "rewards/margins": 0.2033785581588745, + "rewards/rejected": -1.6968257427215576, + "step": 3860 + }, + { + "epoch": 1.3943433615564764, + "grad_norm": 37.7128791809082, + "learning_rate": 1.2704809928338957e-07, + "logits/chosen": 2.224104404449463, + "logits/rejected": 2.2668509483337402, + "logps/chosen": -191.69552612304688, + "logps/rejected": -232.6199951171875, + "loss": 0.5649, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.503103494644165, + "rewards/margins": 0.4015834331512451, + "rewards/rejected": -1.9046871662139893, + "step": 3870 + }, + { + "epoch": 1.3979463159791028, + "grad_norm": 25.900611877441406, + "learning_rate": 1.2568147258031897e-07, + "logits/chosen": 1.8822847604751587, + "logits/rejected": 1.9027903079986572, + "logps/chosen": -187.94454956054688, + "logps/rejected": -226.4380340576172, + "loss": 0.5925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.40830659866333, + "rewards/margins": 0.36093783378601074, + "rewards/rejected": -1.7692444324493408, + "step": 3880 + }, + { + "epoch": 1.4015492704017294, + "grad_norm": 26.54643440246582, + "learning_rate": 1.2431976359031955e-07, + "logits/chosen": 1.8965444564819336, + "logits/rejected": 1.9167616367340088, + "logps/chosen": -186.21804809570312, + "logps/rejected": -238.50595092773438, + "loss": 0.5416, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4133446216583252, + "rewards/margins": 0.4906320571899414, + "rewards/rejected": -1.9039767980575562, + "step": 3890 + }, + { + "epoch": 1.405152224824356, + "grad_norm": 20.78183937072754, + "learning_rate": 1.2296302617900768e-07, + "logits/chosen": 2.143287181854248, + "logits/rejected": 2.1438541412353516, + "logps/chosen": -198.9349365234375, + "logps/rejected": -233.54049682617188, + "loss": 0.6188, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.544627070426941, + "rewards/margins": 0.3243458569049835, + "rewards/rejected": -1.8689727783203125, + "step": 3900 + }, + { + "epoch": 1.4087551792469826, + "grad_norm": 25.700965881347656, + "learning_rate": 1.216113140153371e-07, + "logits/chosen": 2.0946247577667236, + "logits/rejected": 2.117295742034912, + "logps/chosen": -196.2801513671875, + "logps/rejected": -242.2544403076172, + "loss": 0.5527, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5123510360717773, + "rewards/margins": 0.4487608075141907, + "rewards/rejected": -1.9611120223999023, + "step": 3910 + }, + { + "epoch": 1.4123581336696092, + "grad_norm": 33.97287368774414, + "learning_rate": 1.2026468056947606e-07, + "logits/chosen": 2.178783655166626, + "logits/rejected": 2.2043185234069824, + "logps/chosen": -201.91873168945312, + "logps/rejected": -244.51504516601562, + "loss": 0.6021, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.573336124420166, + "rewards/margins": 0.3733164072036743, + "rewards/rejected": -1.9466526508331299, + "step": 3920 + }, + { + "epoch": 1.4159610880922355, + "grad_norm": 45.7540397644043, + "learning_rate": 1.189231791106921e-07, + "logits/chosen": 2.130858898162842, + "logits/rejected": 2.180258274078369, + "logps/chosen": -201.24057006835938, + "logps/rejected": -257.926025390625, + "loss": 0.5199, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6287132501602173, + "rewards/margins": 0.5473740696907043, + "rewards/rejected": -2.1760871410369873, + "step": 3930 + }, + { + "epoch": 1.4195640425148621, + "grad_norm": 27.437519073486328, + "learning_rate": 1.1758686270524482e-07, + "logits/chosen": 2.3916800022125244, + "logits/rejected": 2.4437804222106934, + "logps/chosen": -217.48770141601562, + "logps/rejected": -262.7460632324219, + "loss": 0.5762, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7124818563461304, + "rewards/margins": 0.4045422077178955, + "rewards/rejected": -2.1170241832733154, + "step": 3940 + }, + { + "epoch": 1.4231669969374887, + "grad_norm": 28.69942855834961, + "learning_rate": 1.1625578421428714e-07, + "logits/chosen": 2.472696304321289, + "logits/rejected": 2.467665433883667, + "logps/chosen": -226.68984985351562, + "logps/rejected": -254.80654907226562, + "loss": 0.6573, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.8046505451202393, + "rewards/margins": 0.23238249123096466, + "rewards/rejected": -2.0370326042175293, + "step": 3950 + }, + { + "epoch": 1.4267699513601153, + "grad_norm": 30.90325355529785, + "learning_rate": 1.149299962917733e-07, + "logits/chosen": 2.5207648277282715, + "logits/rejected": 2.524153470993042, + "logps/chosen": -208.6819610595703, + "logps/rejected": -243.366943359375, + "loss": 0.6122, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.645262360572815, + "rewards/margins": 0.32572299242019653, + "rewards/rejected": -1.9709851741790771, + "step": 3960 + }, + { + "epoch": 1.430372905782742, + "grad_norm": 28.970237731933594, + "learning_rate": 1.1360955138237699e-07, + "logits/chosen": 2.6872897148132324, + "logits/rejected": 2.7066454887390137, + "logps/chosen": -217.373046875, + "logps/rejected": -254.2002716064453, + "loss": 0.5834, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.753053069114685, + "rewards/margins": 0.342947393655777, + "rewards/rejected": -2.0960006713867188, + "step": 3970 + }, + { + "epoch": 1.4339758602053685, + "grad_norm": 40.99421310424805, + "learning_rate": 1.1229450171941657e-07, + "logits/chosen": 2.7422609329223633, + "logits/rejected": 2.7955024242401123, + "logps/chosen": -219.7352294921875, + "logps/rejected": -280.3626708984375, + "loss": 0.5177, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7397339344024658, + "rewards/margins": 0.6051377058029175, + "rewards/rejected": -2.3448715209960938, + "step": 3980 + }, + { + "epoch": 1.437578814627995, + "grad_norm": 30.1541748046875, + "learning_rate": 1.109848993227881e-07, + "logits/chosen": 2.5172581672668457, + "logits/rejected": 2.5636343955993652, + "logps/chosen": -218.80911254882812, + "logps/rejected": -269.53973388671875, + "loss": 0.5514, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7507011890411377, + "rewards/margins": 0.4547833800315857, + "rewards/rejected": -2.205484390258789, + "step": 3990 + }, + { + "epoch": 1.4411817690506215, + "grad_norm": 34.5133056640625, + "learning_rate": 1.0968079599690872e-07, + "logits/chosen": 2.8047642707824707, + "logits/rejected": 2.835019826889038, + "logps/chosen": -232.9544219970703, + "logps/rejected": -268.1980285644531, + "loss": 0.6157, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.9134280681610107, + "rewards/margins": 0.30228549242019653, + "rewards/rejected": -2.2157135009765625, + "step": 4000 + }, + { + "epoch": 1.444784723473248, + "grad_norm": 37.54106140136719, + "learning_rate": 1.083822433286666e-07, + "logits/chosen": 2.6304919719696045, + "logits/rejected": 2.652879476547241, + "logps/chosen": -206.3196258544922, + "logps/rejected": -249.50363159179688, + "loss": 0.5796, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6504005193710327, + "rewards/margins": 0.40803390741348267, + "rewards/rejected": -2.058434247970581, + "step": 4010 + }, + { + "epoch": 1.4483876778958746, + "grad_norm": 19.438796997070312, + "learning_rate": 1.0708929268538034e-07, + "logits/chosen": 2.2060623168945312, + "logits/rejected": 2.231776714324951, + "logps/chosen": -215.81393432617188, + "logps/rejected": -249.65823364257812, + "loss": 0.6158, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6890411376953125, + "rewards/margins": 0.3048761487007141, + "rewards/rejected": -1.9939172267913818, + "step": 4020 + }, + { + "epoch": 1.4519906323185012, + "grad_norm": 30.540685653686523, + "learning_rate": 1.0580199521276759e-07, + "logits/chosen": 2.5210537910461426, + "logits/rejected": 2.549612045288086, + "logps/chosen": -206.7338409423828, + "logps/rejected": -244.7657012939453, + "loss": 0.5878, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6120946407318115, + "rewards/margins": 0.36665600538253784, + "rewards/rejected": -1.9787505865097046, + "step": 4030 + }, + { + "epoch": 1.4555935867411276, + "grad_norm": 23.23627471923828, + "learning_rate": 1.0452040183292124e-07, + "logits/chosen": 2.3309028148651123, + "logits/rejected": 2.3343138694763184, + "logps/chosen": -208.15908813476562, + "logps/rejected": -247.7962188720703, + "loss": 0.5666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.594066858291626, + "rewards/margins": 0.38210588693618774, + "rewards/rejected": -1.9761728048324585, + "step": 4040 + }, + { + "epoch": 1.4591965411637542, + "grad_norm": 21.690418243408203, + "learning_rate": 1.0324456324229536e-07, + "logits/chosen": 2.2826833724975586, + "logits/rejected": 2.3174784183502197, + "logps/chosen": -195.55740356445312, + "logps/rejected": -230.21023559570312, + "loss": 0.6036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4546035528182983, + "rewards/margins": 0.3235613703727722, + "rewards/rejected": -1.7781648635864258, + "step": 4050 + }, + { + "epoch": 1.4627994955863808, + "grad_norm": 24.094581604003906, + "learning_rate": 1.0197452990969976e-07, + "logits/chosen": 2.2778327465057373, + "logits/rejected": 2.343728542327881, + "logps/chosen": -198.56021118164062, + "logps/rejected": -256.511962890625, + "loss": 0.5293, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5641119480133057, + "rewards/margins": 0.533170223236084, + "rewards/rejected": -2.0972819328308105, + "step": 4060 + }, + { + "epoch": 1.4664024500090074, + "grad_norm": 23.411500930786133, + "learning_rate": 1.007103520743035e-07, + "logits/chosen": 2.2860405445098877, + "logits/rejected": 2.3017578125, + "logps/chosen": -208.67245483398438, + "logps/rejected": -249.4423370361328, + "loss": 0.596, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6604540348052979, + "rewards/margins": 0.3682071268558502, + "rewards/rejected": -2.0286612510681152, + "step": 4070 + }, + { + "epoch": 1.470005404431634, + "grad_norm": 29.012025833129883, + "learning_rate": 9.945207974364767e-08, + "logits/chosen": 2.3106510639190674, + "logits/rejected": 2.344446897506714, + "logps/chosen": -243.06356811523438, + "logps/rejected": -285.7310791015625, + "loss": 0.6411, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8531125783920288, + "rewards/margins": 0.3922838568687439, + "rewards/rejected": -2.245396614074707, + "step": 4080 + }, + { + "epoch": 1.4736083588542606, + "grad_norm": 31.737157821655273, + "learning_rate": 9.819976269166704e-08, + "logits/chosen": 2.3555893898010254, + "logits/rejected": 2.340897798538208, + "logps/chosen": -219.1803436279297, + "logps/rejected": -248.2200927734375, + "loss": 0.603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6209685802459717, + "rewards/margins": 0.2904869616031647, + "rewards/rejected": -1.911455512046814, + "step": 4090 + }, + { + "epoch": 1.4772113132768872, + "grad_norm": 37.465389251708984, + "learning_rate": 9.695345045672165e-08, + "logits/chosen": 2.235996961593628, + "logits/rejected": 2.2625551223754883, + "logps/chosen": -198.73208618164062, + "logps/rejected": -233.3943328857422, + "loss": 0.5865, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5233110189437866, + "rewards/margins": 0.3367288410663605, + "rewards/rejected": -1.8600399494171143, + "step": 4100 + }, + { + "epoch": 1.4808142676995135, + "grad_norm": 28.036596298217773, + "learning_rate": 9.571319233963626e-08, + "logits/chosen": 2.2205023765563965, + "logits/rejected": 2.242121934890747, + "logps/chosen": -189.00392150878906, + "logps/rejected": -233.1159210205078, + "loss": 0.5559, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4463272094726562, + "rewards/margins": 0.44084176421165466, + "rewards/rejected": -1.8871688842773438, + "step": 4110 + }, + { + "epoch": 1.4844172221221401, + "grad_norm": 36.313133239746094, + "learning_rate": 9.447903740175098e-08, + "logits/chosen": 2.470679759979248, + "logits/rejected": 2.494894504547119, + "logps/chosen": -187.43682861328125, + "logps/rejected": -241.8269805908203, + "loss": 0.5278, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4297406673431396, + "rewards/margins": 0.5476259589195251, + "rewards/rejected": -1.9773668050765991, + "step": 4120 + }, + { + "epoch": 1.4880201765447667, + "grad_norm": 32.384525299072266, + "learning_rate": 9.325103446298038e-08, + "logits/chosen": 2.352541208267212, + "logits/rejected": 2.365428924560547, + "logps/chosen": -214.4003448486328, + "logps/rejected": -244.2249298095703, + "loss": 0.633, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6720874309539795, + "rewards/margins": 0.27814793586730957, + "rewards/rejected": -1.9502356052398682, + "step": 4130 + }, + { + "epoch": 1.4916231309673933, + "grad_norm": 36.610294342041016, + "learning_rate": 9.202923209988197e-08, + "logits/chosen": 2.142672538757324, + "logits/rejected": 2.1541152000427246, + "logps/chosen": -198.27978515625, + "logps/rejected": -231.7624053955078, + "loss": 0.5759, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.429612398147583, + "rewards/margins": 0.342256098985672, + "rewards/rejected": -1.7718684673309326, + "step": 4140 + }, + { + "epoch": 1.4952260853900199, + "grad_norm": 20.335296630859375, + "learning_rate": 9.081367864373488e-08, + "logits/chosen": 2.2970311641693115, + "logits/rejected": 2.3052754402160645, + "logps/chosen": -192.21348571777344, + "logps/rejected": -225.3434295654297, + "loss": 0.6121, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3674556016921997, + "rewards/margins": 0.31879502534866333, + "rewards/rejected": -1.6862504482269287, + "step": 4150 + }, + { + "epoch": 1.4988290398126463, + "grad_norm": 25.644750595092773, + "learning_rate": 8.960442217862795e-08, + "logits/chosen": 2.054680347442627, + "logits/rejected": 2.0955567359924316, + "logps/chosen": -194.7618408203125, + "logps/rejected": -247.1341094970703, + "loss": 0.5412, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.515158772468567, + "rewards/margins": 0.46768832206726074, + "rewards/rejected": -1.9828474521636963, + "step": 4160 + }, + { + "epoch": 1.5024319942352728, + "grad_norm": 38.148651123046875, + "learning_rate": 8.840151053955772e-08, + "logits/chosen": 2.0653910636901855, + "logits/rejected": 2.0864741802215576, + "logps/chosen": -205.56201171875, + "logps/rejected": -238.72677612304688, + "loss": 0.6198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6179357767105103, + "rewards/margins": 0.32318201661109924, + "rewards/rejected": -1.9411180019378662, + "step": 4170 + }, + { + "epoch": 1.5060349486578994, + "grad_norm": 28.420183181762695, + "learning_rate": 8.720499131053611e-08, + "logits/chosen": 2.002610921859741, + "logits/rejected": 2.0557868480682373, + "logps/chosen": -190.8556671142578, + "logps/rejected": -230.8711395263672, + "loss": 0.589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4824676513671875, + "rewards/margins": 0.3686225414276123, + "rewards/rejected": -1.8510901927947998, + "step": 4180 + }, + { + "epoch": 1.509637903080526, + "grad_norm": 23.488773345947266, + "learning_rate": 8.601491182270812e-08, + "logits/chosen": 2.133180618286133, + "logits/rejected": 2.159903049468994, + "logps/chosen": -210.1831512451172, + "logps/rejected": -258.10443115234375, + "loss": 0.5667, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6593990325927734, + "rewards/margins": 0.46231454610824585, + "rewards/rejected": -2.121713638305664, + "step": 4190 + }, + { + "epoch": 1.5132408575031526, + "grad_norm": 25.544527053833008, + "learning_rate": 8.483131915247967e-08, + "logits/chosen": 2.097330093383789, + "logits/rejected": 2.1371865272521973, + "logps/chosen": -188.57901000976562, + "logps/rejected": -234.85470581054688, + "loss": 0.5559, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4768812656402588, + "rewards/margins": 0.4339800775051117, + "rewards/rejected": -1.910861611366272, + "step": 4200 + }, + { + "epoch": 1.5168438119257792, + "grad_norm": 19.057374954223633, + "learning_rate": 8.365426011965512e-08, + "logits/chosen": 2.0573668479919434, + "logits/rejected": 2.072871685028076, + "logps/chosen": -207.1567840576172, + "logps/rejected": -246.02230834960938, + "loss": 0.6044, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5855404138565063, + "rewards/margins": 0.3767652213573456, + "rewards/rejected": -1.9623054265975952, + "step": 4210 + }, + { + "epoch": 1.5204467663484058, + "grad_norm": 27.889890670776367, + "learning_rate": 8.248378128558564e-08, + "logits/chosen": 2.236063003540039, + "logits/rejected": 2.2808563709259033, + "logps/chosen": -213.96591186523438, + "logps/rejected": -264.50653076171875, + "loss": 0.5562, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.730071783065796, + "rewards/margins": 0.4675087332725525, + "rewards/rejected": -2.1975808143615723, + "step": 4220 + }, + { + "epoch": 1.5240497207710324, + "grad_norm": 20.943641662597656, + "learning_rate": 8.131992895132692e-08, + "logits/chosen": 1.8917369842529297, + "logits/rejected": 1.8827037811279297, + "logps/chosen": -192.46511840820312, + "logps/rejected": -225.7458953857422, + "loss": 0.5849, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3797410726547241, + "rewards/margins": 0.3336414694786072, + "rewards/rejected": -1.7133827209472656, + "step": 4230 + }, + { + "epoch": 1.5276526751936588, + "grad_norm": 26.538253784179688, + "learning_rate": 8.016274915580753e-08, + "logits/chosen": 2.1079792976379395, + "logits/rejected": 2.124335289001465, + "logps/chosen": -211.16259765625, + "logps/rejected": -263.89190673828125, + "loss": 0.5333, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5972684621810913, + "rewards/margins": 0.5083234906196594, + "rewards/rejected": -2.1055920124053955, + "step": 4240 + }, + { + "epoch": 1.5312556296162854, + "grad_norm": 32.481163024902344, + "learning_rate": 7.901228767400858e-08, + "logits/chosen": 2.065208911895752, + "logits/rejected": 2.0380001068115234, + "logps/chosen": -196.34857177734375, + "logps/rejected": -219.82095336914062, + "loss": 0.6259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4603230953216553, + "rewards/margins": 0.22774550318717957, + "rewards/rejected": -1.6880686283111572, + "step": 4250 + }, + { + "epoch": 1.534858584038912, + "grad_norm": 21.735862731933594, + "learning_rate": 7.786859001515195e-08, + "logits/chosen": 1.9773706197738647, + "logits/rejected": 2.003389835357666, + "logps/chosen": -187.12619018554688, + "logps/rejected": -237.146484375, + "loss": 0.5525, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.378788709640503, + "rewards/margins": 0.4737245440483093, + "rewards/rejected": -1.852513313293457, + "step": 4260 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 26.040084838867188, + "learning_rate": 7.673170142090075e-08, + "logits/chosen": 1.9840046167373657, + "logits/rejected": 2.0193214416503906, + "logps/chosen": -206.76101684570312, + "logps/rejected": -261.0201110839844, + "loss": 0.5485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6018749475479126, + "rewards/margins": 0.5224472284317017, + "rewards/rejected": -2.1243224143981934, + "step": 4270 + }, + { + "epoch": 1.542064492884165, + "grad_norm": 31.891942977905273, + "learning_rate": 7.560166686356928e-08, + "logits/chosen": 1.8930866718292236, + "logits/rejected": 1.939308762550354, + "logps/chosen": -208.3704833984375, + "logps/rejected": -253.8648223876953, + "loss": 0.5499, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4735658168792725, + "rewards/margins": 0.44235682487487793, + "rewards/rejected": -1.9159224033355713, + "step": 4280 + }, + { + "epoch": 1.5456674473067915, + "grad_norm": 26.410696029663086, + "learning_rate": 7.447853104434438e-08, + "logits/chosen": 1.9173721075057983, + "logits/rejected": 1.9585317373275757, + "logps/chosen": -216.7724151611328, + "logps/rejected": -256.5429992675781, + "loss": 0.5841, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6213337182998657, + "rewards/margins": 0.3730444312095642, + "rewards/rejected": -1.9943780899047852, + "step": 4290 + }, + { + "epoch": 1.549270401729418, + "grad_norm": 27.23445701599121, + "learning_rate": 7.336233839151692e-08, + "logits/chosen": 2.2593255043029785, + "logits/rejected": 2.3057758808135986, + "logps/chosen": -213.1826934814453, + "logps/rejected": -269.57177734375, + "loss": 0.5471, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6645898818969727, + "rewards/margins": 0.5269854664802551, + "rewards/rejected": -2.191575527191162, + "step": 4300 + }, + { + "epoch": 1.5528733561520447, + "grad_norm": 25.672992706298828, + "learning_rate": 7.225313305872438e-08, + "logits/chosen": 2.176313877105713, + "logits/rejected": 2.2026896476745605, + "logps/chosen": -217.30252075195312, + "logps/rejected": -263.0103759765625, + "loss": 0.556, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6082637310028076, + "rewards/margins": 0.454536110162735, + "rewards/rejected": -2.0627999305725098, + "step": 4310 + }, + { + "epoch": 1.5564763105746713, + "grad_norm": 26.70905876159668, + "learning_rate": 7.115095892320455e-08, + "logits/chosen": 2.4024858474731445, + "logits/rejected": 2.4109010696411133, + "logps/chosen": -224.5513458251953, + "logps/rejected": -261.7074279785156, + "loss": 0.5765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7521905899047852, + "rewards/margins": 0.3836340308189392, + "rewards/rejected": -2.135824680328369, + "step": 4320 + }, + { + "epoch": 1.5600792649972979, + "grad_norm": 22.603137969970703, + "learning_rate": 7.005585958405916e-08, + "logits/chosen": 2.3563637733459473, + "logits/rejected": 2.3754074573516846, + "logps/chosen": -221.39016723632812, + "logps/rejected": -266.6629638671875, + "loss": 0.5719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7421430349349976, + "rewards/margins": 0.42091941833496094, + "rewards/rejected": -2.163062572479248, + "step": 4330 + }, + { + "epoch": 1.5636822194199245, + "grad_norm": 47.77539825439453, + "learning_rate": 6.896787836052992e-08, + "logits/chosen": 2.3739821910858154, + "logits/rejected": 2.3638415336608887, + "logps/chosen": -212.63833618164062, + "logps/rejected": -248.03408813476562, + "loss": 0.6251, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7163009643554688, + "rewards/margins": 0.32989609241485596, + "rewards/rejected": -2.0461974143981934, + "step": 4340 + }, + { + "epoch": 1.5672851738425508, + "grad_norm": 22.239276885986328, + "learning_rate": 6.788705829028482e-08, + "logits/chosen": 2.0460848808288574, + "logits/rejected": 2.074023485183716, + "logps/chosen": -205.8352508544922, + "logps/rejected": -244.93984985351562, + "loss": 0.5871, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4822351932525635, + "rewards/margins": 0.38295218348503113, + "rewards/rejected": -1.8651872873306274, + "step": 4350 + }, + { + "epoch": 1.5708881282651774, + "grad_norm": 28.040822982788086, + "learning_rate": 6.681344212771506e-08, + "logits/chosen": 2.334414005279541, + "logits/rejected": 2.349910020828247, + "logps/chosen": -198.33810424804688, + "logps/rejected": -238.9496307373047, + "loss": 0.6032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5019086599349976, + "rewards/margins": 0.3424648642539978, + "rewards/rejected": -1.8443737030029297, + "step": 4360 + }, + { + "epoch": 1.574491082687804, + "grad_norm": 32.481414794921875, + "learning_rate": 6.574707234224466e-08, + "logits/chosen": 2.114077568054199, + "logits/rejected": 2.119075298309326, + "logps/chosen": -201.87498474121094, + "logps/rejected": -227.68698120117188, + "loss": 0.6725, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5666872262954712, + "rewards/margins": 0.22483928501605988, + "rewards/rejected": -1.791526436805725, + "step": 4370 + }, + { + "epoch": 1.5780940371104304, + "grad_norm": 22.595666885375977, + "learning_rate": 6.468799111665003e-08, + "logits/chosen": 2.0177810192108154, + "logits/rejected": 2.061736583709717, + "logps/chosen": -200.3575439453125, + "logps/rejected": -249.4969940185547, + "loss": 0.5581, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4850465059280396, + "rewards/margins": 0.43845659494400024, + "rewards/rejected": -1.9235031604766846, + "step": 4380 + }, + { + "epoch": 1.581696991533057, + "grad_norm": 33.4980354309082, + "learning_rate": 6.363624034539097e-08, + "logits/chosen": 2.0352072715759277, + "logits/rejected": 2.0526909828186035, + "logps/chosen": -211.1829833984375, + "logps/rejected": -253.79928588867188, + "loss": 0.5572, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5847043991088867, + "rewards/margins": 0.398946613073349, + "rewards/rejected": -1.9836509227752686, + "step": 4390 + }, + { + "epoch": 1.5852999459556836, + "grad_norm": 18.39958953857422, + "learning_rate": 6.259186163295438e-08, + "logits/chosen": 2.5003159046173096, + "logits/rejected": 2.5333242416381836, + "logps/chosen": -197.83883666992188, + "logps/rejected": -253.9361572265625, + "loss": 0.5233, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5224320888519287, + "rewards/margins": 0.5308869481086731, + "rewards/rejected": -2.053318977355957, + "step": 4400 + }, + { + "epoch": 1.5889029003783102, + "grad_norm": 30.835023880004883, + "learning_rate": 6.155489629220764e-08, + "logits/chosen": 2.2001023292541504, + "logits/rejected": 2.235380172729492, + "logps/chosen": -195.66595458984375, + "logps/rejected": -237.34512329101562, + "loss": 0.5547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4750816822052002, + "rewards/margins": 0.4272097051143646, + "rewards/rejected": -1.9022915363311768, + "step": 4410 + }, + { + "epoch": 1.5925058548009368, + "grad_norm": 28.24887466430664, + "learning_rate": 6.052538534276477e-08, + "logits/chosen": 2.1940560340881348, + "logits/rejected": 2.260021686553955, + "logps/chosen": -194.35580444335938, + "logps/rejected": -256.4817810058594, + "loss": 0.5122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4525573253631592, + "rewards/margins": 0.603963315486908, + "rewards/rejected": -2.056520700454712, + "step": 4420 + }, + { + "epoch": 1.5961088092235634, + "grad_norm": 28.577600479125977, + "learning_rate": 5.9503369509363774e-08, + "logits/chosen": 2.009065866470337, + "logits/rejected": 2.0373148918151855, + "logps/chosen": -209.2212677001953, + "logps/rejected": -250.5079803466797, + "loss": 0.5754, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.497430443763733, + "rewards/margins": 0.37567323446273804, + "rewards/rejected": -1.8731034994125366, + "step": 4430 + }, + { + "epoch": 1.59971176364619, + "grad_norm": 43.857566833496094, + "learning_rate": 5.848888922025552e-08, + "logits/chosen": 2.3759193420410156, + "logits/rejected": 2.396794319152832, + "logps/chosen": -204.43545532226562, + "logps/rejected": -258.35150146484375, + "loss": 0.5784, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5501829385757446, + "rewards/margins": 0.4931480288505554, + "rewards/rejected": -2.043330669403076, + "step": 4440 + }, + { + "epoch": 1.6033147180688165, + "grad_norm": 34.31081008911133, + "learning_rate": 5.748198460560475e-08, + "logits/chosen": 2.1870875358581543, + "logits/rejected": 2.210813522338867, + "logps/chosen": -226.90933227539062, + "logps/rejected": -270.5358581542969, + "loss": 0.6091, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7758804559707642, + "rewards/margins": 0.4144059121608734, + "rewards/rejected": -2.19028639793396, + "step": 4450 + }, + { + "epoch": 1.6069176724914431, + "grad_norm": 42.93614959716797, + "learning_rate": 5.648269549590232e-08, + "logits/chosen": 2.306990146636963, + "logits/rejected": 2.348219156265259, + "logps/chosen": -213.69424438476562, + "logps/rejected": -251.96255493164062, + "loss": 0.6047, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6652253866195679, + "rewards/margins": 0.3560051918029785, + "rewards/rejected": -2.021230697631836, + "step": 4460 + }, + { + "epoch": 1.6105206269140695, + "grad_norm": 28.934322357177734, + "learning_rate": 5.5491061420390174e-08, + "logits/chosen": 2.3856258392333984, + "logits/rejected": 2.384432315826416, + "logps/chosen": -220.46621704101562, + "logps/rejected": -264.6500549316406, + "loss": 0.5652, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7540113925933838, + "rewards/margins": 0.39775171875953674, + "rewards/rejected": -2.1517632007598877, + "step": 4470 + }, + { + "epoch": 1.614123581336696, + "grad_norm": 27.708112716674805, + "learning_rate": 5.4507121605496726e-08, + "logits/chosen": 2.4679114818573, + "logits/rejected": 2.505674362182617, + "logps/chosen": -178.9739227294922, + "logps/rejected": -243.4705810546875, + "loss": 0.5198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.408857822418213, + "rewards/margins": 0.6008615493774414, + "rewards/rejected": -2.0097193717956543, + "step": 4480 + }, + { + "epoch": 1.6177265357593227, + "grad_norm": 43.39290237426758, + "learning_rate": 5.353091497328627e-08, + "logits/chosen": 2.1443047523498535, + "logits/rejected": 2.1748502254486084, + "logps/chosen": -196.84353637695312, + "logps/rejected": -246.3999481201172, + "loss": 0.5834, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5358701944351196, + "rewards/margins": 0.4669255316257477, + "rewards/rejected": -2.002795696258545, + "step": 4490 + }, + { + "epoch": 1.621329490181949, + "grad_norm": 35.661983489990234, + "learning_rate": 5.256248013991857e-08, + "logits/chosen": 2.0213568210601807, + "logits/rejected": 2.0241341590881348, + "logps/chosen": -203.96005249023438, + "logps/rejected": -242.9303741455078, + "loss": 0.6014, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4897011518478394, + "rewards/margins": 0.3591596186161041, + "rewards/rejected": -1.8488609790802002, + "step": 4500 + }, + { + "epoch": 1.6249324446045756, + "grad_norm": 21.74781608581543, + "learning_rate": 5.1601855414121295e-08, + "logits/chosen": 2.185598373413086, + "logits/rejected": 2.2083213329315186, + "logps/chosen": -185.43930053710938, + "logps/rejected": -227.32546997070312, + "loss": 0.5827, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.39814293384552, + "rewards/margins": 0.3859577178955078, + "rewards/rejected": -1.7841007709503174, + "step": 4510 + }, + { + "epoch": 1.6285353990272022, + "grad_norm": 21.03277587890625, + "learning_rate": 5.064907879567526e-08, + "logits/chosen": 2.1720356941223145, + "logits/rejected": 2.1843976974487305, + "logps/chosen": -183.34188842773438, + "logps/rejected": -229.08407592773438, + "loss": 0.55, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3856041431427002, + "rewards/margins": 0.4459781050682068, + "rewards/rejected": -1.8315823078155518, + "step": 4520 + }, + { + "epoch": 1.6321383534498288, + "grad_norm": 29.443302154541016, + "learning_rate": 4.9704187973910624e-08, + "logits/chosen": 2.2658417224884033, + "logits/rejected": 2.2824199199676514, + "logps/chosen": -208.009033203125, + "logps/rejected": -248.68954467773438, + "loss": 0.5848, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6338236331939697, + "rewards/margins": 0.3810350000858307, + "rewards/rejected": -2.0148587226867676, + "step": 4530 + }, + { + "epoch": 1.6357413078724554, + "grad_norm": 32.813499450683594, + "learning_rate": 4.87672203262163e-08, + "logits/chosen": 2.2142345905303955, + "logits/rejected": 2.2316298484802246, + "logps/chosen": -198.617431640625, + "logps/rejected": -236.8822784423828, + "loss": 0.5863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5111747980117798, + "rewards/margins": 0.37488654255867004, + "rewards/rejected": -1.8860610723495483, + "step": 4540 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 70.49163818359375, + "learning_rate": 4.7838212916561285e-08, + "logits/chosen": 2.2292842864990234, + "logits/rejected": 2.255504608154297, + "logps/chosen": -216.76748657226562, + "logps/rejected": -242.6904296875, + "loss": 0.6585, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6865644454956055, + "rewards/margins": 0.22057318687438965, + "rewards/rejected": -1.9071376323699951, + "step": 4550 + }, + { + "epoch": 1.6429472167177086, + "grad_norm": 26.967416763305664, + "learning_rate": 4.691720249402856e-08, + "logits/chosen": 1.9996013641357422, + "logits/rejected": 2.0293033123016357, + "logps/chosen": -189.83139038085938, + "logps/rejected": -233.23388671875, + "loss": 0.5648, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3463690280914307, + "rewards/margins": 0.4225892126560211, + "rewards/rejected": -1.768958330154419, + "step": 4560 + }, + { + "epoch": 1.6465501711403352, + "grad_norm": 19.056949615478516, + "learning_rate": 4.600422549136137e-08, + "logits/chosen": 2.1994102001190186, + "logits/rejected": 2.231595039367676, + "logps/chosen": -199.77468872070312, + "logps/rejected": -232.41598510742188, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4999521970748901, + "rewards/margins": 0.3468267619609833, + "rewards/rejected": -1.8467791080474854, + "step": 4570 + }, + { + "epoch": 1.6501531255629618, + "grad_norm": 28.43031883239746, + "learning_rate": 4.50993180235221e-08, + "logits/chosen": 2.098292350769043, + "logits/rejected": 2.1165525913238525, + "logps/chosen": -199.96417236328125, + "logps/rejected": -235.9426727294922, + "loss": 0.5796, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.478283166885376, + "rewards/margins": 0.3656356930732727, + "rewards/rejected": -1.843918800354004, + "step": 4580 + }, + { + "epoch": 1.6537560799855882, + "grad_norm": 58.72772979736328, + "learning_rate": 4.4202515886263725e-08, + "logits/chosen": 2.0434346199035645, + "logits/rejected": 2.074063301086426, + "logps/chosen": -190.58383178710938, + "logps/rejected": -238.36990356445312, + "loss": 0.5493, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4896175861358643, + "rewards/margins": 0.4576866626739502, + "rewards/rejected": -1.947304368019104, + "step": 4590 + }, + { + "epoch": 1.6573590344082147, + "grad_norm": 36.73121643066406, + "learning_rate": 4.331385455471345e-08, + "logits/chosen": 2.244147539138794, + "logits/rejected": 2.270183801651001, + "logps/chosen": -186.93829345703125, + "logps/rejected": -219.9781951904297, + "loss": 0.6086, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.395939588546753, + "rewards/margins": 0.3456757068634033, + "rewards/rejected": -1.7416152954101562, + "step": 4600 + }, + { + "epoch": 1.6609619888308413, + "grad_norm": 30.627487182617188, + "learning_rate": 4.24333691819698e-08, + "logits/chosen": 2.131470203399658, + "logits/rejected": 2.138305187225342, + "logps/chosen": -183.6460418701172, + "logps/rejected": -226.09512329101562, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2923738956451416, + "rewards/margins": 0.4197824001312256, + "rewards/rejected": -1.7121562957763672, + "step": 4610 + }, + { + "epoch": 1.6645649432534677, + "grad_norm": 30.62818145751953, + "learning_rate": 4.156109459771215e-08, + "logits/chosen": 2.113499879837036, + "logits/rejected": 2.1298279762268066, + "logps/chosen": -193.1403045654297, + "logps/rejected": -234.14138793945312, + "loss": 0.5866, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3887965679168701, + "rewards/margins": 0.40222710371017456, + "rewards/rejected": -1.7910236120224, + "step": 4620 + }, + { + "epoch": 1.6681678976760943, + "grad_norm": 31.726381301879883, + "learning_rate": 4.069706530682232e-08, + "logits/chosen": 1.8225421905517578, + "logits/rejected": 1.8487510681152344, + "logps/chosen": -185.66934204101562, + "logps/rejected": -235.89761352539062, + "loss": 0.5964, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3873560428619385, + "rewards/margins": 0.4824633002281189, + "rewards/rejected": -1.8698192834854126, + "step": 4630 + }, + { + "epoch": 1.671770852098721, + "grad_norm": 30.31300163269043, + "learning_rate": 3.984131548802047e-08, + "logits/chosen": 2.002676248550415, + "logits/rejected": 2.043703556060791, + "logps/chosen": -178.9305877685547, + "logps/rejected": -227.54110717773438, + "loss": 0.5445, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3820874691009521, + "rewards/margins": 0.4529193937778473, + "rewards/rejected": -1.8350070714950562, + "step": 4640 + }, + { + "epoch": 1.6753738065213475, + "grad_norm": 42.443992614746094, + "learning_rate": 3.899387899251241e-08, + "logits/chosen": 2.156654119491577, + "logits/rejected": 2.172597646713257, + "logps/chosen": -218.76962280273438, + "logps/rejected": -264.9418640136719, + "loss": 0.5746, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7126739025115967, + "rewards/margins": 0.4262749254703522, + "rewards/rejected": -2.138948917388916, + "step": 4650 + }, + { + "epoch": 1.678976760943974, + "grad_norm": 19.931983947753906, + "learning_rate": 3.8154789342650955e-08, + "logits/chosen": 2.096226692199707, + "logits/rejected": 2.1523149013519287, + "logps/chosen": -206.48068237304688, + "logps/rejected": -258.7948913574219, + "loss": 0.5776, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7011810541152954, + "rewards/margins": 0.5019563436508179, + "rewards/rejected": -2.2031373977661133, + "step": 4660 + }, + { + "epoch": 1.6825797153666007, + "grad_norm": 25.14623260498047, + "learning_rate": 3.732407973060964e-08, + "logits/chosen": 2.0920228958129883, + "logits/rejected": 2.109365940093994, + "logps/chosen": -191.2772674560547, + "logps/rejected": -234.9695281982422, + "loss": 0.5615, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5086921453475952, + "rewards/margins": 0.4210229516029358, + "rewards/rejected": -1.9297151565551758, + "step": 4670 + }, + { + "epoch": 1.6861826697892273, + "grad_norm": 24.882339477539062, + "learning_rate": 3.6501783017069823e-08, + "logits/chosen": 2.0588390827178955, + "logits/rejected": 2.108018159866333, + "logps/chosen": -195.41616821289062, + "logps/rejected": -265.99078369140625, + "loss": 0.4962, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5025618076324463, + "rewards/margins": 0.675110936164856, + "rewards/rejected": -2.1776726245880127, + "step": 4680 + }, + { + "epoch": 1.6897856242118539, + "grad_norm": 42.46844482421875, + "learning_rate": 3.5687931729920825e-08, + "logits/chosen": 2.229187488555908, + "logits/rejected": 2.2838330268859863, + "logps/chosen": -199.95668029785156, + "logps/rejected": -246.3704376220703, + "loss": 0.5611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4860082864761353, + "rewards/margins": 0.44565218687057495, + "rewards/rejected": -1.9316606521606445, + "step": 4690 + }, + { + "epoch": 1.6933885786344802, + "grad_norm": 31.575082778930664, + "learning_rate": 3.488255806297311e-08, + "logits/chosen": 2.2462501525878906, + "logits/rejected": 2.277681589126587, + "logps/chosen": -220.80087280273438, + "logps/rejected": -274.53033447265625, + "loss": 0.5585, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6551021337509155, + "rewards/margins": 0.5508934855461121, + "rewards/rejected": -2.205995559692383, + "step": 4700 + }, + { + "epoch": 1.6969915330571068, + "grad_norm": 37.62460708618164, + "learning_rate": 3.408569387468488e-08, + "logits/chosen": 2.237877130508423, + "logits/rejected": 2.329503059387207, + "logps/chosen": -186.99349975585938, + "logps/rejected": -279.9960632324219, + "loss": 0.4353, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4569637775421143, + "rewards/margins": 0.8750921487808228, + "rewards/rejected": -2.3320560455322266, + "step": 4710 + }, + { + "epoch": 1.7005944874797334, + "grad_norm": 34.484046936035156, + "learning_rate": 3.3297370686901834e-08, + "logits/chosen": 2.1620168685913086, + "logits/rejected": 2.177347421646118, + "logps/chosen": -207.050048828125, + "logps/rejected": -247.9182891845703, + "loss": 0.5654, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5944541692733765, + "rewards/margins": 0.38728809356689453, + "rewards/rejected": -1.9817421436309814, + "step": 4720 + }, + { + "epoch": 1.7041974419023598, + "grad_norm": 30.381084442138672, + "learning_rate": 3.2517619683610084e-08, + "logits/chosen": 2.3808302879333496, + "logits/rejected": 2.377898693084717, + "logps/chosen": -217.5401611328125, + "logps/rejected": -243.4801788330078, + "loss": 0.6114, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6837667226791382, + "rewards/margins": 0.27860647439956665, + "rewards/rejected": -1.96237313747406, + "step": 4730 + }, + { + "epoch": 1.7078003963249864, + "grad_norm": 34.90840148925781, + "learning_rate": 3.174647170970296e-08, + "logits/chosen": 2.1793081760406494, + "logits/rejected": 2.217268466949463, + "logps/chosen": -209.4663848876953, + "logps/rejected": -256.37432861328125, + "loss": 0.5456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.565969705581665, + "rewards/margins": 0.471188485622406, + "rewards/rejected": -2.037158250808716, + "step": 4740 + }, + { + "epoch": 1.711403350747613, + "grad_norm": 27.377452850341797, + "learning_rate": 3.0983957269760496e-08, + "logits/chosen": 2.2203779220581055, + "logits/rejected": 2.252307415008545, + "logps/chosen": -224.4615478515625, + "logps/rejected": -269.70343017578125, + "loss": 0.6179, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7553997039794922, + "rewards/margins": 0.43910399079322815, + "rewards/rejected": -2.1945037841796875, + "step": 4750 + }, + { + "epoch": 1.7150063051702396, + "grad_norm": 25.391387939453125, + "learning_rate": 3.023010652684277e-08, + "logits/chosen": 2.140564441680908, + "logits/rejected": 2.171532154083252, + "logps/chosen": -222.165771484375, + "logps/rejected": -273.32904052734375, + "loss": 0.5767, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7117284536361694, + "rewards/margins": 0.4794246554374695, + "rewards/rejected": -2.191153049468994, + "step": 4760 + }, + { + "epoch": 1.7186092595928661, + "grad_norm": 33.57196044921875, + "learning_rate": 2.9484949301297163e-08, + "logits/chosen": 2.471834897994995, + "logits/rejected": 2.542451858520508, + "logps/chosen": -205.19735717773438, + "logps/rejected": -266.74700927734375, + "loss": 0.5333, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.626854658126831, + "rewards/margins": 0.5961812734603882, + "rewards/rejected": -2.223036050796509, + "step": 4770 + }, + { + "epoch": 1.7222122140154927, + "grad_norm": 35.74752426147461, + "learning_rate": 2.874851506957815e-08, + "logits/chosen": 2.269533157348633, + "logits/rejected": 2.283372402191162, + "logps/chosen": -209.5435333251953, + "logps/rejected": -235.3410186767578, + "loss": 0.6232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.60595703125, + "rewards/margins": 0.23182055354118347, + "rewards/rejected": -1.8377774953842163, + "step": 4780 + }, + { + "epoch": 1.7258151684381193, + "grad_norm": 34.54887771606445, + "learning_rate": 2.8020832963081774e-08, + "logits/chosen": 2.1971676349639893, + "logits/rejected": 2.2423644065856934, + "logps/chosen": -210.47946166992188, + "logps/rejected": -253.7831573486328, + "loss": 0.5701, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5512707233428955, + "rewards/margins": 0.4195014536380768, + "rewards/rejected": -1.97077214717865, + "step": 4790 + }, + { + "epoch": 1.729418122860746, + "grad_norm": 36.821964263916016, + "learning_rate": 2.7301931766992913e-08, + "logits/chosen": 2.4112257957458496, + "logits/rejected": 2.466730833053589, + "logps/chosen": -228.7645721435547, + "logps/rejected": -291.2528076171875, + "loss": 0.5122, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.857338547706604, + "rewards/margins": 0.5832494497299194, + "rewards/rejected": -2.4405877590179443, + "step": 4800 + }, + { + "epoch": 1.7330210772833725, + "grad_norm": 34.31934356689453, + "learning_rate": 2.659183991914696e-08, + "logits/chosen": 2.199960470199585, + "logits/rejected": 2.2429251670837402, + "logps/chosen": -218.9283905029297, + "logps/rejected": -259.98016357421875, + "loss": 0.5876, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.725171685218811, + "rewards/margins": 0.38315922021865845, + "rewards/rejected": -2.108330726623535, + "step": 4810 + }, + { + "epoch": 1.7366240317059989, + "grad_norm": 21.790250778198242, + "learning_rate": 2.5890585508904578e-08, + "logits/chosen": 2.387810230255127, + "logits/rejected": 2.4175963401794434, + "logps/chosen": -209.4477996826172, + "logps/rejected": -250.9538116455078, + "loss": 0.5913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6332486867904663, + "rewards/margins": 0.3776470124721527, + "rewards/rejected": -2.0108959674835205, + "step": 4820 + }, + { + "epoch": 1.7402269861286255, + "grad_norm": 25.149259567260742, + "learning_rate": 2.519819627604078e-08, + "logits/chosen": 2.5020625591278076, + "logits/rejected": 2.5205273628234863, + "logps/chosen": -220.3013458251953, + "logps/rejected": -256.93267822265625, + "loss": 0.6108, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.761733055114746, + "rewards/margins": 0.36151832342147827, + "rewards/rejected": -2.123251438140869, + "step": 4830 + }, + { + "epoch": 1.743829940551252, + "grad_norm": 22.088537216186523, + "learning_rate": 2.4514699609647637e-08, + "logits/chosen": 2.372863292694092, + "logits/rejected": 2.4213290214538574, + "logps/chosen": -214.58407592773438, + "logps/rejected": -275.7231750488281, + "loss": 0.525, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.712545394897461, + "rewards/margins": 0.5982009172439575, + "rewards/rejected": -2.310746431350708, + "step": 4840 + }, + { + "epoch": 1.7474328949738784, + "grad_norm": 40.956947326660156, + "learning_rate": 2.3840122547050478e-08, + "logits/chosen": 2.3160908222198486, + "logits/rejected": 2.3820090293884277, + "logps/chosen": -228.3021697998047, + "logps/rejected": -283.82330322265625, + "loss": 0.5659, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8226938247680664, + "rewards/margins": 0.5088420510292053, + "rewards/rejected": -2.331535816192627, + "step": 4850 + }, + { + "epoch": 1.751035849396505, + "grad_norm": 25.419803619384766, + "learning_rate": 2.317449177273889e-08, + "logits/chosen": 2.342552661895752, + "logits/rejected": 2.3440239429473877, + "logps/chosen": -203.36520385742188, + "logps/rejected": -241.7134552001953, + "loss": 0.6071, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.56822669506073, + "rewards/margins": 0.3357328176498413, + "rewards/rejected": -1.9039596319198608, + "step": 4860 + }, + { + "epoch": 1.7546388038191316, + "grad_norm": 58.29960250854492, + "learning_rate": 2.2517833617310855e-08, + "logits/chosen": 2.3843963146209717, + "logits/rejected": 2.4130828380584717, + "logps/chosen": -209.44369506835938, + "logps/rejected": -253.1976776123047, + "loss": 0.5787, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.635347604751587, + "rewards/margins": 0.4348020553588867, + "rewards/rejected": -2.0701496601104736, + "step": 4870 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 30.26983070373535, + "learning_rate": 2.1870174056430962e-08, + "logits/chosen": 2.278834819793701, + "logits/rejected": 2.3181471824645996, + "logps/chosen": -221.67153930664062, + "logps/rejected": -263.6853942871094, + "loss": 0.5966, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7323360443115234, + "rewards/margins": 0.3889417052268982, + "rewards/rejected": -2.1212778091430664, + "step": 4880 + }, + { + "epoch": 1.7618447126643848, + "grad_norm": 30.307025909423828, + "learning_rate": 2.1231538709803488e-08, + "logits/chosen": 2.2428359985351562, + "logits/rejected": 2.2310616970062256, + "logps/chosen": -211.75650024414062, + "logps/rejected": -231.0408935546875, + "loss": 0.6407, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.603784203529358, + "rewards/margins": 0.200571209192276, + "rewards/rejected": -1.8043553829193115, + "step": 4890 + }, + { + "epoch": 1.7654476670870114, + "grad_norm": 33.48064422607422, + "learning_rate": 2.0601952840158364e-08, + "logits/chosen": 2.2225899696350098, + "logits/rejected": 2.257615566253662, + "logps/chosen": -206.1726531982422, + "logps/rejected": -253.6875, + "loss": 0.5507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.616708517074585, + "rewards/margins": 0.46390867233276367, + "rewards/rejected": -2.0806171894073486, + "step": 4900 + }, + { + "epoch": 1.769050621509638, + "grad_norm": 34.95090866088867, + "learning_rate": 1.9981441352252187e-08, + "logits/chosen": 2.3539671897888184, + "logits/rejected": 2.405588150024414, + "logps/chosen": -202.20733642578125, + "logps/rejected": -255.64749145507812, + "loss": 0.5593, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6130344867706299, + "rewards/margins": 0.5116347670555115, + "rewards/rejected": -2.124669313430786, + "step": 4910 + }, + { + "epoch": 1.7726535759322646, + "grad_norm": 64.15029907226562, + "learning_rate": 1.9370028791882847e-08, + "logits/chosen": 2.24306583404541, + "logits/rejected": 2.266782283782959, + "logps/chosen": -221.64199829101562, + "logps/rejected": -263.9620056152344, + "loss": 0.574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7340548038482666, + "rewards/margins": 0.4460780620574951, + "rewards/rejected": -2.1801326274871826, + "step": 4920 + }, + { + "epoch": 1.7762565303548912, + "grad_norm": 40.552127838134766, + "learning_rate": 1.8767739344918737e-08, + "logits/chosen": 2.121140956878662, + "logits/rejected": 2.1537022590637207, + "logps/chosen": -209.78128051757812, + "logps/rejected": -248.19454956054688, + "loss": 0.5906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5313321352005005, + "rewards/margins": 0.3886438012123108, + "rewards/rejected": -1.9199758768081665, + "step": 4930 + }, + { + "epoch": 1.7798594847775175, + "grad_norm": 39.017791748046875, + "learning_rate": 1.8174596836341928e-08, + "logits/chosen": 2.2085771560668945, + "logits/rejected": 2.248157024383545, + "logps/chosen": -212.8624725341797, + "logps/rejected": -260.98992919921875, + "loss": 0.581, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.622436761856079, + "rewards/margins": 0.4501896798610687, + "rewards/rejected": -2.0726265907287598, + "step": 4940 + }, + { + "epoch": 1.7834624392001441, + "grad_norm": 35.74030303955078, + "learning_rate": 1.75906247293057e-08, + "logits/chosen": 2.171400547027588, + "logits/rejected": 2.202230215072632, + "logps/chosen": -209.862060546875, + "logps/rejected": -261.1230773925781, + "loss": 0.5805, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5814688205718994, + "rewards/margins": 0.5131391286849976, + "rewards/rejected": -2.0946078300476074, + "step": 4950 + }, + { + "epoch": 1.7870653936227707, + "grad_norm": 18.447336196899414, + "learning_rate": 1.7015846124206535e-08, + "logits/chosen": 2.213371753692627, + "logits/rejected": 2.2472574710845947, + "logps/chosen": -217.60348510742188, + "logps/rejected": -272.07012939453125, + "loss": 0.5278, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7204748392105103, + "rewards/margins": 0.5277159810066223, + "rewards/rejected": -2.2481908798217773, + "step": 4960 + }, + { + "epoch": 1.790668348045397, + "grad_norm": 20.83319091796875, + "learning_rate": 1.6450283757770077e-08, + "logits/chosen": 2.417060375213623, + "logits/rejected": 2.4168012142181396, + "logps/chosen": -191.89654541015625, + "logps/rejected": -241.69448852539062, + "loss": 0.5303, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4415607452392578, + "rewards/margins": 0.49853330850601196, + "rewards/rejected": -1.940093994140625, + "step": 4970 + }, + { + "epoch": 1.7942713024680237, + "grad_norm": 41.57540512084961, + "learning_rate": 1.58939600021519e-08, + "logits/chosen": 2.1984307765960693, + "logits/rejected": 2.2478766441345215, + "logps/chosen": -198.91310119628906, + "logps/rejected": -249.40933227539062, + "loss": 0.5479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5529348850250244, + "rewards/margins": 0.47851094603538513, + "rewards/rejected": -2.0314459800720215, + "step": 4980 + }, + { + "epoch": 1.7978742568906503, + "grad_norm": 28.625106811523438, + "learning_rate": 1.5346896864052716e-08, + "logits/chosen": 2.232046127319336, + "logits/rejected": 2.2994542121887207, + "logps/chosen": -210.97714233398438, + "logps/rejected": -279.3887023925781, + "loss": 0.5097, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6779661178588867, + "rewards/margins": 0.6225734949111938, + "rewards/rejected": -2.300539493560791, + "step": 4990 + }, + { + "epoch": 1.8014772113132769, + "grad_norm": 26.74551010131836, + "learning_rate": 1.4809115983847265e-08, + "logits/chosen": 2.254472255706787, + "logits/rejected": 2.2978768348693848, + "logps/chosen": -209.3938751220703, + "logps/rejected": -251.89694213867188, + "loss": 0.5982, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6444065570831299, + "rewards/margins": 0.393837034702301, + "rewards/rejected": -2.038243532180786, + "step": 5000 + }, + { + "epoch": 1.8050801657359035, + "grad_norm": 25.60332489013672, + "learning_rate": 1.4280638634728948e-08, + "logits/chosen": 2.3615641593933105, + "logits/rejected": 2.374450206756592, + "logps/chosen": -217.30313110351562, + "logps/rejected": -250.3795623779297, + "loss": 0.6175, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7042795419692993, + "rewards/margins": 0.3368242084980011, + "rewards/rejected": -2.0411038398742676, + "step": 5010 + }, + { + "epoch": 1.80868312015853, + "grad_norm": 58.32925796508789, + "learning_rate": 1.3761485721867971e-08, + "logits/chosen": 2.223480463027954, + "logits/rejected": 2.2535018920898438, + "logps/chosen": -213.029541015625, + "logps/rejected": -260.0022277832031, + "loss": 0.5707, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.604480504989624, + "rewards/margins": 0.4418935179710388, + "rewards/rejected": -2.0463738441467285, + "step": 5020 + }, + { + "epoch": 1.8122860745811566, + "grad_norm": 28.643110275268555, + "learning_rate": 1.3251677781584175e-08, + "logits/chosen": 2.3958420753479004, + "logits/rejected": 2.4055347442626953, + "logps/chosen": -219.23513793945312, + "logps/rejected": -259.5078125, + "loss": 0.5868, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7029615640640259, + "rewards/margins": 0.3803695738315582, + "rewards/rejected": -2.0833308696746826, + "step": 5030 + }, + { + "epoch": 1.8158890290037832, + "grad_norm": 23.900922775268555, + "learning_rate": 1.2751234980535318e-08, + "logits/chosen": 2.322385311126709, + "logits/rejected": 2.367161512374878, + "logps/chosen": -218.500244140625, + "logps/rejected": -268.5885314941406, + "loss": 0.5338, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6854898929595947, + "rewards/margins": 0.4811466336250305, + "rewards/rejected": -2.1666367053985596, + "step": 5040 + }, + { + "epoch": 1.8194919834264096, + "grad_norm": 21.290176391601562, + "learning_rate": 1.2260177114918668e-08, + "logits/chosen": 2.3270578384399414, + "logits/rejected": 2.421325445175171, + "logps/chosen": -185.88186645507812, + "logps/rejected": -258.69708251953125, + "loss": 0.4842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4692976474761963, + "rewards/margins": 0.7032708525657654, + "rewards/rejected": -2.1725687980651855, + "step": 5050 + }, + { + "epoch": 1.8230949378490362, + "grad_norm": 31.082626342773438, + "learning_rate": 1.1778523609688313e-08, + "logits/chosen": 2.322352647781372, + "logits/rejected": 2.3377273082733154, + "logps/chosen": -198.61251831054688, + "logps/rejected": -242.9839630126953, + "loss": 0.5889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.474416971206665, + "rewards/margins": 0.43179792165756226, + "rewards/rejected": -1.9062144756317139, + "step": 5060 + }, + { + "epoch": 1.8266978922716628, + "grad_norm": 35.41000747680664, + "learning_rate": 1.1306293517786613e-08, + "logits/chosen": 2.469616413116455, + "logits/rejected": 2.534945487976074, + "logps/chosen": -222.2001953125, + "logps/rejected": -296.61187744140625, + "loss": 0.5151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8318326473236084, + "rewards/margins": 0.7023247480392456, + "rewards/rejected": -2.5341572761535645, + "step": 5070 + }, + { + "epoch": 1.8303008466942892, + "grad_norm": 17.69406509399414, + "learning_rate": 1.0843505519390588e-08, + "logits/chosen": 2.348323345184326, + "logits/rejected": 2.386146306991577, + "logps/chosen": -209.5731658935547, + "logps/rejected": -267.9241943359375, + "loss": 0.572, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6910864114761353, + "rewards/margins": 0.5442907214164734, + "rewards/rejected": -2.2353768348693848, + "step": 5080 + }, + { + "epoch": 1.8339038011169158, + "grad_norm": 56.48704528808594, + "learning_rate": 1.039017792117286e-08, + "logits/chosen": 2.2373058795928955, + "logits/rejected": 2.278470993041992, + "logps/chosen": -213.70553588867188, + "logps/rejected": -262.6347961425781, + "loss": 0.5841, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7332690954208374, + "rewards/margins": 0.4511147439479828, + "rewards/rejected": -2.1843838691711426, + "step": 5090 + }, + { + "epoch": 1.8375067555395423, + "grad_norm": 25.610595703125, + "learning_rate": 9.946328655577624e-09, + "logits/chosen": 2.4761364459991455, + "logits/rejected": 2.5152907371520996, + "logps/chosen": -200.73094177246094, + "logps/rejected": -257.61474609375, + "loss": 0.5157, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5974736213684082, + "rewards/margins": 0.5294596552848816, + "rewards/rejected": -2.1269335746765137, + "step": 5100 + }, + { + "epoch": 1.841109709962169, + "grad_norm": 32.335365295410156, + "learning_rate": 9.511975280111329e-09, + "logits/chosen": 2.2728469371795654, + "logits/rejected": 2.3433430194854736, + "logps/chosen": -205.5606231689453, + "logps/rejected": -245.2855224609375, + "loss": 0.5799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5625890493392944, + "rewards/margins": 0.39551809430122375, + "rewards/rejected": -1.9581069946289062, + "step": 5110 + }, + { + "epoch": 1.8447126643847955, + "grad_norm": 25.807748794555664, + "learning_rate": 9.087134976647815e-09, + "logits/chosen": 2.431424379348755, + "logits/rejected": 2.4303934574127197, + "logps/chosen": -213.15231323242188, + "logps/rejected": -237.0194854736328, + "loss": 0.6707, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.705770492553711, + "rewards/margins": 0.2457417994737625, + "rewards/rejected": -1.951512336730957, + "step": 5120 + }, + { + "epoch": 1.8483156188074221, + "grad_norm": 28.861064910888672, + "learning_rate": 8.671824550749164e-09, + "logits/chosen": 2.2334659099578857, + "logits/rejected": 2.2756643295288086, + "logps/chosen": -218.04867553710938, + "logps/rejected": -268.18658447265625, + "loss": 0.542, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6735773086547852, + "rewards/margins": 0.47903457283973694, + "rewards/rejected": -2.15261173248291, + "step": 5130 + }, + { + "epoch": 1.8519185732300487, + "grad_norm": 40.80805969238281, + "learning_rate": 8.266060431000448e-09, + "logits/chosen": 2.0608906745910645, + "logits/rejected": 2.1089887619018555, + "logps/chosen": -228.1378936767578, + "logps/rejected": -280.6064147949219, + "loss": 0.5458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8084704875946045, + "rewards/margins": 0.4998833239078522, + "rewards/rejected": -2.308354139328003, + "step": 5140 + }, + { + "epoch": 1.8555215276526753, + "grad_norm": 19.78631591796875, + "learning_rate": 7.86985866836004e-09, + "logits/chosen": 2.2697739601135254, + "logits/rejected": 2.2894599437713623, + "logps/chosen": -208.9029998779297, + "logps/rejected": -248.81808471679688, + "loss": 0.5753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5741163492202759, + "rewards/margins": 0.4058263301849365, + "rewards/rejected": -1.9799423217773438, + "step": 5150 + }, + { + "epoch": 1.859124482075302, + "grad_norm": 23.30661964416504, + "learning_rate": 7.483234935524802e-09, + "logits/chosen": 2.202749729156494, + "logits/rejected": 2.246129274368286, + "logps/chosen": -207.0511016845703, + "logps/rejected": -258.9788818359375, + "loss": 0.5434, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6408188343048096, + "rewards/margins": 0.5060772895812988, + "rewards/rejected": -2.1468958854675293, + "step": 5160 + }, + { + "epoch": 1.8627274364979283, + "grad_norm": 23.1809024810791, + "learning_rate": 7.106204526309944e-09, + "logits/chosen": 2.3511722087860107, + "logits/rejected": 2.3740787506103516, + "logps/chosen": -203.56642150878906, + "logps/rejected": -248.69482421875, + "loss": 0.5664, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5973598957061768, + "rewards/margins": 0.43096810579299927, + "rewards/rejected": -2.0283281803131104, + "step": 5170 + }, + { + "epoch": 1.8663303909205549, + "grad_norm": 25.39525032043457, + "learning_rate": 6.738782355044048e-09, + "logits/chosen": 2.307525157928467, + "logits/rejected": 2.318244695663452, + "logps/chosen": -210.04916381835938, + "logps/rejected": -256.23248291015625, + "loss": 0.561, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.671134352684021, + "rewards/margins": 0.44288238883018494, + "rewards/rejected": -2.1140167713165283, + "step": 5180 + }, + { + "epoch": 1.8699333453431815, + "grad_norm": 34.74847412109375, + "learning_rate": 6.380982955979192e-09, + "logits/chosen": 2.2530720233917236, + "logits/rejected": 2.2938835620880127, + "logps/chosen": -206.26876831054688, + "logps/rejected": -263.15106201171875, + "loss": 0.549, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.588684320449829, + "rewards/margins": 0.5294872522354126, + "rewards/rejected": -2.118171453475952, + "step": 5190 + }, + { + "epoch": 1.8735362997658078, + "grad_norm": 41.298126220703125, + "learning_rate": 6.032820482716e-09, + "logits/chosen": 2.365787982940674, + "logits/rejected": 2.3662896156311035, + "logps/chosen": -184.18885803222656, + "logps/rejected": -222.0736541748047, + "loss": 0.5642, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4184465408325195, + "rewards/margins": 0.386150062084198, + "rewards/rejected": -1.8045965433120728, + "step": 5200 + }, + { + "epoch": 1.8771392541884344, + "grad_norm": 18.131206512451172, + "learning_rate": 5.694308707643619e-09, + "logits/chosen": 2.307661294937134, + "logits/rejected": 2.3559043407440186, + "logps/chosen": -202.79563903808594, + "logps/rejected": -245.22146606445312, + "loss": 0.563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5787583589553833, + "rewards/margins": 0.4313790202140808, + "rewards/rejected": -2.0101375579833984, + "step": 5210 + }, + { + "epoch": 1.880742208611061, + "grad_norm": 27.24417495727539, + "learning_rate": 5.365461021395095e-09, + "logits/chosen": 2.4612069129943848, + "logits/rejected": 2.503018379211426, + "logps/chosen": -223.376708984375, + "logps/rejected": -276.1731872558594, + "loss": 0.5456, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7949035167694092, + "rewards/margins": 0.49945777654647827, + "rewards/rejected": -2.2943613529205322, + "step": 5220 + }, + { + "epoch": 1.8843451630336876, + "grad_norm": 25.800756454467773, + "learning_rate": 5.046290432317663e-09, + "logits/chosen": 2.2512588500976562, + "logits/rejected": 2.3021240234375, + "logps/chosen": -230.3580322265625, + "logps/rejected": -296.2758483886719, + "loss": 0.5168, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.814835786819458, + "rewards/margins": 0.640101969242096, + "rewards/rejected": -2.454937696456909, + "step": 5230 + }, + { + "epoch": 1.8879481174563142, + "grad_norm": 30.53504753112793, + "learning_rate": 4.736809565958011e-09, + "logits/chosen": 2.399794340133667, + "logits/rejected": 2.405836582183838, + "logps/chosen": -200.7362060546875, + "logps/rejected": -243.2240753173828, + "loss": 0.6108, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5796430110931396, + "rewards/margins": 0.4164234697818756, + "rewards/rejected": -1.996066689491272, + "step": 5240 + }, + { + "epoch": 1.8915510718789408, + "grad_norm": 39.20377731323242, + "learning_rate": 4.437030664562968e-09, + "logits/chosen": 2.1366195678710938, + "logits/rejected": 2.1596555709838867, + "logps/chosen": -233.42648315429688, + "logps/rejected": -277.30462646484375, + "loss": 0.5978, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8780971765518188, + "rewards/margins": 0.41063255071640015, + "rewards/rejected": -2.288729667663574, + "step": 5250 + }, + { + "epoch": 1.8951540263015674, + "grad_norm": 39.66998291015625, + "learning_rate": 4.14696558659533e-09, + "logits/chosen": 2.115039110183716, + "logits/rejected": 2.146724224090576, + "logps/chosen": -214.2760009765625, + "logps/rejected": -255.70059204101562, + "loss": 0.5649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5169379711151123, + "rewards/margins": 0.4280829429626465, + "rewards/rejected": -1.9450209140777588, + "step": 5260 + }, + { + "epoch": 1.898756980724194, + "grad_norm": 23.76442527770996, + "learning_rate": 3.8666258062645116e-09, + "logits/chosen": 2.3175015449523926, + "logits/rejected": 2.3244643211364746, + "logps/chosen": -207.5288848876953, + "logps/rejected": -244.6178741455078, + "loss": 0.5804, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5870716571807861, + "rewards/margins": 0.3724648356437683, + "rewards/rejected": -1.9595365524291992, + "step": 5270 + }, + { + "epoch": 1.9023599351468206, + "grad_norm": 27.673847198486328, + "learning_rate": 3.5960224130728858e-09, + "logits/chosen": 2.3952476978302, + "logits/rejected": 2.428849458694458, + "logps/chosen": -227.6666259765625, + "logps/rejected": -280.7477722167969, + "loss": 0.5494, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7906230688095093, + "rewards/margins": 0.5157279372215271, + "rewards/rejected": -2.3063511848449707, + "step": 5280 + }, + { + "epoch": 1.905962889569447, + "grad_norm": 34.72603988647461, + "learning_rate": 3.3351661113769914e-09, + "logits/chosen": 2.1342246532440186, + "logits/rejected": 2.1574902534484863, + "logps/chosen": -207.9198455810547, + "logps/rejected": -265.78875732421875, + "loss": 0.531, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4568283557891846, + "rewards/margins": 0.5453433990478516, + "rewards/rejected": -2.002171754837036, + "step": 5290 + }, + { + "epoch": 1.9095658439920735, + "grad_norm": 22.57170867919922, + "learning_rate": 3.0840672199641815e-09, + "logits/chosen": 2.3103671073913574, + "logits/rejected": 2.350656509399414, + "logps/chosen": -206.5009765625, + "logps/rejected": -256.1815185546875, + "loss": 0.5614, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5754340887069702, + "rewards/margins": 0.4603274464607239, + "rewards/rejected": -2.035761594772339, + "step": 5300 + }, + { + "epoch": 1.9131687984147, + "grad_norm": 24.769725799560547, + "learning_rate": 2.842735671644336e-09, + "logits/chosen": 2.4594054222106934, + "logits/rejected": 2.4655635356903076, + "logps/chosen": -198.40814208984375, + "logps/rejected": -242.697021484375, + "loss": 0.5656, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4905879497528076, + "rewards/margins": 0.4295724928379059, + "rewards/rejected": -1.9201602935791016, + "step": 5310 + }, + { + "epoch": 1.9167717528373265, + "grad_norm": 40.51364517211914, + "learning_rate": 2.6111810128570386e-09, + "logits/chosen": 2.204953908920288, + "logits/rejected": 2.233264207839966, + "logps/chosen": -214.0225830078125, + "logps/rejected": -269.80133056640625, + "loss": 0.5394, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.673193335533142, + "rewards/margins": 0.5253087282180786, + "rewards/rejected": -2.1985020637512207, + "step": 5320 + }, + { + "epoch": 1.920374707259953, + "grad_norm": 25.24776840209961, + "learning_rate": 2.38941240329385e-09, + "logits/chosen": 2.290570020675659, + "logits/rejected": 2.324573040008545, + "logps/chosen": -220.32772827148438, + "logps/rejected": -272.10137939453125, + "loss": 0.5699, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7210849523544312, + "rewards/margins": 0.48146161437034607, + "rewards/rejected": -2.2025465965270996, + "step": 5330 + }, + { + "epoch": 1.9239776616825797, + "grad_norm": 31.25950813293457, + "learning_rate": 2.1774386155361537e-09, + "logits/chosen": 2.2316412925720215, + "logits/rejected": 2.26287841796875, + "logps/chosen": -199.40396118164062, + "logps/rejected": -251.3515625, + "loss": 0.5369, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.561570405960083, + "rewards/margins": 0.49641767144203186, + "rewards/rejected": -2.057988166809082, + "step": 5340 + }, + { + "epoch": 1.9275806161052063, + "grad_norm": 44.36783981323242, + "learning_rate": 1.9752680347078774e-09, + "logits/chosen": 2.3788223266601562, + "logits/rejected": 2.3906877040863037, + "logps/chosen": -230.75460815429688, + "logps/rejected": -258.0755310058594, + "loss": 0.6261, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8339112997055054, + "rewards/margins": 0.23464472591876984, + "rewards/rejected": -2.0685558319091797, + "step": 5350 + }, + { + "epoch": 1.9311835705278328, + "grad_norm": 28.646312713623047, + "learning_rate": 1.7829086581440667e-09, + "logits/chosen": 2.3322763442993164, + "logits/rejected": 2.3718745708465576, + "logps/chosen": -203.08871459960938, + "logps/rejected": -251.14956665039062, + "loss": 0.5678, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5246843099594116, + "rewards/margins": 0.44258731603622437, + "rewards/rejected": -1.9672715663909912, + "step": 5360 + }, + { + "epoch": 1.9347865249504594, + "grad_norm": 26.19532012939453, + "learning_rate": 1.6003680950742726e-09, + "logits/chosen": 2.258373260498047, + "logits/rejected": 2.3165993690490723, + "logps/chosen": -203.53799438476562, + "logps/rejected": -256.0785827636719, + "loss": 0.5388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5047380924224854, + "rewards/margins": 0.5257495641708374, + "rewards/rejected": -2.030487537384033, + "step": 5370 + }, + { + "epoch": 1.938389479373086, + "grad_norm": 26.616071701049805, + "learning_rate": 1.4276535663217682e-09, + "logits/chosen": 2.248194456100464, + "logits/rejected": 2.282585620880127, + "logps/chosen": -198.4256591796875, + "logps/rejected": -242.58901977539062, + "loss": 0.5664, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4669153690338135, + "rewards/margins": 0.4206790328025818, + "rewards/rejected": -1.8875945806503296, + "step": 5380 + }, + { + "epoch": 1.9419924337957126, + "grad_norm": 31.801549911499023, + "learning_rate": 1.264771904017803e-09, + "logits/chosen": 2.246044635772705, + "logits/rejected": 2.3059990406036377, + "logps/chosen": -221.56613159179688, + "logps/rejected": -281.8612365722656, + "loss": 0.5373, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7499691247940063, + "rewards/margins": 0.5678530931472778, + "rewards/rejected": -2.317822217941284, + "step": 5390 + }, + { + "epoch": 1.945595388218339, + "grad_norm": 43.92122268676758, + "learning_rate": 1.1117295513313473e-09, + "logits/chosen": 2.4421546459198, + "logits/rejected": 2.4628496170043945, + "logps/chosen": -227.49951171875, + "logps/rejected": -257.37713623046875, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7916418313980103, + "rewards/margins": 0.3117103576660156, + "rewards/rejected": -2.1033523082733154, + "step": 5400 + }, + { + "epoch": 1.9491983426409656, + "grad_norm": 31.931947708129883, + "learning_rate": 9.685325622142692e-10, + "logits/chosen": 2.235248327255249, + "logits/rejected": 2.2608213424682617, + "logps/chosen": -201.73275756835938, + "logps/rejected": -240.254150390625, + "loss": 0.5679, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.563132405281067, + "rewards/margins": 0.3913213610649109, + "rewards/rejected": -1.9544538259506226, + "step": 5410 + }, + { + "epoch": 1.9528012970635922, + "grad_norm": 55.04080581665039, + "learning_rate": 8.351866011617748e-10, + "logits/chosen": 2.243534564971924, + "logits/rejected": 2.2653114795684814, + "logps/chosen": -208.999755859375, + "logps/rejected": -250.69638061523438, + "loss": 0.5978, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5816562175750732, + "rewards/margins": 0.3955465853214264, + "rewards/rejected": -1.9772027730941772, + "step": 5420 + }, + { + "epoch": 1.9564042514862185, + "grad_norm": 26.250028610229492, + "learning_rate": 7.116969429883934e-10, + "logits/chosen": 2.1618504524230957, + "logits/rejected": 2.240119457244873, + "logps/chosen": -201.18685913085938, + "logps/rejected": -257.50616455078125, + "loss": 0.5461, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.609142541885376, + "rewards/margins": 0.511391818523407, + "rewards/rejected": -2.1205344200134277, + "step": 5430 + }, + { + "epoch": 1.9600072059088451, + "grad_norm": 30.596418380737305, + "learning_rate": 5.980684726193397e-10, + "logits/chosen": 2.4325242042541504, + "logits/rejected": 2.4933700561523438, + "logps/chosen": -200.2987823486328, + "logps/rejected": -264.40252685546875, + "loss": 0.5397, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5802122354507446, + "rewards/margins": 0.6055213212966919, + "rewards/rejected": -2.1857335567474365, + "step": 5440 + }, + { + "epoch": 1.9636101603314717, + "grad_norm": 24.75813865661621, + "learning_rate": 4.943056848972226e-10, + "logits/chosen": 2.3395333290100098, + "logits/rejected": 2.367810010910034, + "logps/chosen": -192.31149291992188, + "logps/rejected": -250.6825408935547, + "loss": 0.5491, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5191051959991455, + "rewards/margins": 0.5737230777740479, + "rewards/rejected": -2.0928282737731934, + "step": 5450 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 38.39195251464844, + "learning_rate": 4.0041268440424434e-10, + "logits/chosen": 2.3370611667633057, + "logits/rejected": 2.3738551139831543, + "logps/chosen": -219.6514892578125, + "logps/rejected": -255.0663299560547, + "loss": 0.6021, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6766411066055298, + "rewards/margins": 0.3405107855796814, + "rewards/rejected": -2.0171518325805664, + "step": 5460 + }, + { + "epoch": 1.970816069176725, + "grad_norm": 33.038150787353516, + "learning_rate": 3.163931852998569e-10, + "logits/chosen": 2.4940686225891113, + "logits/rejected": 2.526848316192627, + "logps/chosen": -223.1959228515625, + "logps/rejected": -278.0130920410156, + "loss": 0.5404, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7955191135406494, + "rewards/margins": 0.5243655443191528, + "rewards/rejected": -2.319884777069092, + "step": 5470 + }, + { + "epoch": 1.9744190235993515, + "grad_norm": 56.82768630981445, + "learning_rate": 2.4225051117390817e-10, + "logits/chosen": 2.2985916137695312, + "logits/rejected": 2.349461317062378, + "logps/chosen": -239.0338134765625, + "logps/rejected": -290.59857177734375, + "loss": 0.5749, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9805402755737305, + "rewards/margins": 0.48385435342788696, + "rewards/rejected": -2.4643945693969727, + "step": 5480 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 32.097084045410156, + "learning_rate": 1.779875949149967e-10, + "logits/chosen": 2.401785373687744, + "logits/rejected": 2.4758224487304688, + "logps/chosen": -204.2624969482422, + "logps/rejected": -258.7240905761719, + "loss": 0.5555, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6019967794418335, + "rewards/margins": 0.47049999237060547, + "rewards/rejected": -2.0724966526031494, + "step": 5490 + }, + { + "epoch": 1.9816249324446047, + "grad_norm": 29.596235275268555, + "learning_rate": 1.2360697859462033e-10, + "logits/chosen": 2.2443838119506836, + "logits/rejected": 2.2928102016448975, + "logps/chosen": -213.0929412841797, + "logps/rejected": -258.2647705078125, + "loss": 0.5449, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6028426885604858, + "rewards/margins": 0.44829025864601135, + "rewards/rejected": -2.051133155822754, + "step": 5500 + }, + { + "epoch": 1.9852278868672313, + "grad_norm": 29.45462989807129, + "learning_rate": 7.911081336656189e-11, + "logits/chosen": 2.351747512817383, + "logits/rejected": 2.353079319000244, + "logps/chosen": -203.29656982421875, + "logps/rejected": -239.415771484375, + "loss": 0.6046, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.591109037399292, + "rewards/margins": 0.346746027469635, + "rewards/rejected": -1.9378551244735718, + "step": 5510 + }, + { + "epoch": 1.9888308412898577, + "grad_norm": 50.27937316894531, + "learning_rate": 4.4500859381707553e-11, + "logits/chosen": 2.1259727478027344, + "logits/rejected": 2.1634602546691895, + "logps/chosen": -221.07583618164062, + "logps/rejected": -272.86737060546875, + "loss": 0.5746, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6605325937271118, + "rewards/margins": 0.4804542660713196, + "rewards/rejected": -2.140986919403076, + "step": 5520 + }, + { + "epoch": 1.9924337957124842, + "grad_norm": 29.29747772216797, + "learning_rate": 1.9778485718630056e-11, + "logits/chosen": 2.165703058242798, + "logits/rejected": 2.1946606636047363, + "logps/chosen": -227.1041259765625, + "logps/rejected": -273.04376220703125, + "loss": 0.5728, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7531017065048218, + "rewards/margins": 0.45691174268722534, + "rewards/rejected": -2.2100133895874023, + "step": 5530 + }, + { + "epoch": 1.9960367501351108, + "grad_norm": 27.07058334350586, + "learning_rate": 4.944670329187772e-12, + "logits/chosen": 2.43207049369812, + "logits/rejected": 2.4914469718933105, + "logps/chosen": -199.6597137451172, + "logps/rejected": -240.5431671142578, + "loss": 0.5931, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5773383378982544, + "rewards/margins": 0.39172571897506714, + "rewards/rejected": -1.9690639972686768, + "step": 5540 + }, + { + "epoch": 1.9996397045577372, + "grad_norm": 23.37407112121582, + "learning_rate": 0.0, + "logits/chosen": 2.2600417137145996, + "logits/rejected": 2.296538829803467, + "logps/chosen": -213.81613159179688, + "logps/rejected": -258.1305847167969, + "loss": 0.5881, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6341050863265991, + "rewards/margins": 0.4021049439907074, + "rewards/rejected": -2.036210060119629, + "step": 5550 + }, + { + "epoch": 1.9996397045577372, + "step": 5550, + "total_flos": 0.0, + "train_loss": 0.6103729385513443, + "train_runtime": 10082.6993, + "train_samples_per_second": 4.404, + "train_steps_per_second": 0.55 + } + ], + "logging_steps": 10, + "max_steps": 5550, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}