{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 8.800843666556295, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.6664202213287354, "logits/rejected": -2.5855507850646973, "logps/chosen": -290.6496276855469, "logps/rejected": -275.44122314453125, "loss": 0.6931, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.00033013062784448266, "rewards/margins": 0.00021790717437397689, "rewards/rejected": 0.00011222347529837862, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 8.175869900678, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6562132835388184, "logits/rejected": -2.5835225582122803, "logps/chosen": -250.1300048828125, "logps/rejected": -232.3563995361328, "loss": 0.6921, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0010141785023733974, "rewards/margins": 0.002109699649736285, "rewards/rejected": -0.001095521030947566, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 7.970196521590508, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6696975231170654, "logits/rejected": -2.6478660106658936, "logps/chosen": -275.9632263183594, "logps/rejected": -266.75457763671875, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00607306556776166, "rewards/margins": 0.009838912636041641, "rewards/rejected": -0.003765846835449338, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 8.67000085322498, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.595798969268799, "logits/rejected": -2.5203123092651367, "logps/chosen": -286.19683837890625, "logps/rejected": -249.7381591796875, "loss": 0.6763, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.028836142271757126, "rewards/margins": 0.03639785572886467, "rewards/rejected": -0.007561707403510809, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 13.337487209496269, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.536128520965576, "logits/rejected": -2.4935240745544434, "logps/chosen": -288.22052001953125, "logps/rejected": -272.15167236328125, "loss": 0.6558, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006760952528566122, "rewards/margins": 0.0824705958366394, "rewards/rejected": -0.07570964097976685, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.5882346630096436, "eval_logits/rejected": -2.4863200187683105, "eval_logps/chosen": -283.5689697265625, "eval_logps/rejected": -255.30152893066406, "eval_loss": 0.6454581022262573, "eval_rewards/accuracies": 0.681034505367279, "eval_rewards/chosen": 0.00436991173774004, "eval_rewards/margins": 0.10091028362512589, "eval_rewards/rejected": -0.09654037654399872, "eval_runtime": 90.5984, "eval_samples_per_second": 20.067, "eval_steps_per_second": 0.32, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 11.707830850446049, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.5649328231811523, "logits/rejected": -2.482309341430664, "logps/chosen": -296.80865478515625, "logps/rejected": -277.1481628417969, "loss": 0.635, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026310335844755173, "rewards/margins": 0.16292977333068848, "rewards/rejected": -0.18924009799957275, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 15.087943087489696, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.5654821395874023, "logits/rejected": -2.473752498626709, "logps/chosen": -302.06201171875, "logps/rejected": -300.6982421875, "loss": 0.6205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1509528011083603, "rewards/margins": 0.2617207169532776, "rewards/rejected": -0.4126734733581543, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 12.495701165097614, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.513960361480713, "logits/rejected": -2.4418704509735107, "logps/chosen": -310.827392578125, "logps/rejected": -304.6408996582031, "loss": 0.6186, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3509863018989563, "rewards/margins": 0.16184711456298828, "rewards/rejected": -0.5128334760665894, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 18.19180173568921, "learning_rate": 4.832031033425662e-07, "logits/chosen": -2.5777266025543213, "logits/rejected": -2.5101935863494873, "logps/chosen": -290.23126220703125, "logps/rejected": -271.0259704589844, "loss": 0.5969, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19850726425647736, "rewards/margins": 0.29140809178352356, "rewards/rejected": -0.4899153709411621, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 20.909839179480045, "learning_rate": 4.752422169756047e-07, "logits/chosen": -2.548067808151245, "logits/rejected": -2.48946475982666, "logps/chosen": -313.51727294921875, "logps/rejected": -314.1280212402344, "loss": 0.5907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2900985777378082, "rewards/margins": 0.3498932421207428, "rewards/rejected": -0.6399917602539062, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -2.590954065322876, "eval_logits/rejected": -2.465471029281616, "eval_logps/chosen": -307.22003173828125, "eval_logps/rejected": -299.41168212890625, "eval_loss": 0.5893865823745728, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": -0.2321406602859497, "eval_rewards/margins": 0.30550122261047363, "eval_rewards/rejected": -0.5376418828964233, "eval_runtime": 91.8942, "eval_samples_per_second": 19.784, "eval_steps_per_second": 0.316, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 21.128492078002974, "learning_rate": 4.658354083558188e-07, "logits/chosen": -2.2998366355895996, "logits/rejected": -2.1539642810821533, "logps/chosen": -310.5582580566406, "logps/rejected": -343.615234375, "loss": 0.58, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4958949089050293, "rewards/margins": 0.41762369871139526, "rewards/rejected": -0.9135186076164246, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 22.76420803543292, "learning_rate": 4.550430636492389e-07, "logits/chosen": -1.5430848598480225, "logits/rejected": -1.318800926208496, "logps/chosen": -358.74884033203125, "logps/rejected": -403.6099548339844, "loss": 0.576, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6421557664871216, "rewards/margins": 0.5780781507492065, "rewards/rejected": -1.2202337980270386, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 18.25021654810524, "learning_rate": 4.429344633468004e-07, "logits/chosen": -1.3487799167633057, "logits/rejected": -1.0795713663101196, "logps/chosen": -370.9684143066406, "logps/rejected": -413.46026611328125, "loss": 0.5503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7266704440116882, "rewards/margins": 0.6452904939651489, "rewards/rejected": -1.371960997581482, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 26.911406799310377, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -1.0087560415267944, "logits/rejected": -0.8604623079299927, "logps/chosen": -331.37322998046875, "logps/rejected": -340.6692810058594, "loss": 0.5608, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7133186459541321, "rewards/margins": 0.42135268449783325, "rewards/rejected": -1.1346712112426758, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 21.7192249273613, "learning_rate": 4.150873668617898e-07, "logits/chosen": -1.0606236457824707, "logits/rejected": -0.8972805738449097, "logps/chosen": -345.69580078125, "logps/rejected": -376.9483947753906, "loss": 0.5657, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6985127329826355, "rewards/margins": 0.4048144221305847, "rewards/rejected": -1.1033271551132202, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": -1.0350327491760254, "eval_logits/rejected": -0.6546276807785034, "eval_logps/chosen": -335.6878662109375, "eval_logps/rejected": -348.57501220703125, "eval_loss": 0.5474189519882202, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -0.5168190598487854, "eval_rewards/margins": 0.5124561190605164, "eval_rewards/rejected": -1.0292751789093018, "eval_runtime": 90.811, "eval_samples_per_second": 20.02, "eval_steps_per_second": 0.319, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 20.42265728106707, "learning_rate": 3.9952763262280397e-07, "logits/chosen": -0.5147255063056946, "logits/rejected": -0.3840788006782532, "logps/chosen": -278.23956298828125, "logps/rejected": -349.45599365234375, "loss": 0.557, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5052815675735474, "rewards/margins": 0.5687552690505981, "rewards/rejected": -1.0740368366241455, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 27.546124980926876, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -0.2255803644657135, "logits/rejected": 0.25452661514282227, "logps/chosen": -356.0308837890625, "logps/rejected": -378.4934997558594, "loss": 0.5436, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6629082560539246, "rewards/margins": 0.6058169007301331, "rewards/rejected": -1.268725037574768, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 26.686484538643338, "learning_rate": 3.6563457256020884e-07, "logits/chosen": -0.08900181949138641, "logits/rejected": 0.5574623942375183, "logps/chosen": -388.6288146972656, "logps/rejected": -392.36724853515625, "loss": 0.5357, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8669939041137695, "rewards/margins": 0.6011493802070618, "rewards/rejected": -1.4681432247161865, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 26.88225056627165, "learning_rate": 3.475188202022617e-07, "logits/chosen": -0.923051655292511, "logits/rejected": -0.5179102420806885, "logps/chosen": -328.10980224609375, "logps/rejected": -390.62261962890625, "loss": 0.5355, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5608564615249634, "rewards/margins": 0.6695758104324341, "rewards/rejected": -1.2304322719573975, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 28.83910063760549, "learning_rate": 3.287770545059052e-07, "logits/chosen": 0.1071801632642746, "logits/rejected": 0.8112713098526001, "logps/chosen": -398.0206604003906, "logps/rejected": -412.6197814941406, "loss": 0.5303, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0966007709503174, "rewards/margins": 0.6390595436096191, "rewards/rejected": -1.735660195350647, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 0.07066183537244797, "eval_logits/rejected": 0.7245904207229614, "eval_logps/chosen": -390.5937194824219, "eval_logps/rejected": -417.4531555175781, "eval_loss": 0.5413529872894287, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.0658774375915527, "eval_rewards/margins": 0.6521791815757751, "eval_rewards/rejected": -1.7180566787719727, "eval_runtime": 91.7218, "eval_samples_per_second": 19.821, "eval_steps_per_second": 0.316, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 22.60643364808249, "learning_rate": 3.0952958655864954e-07, "logits/chosen": -0.1810363531112671, "logits/rejected": 0.5873329043388367, "logps/chosen": -370.56695556640625, "logps/rejected": -393.52349853515625, "loss": 0.52, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.848964512348175, "rewards/margins": 0.7796515226364136, "rewards/rejected": -1.6286159753799438, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 25.261348780053293, "learning_rate": 2.898999737583448e-07, "logits/chosen": 0.12911781668663025, "logits/rejected": 0.8453273773193359, "logps/chosen": -356.42694091796875, "logps/rejected": -401.22625732421875, "loss": 0.5152, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7875608801841736, "rewards/margins": 0.8164966702461243, "rewards/rejected": -1.6040576696395874, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 27.60361008184946, "learning_rate": 2.7001422664752333e-07, "logits/chosen": 0.1694536954164505, "logits/rejected": 0.745651125907898, "logps/chosen": -374.00701904296875, "logps/rejected": -404.31109619140625, "loss": 0.5324, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8075408935546875, "rewards/margins": 0.6526550054550171, "rewards/rejected": -1.4601958990097046, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 26.900360490730417, "learning_rate": 2.5e-07, "logits/chosen": 0.5097543001174927, "logits/rejected": 1.0124820470809937, "logps/chosen": -356.8577575683594, "logps/rejected": -414.31085205078125, "loss": 0.5326, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8185293078422546, "rewards/margins": 0.7143917083740234, "rewards/rejected": -1.5329210758209229, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 21.852049902948178, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 0.41708582639694214, "logits/rejected": 1.3239974975585938, "logps/chosen": -396.1962890625, "logps/rejected": -424.10107421875, "loss": 0.5472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8911197781562805, "rewards/margins": 0.6929961442947388, "rewards/rejected": -1.5841158628463745, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": 0.42131370306015015, "eval_logits/rejected": 1.265686273574829, "eval_logps/chosen": -364.9606018066406, "eval_logps/rejected": -392.8293762207031, "eval_loss": 0.5268200635910034, "eval_rewards/accuracies": 0.7155172228813171, "eval_rewards/chosen": -0.8095463514328003, "eval_rewards/margins": 0.6622725129127502, "eval_rewards/rejected": -1.4718190431594849, "eval_runtime": 91.4885, "eval_samples_per_second": 19.871, "eval_steps_per_second": 0.317, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 23.156708624000842, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 0.08591889590024948, "logits/rejected": 0.6472758054733276, "logps/chosen": -366.85699462890625, "logps/rejected": -412.9854431152344, "loss": 0.5217, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7147234678268433, "rewards/margins": 0.6098002195358276, "rewards/rejected": -1.324523687362671, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 26.15521858032112, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 0.2586960792541504, "logits/rejected": 1.1518855094909668, "logps/chosen": -362.0467224121094, "logps/rejected": -416.77716064453125, "loss": 0.5074, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6773337125778198, "rewards/margins": 0.9160529375076294, "rewards/rejected": -1.5933866500854492, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 22.25288149573626, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 0.54390949010849, "logits/rejected": 1.2391692399978638, "logps/chosen": -429.69769287109375, "logps/rejected": -436.3062438964844, "loss": 0.5485, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0754292011260986, "rewards/margins": 0.619503915309906, "rewards/rejected": -1.6949331760406494, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 23.40120757809907, "learning_rate": 1.524811797977383e-07, "logits/chosen": 0.3535264730453491, "logits/rejected": 1.0907752513885498, "logps/chosen": -380.4679260253906, "logps/rejected": -438.02178955078125, "loss": 0.5205, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9396567344665527, "rewards/margins": 0.6416970491409302, "rewards/rejected": -1.581353783607483, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 28.751131516456226, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 0.7159294486045837, "logits/rejected": 1.3497685194015503, "logps/chosen": -370.821533203125, "logps/rejected": -406.04010009765625, "loss": 0.5517, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1309765577316284, "rewards/margins": 0.6069232821464539, "rewards/rejected": -1.7378997802734375, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": 0.29943859577178955, "eval_logits/rejected": 1.3137139081954956, "eval_logps/chosen": -373.143798828125, "eval_logps/rejected": -407.0939636230469, "eval_loss": 0.5283679962158203, "eval_rewards/accuracies": 0.7112069129943848, "eval_rewards/chosen": -0.8913781642913818, "eval_rewards/margins": 0.723086416721344, "eval_rewards/rejected": -1.6144647598266602, "eval_runtime": 91.561, "eval_samples_per_second": 19.856, "eval_steps_per_second": 0.317, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 31.58697998681577, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 0.399633526802063, "logits/rejected": 1.2564690113067627, "logps/chosen": -335.18218994140625, "logps/rejected": -429.2320861816406, "loss": 0.5096, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8466620445251465, "rewards/margins": 0.8323869705200195, "rewards/rejected": -1.6790491342544556, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 25.23384166425554, "learning_rate": 1.00472367377196e-07, "logits/chosen": 0.13118520379066467, "logits/rejected": 0.912071704864502, "logps/chosen": -374.3765869140625, "logps/rejected": -386.69384765625, "loss": 0.5351, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8689004182815552, "rewards/margins": 0.6572145223617554, "rewards/rejected": -1.5261149406433105, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 23.043674301052096, "learning_rate": 8.49126331382102e-08, "logits/chosen": 0.2135738581418991, "logits/rejected": 1.1428295373916626, "logps/chosen": -332.0506286621094, "logps/rejected": -378.5418701171875, "loss": 0.5351, "rewards/accuracies": 0.75, "rewards/chosen": -0.7963577508926392, "rewards/margins": 0.6999517679214478, "rewards/rejected": -1.4963096380233765, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 22.897014093425614, "learning_rate": 7.041266247556812e-08, "logits/chosen": 0.228415846824646, "logits/rejected": 1.0023858547210693, "logps/chosen": -356.47955322265625, "logps/rejected": -407.20928955078125, "loss": 0.5292, "rewards/accuracies": 0.71875, "rewards/chosen": -0.906414806842804, "rewards/margins": 0.5302497744560242, "rewards/rejected": -1.4366645812988281, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 24.081238898664065, "learning_rate": 5.706553665319955e-08, "logits/chosen": 0.30141282081604004, "logits/rejected": 1.139885663986206, "logps/chosen": -382.5127868652344, "logps/rejected": -426.60723876953125, "loss": 0.4943, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8278465270996094, "rewards/margins": 0.7722162008285522, "rewards/rejected": -1.6000627279281616, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 0.40436258912086487, "eval_logits/rejected": 1.4252060651779175, "eval_logps/chosen": -367.28948974609375, "eval_logps/rejected": -402.3227233886719, "eval_loss": 0.5236544609069824, "eval_rewards/accuracies": 0.7112069129943848, "eval_rewards/chosen": -0.8328355550765991, "eval_rewards/margins": 0.7339165806770325, "eval_rewards/rejected": -1.5667520761489868, "eval_runtime": 91.5336, "eval_samples_per_second": 19.862, "eval_steps_per_second": 0.317, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 25.9302171157121, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 0.17887906730175018, "logits/rejected": 1.2261936664581299, "logps/chosen": -403.9292297363281, "logps/rejected": -456.3939514160156, "loss": 0.5292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9248501062393188, "rewards/margins": 0.827883243560791, "rewards/rejected": -1.7527334690093994, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 25.758246129198, "learning_rate": 3.416459164418123e-08, "logits/chosen": 0.7486616373062134, "logits/rejected": 1.5815865993499756, "logps/chosen": -384.26458740234375, "logps/rejected": -433.817626953125, "loss": 0.5355, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9148642420768738, "rewards/margins": 0.8338562846183777, "rewards/rejected": -1.7487205266952515, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 27.707849331024978, "learning_rate": 2.475778302439524e-08, "logits/chosen": 0.8750450015068054, "logits/rejected": 1.3391921520233154, "logps/chosen": -381.8013610839844, "logps/rejected": -423.6830139160156, "loss": 0.5201, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9749946594238281, "rewards/margins": 0.6640049815177917, "rewards/rejected": -1.638999581336975, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 26.946369171438246, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 0.8233477473258972, "logits/rejected": 1.701456069946289, "logps/chosen": -368.91595458984375, "logps/rejected": -411.06951904296875, "loss": 0.5176, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.964392364025116, "rewards/margins": 0.733443021774292, "rewards/rejected": -1.6978353261947632, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 28.888256646435288, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 0.8198953866958618, "logits/rejected": 1.1675937175750732, "logps/chosen": -357.7641296386719, "logps/rejected": -417.57720947265625, "loss": 0.5335, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8717674016952515, "rewards/margins": 0.6356672644615173, "rewards/rejected": -1.5074348449707031, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 0.3891567885875702, "eval_logits/rejected": 1.407207727432251, "eval_logps/chosen": -365.5233154296875, "eval_logps/rejected": -399.57244873046875, "eval_loss": 0.5228918790817261, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": -0.8151733875274658, "eval_rewards/margins": 0.7240758538246155, "eval_rewards/rejected": -1.5392491817474365, "eval_runtime": 90.7936, "eval_samples_per_second": 20.023, "eval_steps_per_second": 0.319, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 26.17469084665507, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 0.33993715047836304, "logits/rejected": 1.2628642320632935, "logps/chosen": -358.8694152832031, "logps/rejected": -421.8302307128906, "loss": 0.536, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8047143220901489, "rewards/margins": 0.8802136182785034, "rewards/rejected": -1.6849279403686523, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 21.435784540966043, "learning_rate": 2.052496544188487e-09, "logits/chosen": 0.5830007791519165, "logits/rejected": 1.3784797191619873, "logps/chosen": -336.01025390625, "logps/rejected": -399.4895324707031, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -0.856100857257843, "rewards/margins": 0.8189223408699036, "rewards/rejected": -1.675023078918457, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 33.824936700213314, "learning_rate": 2.889724508297886e-10, "logits/chosen": 0.7273231744766235, "logits/rejected": 1.6111886501312256, "logps/chosen": -351.79278564453125, "logps/rejected": -395.43890380859375, "loss": 0.5241, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8372980356216431, "rewards/margins": 0.6832476258277893, "rewards/rejected": -1.5205457210540771, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5600318115785581, "train_runtime": 11378.3731, "train_samples_per_second": 4.9, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }