{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 25, "global_step": 608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 8.19672131147541e-09, "logits/generated": -3.1101083755493164, "logits/real": -3.0836105346679688, "logps/generated": -234.708984375, "logps/real": -348.7855224609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 8.196721311475409e-08, "logits/generated": -3.041771173477173, "logits/real": -3.0358164310455322, "logps/generated": -259.9490051269531, "logps/real": -409.8074951171875, "loss": 0.6715, "rewards/accuracies": 0.6597222089767456, "rewards/generated": -0.009591154754161835, "rewards/margins": 0.04917733371257782, "rewards/real": 0.039586178958415985, "step": 10 }, { "epoch": 0.07, "learning_rate": 1.6393442622950818e-07, "logits/generated": -3.025602102279663, "logits/real": -3.008653163909912, "logps/generated": -291.55126953125, "logps/real": -418.98614501953125, "loss": 0.4944, "rewards/accuracies": 0.9437500238418579, "rewards/generated": -0.1885037124156952, "rewards/margins": 0.5224509239196777, "rewards/real": 0.33394715189933777, "step": 20 }, { "epoch": 0.08, "eval_logits/generated": -3.0068743228912354, "eval_logits/real": -3.014744520187378, "eval_logps/generated": -285.1263732910156, "eval_logps/real": -406.0350036621094, "eval_loss": 0.2565726339817047, "eval_rewards/accuracies": 0.9761029481887817, "eval_rewards/generated": -0.8426953554153442, "eval_rewards/margins": 1.5071492195129395, "eval_rewards/real": 0.6644538044929504, "eval_runtime": 272.5132, "eval_samples_per_second": 7.923, "eval_steps_per_second": 0.25, "step": 25 }, { "epoch": 0.1, "learning_rate": 2.4590163934426226e-07, "logits/generated": -3.019906997680664, "logits/real": -3.0092620849609375, "logps/generated": -305.70733642578125, "logps/real": -410.2085876464844, "loss": 0.2752, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.7856386303901672, "rewards/margins": 1.4234931468963623, "rewards/real": 0.6378546357154846, "step": 30 }, { "epoch": 0.13, "learning_rate": 3.2786885245901637e-07, "logits/generated": -2.9752182960510254, "logits/real": -2.9877312183380127, "logps/generated": -290.79388427734375, "logps/real": -412.19482421875, "loss": 0.1601, "rewards/accuracies": 1.0, "rewards/generated": -1.8102967739105225, "rewards/margins": 2.52372407913208, "rewards/real": 0.7134272456169128, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.0983606557377047e-07, "logits/generated": -2.939542531967163, "logits/real": -2.9472873210906982, "logps/generated": -315.44427490234375, "logps/real": -429.36614990234375, "loss": 0.092, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -3.036115884780884, "rewards/margins": 3.7522292137145996, "rewards/real": 0.7161135077476501, "step": 50 }, { "epoch": 0.16, "eval_logits/generated": -2.9427242279052734, "eval_logits/real": -2.955735921859741, "eval_logps/generated": -314.470458984375, "eval_logps/real": -408.6964111328125, "eval_loss": 0.08381339907646179, "eval_rewards/accuracies": 0.9889705777168274, "eval_rewards/generated": -3.777100086212158, "eval_rewards/margins": 4.175417900085449, "eval_rewards/real": 0.39831778407096863, "eval_runtime": 271.2109, "eval_samples_per_second": 7.961, "eval_steps_per_second": 0.251, "step": 50 }, { "epoch": 0.2, "learning_rate": 4.918032786885245e-07, "logits/generated": -2.946568250656128, "logits/real": -2.9412360191345215, "logps/generated": -343.0057678222656, "logps/real": -417.2606506347656, "loss": 0.0779, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.043205261230469, "rewards/margins": 4.313161849975586, "rewards/real": 0.26995649933815, "step": 60 }, { "epoch": 0.23, "learning_rate": 4.917733089579524e-07, "logits/generated": -2.9332735538482666, "logits/real": -2.9507241249084473, "logps/generated": -310.9266662597656, "logps/real": -392.91436767578125, "loss": 0.0601, "rewards/accuracies": 0.987500011920929, "rewards/generated": -4.882044792175293, "rewards/margins": 4.866286277770996, "rewards/real": -0.015758510679006577, "step": 70 }, { "epoch": 0.25, "eval_logits/generated": -2.9205291271209717, "eval_logits/real": -2.9325790405273438, "eval_logps/generated": -333.0871276855469, "eval_logps/real": -410.1153564453125, "eval_loss": 0.04572594165802002, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -5.638766288757324, "eval_rewards/margins": 5.895185470581055, "eval_rewards/real": 0.2564197778701782, "eval_runtime": 279.8658, "eval_samples_per_second": 7.714, "eval_steps_per_second": 0.243, "step": 75 }, { "epoch": 0.26, "learning_rate": 4.826325411334552e-07, "logits/generated": -2.923539400100708, "logits/real": -2.924189805984497, "logps/generated": -344.7599792480469, "logps/real": -426.96954345703125, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -5.42771053314209, "rewards/margins": 5.462244987487793, "rewards/real": 0.03453410416841507, "step": 80 }, { "epoch": 0.3, "learning_rate": 4.7349177330895793e-07, "logits/generated": -2.9185070991516113, "logits/real": -2.923430919647217, "logps/generated": -323.52618408203125, "logps/real": -421.53558349609375, "loss": 0.0381, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -6.292209625244141, "rewards/margins": 6.230466365814209, "rewards/real": -0.0617423951625824, "step": 90 }, { "epoch": 0.33, "learning_rate": 4.6435100548446064e-07, "logits/generated": -2.940284490585327, "logits/real": -2.9340343475341797, "logps/generated": -348.3626708984375, "logps/real": -378.1487731933594, "loss": 0.0437, "rewards/accuracies": 0.987500011920929, "rewards/generated": -6.817117214202881, "rewards/margins": 6.638771057128906, "rewards/real": -0.17834754288196564, "step": 100 }, { "epoch": 0.33, "eval_logits/generated": -2.888258218765259, "eval_logits/real": -2.906193733215332, "eval_logps/generated": -349.15032958984375, "eval_logps/real": -414.5328369140625, "eval_loss": 0.03355773165822029, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -7.245093822479248, "eval_rewards/margins": 7.059763431549072, "eval_rewards/real": -0.18532894551753998, "eval_runtime": 281.2041, "eval_samples_per_second": 7.678, "eval_steps_per_second": 0.242, "step": 100 }, { "epoch": 0.36, "learning_rate": 4.552102376599634e-07, "logits/generated": -2.912867307662964, "logits/real": -2.9122469425201416, "logps/generated": -353.8576965332031, "logps/real": -416.38262939453125, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/generated": -7.3175201416015625, "rewards/margins": 6.935573577880859, "rewards/real": -0.38194605708122253, "step": 110 }, { "epoch": 0.39, "learning_rate": 4.460694698354662e-07, "logits/generated": -2.916935920715332, "logits/real": -2.90993070602417, "logps/generated": -347.873046875, "logps/real": -423.1348571777344, "loss": 0.036, "rewards/accuracies": 0.981249988079071, "rewards/generated": -7.8421173095703125, "rewards/margins": 7.608695030212402, "rewards/real": -0.23342163860797882, "step": 120 }, { "epoch": 0.41, "eval_logits/generated": -2.881693124771118, "eval_logits/real": -2.9014151096343994, "eval_logps/generated": -354.1070556640625, "eval_logps/real": -414.3309326171875, "eval_loss": 0.027073556557297707, "eval_rewards/accuracies": 0.9944853186607361, "eval_rewards/generated": -7.7407612800598145, "eval_rewards/margins": 7.575623512268066, "eval_rewards/real": -0.16513808071613312, "eval_runtime": 282.9385, "eval_samples_per_second": 7.631, "eval_steps_per_second": 0.24, "step": 125 }, { "epoch": 0.43, "learning_rate": 4.3692870201096885e-07, "logits/generated": -2.886507272720337, "logits/real": -2.8951663970947266, "logps/generated": -376.47540283203125, "logps/real": -435.31024169921875, "loss": 0.0309, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -8.095990180969238, "rewards/margins": 7.625959873199463, "rewards/real": -0.47002920508384705, "step": 130 }, { "epoch": 0.46, "learning_rate": 4.277879341864716e-07, "logits/generated": -2.866490602493286, "logits/real": -2.878412961959839, "logps/generated": -375.51739501953125, "logps/real": -430.4481506347656, "loss": 0.0318, "rewards/accuracies": 0.981249988079071, "rewards/generated": -8.508157730102539, "rewards/margins": 8.17884635925293, "rewards/real": -0.32931095361709595, "step": 140 }, { "epoch": 0.49, "learning_rate": 4.186471663619744e-07, "logits/generated": -2.8697776794433594, "logits/real": -2.87200927734375, "logps/generated": -344.39154052734375, "logps/real": -417.9952697753906, "loss": 0.0373, "rewards/accuracies": 0.987500011920929, "rewards/generated": -8.700019836425781, "rewards/margins": 8.087128639221191, "rewards/real": -0.6128913760185242, "step": 150 }, { "epoch": 0.49, "eval_logits/generated": -2.827141761779785, "eval_logits/real": -2.8543221950531006, "eval_logps/generated": -355.01171875, "eval_logps/real": -415.0633850097656, "eval_loss": 0.026445312425494194, "eval_rewards/accuracies": 0.9908088445663452, "eval_rewards/generated": -7.831226825714111, "eval_rewards/margins": 7.5928425788879395, "eval_rewards/real": -0.23838478326797485, "eval_runtime": 279.7973, "eval_samples_per_second": 7.716, "eval_steps_per_second": 0.243, "step": 150 }, { "epoch": 0.53, "learning_rate": 4.0950639853747716e-07, "logits/generated": -2.832282543182373, "logits/real": -2.8597447872161865, "logps/generated": -383.411376953125, "logps/real": -424.3670959472656, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/generated": -8.395051002502441, "rewards/margins": 8.07539176940918, "rewards/real": -0.319659024477005, "step": 160 }, { "epoch": 0.56, "learning_rate": 4.0036563071297983e-07, "logits/generated": -2.811795234680176, "logits/real": -2.841276168823242, "logps/generated": -379.37359619140625, "logps/real": -428.99407958984375, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/generated": -8.731819152832031, "rewards/margins": 8.355070114135742, "rewards/real": -0.37674885988235474, "step": 170 }, { "epoch": 0.58, "eval_logits/generated": -2.8051629066467285, "eval_logits/real": -2.832581043243408, "eval_logps/generated": -376.1680603027344, "eval_logps/real": -421.83154296875, "eval_loss": 0.02137557417154312, "eval_rewards/accuracies": 0.9908088445663452, "eval_rewards/generated": -9.94686222076416, "eval_rewards/margins": 9.031662940979004, "eval_rewards/real": -0.9151991009712219, "eval_runtime": 281.297, "eval_samples_per_second": 7.675, "eval_steps_per_second": 0.242, "step": 175 }, { "epoch": 0.59, "learning_rate": 3.912248628884826e-07, "logits/generated": -2.8207595348358154, "logits/real": -2.8399100303649902, "logps/generated": -393.3479919433594, "logps/real": -451.2169494628906, "loss": 0.022, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -9.940328598022461, "rewards/margins": 9.013021469116211, "rewards/real": -0.92730712890625, "step": 180 }, { "epoch": 0.62, "learning_rate": 3.8208409506398537e-07, "logits/generated": -2.8636739253997803, "logits/real": -2.867056369781494, "logps/generated": -404.6825256347656, "logps/real": -445.5357971191406, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/generated": -10.549575805664062, "rewards/margins": 9.651226043701172, "rewards/real": -0.8983511924743652, "step": 190 }, { "epoch": 0.66, "learning_rate": 3.7294332723948814e-07, "logits/generated": -2.8503365516662598, "logits/real": -2.8629379272460938, "logps/generated": -380.40264892578125, "logps/real": -451.1651916503906, "loss": 0.0426, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.779017448425293, "rewards/margins": 9.16708755493164, "rewards/real": -1.6119304895401, "step": 200 }, { "epoch": 0.66, "eval_logits/generated": -2.8450069427490234, "eval_logits/real": -2.85876727104187, "eval_logps/generated": -367.72100830078125, "eval_logps/real": -422.42657470703125, "eval_loss": 0.02509322017431259, "eval_rewards/accuracies": 0.9908088445663452, "eval_rewards/generated": -9.10215950012207, "eval_rewards/margins": 8.127457618713379, "eval_rewards/real": -0.9747023582458496, "eval_runtime": 279.5471, "eval_samples_per_second": 7.723, "eval_steps_per_second": 0.243, "step": 200 }, { "epoch": 0.69, "learning_rate": 3.638025594149908e-07, "logits/generated": -2.840203285217285, "logits/real": -2.83565354347229, "logps/generated": -398.70989990234375, "logps/real": -414.3641662597656, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/generated": -9.996566772460938, "rewards/margins": 8.856243133544922, "rewards/real": -1.1403248310089111, "step": 210 }, { "epoch": 0.72, "learning_rate": 3.5466179159049357e-07, "logits/generated": -2.82039213180542, "logits/real": -2.8223772048950195, "logps/generated": -382.1441955566406, "logps/real": -446.4202575683594, "loss": 0.0262, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -9.665162086486816, "rewards/margins": 8.764172554016113, "rewards/real": -0.9009901881217957, "step": 220 }, { "epoch": 0.74, "eval_logits/generated": -2.800882339477539, "eval_logits/real": -2.8208842277526855, "eval_logps/generated": -376.0171813964844, "eval_logps/real": -421.0940246582031, "eval_loss": 0.018937036395072937, "eval_rewards/accuracies": 0.9926470518112183, "eval_rewards/generated": -9.93177318572998, "eval_rewards/margins": 9.090324401855469, "eval_rewards/real": -0.8414493203163147, "eval_runtime": 281.9616, "eval_samples_per_second": 7.657, "eval_steps_per_second": 0.241, "step": 225 }, { "epoch": 0.76, "learning_rate": 3.4552102376599634e-07, "logits/generated": -2.808319568634033, "logits/real": -2.802628993988037, "logps/generated": -363.14227294921875, "logps/real": -395.32305908203125, "loss": 0.0226, "rewards/accuracies": 0.987500011920929, "rewards/generated": -9.69061279296875, "rewards/margins": 8.589911460876465, "rewards/real": -1.1007012128829956, "step": 230 }, { "epoch": 0.79, "learning_rate": 3.3638025594149906e-07, "logits/generated": -2.8294272422790527, "logits/real": -2.8311877250671387, "logps/generated": -398.45635986328125, "logps/real": -404.5653991699219, "loss": 0.0297, "rewards/accuracies": 0.987500011920929, "rewards/generated": -9.038873672485352, "rewards/margins": 8.15873908996582, "rewards/real": -0.8801354169845581, "step": 240 }, { "epoch": 0.82, "learning_rate": 3.272394881170018e-07, "logits/generated": -2.816089153289795, "logits/real": -2.839442729949951, "logps/generated": -382.88311767578125, "logps/real": -406.73297119140625, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/generated": -9.536883354187012, "rewards/margins": 8.913406372070312, "rewards/real": -0.6234776973724365, "step": 250 }, { "epoch": 0.82, "eval_logits/generated": -2.7973484992980957, "eval_logits/real": -2.8200957775115967, "eval_logps/generated": -377.7586364746094, "eval_logps/real": -419.83355712890625, "eval_loss": 0.01662140153348446, "eval_rewards/accuracies": 0.9944853186607361, "eval_rewards/generated": -10.105918884277344, "eval_rewards/margins": 9.390520095825195, "eval_rewards/real": -0.7153997421264648, "eval_runtime": 282.5094, "eval_samples_per_second": 7.642, "eval_steps_per_second": 0.241, "step": 250 }, { "epoch": 0.86, "learning_rate": 3.1809872029250455e-07, "logits/generated": -2.802105665206909, "logits/real": -2.824450731277466, "logps/generated": -371.94012451171875, "logps/real": -423.9705505371094, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/generated": -10.57664680480957, "rewards/margins": 9.595219612121582, "rewards/real": -0.9814273715019226, "step": 260 }, { "epoch": 0.89, "learning_rate": 3.089579524680073e-07, "logits/generated": -2.7837929725646973, "logits/real": -2.7973639965057373, "logps/generated": -388.8419189453125, "logps/real": -429.37054443359375, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/generated": -10.401695251464844, "rewards/margins": 9.380131721496582, "rewards/real": -1.021564245223999, "step": 270 }, { "epoch": 0.9, "eval_logits/generated": -2.7641427516937256, "eval_logits/real": -2.7935867309570312, "eval_logps/generated": -385.756103515625, "eval_logps/real": -423.5848693847656, "eval_loss": 0.018939225003123283, "eval_rewards/accuracies": 0.9944853186607361, "eval_rewards/generated": -10.90566635131836, "eval_rewards/margins": 9.815133094787598, "eval_rewards/real": -1.0905327796936035, "eval_runtime": 281.011, "eval_samples_per_second": 7.683, "eval_steps_per_second": 0.242, "step": 275 }, { "epoch": 0.92, "learning_rate": 2.9981718464351004e-07, "logits/generated": -2.762982130050659, "logits/real": -2.7803285121917725, "logps/generated": -370.44073486328125, "logps/real": -432.51708984375, "loss": 0.025, "rewards/accuracies": 0.987500011920929, "rewards/generated": -10.256260871887207, "rewards/margins": 9.053407669067383, "rewards/real": -1.202852487564087, "step": 280 }, { "epoch": 0.95, "learning_rate": 2.906764168190128e-07, "logits/generated": -2.7955307960510254, "logits/real": -2.818641185760498, "logps/generated": -406.474853515625, "logps/real": -446.6991271972656, "loss": 0.0378, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -10.223650932312012, "rewards/margins": 8.97282600402832, "rewards/real": -1.250824213027954, "step": 290 }, { "epoch": 0.99, "learning_rate": 2.815356489945155e-07, "logits/generated": -2.7767701148986816, "logits/real": -2.79323410987854, "logps/generated": -376.298828125, "logps/real": -418.32611083984375, "loss": 0.0333, "rewards/accuracies": 0.981249988079071, "rewards/generated": -10.532678604125977, "rewards/margins": 9.659395217895508, "rewards/real": -0.8732837438583374, "step": 300 }, { "epoch": 0.99, "eval_logits/generated": -2.7972822189331055, "eval_logits/real": -2.8230373859405518, "eval_logps/generated": -391.56549072265625, "eval_logps/real": -425.4764709472656, "eval_loss": 0.01681051403284073, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -11.48660659790039, "eval_rewards/margins": 10.206913948059082, "eval_rewards/real": -1.2796927690505981, "eval_runtime": 282.0432, "eval_samples_per_second": 7.655, "eval_steps_per_second": 0.241, "step": 300 }, { "epoch": 1.02, "learning_rate": 2.7239488117001824e-07, "logits/generated": -2.804527997970581, "logits/real": -2.8110270500183105, "logps/generated": -392.09259033203125, "logps/real": -416.8059997558594, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -12.317480087280273, "rewards/margins": 10.796090126037598, "rewards/real": -1.521390676498413, "step": 310 }, { "epoch": 1.05, "learning_rate": 2.63254113345521e-07, "logits/generated": -2.7929205894470215, "logits/real": -2.8038601875305176, "logps/generated": -406.0953063964844, "logps/real": -417.6265563964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/generated": -12.046645164489746, "rewards/margins": 11.529437065124512, "rewards/real": -0.5172096490859985, "step": 320 }, { "epoch": 1.07, "eval_logits/generated": -2.797431468963623, "eval_logits/real": -2.82306170463562, "eval_logps/generated": -388.5797424316406, "eval_logps/real": -424.7586669921875, "eval_loss": 0.015748905017971992, "eval_rewards/accuracies": 0.9944853186607361, "eval_rewards/generated": -11.188030242919922, "eval_rewards/margins": 9.980118751525879, "eval_rewards/real": -1.2079119682312012, "eval_runtime": 274.6049, "eval_samples_per_second": 7.862, "eval_steps_per_second": 0.248, "step": 325 }, { "epoch": 1.09, "learning_rate": 2.541133455210238e-07, "logits/generated": -2.783512592315674, "logits/real": -2.7885773181915283, "logps/generated": -410.649658203125, "logps/real": -451.8507385253906, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/generated": -11.628830909729004, "rewards/margins": 10.77421760559082, "rewards/real": -0.854611873626709, "step": 330 }, { "epoch": 1.12, "learning_rate": 2.449725776965265e-07, "logits/generated": -2.8123326301574707, "logits/real": -2.8071532249450684, "logps/generated": -417.04833984375, "logps/real": -433.6940002441406, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/generated": -11.209399223327637, "rewards/margins": 10.593694686889648, "rewards/real": -0.615705132484436, "step": 340 }, { "epoch": 1.15, "learning_rate": 2.3583180987202925e-07, "logits/generated": -2.80556058883667, "logits/real": -2.816408157348633, "logps/generated": -402.67120361328125, "logps/real": -396.2184753417969, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/generated": -10.71699047088623, "rewards/margins": 10.27141284942627, "rewards/real": -0.4455786347389221, "step": 350 }, { "epoch": 1.15, "eval_logits/generated": -2.7852790355682373, "eval_logits/real": -2.8128247261047363, "eval_logps/generated": -389.1376037597656, "eval_logps/real": -423.37457275390625, "eval_loss": 0.015182293951511383, "eval_rewards/accuracies": 0.9908088445663452, "eval_rewards/generated": -11.243818283081055, "eval_rewards/margins": 10.17431640625, "eval_rewards/real": -1.0695013999938965, "eval_runtime": 277.2781, "eval_samples_per_second": 7.786, "eval_steps_per_second": 0.245, "step": 350 }, { "epoch": 1.18, "learning_rate": 2.26691042047532e-07, "logits/generated": -2.795651912689209, "logits/real": -2.7973437309265137, "logps/generated": -416.8365783691406, "logps/real": -424.8717346191406, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/generated": -11.76781940460205, "rewards/margins": 10.914695739746094, "rewards/real": -0.853122889995575, "step": 360 }, { "epoch": 1.22, "learning_rate": 2.1755027422303473e-07, "logits/generated": -2.8003101348876953, "logits/real": -2.8173794746398926, "logps/generated": -425.969482421875, "logps/real": -427.357421875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/generated": -12.280854225158691, "rewards/margins": 11.536186218261719, "rewards/real": -0.7446659803390503, "step": 370 }, { "epoch": 1.23, "eval_logits/generated": -2.7750558853149414, "eval_logits/real": -2.8028876781463623, "eval_logps/generated": -393.3175048828125, "eval_logps/real": -424.44647216796875, "eval_loss": 0.014840944670140743, "eval_rewards/accuracies": 0.9908088445663452, "eval_rewards/generated": -11.66180419921875, "eval_rewards/margins": 10.485115051269531, "eval_rewards/real": -1.1766891479492188, "eval_runtime": 278.3753, "eval_samples_per_second": 7.756, "eval_steps_per_second": 0.244, "step": 375 }, { "epoch": 1.25, "learning_rate": 2.0840950639853748e-07, "logits/generated": -2.779252529144287, "logits/real": -2.804440975189209, "logps/generated": -388.8800354003906, "logps/real": -422.3763122558594, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/generated": -12.17228889465332, "rewards/margins": 11.443912506103516, "rewards/real": -0.7283763289451599, "step": 380 }, { "epoch": 1.28, "learning_rate": 1.9926873857404022e-07, "logits/generated": -2.7903225421905518, "logits/real": -2.7983546257019043, "logps/generated": -407.7205810546875, "logps/real": -410.11627197265625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/generated": -11.748414993286133, "rewards/margins": 11.07917594909668, "rewards/real": -0.6692394614219666, "step": 390 }, { "epoch": 1.32, "learning_rate": 1.9012797074954297e-07, "logits/generated": -2.756133794784546, "logits/real": -2.773040771484375, "logps/generated": -429.03436279296875, "logps/real": -455.53497314453125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/generated": -12.553096771240234, "rewards/margins": 12.007298469543457, "rewards/real": -0.5457991361618042, "step": 400 }, { "epoch": 1.32, "eval_logits/generated": -2.77026104927063, "eval_logits/real": -2.7975828647613525, "eval_logps/generated": -395.00494384765625, "eval_logps/real": -423.63067626953125, "eval_loss": 0.013821952044963837, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -11.830552101135254, "eval_rewards/margins": 10.735437393188477, "eval_rewards/real": -1.095115065574646, "eval_runtime": 280.6653, "eval_samples_per_second": 7.692, "eval_steps_per_second": 0.242, "step": 400 }, { "epoch": 1.35, "learning_rate": 1.8098720292504568e-07, "logits/generated": -2.7702574729919434, "logits/real": -2.804114818572998, "logps/generated": -426.2373046875, "logps/real": -466.58599853515625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/generated": -12.460441589355469, "rewards/margins": 11.789602279663086, "rewards/real": -0.6708400249481201, "step": 410 }, { "epoch": 1.38, "learning_rate": 1.7184643510054845e-07, "logits/generated": -2.7784767150878906, "logits/real": -2.801504373550415, "logps/generated": -416.26678466796875, "logps/real": -468.1026916503906, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/generated": -12.879318237304688, "rewards/margins": 11.978571891784668, "rewards/real": -0.9007464647293091, "step": 420 }, { "epoch": 1.4, "eval_logits/generated": -2.7573304176330566, "eval_logits/real": -2.785118818283081, "eval_logps/generated": -401.373291015625, "eval_logps/real": -425.8589172363281, "eval_loss": 0.013551454059779644, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -12.467382431030273, "eval_rewards/margins": 11.149442672729492, "eval_rewards/real": -1.3179404735565186, "eval_runtime": 278.2775, "eval_samples_per_second": 7.758, "eval_steps_per_second": 0.244, "step": 425 }, { "epoch": 1.41, "learning_rate": 1.6270566727605117e-07, "logits/generated": -2.7749133110046387, "logits/real": -2.795462131500244, "logps/generated": -417.4508361816406, "logps/real": -445.52728271484375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/generated": -12.883870124816895, "rewards/margins": 11.809028625488281, "rewards/real": -1.0748413801193237, "step": 430 }, { "epoch": 1.45, "learning_rate": 1.5356489945155394e-07, "logits/generated": -2.7696471214294434, "logits/real": -2.778109312057495, "logps/generated": -392.8078918457031, "logps/real": -403.84417724609375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/generated": -12.799501419067383, "rewards/margins": 12.152459144592285, "rewards/real": -0.6470428705215454, "step": 440 }, { "epoch": 1.48, "learning_rate": 1.4442413162705666e-07, "logits/generated": -2.7640178203582764, "logits/real": -2.7718958854675293, "logps/generated": -417.58770751953125, "logps/real": -419.9620056152344, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/generated": -12.79851245880127, "rewards/margins": 11.963459014892578, "rewards/real": -0.835052490234375, "step": 450 }, { "epoch": 1.48, "eval_logits/generated": -2.7543997764587402, "eval_logits/real": -2.7815120220184326, "eval_logps/generated": -403.600341796875, "eval_logps/real": -426.4503479003906, "eval_loss": 0.0138690285384655, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -12.690093994140625, "eval_rewards/margins": 11.313016891479492, "eval_rewards/real": -1.3770767450332642, "eval_runtime": 277.1873, "eval_samples_per_second": 7.789, "eval_steps_per_second": 0.245, "step": 450 }, { "epoch": 1.51, "learning_rate": 1.3528336380255943e-07, "logits/generated": -2.752661943435669, "logits/real": -2.7650394439697266, "logps/generated": -416.608154296875, "logps/real": -423.7530212402344, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -13.213157653808594, "rewards/margins": 12.553186416625977, "rewards/real": -0.6599710583686829, "step": 460 }, { "epoch": 1.55, "learning_rate": 1.2614259597806215e-07, "logits/generated": -2.745286464691162, "logits/real": -2.7696480751037598, "logps/generated": -412.99420166015625, "logps/real": -438.382568359375, "loss": 0.0039, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -12.78739070892334, "rewards/margins": 11.881449699401855, "rewards/real": -0.9059404134750366, "step": 470 }, { "epoch": 1.56, "eval_logits/generated": -2.7446181774139404, "eval_logits/real": -2.7735180854797363, "eval_logps/generated": -404.79119873046875, "eval_logps/real": -426.5647888183594, "eval_loss": 0.013367247767746449, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -12.809175491333008, "eval_rewards/margins": 11.42065143585205, "eval_rewards/real": -1.3885247707366943, "eval_runtime": 276.6137, "eval_samples_per_second": 7.805, "eval_steps_per_second": 0.246, "step": 475 }, { "epoch": 1.58, "learning_rate": 1.1700182815356489e-07, "logits/generated": -2.7754604816436768, "logits/real": -2.7696869373321533, "logps/generated": -402.7199401855469, "logps/real": -402.69091796875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/generated": -13.116182327270508, "rewards/margins": 11.969404220581055, "rewards/real": -1.146780252456665, "step": 480 }, { "epoch": 1.61, "learning_rate": 1.0786106032906764e-07, "logits/generated": -2.76755952835083, "logits/real": -2.7802929878234863, "logps/generated": -425.44757080078125, "logps/real": -418.4566345214844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/generated": -13.474451065063477, "rewards/margins": 12.222225189208984, "rewards/real": -1.2522268295288086, "step": 490 }, { "epoch": 1.64, "learning_rate": 9.872029250457038e-08, "logits/generated": -2.7439827919006348, "logits/real": -2.7600913047790527, "logps/generated": -400.67523193359375, "logps/real": -404.1493225097656, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/generated": -12.943422317504883, "rewards/margins": 11.872589111328125, "rewards/real": -1.0708342790603638, "step": 500 }, { "epoch": 1.64, "eval_logits/generated": -2.7403690814971924, "eval_logits/real": -2.7701306343078613, "eval_logps/generated": -406.73699951171875, "eval_logps/real": -427.05706787109375, "eval_loss": 0.013619424775242805, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -13.003759384155273, "eval_rewards/margins": 11.566007614135742, "eval_rewards/real": -1.4377506971359253, "eval_runtime": 277.1256, "eval_samples_per_second": 7.791, "eval_steps_per_second": 0.245, "step": 500 }, { "epoch": 1.68, "learning_rate": 8.957952468007312e-08, "logits/generated": -2.746847629547119, "logits/real": -2.7727770805358887, "logps/generated": -415.89227294921875, "logps/real": -417.8184509277344, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/generated": -13.302165031433105, "rewards/margins": 11.868834495544434, "rewards/real": -1.4333298206329346, "step": 510 }, { "epoch": 1.71, "learning_rate": 8.043875685557587e-08, "logits/generated": -2.7504754066467285, "logits/real": -2.757420063018799, "logps/generated": -405.920166015625, "logps/real": -403.3671875, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -13.80413818359375, "rewards/margins": 12.69081974029541, "rewards/real": -1.1133191585540771, "step": 520 }, { "epoch": 1.73, "eval_logits/generated": -2.7293171882629395, "eval_logits/real": -2.7628748416900635, "eval_logps/generated": -410.86712646484375, "eval_logps/real": -428.60345458984375, "eval_loss": 0.013887421227991581, "eval_rewards/accuracies": 0.9944853186607361, "eval_rewards/generated": -13.416768074035645, "eval_rewards/margins": 11.824378967285156, "eval_rewards/real": -1.59238862991333, "eval_runtime": 280.0429, "eval_samples_per_second": 7.71, "eval_steps_per_second": 0.243, "step": 525 }, { "epoch": 1.74, "learning_rate": 7.12979890310786e-08, "logits/generated": -2.7278664112091064, "logits/real": -2.767366886138916, "logps/generated": -398.7755432128906, "logps/real": -448.11932373046875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/generated": -13.456060409545898, "rewards/margins": 11.94404411315918, "rewards/real": -1.51201593875885, "step": 530 }, { "epoch": 1.78, "learning_rate": 6.215722120658136e-08, "logits/generated": -2.743837356567383, "logits/real": -2.776067018508911, "logps/generated": -408.1714782714844, "logps/real": -454.8717346191406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/generated": -13.510284423828125, "rewards/margins": 12.263895988464355, "rewards/real": -1.2463879585266113, "step": 540 }, { "epoch": 1.81, "learning_rate": 5.301645338208409e-08, "logits/generated": -2.7187867164611816, "logits/real": -2.753685474395752, "logps/generated": -414.6969299316406, "logps/real": -451.0293884277344, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/generated": -13.869306564331055, "rewards/margins": 12.921630859375, "rewards/real": -0.9476770162582397, "step": 550 }, { "epoch": 1.81, "eval_logits/generated": -2.7283482551574707, "eval_logits/real": -2.762289524078369, "eval_logps/generated": -410.68316650390625, "eval_logps/real": -427.815673828125, "eval_loss": 0.013605926185846329, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -13.398374557495117, "eval_rewards/margins": 11.884763717651367, "eval_rewards/real": -1.5136103630065918, "eval_runtime": 279.4319, "eval_samples_per_second": 7.726, "eval_steps_per_second": 0.243, "step": 550 }, { "epoch": 1.84, "learning_rate": 4.387568555758683e-08, "logits/generated": -2.7442142963409424, "logits/real": -2.7617897987365723, "logps/generated": -428.95721435546875, "logps/real": -429.8484802246094, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/generated": -14.423190116882324, "rewards/margins": 13.05412483215332, "rewards/real": -1.3690690994262695, "step": 560 }, { "epoch": 1.88, "learning_rate": 3.4734917733089575e-08, "logits/generated": -2.720496654510498, "logits/real": -2.742316722869873, "logps/generated": -408.7587890625, "logps/real": -418.96612548828125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/generated": -14.030049324035645, "rewards/margins": 12.879358291625977, "rewards/real": -1.1506898403167725, "step": 570 }, { "epoch": 1.89, "eval_logits/generated": -2.730926752090454, "eval_logits/real": -2.7644882202148438, "eval_logps/generated": -410.0223693847656, "eval_logps/real": -427.5704040527344, "eval_loss": 0.013477620668709278, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -13.332295417785645, "eval_rewards/margins": 11.843212127685547, "eval_rewards/real": -1.4890828132629395, "eval_runtime": 275.7027, "eval_samples_per_second": 7.831, "eval_steps_per_second": 0.247, "step": 575 }, { "epoch": 1.91, "learning_rate": 2.5594149908592323e-08, "logits/generated": -2.7408628463745117, "logits/real": -2.7660305500030518, "logps/generated": -404.4073486328125, "logps/real": -441.6896057128906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/generated": -13.855819702148438, "rewards/margins": 12.916915893554688, "rewards/real": -0.9389039874076843, "step": 580 }, { "epoch": 1.94, "learning_rate": 1.6453382084095063e-08, "logits/generated": -2.7413947582244873, "logits/real": -2.7642316818237305, "logps/generated": -462.260986328125, "logps/real": -425.875244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/generated": -14.152559280395508, "rewards/margins": 12.800895690917969, "rewards/real": -1.3516645431518555, "step": 590 }, { "epoch": 1.97, "learning_rate": 7.312614259597806e-09, "logits/generated": -2.7413601875305176, "logits/real": -2.767582416534424, "logps/generated": -410.154052734375, "logps/real": -414.71295166015625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/generated": -13.285085678100586, "rewards/margins": 12.160585403442383, "rewards/real": -1.1245028972625732, "step": 600 }, { "epoch": 1.97, "eval_logits/generated": -2.7305350303649902, "eval_logits/real": -2.7643280029296875, "eval_logps/generated": -410.0756530761719, "eval_logps/real": -427.497802734375, "eval_loss": 0.01349999662488699, "eval_rewards/accuracies": 0.9963235259056091, "eval_rewards/generated": -13.337620735168457, "eval_rewards/margins": 11.855793952941895, "eval_rewards/real": -1.4818273782730103, "eval_runtime": 276.8139, "eval_samples_per_second": 7.799, "eval_steps_per_second": 0.246, "step": 600 }, { "epoch": 2.0, "step": 608, "total_flos": 0.0, "train_loss": 0.04328188924946038, "train_runtime": 16735.9464, "train_samples_per_second": 2.321, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 608, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }