{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9159905783826223, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 50.2683219909668, "kl": 0.0, "learning_rate": 4.998691442030882e-07, "logps/chosen": -205.0401611328125, "logps/rejected": -172.3661346435547, "loss": 0.4375, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 52.03371810913086, "kl": 0.8273243308067322, "learning_rate": 4.997382884061764e-07, "logps/chosen": -228.83558654785156, "logps/rejected": -244.7559051513672, "loss": 0.3796, "rewards/chosen": 0.06499359011650085, "rewards/margins": 0.07474303990602493, "rewards/rejected": -0.009749448858201504, "step": 2 }, { "epoch": 0.0, "grad_norm": 47.92995834350586, "kl": 0.0, "learning_rate": 4.996074326092646e-07, "logps/chosen": -222.45486450195312, "logps/rejected": -224.06056213378906, "loss": 0.2934, "rewards/chosen": 0.3142276406288147, "rewards/margins": 0.35068318247795105, "rewards/rejected": -0.036455538123846054, "step": 3 }, { "epoch": 0.0, "grad_norm": 48.66846466064453, "kl": 0.03168666362762451, "learning_rate": 4.994765768123528e-07, "logps/chosen": -215.8352508544922, "logps/rejected": -168.82923889160156, "loss": 0.4269, "rewards/chosen": 0.29763078689575195, "rewards/margins": -0.13886231184005737, "rewards/rejected": 0.4364930987358093, "step": 4 }, { "epoch": 0.0, "grad_norm": 49.69014358520508, "kl": 0.0, "learning_rate": 4.99345721015441e-07, "logps/chosen": -260.2557067871094, "logps/rejected": -195.94869995117188, "loss": 0.3903, "rewards/chosen": -0.442703515291214, "rewards/margins": 0.013102918863296509, "rewards/rejected": -0.4558064341545105, "step": 5 }, { "epoch": 0.0, "grad_norm": 59.5657958984375, "kl": 0.0, "learning_rate": 4.992148652185292e-07, "logps/chosen": -278.39373779296875, "logps/rejected": -198.203125, "loss": 0.562, "rewards/chosen": -0.8296091556549072, "rewards/margins": -0.6596475839614868, "rewards/rejected": -0.1699615716934204, "step": 6 }, { "epoch": 0.0, "grad_norm": 43.889366149902344, "kl": 0.0, "learning_rate": 4.990840094216174e-07, "logps/chosen": -270.32000732421875, "logps/rejected": -178.87966918945312, "loss": 0.494, "rewards/chosen": -0.641089677810669, "rewards/margins": -1.1119567155838013, "rewards/rejected": 0.47086700797080994, "step": 7 }, { "epoch": 0.0, "grad_norm": 47.326114654541016, "kl": 0.1817803978919983, "learning_rate": 4.989531536247055e-07, "logps/chosen": -211.3541259765625, "logps/rejected": -253.94935607910156, "loss": 0.4547, "rewards/chosen": -0.5205121040344238, "rewards/margins": -0.384090781211853, "rewards/rejected": -0.136421337723732, "step": 8 }, { "epoch": 0.0, "grad_norm": 48.22457504272461, "kl": 0.0, "learning_rate": 4.988222978277937e-07, "logps/chosen": -273.5563659667969, "logps/rejected": -226.06747436523438, "loss": 0.3257, "rewards/chosen": -0.0483764223754406, "rewards/margins": 0.6438623666763306, "rewards/rejected": -0.6922388076782227, "step": 9 }, { "epoch": 0.0, "grad_norm": 42.13737869262695, "kl": 0.0, "learning_rate": 4.986914420308819e-07, "logps/chosen": -192.82940673828125, "logps/rejected": -182.11260986328125, "loss": 0.4647, "rewards/chosen": -0.48483985662460327, "rewards/margins": 0.11868029832839966, "rewards/rejected": -0.6035201549530029, "step": 10 }, { "epoch": 0.0, "grad_norm": 46.52305603027344, "kl": 0.0, "learning_rate": 4.985605862339701e-07, "logps/chosen": -257.1351318359375, "logps/rejected": -213.33688354492188, "loss": 0.3875, "rewards/chosen": 0.08887295424938202, "rewards/margins": 0.19311153888702393, "rewards/rejected": -0.1042385920882225, "step": 11 }, { "epoch": 0.0, "grad_norm": 46.00929260253906, "kl": 0.0, "learning_rate": 4.984297304370583e-07, "logps/chosen": -206.80709838867188, "logps/rejected": -302.8717956542969, "loss": 0.3693, "rewards/chosen": -0.29487642645835876, "rewards/margins": 0.7638503313064575, "rewards/rejected": -1.0587267875671387, "step": 12 }, { "epoch": 0.0, "grad_norm": 66.953369140625, "kl": 0.0, "learning_rate": 4.982988746401465e-07, "logps/chosen": -281.1011047363281, "logps/rejected": -272.95831298828125, "loss": 0.4793, "rewards/chosen": -1.2275511026382446, "rewards/margins": 0.7767127752304077, "rewards/rejected": -2.0042638778686523, "step": 13 }, { "epoch": 0.0, "grad_norm": 42.68067932128906, "kl": 0.0, "learning_rate": 4.981680188432347e-07, "logps/chosen": -197.5496826171875, "logps/rejected": -233.1874542236328, "loss": 0.3359, "rewards/chosen": -0.7852997183799744, "rewards/margins": -0.11471927165985107, "rewards/rejected": -0.6705804467201233, "step": 14 }, { "epoch": 0.0, "grad_norm": 36.0564079284668, "kl": 0.0, "learning_rate": 4.980371630463229e-07, "logps/chosen": -159.7126922607422, "logps/rejected": -230.08297729492188, "loss": 0.5013, "rewards/chosen": -0.8807596564292908, "rewards/margins": 0.22696465253829956, "rewards/rejected": -1.1077243089675903, "step": 15 }, { "epoch": 0.0, "grad_norm": 42.2777214050293, "kl": 0.0, "learning_rate": 4.979063072494111e-07, "logps/chosen": -285.4176330566406, "logps/rejected": -212.86843872070312, "loss": 0.3413, "rewards/chosen": -0.02461055852472782, "rewards/margins": 1.1725351810455322, "rewards/rejected": -1.197145700454712, "step": 16 }, { "epoch": 0.0, "grad_norm": 37.475677490234375, "kl": 0.0, "learning_rate": 4.977754514524993e-07, "logps/chosen": -277.5311279296875, "logps/rejected": -273.7735290527344, "loss": 0.3365, "rewards/chosen": -0.8537225127220154, "rewards/margins": 0.5020245909690857, "rewards/rejected": -1.355747103691101, "step": 17 }, { "epoch": 0.0, "grad_norm": 50.058441162109375, "kl": 0.0, "learning_rate": 4.976445956555875e-07, "logps/chosen": -255.5559539794922, "logps/rejected": -251.7951202392578, "loss": 0.3898, "rewards/chosen": -1.131912112236023, "rewards/margins": -0.5449322462081909, "rewards/rejected": -0.586979866027832, "step": 18 }, { "epoch": 0.0, "grad_norm": 48.28940200805664, "kl": 0.0, "learning_rate": 4.975137398586757e-07, "logps/chosen": -169.98117065429688, "logps/rejected": -234.54136657714844, "loss": 0.4461, "rewards/chosen": -0.8914527893066406, "rewards/margins": 0.0006075501441955566, "rewards/rejected": -0.8920603394508362, "step": 19 }, { "epoch": 0.01, "grad_norm": 52.11145782470703, "kl": 0.0, "learning_rate": 4.973828840617639e-07, "logps/chosen": -265.4768371582031, "logps/rejected": -242.55845642089844, "loss": 0.4579, "rewards/chosen": -1.322069764137268, "rewards/margins": -0.0047348737716674805, "rewards/rejected": -1.3173348903656006, "step": 20 }, { "epoch": 0.01, "grad_norm": 37.193660736083984, "kl": 0.0, "learning_rate": 4.972520282648521e-07, "logps/chosen": -219.7395477294922, "logps/rejected": -177.62509155273438, "loss": 0.3403, "rewards/chosen": 0.3125017583370209, "rewards/margins": 0.7497882843017578, "rewards/rejected": -0.4372865557670593, "step": 21 }, { "epoch": 0.01, "grad_norm": 47.88676452636719, "kl": 0.0, "learning_rate": 4.971211724679403e-07, "logps/chosen": -342.21783447265625, "logps/rejected": -224.13197326660156, "loss": 0.4, "rewards/chosen": -0.6000944375991821, "rewards/margins": 1.1481363773345947, "rewards/rejected": -1.7482308149337769, "step": 22 }, { "epoch": 0.01, "grad_norm": 35.711021423339844, "kl": 0.0, "learning_rate": 4.969903166710285e-07, "logps/chosen": -149.5780487060547, "logps/rejected": -152.07003784179688, "loss": 0.3721, "rewards/chosen": -0.5182877779006958, "rewards/margins": 0.29111266136169434, "rewards/rejected": -0.8094004392623901, "step": 23 }, { "epoch": 0.01, "grad_norm": 40.70458221435547, "kl": 0.0, "learning_rate": 4.968594608741167e-07, "logps/chosen": -249.3180389404297, "logps/rejected": -186.05471801757812, "loss": 0.3619, "rewards/chosen": -0.8323618769645691, "rewards/margins": 0.7159193158149719, "rewards/rejected": -1.548281192779541, "step": 24 }, { "epoch": 0.01, "grad_norm": 42.917049407958984, "kl": 0.0, "learning_rate": 4.967286050772049e-07, "logps/chosen": -177.19561767578125, "logps/rejected": -248.12649536132812, "loss": 0.4387, "rewards/chosen": -0.6950159668922424, "rewards/margins": 0.9509089589118958, "rewards/rejected": -1.6459249258041382, "step": 25 }, { "epoch": 0.01, "grad_norm": 39.620574951171875, "kl": 0.0, "learning_rate": 4.965977492802931e-07, "logps/chosen": -315.2325439453125, "logps/rejected": -218.47903442382812, "loss": 0.3679, "rewards/chosen": -1.3541030883789062, "rewards/margins": 0.7431886196136475, "rewards/rejected": -2.0972917079925537, "step": 26 }, { "epoch": 0.01, "grad_norm": 37.08391571044922, "kl": 0.0, "learning_rate": 4.964668934833813e-07, "logps/chosen": -230.52078247070312, "logps/rejected": -221.7032470703125, "loss": 0.3973, "rewards/chosen": -1.261672019958496, "rewards/margins": 0.273656964302063, "rewards/rejected": -1.535328984260559, "step": 27 }, { "epoch": 0.01, "grad_norm": 41.10235595703125, "kl": 0.0, "learning_rate": 4.963360376864695e-07, "logps/chosen": -191.61048889160156, "logps/rejected": -193.62025451660156, "loss": 0.3261, "rewards/chosen": -1.396644949913025, "rewards/margins": -0.22787630558013916, "rewards/rejected": -1.1687686443328857, "step": 28 }, { "epoch": 0.01, "grad_norm": 42.94221115112305, "kl": 0.0, "learning_rate": 4.962051818895577e-07, "logps/chosen": -275.1850891113281, "logps/rejected": -223.00143432617188, "loss": 0.3074, "rewards/chosen": -0.8943480253219604, "rewards/margins": 0.9237412214279175, "rewards/rejected": -1.818089246749878, "step": 29 }, { "epoch": 0.01, "grad_norm": 55.69268035888672, "kl": 0.0, "learning_rate": 4.960743260926459e-07, "logps/chosen": -266.8174133300781, "logps/rejected": -200.5430908203125, "loss": 0.3224, "rewards/chosen": -0.08692857623100281, "rewards/margins": 1.410178542137146, "rewards/rejected": -1.4971071481704712, "step": 30 }, { "epoch": 0.01, "grad_norm": 36.69532012939453, "kl": 0.0, "learning_rate": 4.959434702957341e-07, "logps/chosen": -239.9783935546875, "logps/rejected": -285.19903564453125, "loss": 0.2192, "rewards/chosen": -0.2260018140077591, "rewards/margins": 1.6547338962554932, "rewards/rejected": -1.8807357549667358, "step": 31 }, { "epoch": 0.01, "grad_norm": 42.48405838012695, "kl": 0.0, "learning_rate": 4.958126144988223e-07, "logps/chosen": -277.462890625, "logps/rejected": -230.81143188476562, "loss": 0.4113, "rewards/chosen": -1.1050209999084473, "rewards/margins": 1.3367903232574463, "rewards/rejected": -2.4418113231658936, "step": 32 }, { "epoch": 0.01, "grad_norm": 40.719364166259766, "kl": 0.0, "learning_rate": 4.956817587019104e-07, "logps/chosen": -243.24012756347656, "logps/rejected": -265.06121826171875, "loss": 0.3527, "rewards/chosen": -1.113293170928955, "rewards/margins": 0.796079158782959, "rewards/rejected": -1.909372329711914, "step": 33 }, { "epoch": 0.01, "grad_norm": 40.56840133666992, "kl": 0.0, "learning_rate": 4.955509029049986e-07, "logps/chosen": -357.2673034667969, "logps/rejected": -271.1274108886719, "loss": 0.3782, "rewards/chosen": -1.7618191242218018, "rewards/margins": -0.4248075485229492, "rewards/rejected": -1.3370115756988525, "step": 34 }, { "epoch": 0.01, "grad_norm": 45.011009216308594, "kl": 0.0, "learning_rate": 4.954200471080868e-07, "logps/chosen": -202.9520721435547, "logps/rejected": -186.841796875, "loss": 0.426, "rewards/chosen": -0.8729759454727173, "rewards/margins": 0.19871211051940918, "rewards/rejected": -1.0716880559921265, "step": 35 }, { "epoch": 0.01, "grad_norm": 39.886924743652344, "kl": 0.0, "learning_rate": 4.95289191311175e-07, "logps/chosen": -165.83241271972656, "logps/rejected": -280.1267395019531, "loss": 0.2954, "rewards/chosen": -0.43662160634994507, "rewards/margins": 1.597978115081787, "rewards/rejected": -2.034599781036377, "step": 36 }, { "epoch": 0.01, "grad_norm": 26.526390075683594, "kl": 0.0, "learning_rate": 4.951583355142632e-07, "logps/chosen": -159.53414916992188, "logps/rejected": -185.03199768066406, "loss": 0.4893, "rewards/chosen": -1.8748741149902344, "rewards/margins": 0.013196945190429688, "rewards/rejected": -1.888071060180664, "step": 37 }, { "epoch": 0.01, "grad_norm": 25.534835815429688, "kl": 0.0, "learning_rate": 4.950274797173514e-07, "logps/chosen": -200.60745239257812, "logps/rejected": -214.85702514648438, "loss": 0.4536, "rewards/chosen": -2.5081725120544434, "rewards/margins": -0.036855220794677734, "rewards/rejected": -2.4713172912597656, "step": 38 }, { "epoch": 0.01, "grad_norm": 37.80105209350586, "kl": 0.0, "learning_rate": 4.948966239204396e-07, "logps/chosen": -280.5340576171875, "logps/rejected": -287.7911682128906, "loss": 0.4128, "rewards/chosen": -1.4261023998260498, "rewards/margins": 1.8715763092041016, "rewards/rejected": -3.2976787090301514, "step": 39 }, { "epoch": 0.01, "grad_norm": 43.570186614990234, "kl": 0.0, "learning_rate": 4.947657681235278e-07, "logps/chosen": -215.7322998046875, "logps/rejected": -245.70297241210938, "loss": 0.3551, "rewards/chosen": -0.5773531794548035, "rewards/margins": 1.1560332775115967, "rewards/rejected": -1.7333863973617554, "step": 40 }, { "epoch": 0.01, "grad_norm": 37.78433609008789, "kl": 0.0, "learning_rate": 4.94634912326616e-07, "logps/chosen": -233.8341522216797, "logps/rejected": -292.6675109863281, "loss": 0.2997, "rewards/chosen": -1.021287202835083, "rewards/margins": 0.9503535032272339, "rewards/rejected": -1.971640706062317, "step": 41 }, { "epoch": 0.01, "grad_norm": 36.56157302856445, "kl": 0.0, "learning_rate": 4.945040565297042e-07, "logps/chosen": -257.9923095703125, "logps/rejected": -228.11431884765625, "loss": 0.4875, "rewards/chosen": -1.3468284606933594, "rewards/margins": 0.05934798717498779, "rewards/rejected": -1.4061764478683472, "step": 42 }, { "epoch": 0.01, "grad_norm": 30.533437728881836, "kl": 0.0, "learning_rate": 4.943732007327924e-07, "logps/chosen": -215.95953369140625, "logps/rejected": -383.3542175292969, "loss": 0.3071, "rewards/chosen": -0.46231940388679504, "rewards/margins": 3.4177749156951904, "rewards/rejected": -3.880094289779663, "step": 43 }, { "epoch": 0.01, "grad_norm": 32.82918930053711, "kl": 0.0, "learning_rate": 4.942423449358806e-07, "logps/chosen": -203.03924560546875, "logps/rejected": -230.98194885253906, "loss": 0.2845, "rewards/chosen": 0.07298585772514343, "rewards/margins": 2.675164222717285, "rewards/rejected": -2.6021783351898193, "step": 44 }, { "epoch": 0.01, "grad_norm": 29.841644287109375, "kl": 0.0, "learning_rate": 4.941114891389688e-07, "logps/chosen": -197.1206817626953, "logps/rejected": -178.6546173095703, "loss": 0.3349, "rewards/chosen": -1.011289119720459, "rewards/margins": 1.0256037712097168, "rewards/rejected": -2.036892890930176, "step": 45 }, { "epoch": 0.01, "grad_norm": 39.413970947265625, "kl": 0.0, "learning_rate": 4.93980633342057e-07, "logps/chosen": -243.208984375, "logps/rejected": -170.91363525390625, "loss": 0.3565, "rewards/chosen": -1.0538300275802612, "rewards/margins": 0.08940017223358154, "rewards/rejected": -1.1432301998138428, "step": 46 }, { "epoch": 0.01, "grad_norm": 40.43966293334961, "kl": 0.0, "learning_rate": 4.938497775451452e-07, "logps/chosen": -243.11795043945312, "logps/rejected": -250.5272674560547, "loss": 0.3056, "rewards/chosen": -1.732240080833435, "rewards/margins": 1.5034791231155396, "rewards/rejected": -3.2357192039489746, "step": 47 }, { "epoch": 0.01, "grad_norm": 48.8133544921875, "kl": 0.0, "learning_rate": 4.937189217482334e-07, "logps/chosen": -225.61911010742188, "logps/rejected": -303.3346252441406, "loss": 0.4549, "rewards/chosen": -1.1798169612884521, "rewards/margins": 0.7035973072052002, "rewards/rejected": -1.8834142684936523, "step": 48 }, { "epoch": 0.01, "grad_norm": 28.380380630493164, "kl": 0.0, "learning_rate": 4.935880659513216e-07, "logps/chosen": -164.39273071289062, "logps/rejected": -314.02972412109375, "loss": 0.4319, "rewards/chosen": -1.009232521057129, "rewards/margins": 0.519694447517395, "rewards/rejected": -1.528926968574524, "step": 49 }, { "epoch": 0.01, "grad_norm": 37.58095169067383, "kl": 0.0, "learning_rate": 4.934572101544098e-07, "logps/chosen": -348.113525390625, "logps/rejected": -210.80386352539062, "loss": 0.4788, "rewards/chosen": -3.091629981994629, "rewards/margins": -2.1929235458374023, "rewards/rejected": -0.8987063765525818, "step": 50 }, { "epoch": 0.01, "grad_norm": 36.6107063293457, "kl": 0.0, "learning_rate": 4.933263543574981e-07, "logps/chosen": -212.70262145996094, "logps/rejected": -291.3959045410156, "loss": 0.4458, "rewards/chosen": -1.1933636665344238, "rewards/margins": 0.39327001571655273, "rewards/rejected": -1.5866336822509766, "step": 51 }, { "epoch": 0.01, "grad_norm": 40.384883880615234, "kl": 0.0, "learning_rate": 4.931954985605863e-07, "logps/chosen": -257.5487976074219, "logps/rejected": -211.3709259033203, "loss": 0.2835, "rewards/chosen": -1.0930713415145874, "rewards/margins": 0.26484453678131104, "rewards/rejected": -1.3579158782958984, "step": 52 }, { "epoch": 0.01, "grad_norm": 35.08112716674805, "kl": 0.0, "learning_rate": 4.930646427636745e-07, "logps/chosen": -231.74722290039062, "logps/rejected": -246.1368865966797, "loss": 0.4926, "rewards/chosen": -1.5169533491134644, "rewards/margins": 0.9005342721939087, "rewards/rejected": -2.417487621307373, "step": 53 }, { "epoch": 0.01, "grad_norm": 39.13615036010742, "kl": 0.0, "learning_rate": 4.929337869667627e-07, "logps/chosen": -233.39125061035156, "logps/rejected": -182.48666381835938, "loss": 0.4699, "rewards/chosen": -1.9513063430786133, "rewards/margins": -0.586869478225708, "rewards/rejected": -1.3644368648529053, "step": 54 }, { "epoch": 0.01, "grad_norm": 43.6704216003418, "kl": 0.0, "learning_rate": 4.928029311698508e-07, "logps/chosen": -302.2059020996094, "logps/rejected": -216.83807373046875, "loss": 0.4721, "rewards/chosen": -1.3140746355056763, "rewards/margins": -0.25487399101257324, "rewards/rejected": -1.059200644493103, "step": 55 }, { "epoch": 0.01, "grad_norm": 44.78774642944336, "kl": 0.0, "learning_rate": 4.92672075372939e-07, "logps/chosen": -310.4451599121094, "logps/rejected": -285.1246337890625, "loss": 0.2802, "rewards/chosen": 0.11061844974756241, "rewards/margins": 2.974390983581543, "rewards/rejected": -2.8637726306915283, "step": 56 }, { "epoch": 0.01, "grad_norm": 46.488983154296875, "kl": 0.0, "learning_rate": 4.925412195760272e-07, "logps/chosen": -257.123291015625, "logps/rejected": -258.96588134765625, "loss": 0.4438, "rewards/chosen": -1.130954384803772, "rewards/margins": 2.214172840118408, "rewards/rejected": -3.3451271057128906, "step": 57 }, { "epoch": 0.02, "grad_norm": 38.61354064941406, "kl": 0.0, "learning_rate": 4.924103637791154e-07, "logps/chosen": -175.5467071533203, "logps/rejected": -285.81341552734375, "loss": 0.4081, "rewards/chosen": -1.0343022346496582, "rewards/margins": 0.7621434926986694, "rewards/rejected": -1.7964457273483276, "step": 58 }, { "epoch": 0.02, "grad_norm": 32.77366256713867, "kl": 0.0, "learning_rate": 4.922795079822035e-07, "logps/chosen": -166.06712341308594, "logps/rejected": -210.75096130371094, "loss": 0.4244, "rewards/chosen": -0.37483546137809753, "rewards/margins": 1.1932029724121094, "rewards/rejected": -1.5680384635925293, "step": 59 }, { "epoch": 0.02, "grad_norm": 42.49043273925781, "kl": 0.0, "learning_rate": 4.921486521852917e-07, "logps/chosen": -252.02198791503906, "logps/rejected": -188.50100708007812, "loss": 0.3225, "rewards/chosen": 0.3132118582725525, "rewards/margins": 1.1008962392807007, "rewards/rejected": -0.7876843810081482, "step": 60 }, { "epoch": 0.02, "grad_norm": 32.550514221191406, "kl": 0.0, "learning_rate": 4.920177963883799e-07, "logps/chosen": -186.67495727539062, "logps/rejected": -221.72752380371094, "loss": 0.3053, "rewards/chosen": -0.44850480556488037, "rewards/margins": 2.5049924850463867, "rewards/rejected": -2.9534971714019775, "step": 61 }, { "epoch": 0.02, "grad_norm": 44.155418395996094, "kl": 0.0, "learning_rate": 4.918869405914681e-07, "logps/chosen": -225.32420349121094, "logps/rejected": -250.5755615234375, "loss": 0.4224, "rewards/chosen": -0.37531524896621704, "rewards/margins": 1.9962267875671387, "rewards/rejected": -2.371541976928711, "step": 62 }, { "epoch": 0.02, "grad_norm": 47.24485778808594, "kl": 0.0, "learning_rate": 4.917560847945563e-07, "logps/chosen": -195.40579223632812, "logps/rejected": -278.23345947265625, "loss": 0.3477, "rewards/chosen": -0.42761945724487305, "rewards/margins": 2.3172523975372314, "rewards/rejected": -2.7448718547821045, "step": 63 }, { "epoch": 0.02, "grad_norm": 38.27678680419922, "kl": 0.0, "learning_rate": 4.916252289976446e-07, "logps/chosen": -220.49884033203125, "logps/rejected": -188.00048828125, "loss": 0.4889, "rewards/chosen": -0.5155230760574341, "rewards/margins": 0.9867794513702393, "rewards/rejected": -1.5023025274276733, "step": 64 }, { "epoch": 0.02, "grad_norm": 33.94312286376953, "kl": 0.0, "learning_rate": 4.914943732007328e-07, "logps/chosen": -196.26255798339844, "logps/rejected": -173.22503662109375, "loss": 0.3842, "rewards/chosen": -1.0869052410125732, "rewards/margins": 0.193229079246521, "rewards/rejected": -1.2801343202590942, "step": 65 }, { "epoch": 0.02, "grad_norm": 41.270484924316406, "kl": 0.0, "learning_rate": 4.91363517403821e-07, "logps/chosen": -162.62060546875, "logps/rejected": -208.2407684326172, "loss": 0.4074, "rewards/chosen": -1.0006041526794434, "rewards/margins": 0.45082569122314453, "rewards/rejected": -1.451429843902588, "step": 66 }, { "epoch": 0.02, "grad_norm": 31.421226501464844, "kl": 0.0, "learning_rate": 4.912326616069092e-07, "logps/chosen": -266.94854736328125, "logps/rejected": -159.35922241210938, "loss": 0.4062, "rewards/chosen": -0.47417137026786804, "rewards/margins": 1.2088725566864014, "rewards/rejected": -1.6830439567565918, "step": 67 }, { "epoch": 0.02, "grad_norm": 40.290950775146484, "kl": 0.0, "learning_rate": 4.911018058099974e-07, "logps/chosen": -294.2968444824219, "logps/rejected": -238.9677276611328, "loss": 0.4053, "rewards/chosen": -0.8080188035964966, "rewards/margins": 0.8553787469863892, "rewards/rejected": -1.6633975505828857, "step": 68 }, { "epoch": 0.02, "grad_norm": 37.789405822753906, "kl": 0.0, "learning_rate": 4.909709500130856e-07, "logps/chosen": -287.0626220703125, "logps/rejected": -279.5071716308594, "loss": 0.2481, "rewards/chosen": -0.2926979064941406, "rewards/margins": 2.139275074005127, "rewards/rejected": -2.4319729804992676, "step": 69 }, { "epoch": 0.02, "grad_norm": 36.02606964111328, "kl": 0.0, "learning_rate": 4.908400942161737e-07, "logps/chosen": -175.38587951660156, "logps/rejected": -172.9152069091797, "loss": 0.3982, "rewards/chosen": -0.6662830710411072, "rewards/margins": 0.7709174752235413, "rewards/rejected": -1.4372005462646484, "step": 70 }, { "epoch": 0.02, "grad_norm": 27.810359954833984, "kl": 0.0, "learning_rate": 4.907092384192619e-07, "logps/chosen": -220.51766967773438, "logps/rejected": -250.37957763671875, "loss": 0.2556, "rewards/chosen": -0.9891327619552612, "rewards/margins": 1.5172969102859497, "rewards/rejected": -2.506429672241211, "step": 71 }, { "epoch": 0.02, "grad_norm": 36.54177474975586, "kl": 0.0, "learning_rate": 4.905783826223501e-07, "logps/chosen": -196.7530975341797, "logps/rejected": -243.111328125, "loss": 0.2657, "rewards/chosen": -0.8695046305656433, "rewards/margins": 2.0881636142730713, "rewards/rejected": -2.9576683044433594, "step": 72 }, { "epoch": 0.02, "grad_norm": 32.64535903930664, "kl": 0.0, "learning_rate": 4.904475268254383e-07, "logps/chosen": -227.05311584472656, "logps/rejected": -179.5013885498047, "loss": 0.3447, "rewards/chosen": -1.2679059505462646, "rewards/margins": -0.13299989700317383, "rewards/rejected": -1.1349060535430908, "step": 73 }, { "epoch": 0.02, "grad_norm": 34.087646484375, "kl": 0.0, "learning_rate": 4.903166710285265e-07, "logps/chosen": -245.4705810546875, "logps/rejected": -211.25796508789062, "loss": 0.1794, "rewards/chosen": 0.6509444713592529, "rewards/margins": 2.1154017448425293, "rewards/rejected": -1.4644572734832764, "step": 74 }, { "epoch": 0.02, "grad_norm": 38.3884391784668, "kl": 0.0, "learning_rate": 4.901858152316147e-07, "logps/chosen": -182.15093994140625, "logps/rejected": -184.2251739501953, "loss": 0.3708, "rewards/chosen": -0.33757108449935913, "rewards/margins": 1.606074571609497, "rewards/rejected": -1.943645715713501, "step": 75 }, { "epoch": 0.02, "grad_norm": 32.64462661743164, "kl": 0.0, "learning_rate": 4.900549594347029e-07, "logps/chosen": -212.72073364257812, "logps/rejected": -197.57626342773438, "loss": 0.2817, "rewards/chosen": 0.027129173278808594, "rewards/margins": 2.553429126739502, "rewards/rejected": -2.5262999534606934, "step": 76 }, { "epoch": 0.02, "grad_norm": 40.509376525878906, "kl": 0.0, "learning_rate": 4.899241036377912e-07, "logps/chosen": -260.6392822265625, "logps/rejected": -246.5421600341797, "loss": 0.3453, "rewards/chosen": -0.5853943228721619, "rewards/margins": 1.0417983531951904, "rewards/rejected": -1.6271926164627075, "step": 77 }, { "epoch": 0.02, "grad_norm": 37.12883758544922, "kl": 0.0, "learning_rate": 4.897932478408794e-07, "logps/chosen": -204.54745483398438, "logps/rejected": -235.77371215820312, "loss": 0.3932, "rewards/chosen": -1.1263680458068848, "rewards/margins": 1.0550079345703125, "rewards/rejected": -2.1813759803771973, "step": 78 }, { "epoch": 0.02, "grad_norm": 29.98268699645996, "kl": 0.0, "learning_rate": 4.896623920439676e-07, "logps/chosen": -274.41754150390625, "logps/rejected": -291.3458251953125, "loss": 0.3497, "rewards/chosen": -2.2485036849975586, "rewards/margins": 0.7142941951751709, "rewards/rejected": -2.9627978801727295, "step": 79 }, { "epoch": 0.02, "grad_norm": 35.18719482421875, "kl": 0.0, "learning_rate": 4.895315362470558e-07, "logps/chosen": -203.51724243164062, "logps/rejected": -268.3537292480469, "loss": 0.4434, "rewards/chosen": -1.129686951637268, "rewards/margins": 1.2693904638290405, "rewards/rejected": -2.3990774154663086, "step": 80 }, { "epoch": 0.02, "grad_norm": 34.704044342041016, "kl": 0.0, "learning_rate": 4.89400680450144e-07, "logps/chosen": -236.12542724609375, "logps/rejected": -261.73687744140625, "loss": 0.3943, "rewards/chosen": -1.494764804840088, "rewards/margins": 0.6487529277801514, "rewards/rejected": -2.1435177326202393, "step": 81 }, { "epoch": 0.02, "grad_norm": 38.02219772338867, "kl": 0.0, "learning_rate": 4.892698246532322e-07, "logps/chosen": -236.21002197265625, "logps/rejected": -219.7363739013672, "loss": 0.5544, "rewards/chosen": -2.262413501739502, "rewards/margins": -1.9126307964324951, "rewards/rejected": -0.34978270530700684, "step": 82 }, { "epoch": 0.02, "grad_norm": 32.88979721069336, "kl": 0.0, "learning_rate": 4.891389688563204e-07, "logps/chosen": -207.0186309814453, "logps/rejected": -227.69236755371094, "loss": 0.3109, "rewards/chosen": -0.8603959679603577, "rewards/margins": 1.2926530838012695, "rewards/rejected": -2.1530489921569824, "step": 83 }, { "epoch": 0.02, "grad_norm": 37.908660888671875, "kl": 0.0, "learning_rate": 4.890081130594086e-07, "logps/chosen": -295.50836181640625, "logps/rejected": -281.04058837890625, "loss": 0.354, "rewards/chosen": 0.09849908947944641, "rewards/margins": 3.176734685897827, "rewards/rejected": -3.078235626220703, "step": 84 }, { "epoch": 0.02, "grad_norm": 37.847747802734375, "kl": 0.0, "learning_rate": 4.888772572624968e-07, "logps/chosen": -174.21282958984375, "logps/rejected": -305.0511169433594, "loss": 0.2727, "rewards/chosen": -0.06367160379886627, "rewards/margins": 1.2645058631896973, "rewards/rejected": -1.3281774520874023, "step": 85 }, { "epoch": 0.02, "grad_norm": 41.107765197753906, "kl": 0.0, "learning_rate": 4.887464014655848e-07, "logps/chosen": -207.59597778320312, "logps/rejected": -291.0322265625, "loss": 0.3398, "rewards/chosen": -0.1212925836443901, "rewards/margins": 2.127479076385498, "rewards/rejected": -2.2487716674804688, "step": 86 }, { "epoch": 0.02, "grad_norm": 37.753013610839844, "kl": 0.0, "learning_rate": 4.88615545668673e-07, "logps/chosen": -243.60118103027344, "logps/rejected": -201.14027404785156, "loss": 0.4513, "rewards/chosen": -0.5243691205978394, "rewards/margins": 0.809841513633728, "rewards/rejected": -1.3342106342315674, "step": 87 }, { "epoch": 0.02, "grad_norm": 30.350231170654297, "kl": 0.0, "learning_rate": 4.884846898717612e-07, "logps/chosen": -210.97064208984375, "logps/rejected": -187.00070190429688, "loss": 0.4391, "rewards/chosen": -2.0821855068206787, "rewards/margins": -0.4192476272583008, "rewards/rejected": -1.662937879562378, "step": 88 }, { "epoch": 0.02, "grad_norm": 36.895530700683594, "kl": 0.0, "learning_rate": 4.883538340748494e-07, "logps/chosen": -213.4839630126953, "logps/rejected": -240.64381408691406, "loss": 0.3759, "rewards/chosen": -0.6569501757621765, "rewards/margins": 1.2435503005981445, "rewards/rejected": -1.9005005359649658, "step": 89 }, { "epoch": 0.02, "grad_norm": 35.79010009765625, "kl": 0.0, "learning_rate": 4.882229782779377e-07, "logps/chosen": -242.63958740234375, "logps/rejected": -259.929931640625, "loss": 0.4686, "rewards/chosen": -1.5431857109069824, "rewards/margins": 0.8138833045959473, "rewards/rejected": -2.3570690155029297, "step": 90 }, { "epoch": 0.02, "grad_norm": 35.99927520751953, "kl": 0.0, "learning_rate": 4.880921224810259e-07, "logps/chosen": -174.24330139160156, "logps/rejected": -301.5118713378906, "loss": 0.3863, "rewards/chosen": -0.5870503187179565, "rewards/margins": 1.726894736289978, "rewards/rejected": -2.3139450550079346, "step": 91 }, { "epoch": 0.02, "grad_norm": 36.06237030029297, "kl": 0.0, "learning_rate": 4.879612666841141e-07, "logps/chosen": -223.64593505859375, "logps/rejected": -176.1517333984375, "loss": 0.4347, "rewards/chosen": -1.5263793468475342, "rewards/margins": 0.5033280849456787, "rewards/rejected": -2.029707431793213, "step": 92 }, { "epoch": 0.02, "grad_norm": 40.38705825805664, "kl": 0.0, "learning_rate": 4.878304108872023e-07, "logps/chosen": -286.7668762207031, "logps/rejected": -294.2085876464844, "loss": 0.3554, "rewards/chosen": -1.9862689971923828, "rewards/margins": 0.1652989387512207, "rewards/rejected": -2.1515679359436035, "step": 93 }, { "epoch": 0.02, "grad_norm": 27.866796493530273, "kl": 0.0, "learning_rate": 4.876995550902905e-07, "logps/chosen": -206.51478576660156, "logps/rejected": -229.76593017578125, "loss": 0.2507, "rewards/chosen": 1.2367534637451172, "rewards/margins": 3.6880853176116943, "rewards/rejected": -2.451331853866577, "step": 94 }, { "epoch": 0.02, "grad_norm": 31.56693458557129, "kl": 0.0, "learning_rate": 4.875686992933787e-07, "logps/chosen": -239.28445434570312, "logps/rejected": -266.00836181640625, "loss": 0.3358, "rewards/chosen": -0.9398892521858215, "rewards/margins": 3.100093126296997, "rewards/rejected": -4.039982318878174, "step": 95 }, { "epoch": 0.03, "grad_norm": 33.1443977355957, "kl": 0.0, "learning_rate": 4.874378434964669e-07, "logps/chosen": -196.32676696777344, "logps/rejected": -155.44985961914062, "loss": 0.4324, "rewards/chosen": -0.7763940691947937, "rewards/margins": 1.0392913818359375, "rewards/rejected": -1.815685510635376, "step": 96 }, { "epoch": 0.03, "grad_norm": 36.72030258178711, "kl": 0.0, "learning_rate": 4.873069876995551e-07, "logps/chosen": -237.5760040283203, "logps/rejected": -381.8189392089844, "loss": 0.4122, "rewards/chosen": -1.7448315620422363, "rewards/margins": 0.012748241424560547, "rewards/rejected": -1.7575798034667969, "step": 97 }, { "epoch": 0.03, "grad_norm": 35.568607330322266, "kl": 0.0, "learning_rate": 4.871761319026433e-07, "logps/chosen": -261.92828369140625, "logps/rejected": -200.28616333007812, "loss": 0.3368, "rewards/chosen": -0.9904589653015137, "rewards/margins": 1.3062183856964111, "rewards/rejected": -2.296677350997925, "step": 98 }, { "epoch": 0.03, "grad_norm": 25.75632095336914, "kl": 0.0, "learning_rate": 4.870452761057315e-07, "logps/chosen": -201.7320556640625, "logps/rejected": -262.0142517089844, "loss": 0.2943, "rewards/chosen": -1.7546271085739136, "rewards/margins": 1.8151785135269165, "rewards/rejected": -3.56980562210083, "step": 99 }, { "epoch": 0.03, "grad_norm": 41.999332427978516, "kl": 0.0, "learning_rate": 4.869144203088197e-07, "logps/chosen": -218.90869140625, "logps/rejected": -193.1501007080078, "loss": 0.4521, "rewards/chosen": -1.5635461807250977, "rewards/margins": -0.5969691276550293, "rewards/rejected": -0.9665770530700684, "step": 100 }, { "epoch": 0.03, "grad_norm": 36.3663330078125, "kl": 0.0, "learning_rate": 4.867835645119078e-07, "logps/chosen": -225.2639923095703, "logps/rejected": -218.92816162109375, "loss": 0.4209, "rewards/chosen": -1.5182456970214844, "rewards/margins": 0.6523854732513428, "rewards/rejected": -2.170631170272827, "step": 101 }, { "epoch": 0.03, "grad_norm": 40.12117385864258, "kl": 0.0, "learning_rate": 4.86652708714996e-07, "logps/chosen": -232.11502075195312, "logps/rejected": -297.0523986816406, "loss": 0.3277, "rewards/chosen": -0.4440801739692688, "rewards/margins": 3.198974609375, "rewards/rejected": -3.643054723739624, "step": 102 }, { "epoch": 0.03, "grad_norm": 34.094383239746094, "kl": 0.0, "learning_rate": 4.865218529180843e-07, "logps/chosen": -227.10867309570312, "logps/rejected": -231.43902587890625, "loss": 0.3561, "rewards/chosen": -0.583474338054657, "rewards/margins": 1.4088008403778076, "rewards/rejected": -1.9922752380371094, "step": 103 }, { "epoch": 0.03, "grad_norm": 33.84486389160156, "kl": 0.0, "learning_rate": 4.863909971211725e-07, "logps/chosen": -245.8929443359375, "logps/rejected": -193.90614318847656, "loss": 0.3577, "rewards/chosen": -0.8680392503738403, "rewards/margins": 0.2119290828704834, "rewards/rejected": -1.0799683332443237, "step": 104 }, { "epoch": 0.03, "grad_norm": 90.90773010253906, "kl": 0.0, "learning_rate": 4.862601413242607e-07, "logps/chosen": -218.3358154296875, "logps/rejected": -267.4952697753906, "loss": 0.423, "rewards/chosen": 1.7071785926818848, "rewards/margins": 3.0971498489379883, "rewards/rejected": -1.389971137046814, "step": 105 }, { "epoch": 0.03, "grad_norm": 37.740966796875, "kl": 0.0, "learning_rate": 4.861292855273489e-07, "logps/chosen": -224.6427764892578, "logps/rejected": -267.43890380859375, "loss": 0.3854, "rewards/chosen": -0.49575966596603394, "rewards/margins": 1.3802204132080078, "rewards/rejected": -1.875980019569397, "step": 106 }, { "epoch": 0.03, "grad_norm": 26.653839111328125, "kl": 0.0, "learning_rate": 4.859984297304371e-07, "logps/chosen": -192.61354064941406, "logps/rejected": -188.12330627441406, "loss": 0.3971, "rewards/chosen": -1.3154840469360352, "rewards/margins": 1.1609439849853516, "rewards/rejected": -2.4764280319213867, "step": 107 }, { "epoch": 0.03, "grad_norm": 37.36748504638672, "kl": 0.0, "learning_rate": 4.858675739335253e-07, "logps/chosen": -288.2717590332031, "logps/rejected": -205.67002868652344, "loss": 0.364, "rewards/chosen": -0.9201427698135376, "rewards/margins": 0.7539663314819336, "rewards/rejected": -1.6741091012954712, "step": 108 }, { "epoch": 0.03, "grad_norm": 26.112871170043945, "kl": 0.0, "learning_rate": 4.857367181366135e-07, "logps/chosen": -173.88441467285156, "logps/rejected": -184.56675720214844, "loss": 0.3863, "rewards/chosen": -1.1070303916931152, "rewards/margins": 1.5630695819854736, "rewards/rejected": -2.670099973678589, "step": 109 }, { "epoch": 0.03, "grad_norm": 46.24738693237305, "kl": 0.0, "learning_rate": 4.856058623397017e-07, "logps/chosen": -290.48577880859375, "logps/rejected": -203.6895294189453, "loss": 0.3406, "rewards/chosen": 0.15263789892196655, "rewards/margins": 1.928048849105835, "rewards/rejected": -1.7754108905792236, "step": 110 }, { "epoch": 0.03, "grad_norm": 74.96126556396484, "kl": 0.0, "learning_rate": 4.854750065427898e-07, "logps/chosen": -198.45909118652344, "logps/rejected": -210.08143615722656, "loss": 0.3199, "rewards/chosen": -0.6983060240745544, "rewards/margins": 1.6797571182250977, "rewards/rejected": -2.378063201904297, "step": 111 }, { "epoch": 0.03, "grad_norm": 31.173887252807617, "kl": 0.0, "learning_rate": 4.85344150745878e-07, "logps/chosen": -202.14859008789062, "logps/rejected": -185.81361389160156, "loss": 0.3284, "rewards/chosen": -1.0504775047302246, "rewards/margins": 1.1194241046905518, "rewards/rejected": -2.1699016094207764, "step": 112 }, { "epoch": 0.03, "grad_norm": 37.457008361816406, "kl": 0.0, "learning_rate": 4.852132949489662e-07, "logps/chosen": -178.17190551757812, "logps/rejected": -198.142333984375, "loss": 0.2724, "rewards/chosen": -0.5825530886650085, "rewards/margins": 1.6814563274383545, "rewards/rejected": -2.264009475708008, "step": 113 }, { "epoch": 0.03, "grad_norm": 36.00678253173828, "kl": 0.0, "learning_rate": 4.850824391520544e-07, "logps/chosen": -226.97251892089844, "logps/rejected": -277.8453674316406, "loss": 0.4056, "rewards/chosen": -1.666909098625183, "rewards/margins": 0.5384429693222046, "rewards/rejected": -2.2053520679473877, "step": 114 }, { "epoch": 0.03, "grad_norm": 31.167097091674805, "kl": 0.0, "learning_rate": 4.849515833551426e-07, "logps/chosen": -221.36300659179688, "logps/rejected": -242.39280700683594, "loss": 0.4028, "rewards/chosen": -1.4840993881225586, "rewards/margins": 1.3504664897918701, "rewards/rejected": -2.8345658779144287, "step": 115 }, { "epoch": 0.03, "grad_norm": 31.748884201049805, "kl": 0.0, "learning_rate": 4.848207275582308e-07, "logps/chosen": -297.3919677734375, "logps/rejected": -213.3976287841797, "loss": 0.2864, "rewards/chosen": -2.3920040130615234, "rewards/margins": 0.7508974075317383, "rewards/rejected": -3.1429014205932617, "step": 116 }, { "epoch": 0.03, "grad_norm": 30.6126766204834, "kl": 0.0, "learning_rate": 4.84689871761319e-07, "logps/chosen": -236.26303100585938, "logps/rejected": -177.65513610839844, "loss": 0.3932, "rewards/chosen": -2.2624423503875732, "rewards/margins": 0.27295541763305664, "rewards/rejected": -2.53539776802063, "step": 117 }, { "epoch": 0.03, "grad_norm": 26.81728744506836, "kl": 0.0, "learning_rate": 4.845590159644072e-07, "logps/chosen": -297.1986389160156, "logps/rejected": -206.4488983154297, "loss": 0.4104, "rewards/chosen": -2.306591510772705, "rewards/margins": 0.1528792381286621, "rewards/rejected": -2.459470748901367, "step": 118 }, { "epoch": 0.03, "grad_norm": 32.799739837646484, "kl": 0.0, "learning_rate": 4.844281601674954e-07, "logps/chosen": -212.15191650390625, "logps/rejected": -264.2872314453125, "loss": 0.4941, "rewards/chosen": -1.7588011026382446, "rewards/margins": 1.0008176565170288, "rewards/rejected": -2.7596187591552734, "step": 119 }, { "epoch": 0.03, "grad_norm": 31.55664825439453, "kl": 0.0, "learning_rate": 4.842973043705836e-07, "logps/chosen": -289.9277648925781, "logps/rejected": -317.9549255371094, "loss": 0.4072, "rewards/chosen": -1.3789793252944946, "rewards/margins": 3.475088596343994, "rewards/rejected": -4.854067802429199, "step": 120 }, { "epoch": 0.03, "grad_norm": 31.860851287841797, "kl": 0.0, "learning_rate": 4.841664485736718e-07, "logps/chosen": -227.15924072265625, "logps/rejected": -233.9003143310547, "loss": 0.4206, "rewards/chosen": -0.4197777509689331, "rewards/margins": 2.2443408966064453, "rewards/rejected": -2.664118766784668, "step": 121 }, { "epoch": 0.03, "grad_norm": 35.52662658691406, "kl": 0.0, "learning_rate": 4.8403559277676e-07, "logps/chosen": -170.0311279296875, "logps/rejected": -187.07435607910156, "loss": 0.3639, "rewards/chosen": -0.22913819551467896, "rewards/margins": 2.3094356060028076, "rewards/rejected": -2.538573741912842, "step": 122 }, { "epoch": 0.03, "grad_norm": 38.98072814941406, "kl": 0.0, "learning_rate": 4.839047369798482e-07, "logps/chosen": -236.20376586914062, "logps/rejected": -213.81533813476562, "loss": 0.4007, "rewards/chosen": -0.15046189725399017, "rewards/margins": 1.8937103748321533, "rewards/rejected": -2.0441722869873047, "step": 123 }, { "epoch": 0.03, "grad_norm": 39.98687744140625, "kl": 0.0, "learning_rate": 4.837738811829364e-07, "logps/chosen": -276.67901611328125, "logps/rejected": -261.1310119628906, "loss": 0.4084, "rewards/chosen": -0.6370207071304321, "rewards/margins": 1.557566523551941, "rewards/rejected": -2.194587230682373, "step": 124 }, { "epoch": 0.03, "grad_norm": 31.81071662902832, "kl": 0.0, "learning_rate": 4.836430253860246e-07, "logps/chosen": -164.74241638183594, "logps/rejected": -164.97280883789062, "loss": 0.308, "rewards/chosen": -0.021863222122192383, "rewards/margins": 1.7506403923034668, "rewards/rejected": -1.7725036144256592, "step": 125 }, { "epoch": 0.03, "grad_norm": 40.468868255615234, "kl": 0.0, "learning_rate": 4.835121695891128e-07, "logps/chosen": -209.9675750732422, "logps/rejected": -272.98876953125, "loss": 0.3195, "rewards/chosen": -1.1054404973983765, "rewards/margins": 1.143508791923523, "rewards/rejected": -2.2489492893218994, "step": 126 }, { "epoch": 0.03, "grad_norm": 32.93410110473633, "kl": 0.0, "learning_rate": 4.83381313792201e-07, "logps/chosen": -191.24378967285156, "logps/rejected": -237.31460571289062, "loss": 0.419, "rewards/chosen": -1.1644923686981201, "rewards/margins": 1.5805509090423584, "rewards/rejected": -2.7450432777404785, "step": 127 }, { "epoch": 0.03, "grad_norm": 33.93423843383789, "kl": 0.0, "learning_rate": 4.832504579952892e-07, "logps/chosen": -238.41317749023438, "logps/rejected": -270.1029052734375, "loss": 0.3703, "rewards/chosen": -1.668627381324768, "rewards/margins": -0.14160168170928955, "rewards/rejected": -1.5270256996154785, "step": 128 }, { "epoch": 0.03, "grad_norm": 37.237674713134766, "kl": 0.0, "learning_rate": 4.831196021983774e-07, "logps/chosen": -253.71865844726562, "logps/rejected": -199.205078125, "loss": 0.4102, "rewards/chosen": -0.7242934703826904, "rewards/margins": 1.206939935684204, "rewards/rejected": -1.9312334060668945, "step": 129 }, { "epoch": 0.03, "grad_norm": 42.5329704284668, "kl": 0.0, "learning_rate": 4.829887464014656e-07, "logps/chosen": -178.42868041992188, "logps/rejected": -188.00540161132812, "loss": 0.3732, "rewards/chosen": -0.8551152348518372, "rewards/margins": 1.5641908645629883, "rewards/rejected": -2.4193060398101807, "step": 130 }, { "epoch": 0.03, "grad_norm": 27.833166122436523, "kl": 0.0, "learning_rate": 4.828578906045538e-07, "logps/chosen": -211.11993408203125, "logps/rejected": -161.3157501220703, "loss": 0.4244, "rewards/chosen": -1.8940626382827759, "rewards/margins": 0.8122283220291138, "rewards/rejected": -2.7062909603118896, "step": 131 }, { "epoch": 0.03, "grad_norm": 41.60429763793945, "kl": 0.0, "learning_rate": 4.82727034807642e-07, "logps/chosen": -271.83526611328125, "logps/rejected": -171.28790283203125, "loss": 0.3065, "rewards/chosen": -0.894410252571106, "rewards/margins": 0.685144305229187, "rewards/rejected": -1.579554557800293, "step": 132 }, { "epoch": 0.03, "grad_norm": 37.991764068603516, "kl": 0.0, "learning_rate": 4.825961790107302e-07, "logps/chosen": -101.1789321899414, "logps/rejected": -278.728759765625, "loss": 0.2365, "rewards/chosen": -0.014444398693740368, "rewards/margins": 2.06191086769104, "rewards/rejected": -2.076355218887329, "step": 133 }, { "epoch": 0.04, "grad_norm": 33.86128234863281, "kl": 0.0, "learning_rate": 4.824653232138184e-07, "logps/chosen": -250.19801330566406, "logps/rejected": -214.6094512939453, "loss": 0.2672, "rewards/chosen": -0.9694932103157043, "rewards/margins": 1.6486060619354248, "rewards/rejected": -2.6180992126464844, "step": 134 }, { "epoch": 0.04, "grad_norm": 40.54289245605469, "kl": 0.0, "learning_rate": 4.823344674169066e-07, "logps/chosen": -158.35687255859375, "logps/rejected": -197.9619598388672, "loss": 0.3677, "rewards/chosen": -1.2970025539398193, "rewards/margins": -0.352266788482666, "rewards/rejected": -0.9447357654571533, "step": 135 }, { "epoch": 0.04, "grad_norm": 32.23467254638672, "kl": 0.0, "learning_rate": 4.822036116199948e-07, "logps/chosen": -172.86297607421875, "logps/rejected": -270.7877502441406, "loss": 0.2857, "rewards/chosen": -0.7820056676864624, "rewards/margins": 1.7432712316513062, "rewards/rejected": -2.5252768993377686, "step": 136 }, { "epoch": 0.04, "grad_norm": 34.8538703918457, "kl": 0.0, "learning_rate": 4.82072755823083e-07, "logps/chosen": -308.17401123046875, "logps/rejected": -210.2477264404297, "loss": 0.4101, "rewards/chosen": -0.6375879049301147, "rewards/margins": 0.9788401126861572, "rewards/rejected": -1.616428017616272, "step": 137 }, { "epoch": 0.04, "grad_norm": 40.68290328979492, "kl": 0.0, "learning_rate": 4.819419000261711e-07, "logps/chosen": -210.66500854492188, "logps/rejected": -228.51309204101562, "loss": 0.4091, "rewards/chosen": -0.766243577003479, "rewards/margins": 1.044326663017273, "rewards/rejected": -1.810570240020752, "step": 138 }, { "epoch": 0.04, "grad_norm": 41.904876708984375, "kl": 0.0, "learning_rate": 4.818110442292593e-07, "logps/chosen": -203.61105346679688, "logps/rejected": -234.1663360595703, "loss": 0.3681, "rewards/chosen": 0.1758965253829956, "rewards/margins": 3.3261351585388184, "rewards/rejected": -3.150238513946533, "step": 139 }, { "epoch": 0.04, "grad_norm": 42.24943542480469, "kl": 0.0, "learning_rate": 4.816801884323475e-07, "logps/chosen": -285.647705078125, "logps/rejected": -288.2828369140625, "loss": 0.3483, "rewards/chosen": -1.2011711597442627, "rewards/margins": 1.2541842460632324, "rewards/rejected": -2.455355405807495, "step": 140 }, { "epoch": 0.04, "grad_norm": 34.05189514160156, "kl": 0.0, "learning_rate": 4.815493326354357e-07, "logps/chosen": -182.27590942382812, "logps/rejected": -290.2626037597656, "loss": 0.4624, "rewards/chosen": -1.4244545698165894, "rewards/margins": 0.7048934698104858, "rewards/rejected": -2.129348039627075, "step": 141 }, { "epoch": 0.04, "grad_norm": 33.5157356262207, "kl": 0.0, "learning_rate": 4.814184768385239e-07, "logps/chosen": -204.3314208984375, "logps/rejected": -326.6562805175781, "loss": 0.4179, "rewards/chosen": -1.0364093780517578, "rewards/margins": 1.5139243602752686, "rewards/rejected": -2.5503337383270264, "step": 142 }, { "epoch": 0.04, "grad_norm": 31.165124893188477, "kl": 0.0, "learning_rate": 4.812876210416121e-07, "logps/chosen": -251.93438720703125, "logps/rejected": -233.26759338378906, "loss": 0.3301, "rewards/chosen": -2.0439867973327637, "rewards/margins": 0.13914990425109863, "rewards/rejected": -2.1831367015838623, "step": 143 }, { "epoch": 0.04, "grad_norm": 33.466365814208984, "kl": 0.0, "learning_rate": 4.811567652447003e-07, "logps/chosen": -233.31072998046875, "logps/rejected": -202.12730407714844, "loss": 0.4027, "rewards/chosen": -2.06561541557312, "rewards/margins": -0.20110619068145752, "rewards/rejected": -1.8645092248916626, "step": 144 }, { "epoch": 0.04, "grad_norm": 33.38949203491211, "kl": 0.0, "learning_rate": 4.810259094477885e-07, "logps/chosen": -256.955322265625, "logps/rejected": -203.2244110107422, "loss": 0.3354, "rewards/chosen": 0.2599179148674011, "rewards/margins": 2.934673547744751, "rewards/rejected": -2.674755573272705, "step": 145 }, { "epoch": 0.04, "grad_norm": 36.793487548828125, "kl": 0.0, "learning_rate": 4.808950536508767e-07, "logps/chosen": -245.50906372070312, "logps/rejected": -276.3681640625, "loss": 0.3287, "rewards/chosen": -0.11979079246520996, "rewards/margins": 1.873924732208252, "rewards/rejected": -1.993715524673462, "step": 146 }, { "epoch": 0.04, "grad_norm": 31.725196838378906, "kl": 0.0, "learning_rate": 4.807641978539649e-07, "logps/chosen": -128.91494750976562, "logps/rejected": -231.997314453125, "loss": 0.2698, "rewards/chosen": -0.5471810698509216, "rewards/margins": 1.7721290588378906, "rewards/rejected": -2.319310188293457, "step": 147 }, { "epoch": 0.04, "grad_norm": 27.87786865234375, "kl": 0.0, "learning_rate": 4.806333420570531e-07, "logps/chosen": -202.31613159179688, "logps/rejected": -222.10958862304688, "loss": 0.3701, "rewards/chosen": -1.3587830066680908, "rewards/margins": 0.8089334964752197, "rewards/rejected": -2.1677165031433105, "step": 148 }, { "epoch": 0.04, "grad_norm": 33.04602813720703, "kl": 0.0, "learning_rate": 4.805024862601413e-07, "logps/chosen": -176.23416137695312, "logps/rejected": -229.03103637695312, "loss": 0.3393, "rewards/chosen": -0.5084764361381531, "rewards/margins": 1.4789369106292725, "rewards/rejected": -1.9874134063720703, "step": 149 }, { "epoch": 0.04, "grad_norm": 22.341838836669922, "kl": 0.0, "learning_rate": 4.803716304632295e-07, "logps/chosen": -167.64193725585938, "logps/rejected": -173.7758026123047, "loss": 0.269, "rewards/chosen": -1.1564908027648926, "rewards/margins": 1.4417181015014648, "rewards/rejected": -2.5982089042663574, "step": 150 }, { "epoch": 0.04, "grad_norm": 48.48427963256836, "kl": 0.0, "learning_rate": 4.802407746663177e-07, "logps/chosen": -279.48785400390625, "logps/rejected": -292.7326965332031, "loss": 0.4666, "rewards/chosen": -1.6676387786865234, "rewards/margins": -1.201262354850769, "rewards/rejected": -0.4663764536380768, "step": 151 }, { "epoch": 0.04, "grad_norm": 33.24284744262695, "kl": 0.0, "learning_rate": 4.801099188694059e-07, "logps/chosen": -244.73452758789062, "logps/rejected": -240.77969360351562, "loss": 0.4686, "rewards/chosen": -1.590959072113037, "rewards/margins": -0.13520264625549316, "rewards/rejected": -1.455756425857544, "step": 152 }, { "epoch": 0.04, "grad_norm": 32.1400146484375, "kl": 0.0, "learning_rate": 4.799790630724941e-07, "logps/chosen": -237.9163818359375, "logps/rejected": -194.08131408691406, "loss": 0.3316, "rewards/chosen": 2.124384641647339, "rewards/margins": 3.3624322414398193, "rewards/rejected": -1.2380475997924805, "step": 153 }, { "epoch": 0.04, "grad_norm": 34.19780731201172, "kl": 0.0, "learning_rate": 4.798482072755823e-07, "logps/chosen": -220.74940490722656, "logps/rejected": -224.6669921875, "loss": 0.3385, "rewards/chosen": -1.281145453453064, "rewards/margins": 1.0018588304519653, "rewards/rejected": -2.2830042839050293, "step": 154 }, { "epoch": 0.04, "grad_norm": 31.86383056640625, "kl": 0.0, "learning_rate": 4.797173514786705e-07, "logps/chosen": -186.74932861328125, "logps/rejected": -292.90008544921875, "loss": 0.456, "rewards/chosen": -1.296830415725708, "rewards/margins": 0.6673908233642578, "rewards/rejected": -1.9642212390899658, "step": 155 }, { "epoch": 0.04, "grad_norm": 32.114322662353516, "kl": 0.0, "learning_rate": 4.795864956817587e-07, "logps/chosen": -225.802490234375, "logps/rejected": -154.5421142578125, "loss": 0.3427, "rewards/chosen": -0.9391254186630249, "rewards/margins": 1.5108743906021118, "rewards/rejected": -2.4499998092651367, "step": 156 }, { "epoch": 0.04, "grad_norm": 34.9625358581543, "kl": 0.0, "learning_rate": 4.794556398848469e-07, "logps/chosen": -272.5867004394531, "logps/rejected": -266.7203063964844, "loss": 0.3097, "rewards/chosen": -0.2914942800998688, "rewards/margins": 4.458119869232178, "rewards/rejected": -4.749614238739014, "step": 157 }, { "epoch": 0.04, "grad_norm": 27.236101150512695, "kl": 0.0, "learning_rate": 4.793247840879351e-07, "logps/chosen": -207.18653869628906, "logps/rejected": -290.4095153808594, "loss": 0.3525, "rewards/chosen": -0.14065659046173096, "rewards/margins": 3.9441447257995605, "rewards/rejected": -4.084801197052002, "step": 158 }, { "epoch": 0.04, "grad_norm": 34.254356384277344, "kl": 0.0, "learning_rate": 4.791939282910233e-07, "logps/chosen": -271.6333312988281, "logps/rejected": -261.6518859863281, "loss": 0.4046, "rewards/chosen": -1.3647454977035522, "rewards/margins": 0.2699841260910034, "rewards/rejected": -1.6347296237945557, "step": 159 }, { "epoch": 0.04, "grad_norm": 36.57579040527344, "kl": 0.0, "learning_rate": 4.790630724941115e-07, "logps/chosen": -338.86138916015625, "logps/rejected": -216.30711364746094, "loss": 0.3796, "rewards/chosen": -1.2024385929107666, "rewards/margins": 0.8041627407073975, "rewards/rejected": -2.006601333618164, "step": 160 }, { "epoch": 0.04, "grad_norm": 33.14208221435547, "kl": 0.0, "learning_rate": 4.789322166971997e-07, "logps/chosen": -175.1085205078125, "logps/rejected": -367.12030029296875, "loss": 0.2882, "rewards/chosen": -1.5256532430648804, "rewards/margins": 1.4195586442947388, "rewards/rejected": -2.945211887359619, "step": 161 }, { "epoch": 0.04, "grad_norm": 33.05416488647461, "kl": 0.0, "learning_rate": 4.788013609002879e-07, "logps/chosen": -209.37460327148438, "logps/rejected": -218.49258422851562, "loss": 0.5348, "rewards/chosen": -1.7475395202636719, "rewards/margins": -0.1888653039932251, "rewards/rejected": -1.5586742162704468, "step": 162 }, { "epoch": 0.04, "grad_norm": 37.163150787353516, "kl": 0.0, "learning_rate": 4.78670505103376e-07, "logps/chosen": -293.8970642089844, "logps/rejected": -269.2586364746094, "loss": 0.4305, "rewards/chosen": -1.452363133430481, "rewards/margins": 1.5342754125595093, "rewards/rejected": -2.9866385459899902, "step": 163 }, { "epoch": 0.04, "grad_norm": 32.23062515258789, "kl": 0.0, "learning_rate": 4.785396493064642e-07, "logps/chosen": -221.77999877929688, "logps/rejected": -198.4766082763672, "loss": 0.3642, "rewards/chosen": -1.34794020652771, "rewards/margins": 1.2967596054077148, "rewards/rejected": -2.644699811935425, "step": 164 }, { "epoch": 0.04, "grad_norm": 36.29683303833008, "kl": 0.0, "learning_rate": 4.784087935095524e-07, "logps/chosen": -239.8020477294922, "logps/rejected": -234.60704040527344, "loss": 0.2911, "rewards/chosen": -0.17822468280792236, "rewards/margins": 2.4091439247131348, "rewards/rejected": -2.5873684883117676, "step": 165 }, { "epoch": 0.04, "grad_norm": 37.25835037231445, "kl": 0.0, "learning_rate": 4.782779377126406e-07, "logps/chosen": -226.92759704589844, "logps/rejected": -230.7248992919922, "loss": 0.4445, "rewards/chosen": -1.0927642583847046, "rewards/margins": 1.1659952402114868, "rewards/rejected": -2.2587594985961914, "step": 166 }, { "epoch": 0.04, "grad_norm": 34.106014251708984, "kl": 0.0, "learning_rate": 4.781470819157288e-07, "logps/chosen": -246.8311004638672, "logps/rejected": -246.91729736328125, "loss": 0.3259, "rewards/chosen": -0.07399371266365051, "rewards/margins": 3.7297139167785645, "rewards/rejected": -3.8037075996398926, "step": 167 }, { "epoch": 0.04, "grad_norm": 28.763517379760742, "kl": 0.0, "learning_rate": 4.78016226118817e-07, "logps/chosen": -212.7134552001953, "logps/rejected": -273.555419921875, "loss": 0.2403, "rewards/chosen": -0.3746832311153412, "rewards/margins": 2.443885564804077, "rewards/rejected": -2.818568706512451, "step": 168 }, { "epoch": 0.04, "grad_norm": 31.084125518798828, "kl": 0.0, "learning_rate": 4.778853703219052e-07, "logps/chosen": -267.5098571777344, "logps/rejected": -239.85763549804688, "loss": 0.1923, "rewards/chosen": -1.1324867010116577, "rewards/margins": 2.735407829284668, "rewards/rejected": -3.867894411087036, "step": 169 }, { "epoch": 0.04, "grad_norm": 30.411270141601562, "kl": 0.0, "learning_rate": 4.777545145249934e-07, "logps/chosen": -195.84251403808594, "logps/rejected": -127.29740905761719, "loss": 0.3347, "rewards/chosen": -1.0493810176849365, "rewards/margins": 0.6141175031661987, "rewards/rejected": -1.6634985208511353, "step": 170 }, { "epoch": 0.04, "grad_norm": 35.29057312011719, "kl": 0.0, "learning_rate": 4.776236587280816e-07, "logps/chosen": -181.64306640625, "logps/rejected": -311.97705078125, "loss": 0.3247, "rewards/chosen": -1.0975894927978516, "rewards/margins": 0.9621212482452393, "rewards/rejected": -2.059710741043091, "step": 171 }, { "epoch": 0.05, "grad_norm": 34.762691497802734, "kl": 0.0, "learning_rate": 4.774928029311698e-07, "logps/chosen": -233.34786987304688, "logps/rejected": -180.6575469970703, "loss": 0.3342, "rewards/chosen": -1.5650959014892578, "rewards/margins": 0.0028657913208007812, "rewards/rejected": -1.5679616928100586, "step": 172 }, { "epoch": 0.05, "grad_norm": 34.95657730102539, "kl": 0.0, "learning_rate": 4.77361947134258e-07, "logps/chosen": -233.89166259765625, "logps/rejected": -164.43466186523438, "loss": 0.3293, "rewards/chosen": 0.26836246252059937, "rewards/margins": 2.1331124305725098, "rewards/rejected": -1.8647499084472656, "step": 173 }, { "epoch": 0.05, "grad_norm": 35.36391830444336, "kl": 0.0, "learning_rate": 4.772310913373462e-07, "logps/chosen": -254.98135375976562, "logps/rejected": -245.09056091308594, "loss": 0.3722, "rewards/chosen": -0.3194230794906616, "rewards/margins": 1.2747198343276978, "rewards/rejected": -1.5941429138183594, "step": 174 }, { "epoch": 0.05, "grad_norm": 28.04030418395996, "kl": 0.0, "learning_rate": 4.771002355404344e-07, "logps/chosen": -191.9232177734375, "logps/rejected": -249.7639617919922, "loss": 0.1768, "rewards/chosen": -1.6041406393051147, "rewards/margins": 0.7407997846603394, "rewards/rejected": -2.344940423965454, "step": 175 }, { "epoch": 0.05, "grad_norm": 33.58053207397461, "kl": 0.0, "learning_rate": 4.769693797435226e-07, "logps/chosen": -258.4371032714844, "logps/rejected": -221.71328735351562, "loss": 0.3767, "rewards/chosen": 0.0411025732755661, "rewards/margins": 2.80287766456604, "rewards/rejected": -2.761775016784668, "step": 176 }, { "epoch": 0.05, "grad_norm": 44.45076370239258, "kl": 0.0, "learning_rate": 4.768385239466108e-07, "logps/chosen": -262.84375, "logps/rejected": -283.4814147949219, "loss": 0.284, "rewards/chosen": -0.07760030031204224, "rewards/margins": 2.31966495513916, "rewards/rejected": -2.3972651958465576, "step": 177 }, { "epoch": 0.05, "grad_norm": 32.66361999511719, "kl": 0.0, "learning_rate": 4.76707668149699e-07, "logps/chosen": -173.99244689941406, "logps/rejected": -231.17002868652344, "loss": 0.2292, "rewards/chosen": -0.2332194596529007, "rewards/margins": 2.0381579399108887, "rewards/rejected": -2.2713773250579834, "step": 178 }, { "epoch": 0.05, "grad_norm": 29.631681442260742, "kl": 0.0, "learning_rate": 4.765768123527872e-07, "logps/chosen": -215.5139923095703, "logps/rejected": -275.6757507324219, "loss": 0.3481, "rewards/chosen": -0.28280165791511536, "rewards/margins": 1.5615084171295166, "rewards/rejected": -1.8443100452423096, "step": 179 }, { "epoch": 0.05, "grad_norm": 33.25776672363281, "kl": 0.0, "learning_rate": 4.764459565558754e-07, "logps/chosen": -300.66571044921875, "logps/rejected": -226.95843505859375, "loss": 0.5674, "rewards/chosen": -2.056067705154419, "rewards/margins": -0.5937771797180176, "rewards/rejected": -1.4622905254364014, "step": 180 }, { "epoch": 0.05, "grad_norm": 32.5307502746582, "kl": 0.0, "learning_rate": 4.763151007589636e-07, "logps/chosen": -274.2823791503906, "logps/rejected": -278.12158203125, "loss": 0.364, "rewards/chosen": -1.056592345237732, "rewards/margins": 1.1514142751693726, "rewards/rejected": -2.2080066204071045, "step": 181 }, { "epoch": 0.05, "grad_norm": 31.034576416015625, "kl": 0.0, "learning_rate": 4.7618424496205177e-07, "logps/chosen": -320.25201416015625, "logps/rejected": -268.442138671875, "loss": 0.3914, "rewards/chosen": -1.3781906366348267, "rewards/margins": 1.2480562925338745, "rewards/rejected": -2.626246929168701, "step": 182 }, { "epoch": 0.05, "grad_norm": 27.58580207824707, "kl": 0.0, "learning_rate": 4.7605338916513997e-07, "logps/chosen": -178.425537109375, "logps/rejected": -249.5200958251953, "loss": 0.3172, "rewards/chosen": -0.6601690649986267, "rewards/margins": 2.0617916584014893, "rewards/rejected": -2.7219607830047607, "step": 183 }, { "epoch": 0.05, "grad_norm": 31.219865798950195, "kl": 0.0, "learning_rate": 4.7592253336822816e-07, "logps/chosen": -279.55352783203125, "logps/rejected": -152.67330932617188, "loss": 0.4063, "rewards/chosen": -1.6273868083953857, "rewards/margins": -0.3303135633468628, "rewards/rejected": -1.297073245048523, "step": 184 }, { "epoch": 0.05, "grad_norm": 39.95933532714844, "kl": 0.0, "learning_rate": 4.7579167757131636e-07, "logps/chosen": -217.0849151611328, "logps/rejected": -275.55670166015625, "loss": 0.4124, "rewards/chosen": -0.4421677887439728, "rewards/margins": 1.6669080257415771, "rewards/rejected": -2.1090757846832275, "step": 185 }, { "epoch": 0.05, "grad_norm": 29.07082748413086, "kl": 0.0, "learning_rate": 4.756608217744046e-07, "logps/chosen": -126.4207992553711, "logps/rejected": -215.87127685546875, "loss": 0.3561, "rewards/chosen": -1.5301021337509155, "rewards/margins": 0.5697237253189087, "rewards/rejected": -2.099825859069824, "step": 186 }, { "epoch": 0.05, "grad_norm": 36.05703353881836, "kl": 0.0, "learning_rate": 4.755299659774928e-07, "logps/chosen": -300.80914306640625, "logps/rejected": -214.6817626953125, "loss": 0.5, "rewards/chosen": -1.702012062072754, "rewards/margins": -0.06618618965148926, "rewards/rejected": -1.6358258724212646, "step": 187 }, { "epoch": 0.05, "grad_norm": 32.64337921142578, "kl": 0.0, "learning_rate": 4.75399110180581e-07, "logps/chosen": -314.01226806640625, "logps/rejected": -260.5047607421875, "loss": 0.2495, "rewards/chosen": -0.9918256402015686, "rewards/margins": 1.699979305267334, "rewards/rejected": -2.691804885864258, "step": 188 }, { "epoch": 0.05, "grad_norm": 28.98259735107422, "kl": 0.0, "learning_rate": 4.752682543836692e-07, "logps/chosen": -171.03106689453125, "logps/rejected": -305.4718933105469, "loss": 0.4056, "rewards/chosen": -1.0309761762619019, "rewards/margins": 2.5201992988586426, "rewards/rejected": -3.551175594329834, "step": 189 }, { "epoch": 0.05, "grad_norm": 43.78743362426758, "kl": 0.0, "learning_rate": 4.751373985867574e-07, "logps/chosen": -203.0801239013672, "logps/rejected": -247.2637939453125, "loss": 0.3496, "rewards/chosen": 0.1427454948425293, "rewards/margins": 2.5608232021331787, "rewards/rejected": -2.4180777072906494, "step": 190 }, { "epoch": 0.05, "grad_norm": 34.81067657470703, "kl": 0.0, "learning_rate": 4.750065427898456e-07, "logps/chosen": -177.9116973876953, "logps/rejected": -285.660400390625, "loss": 0.3558, "rewards/chosen": 0.284568727016449, "rewards/margins": 2.3395156860351562, "rewards/rejected": -2.0549468994140625, "step": 191 }, { "epoch": 0.05, "grad_norm": 36.1607780456543, "kl": 0.0, "learning_rate": 4.748756869929338e-07, "logps/chosen": -238.54434204101562, "logps/rejected": -200.06578063964844, "loss": 0.3894, "rewards/chosen": -0.9025675654411316, "rewards/margins": -0.14597797393798828, "rewards/rejected": -0.7565895915031433, "step": 192 }, { "epoch": 0.05, "grad_norm": 36.35572814941406, "kl": 0.0, "learning_rate": 4.74744831196022e-07, "logps/chosen": -176.56765747070312, "logps/rejected": -160.97991943359375, "loss": 0.4684, "rewards/chosen": -0.44666993618011475, "rewards/margins": 1.3998445272445679, "rewards/rejected": -1.8465144634246826, "step": 193 }, { "epoch": 0.05, "grad_norm": 38.28248977661133, "kl": 0.0, "learning_rate": 4.746139753991101e-07, "logps/chosen": -267.4242858886719, "logps/rejected": -227.41146850585938, "loss": 0.3664, "rewards/chosen": -1.034665822982788, "rewards/margins": 1.0908312797546387, "rewards/rejected": -2.1254971027374268, "step": 194 }, { "epoch": 0.05, "grad_norm": 41.40991973876953, "kl": 0.0, "learning_rate": 4.744831196021983e-07, "logps/chosen": -215.8043212890625, "logps/rejected": -260.5753173828125, "loss": 0.4136, "rewards/chosen": -0.8841571807861328, "rewards/margins": 2.375264883041382, "rewards/rejected": -3.2594220638275146, "step": 195 }, { "epoch": 0.05, "grad_norm": 30.04712677001953, "kl": 0.0, "learning_rate": 4.743522638052865e-07, "logps/chosen": -247.63121032714844, "logps/rejected": -191.33665466308594, "loss": 0.4148, "rewards/chosen": -2.485023021697998, "rewards/margins": 0.23484015464782715, "rewards/rejected": -2.719863176345825, "step": 196 }, { "epoch": 0.05, "grad_norm": 35.53774642944336, "kl": 0.0, "learning_rate": 4.742214080083747e-07, "logps/chosen": -199.6810302734375, "logps/rejected": -191.50827026367188, "loss": 0.295, "rewards/chosen": -1.0607842206954956, "rewards/margins": 1.159951090812683, "rewards/rejected": -2.2207353115081787, "step": 197 }, { "epoch": 0.05, "grad_norm": 38.35923767089844, "kl": 0.0, "learning_rate": 4.740905522114629e-07, "logps/chosen": -242.72299194335938, "logps/rejected": -247.96231079101562, "loss": 0.4228, "rewards/chosen": -1.3263717889785767, "rewards/margins": 1.3898276090621948, "rewards/rejected": -2.7161993980407715, "step": 198 }, { "epoch": 0.05, "grad_norm": 29.171445846557617, "kl": 0.0, "learning_rate": 4.7395969641455116e-07, "logps/chosen": -260.06341552734375, "logps/rejected": -262.1416931152344, "loss": 0.3828, "rewards/chosen": -2.1225051879882812, "rewards/margins": 1.466343641281128, "rewards/rejected": -3.588848829269409, "step": 199 }, { "epoch": 0.05, "grad_norm": 33.99834060668945, "kl": 0.0, "learning_rate": 4.7382884061763935e-07, "logps/chosen": -204.8150634765625, "logps/rejected": -220.98208618164062, "loss": 0.3487, "rewards/chosen": -0.6152539253234863, "rewards/margins": 1.9658851623535156, "rewards/rejected": -2.581139087677002, "step": 200 }, { "epoch": 0.05, "grad_norm": 23.109434127807617, "kl": 0.0, "learning_rate": 4.7369798482072755e-07, "logps/chosen": -189.38824462890625, "logps/rejected": -225.818115234375, "loss": 0.3498, "rewards/chosen": -1.0156517028808594, "rewards/margins": 2.376852035522461, "rewards/rejected": -3.3925037384033203, "step": 201 }, { "epoch": 0.05, "grad_norm": 31.507305145263672, "kl": 0.0, "learning_rate": 4.7356712902381575e-07, "logps/chosen": -225.10400390625, "logps/rejected": -249.22921752929688, "loss": 0.3674, "rewards/chosen": -0.7410069108009338, "rewards/margins": 1.9564170837402344, "rewards/rejected": -2.6974239349365234, "step": 202 }, { "epoch": 0.05, "grad_norm": 40.36268615722656, "kl": 0.0, "learning_rate": 4.7343627322690394e-07, "logps/chosen": -241.38609313964844, "logps/rejected": -219.32449340820312, "loss": 0.3755, "rewards/chosen": 1.6069724559783936, "rewards/margins": 2.9934029579162598, "rewards/rejected": -1.3864305019378662, "step": 203 }, { "epoch": 0.05, "grad_norm": 31.71283531188965, "kl": 0.0, "learning_rate": 4.7330541742999214e-07, "logps/chosen": -245.4090576171875, "logps/rejected": -264.3318786621094, "loss": 0.3452, "rewards/chosen": -1.086578130722046, "rewards/margins": 0.6778701543807983, "rewards/rejected": -1.7644482851028442, "step": 204 }, { "epoch": 0.05, "grad_norm": 34.97480010986328, "kl": 0.0, "learning_rate": 4.7317456163308034e-07, "logps/chosen": -284.29791259765625, "logps/rejected": -244.2501678466797, "loss": 0.2654, "rewards/chosen": -0.4497016370296478, "rewards/margins": 2.9477717876434326, "rewards/rejected": -3.3974733352661133, "step": 205 }, { "epoch": 0.05, "grad_norm": 42.56742858886719, "kl": 0.0, "learning_rate": 4.7304370583616853e-07, "logps/chosen": -275.45928955078125, "logps/rejected": -223.07778930664062, "loss": 0.3474, "rewards/chosen": -0.8359074592590332, "rewards/margins": 1.7483019828796387, "rewards/rejected": -2.584209442138672, "step": 206 }, { "epoch": 0.05, "grad_norm": 33.80686569213867, "kl": 0.0, "learning_rate": 4.7291285003925673e-07, "logps/chosen": -180.66078186035156, "logps/rejected": -202.80429077148438, "loss": 0.308, "rewards/chosen": -1.281614899635315, "rewards/margins": 0.863860011100769, "rewards/rejected": -2.145474910736084, "step": 207 }, { "epoch": 0.05, "grad_norm": 37.145477294921875, "kl": 0.0, "learning_rate": 4.727819942423449e-07, "logps/chosen": -290.77020263671875, "logps/rejected": -283.8406066894531, "loss": 0.4507, "rewards/chosen": -1.2572423219680786, "rewards/margins": 2.4235076904296875, "rewards/rejected": -3.6807498931884766, "step": 208 }, { "epoch": 0.05, "grad_norm": 35.7796745300293, "kl": 0.0, "learning_rate": 4.7265113844543307e-07, "logps/chosen": -219.49148559570312, "logps/rejected": -117.24064636230469, "loss": 0.444, "rewards/chosen": -0.850780189037323, "rewards/margins": 0.35081928968429565, "rewards/rejected": -1.2015994787216187, "step": 209 }, { "epoch": 0.05, "grad_norm": 26.550434112548828, "kl": 0.0, "learning_rate": 4.7252028264852126e-07, "logps/chosen": -162.2927703857422, "logps/rejected": -262.95635986328125, "loss": 0.3816, "rewards/chosen": -0.9790937304496765, "rewards/margins": 1.541642189025879, "rewards/rejected": -2.5207359790802, "step": 210 }, { "epoch": 0.06, "grad_norm": 38.59910583496094, "kl": 0.0, "learning_rate": 4.7238942685160946e-07, "logps/chosen": -220.10491943359375, "logps/rejected": -193.5897216796875, "loss": 0.2675, "rewards/chosen": 0.22388845682144165, "rewards/margins": 1.840318202972412, "rewards/rejected": -1.6164298057556152, "step": 211 }, { "epoch": 0.06, "grad_norm": 31.72176742553711, "kl": 0.0, "learning_rate": 4.722585710546977e-07, "logps/chosen": -158.26223754882812, "logps/rejected": -349.89923095703125, "loss": 0.4544, "rewards/chosen": -0.953671932220459, "rewards/margins": 5.621851444244385, "rewards/rejected": -6.575523376464844, "step": 212 }, { "epoch": 0.06, "grad_norm": 35.59261703491211, "kl": 0.0, "learning_rate": 4.721277152577859e-07, "logps/chosen": -188.9051513671875, "logps/rejected": -229.09368896484375, "loss": 0.3985, "rewards/chosen": -0.7103174924850464, "rewards/margins": 1.4962338209152222, "rewards/rejected": -2.2065513134002686, "step": 213 }, { "epoch": 0.06, "grad_norm": 29.88261604309082, "kl": 0.0, "learning_rate": 4.719968594608741e-07, "logps/chosen": -258.1292419433594, "logps/rejected": -285.9716491699219, "loss": 0.2742, "rewards/chosen": -0.8898869752883911, "rewards/margins": 4.2793169021606445, "rewards/rejected": -5.169203758239746, "step": 214 }, { "epoch": 0.06, "grad_norm": 31.082521438598633, "kl": 0.0, "learning_rate": 4.718660036639623e-07, "logps/chosen": -235.5315704345703, "logps/rejected": -142.43453979492188, "loss": 0.3042, "rewards/chosen": -1.472436785697937, "rewards/margins": 0.5655766725540161, "rewards/rejected": -2.038013458251953, "step": 215 }, { "epoch": 0.06, "grad_norm": 45.72218704223633, "kl": 0.0, "learning_rate": 4.717351478670505e-07, "logps/chosen": -277.6252136230469, "logps/rejected": -190.9403839111328, "loss": 0.3446, "rewards/chosen": 0.9356330037117004, "rewards/margins": 2.0501766204833984, "rewards/rejected": -1.1145436763763428, "step": 216 }, { "epoch": 0.06, "grad_norm": 36.96467971801758, "kl": 0.0, "learning_rate": 4.716042920701387e-07, "logps/chosen": -194.5904541015625, "logps/rejected": -251.4320068359375, "loss": 0.4222, "rewards/chosen": -0.46582919359207153, "rewards/margins": 1.6109564304351807, "rewards/rejected": -2.0767855644226074, "step": 217 }, { "epoch": 0.06, "grad_norm": 36.587127685546875, "kl": 0.0, "learning_rate": 4.714734362732269e-07, "logps/chosen": -250.0608673095703, "logps/rejected": -217.1116943359375, "loss": 0.2782, "rewards/chosen": -0.5743464231491089, "rewards/margins": 1.952079176902771, "rewards/rejected": -2.52642560005188, "step": 218 }, { "epoch": 0.06, "grad_norm": 27.2506046295166, "kl": 0.0, "learning_rate": 4.713425804763151e-07, "logps/chosen": -150.00576782226562, "logps/rejected": -322.8671875, "loss": 0.1698, "rewards/chosen": -0.5786823034286499, "rewards/margins": 3.880950450897217, "rewards/rejected": -4.459632873535156, "step": 219 }, { "epoch": 0.06, "grad_norm": 31.029760360717773, "kl": 0.0, "learning_rate": 4.712117246794033e-07, "logps/chosen": -250.29478454589844, "logps/rejected": -235.32687377929688, "loss": 0.3649, "rewards/chosen": -1.7568782567977905, "rewards/margins": 1.4462846517562866, "rewards/rejected": -3.203162908554077, "step": 220 }, { "epoch": 0.06, "grad_norm": 38.26043701171875, "kl": 0.0, "learning_rate": 4.7108086888249147e-07, "logps/chosen": -279.1617431640625, "logps/rejected": -174.76123046875, "loss": 0.4814, "rewards/chosen": -1.5011340379714966, "rewards/margins": 0.26141786575317383, "rewards/rejected": -1.7625519037246704, "step": 221 }, { "epoch": 0.06, "grad_norm": 35.2736701965332, "kl": 0.0, "learning_rate": 4.7095001308557967e-07, "logps/chosen": -215.34039306640625, "logps/rejected": -221.15383911132812, "loss": 0.3409, "rewards/chosen": -0.45132410526275635, "rewards/margins": 2.0330100059509277, "rewards/rejected": -2.4843342304229736, "step": 222 }, { "epoch": 0.06, "grad_norm": 36.65085983276367, "kl": 0.0, "learning_rate": 4.7081915728866786e-07, "logps/chosen": -194.25527954101562, "logps/rejected": -124.3009033203125, "loss": 0.4022, "rewards/chosen": -0.9982702732086182, "rewards/margins": 0.494891881942749, "rewards/rejected": -1.4931621551513672, "step": 223 }, { "epoch": 0.06, "grad_norm": 36.66006851196289, "kl": 0.0, "learning_rate": 4.706883014917561e-07, "logps/chosen": -213.27227783203125, "logps/rejected": -195.74505615234375, "loss": 0.3352, "rewards/chosen": -0.09169921278953552, "rewards/margins": 1.9126981496810913, "rewards/rejected": -2.004397392272949, "step": 224 }, { "epoch": 0.06, "grad_norm": 37.03841018676758, "kl": 0.0, "learning_rate": 4.7055744569484426e-07, "logps/chosen": -174.44583129882812, "logps/rejected": -203.36178588867188, "loss": 0.4792, "rewards/chosen": -0.9792169332504272, "rewards/margins": 0.39964139461517334, "rewards/rejected": -1.3788583278656006, "step": 225 }, { "epoch": 0.06, "grad_norm": 41.96936798095703, "kl": 0.0, "learning_rate": 4.7042658989793245e-07, "logps/chosen": -222.01852416992188, "logps/rejected": -235.48773193359375, "loss": 0.4073, "rewards/chosen": -1.3425058126449585, "rewards/margins": 0.3740893602371216, "rewards/rejected": -1.71659517288208, "step": 226 }, { "epoch": 0.06, "grad_norm": 40.41291427612305, "kl": 0.0, "learning_rate": 4.7029573410102065e-07, "logps/chosen": -274.5633544921875, "logps/rejected": -268.20489501953125, "loss": 0.4272, "rewards/chosen": -1.1389542818069458, "rewards/margins": 2.3568062782287598, "rewards/rejected": -3.495760440826416, "step": 227 }, { "epoch": 0.06, "grad_norm": 36.06272506713867, "kl": 0.0, "learning_rate": 4.7016487830410885e-07, "logps/chosen": -245.97525024414062, "logps/rejected": -242.12374877929688, "loss": 0.3408, "rewards/chosen": -1.1917027235031128, "rewards/margins": 1.347025752067566, "rewards/rejected": -2.5387284755706787, "step": 228 }, { "epoch": 0.06, "grad_norm": 42.17586898803711, "kl": 0.0, "learning_rate": 4.7003402250719704e-07, "logps/chosen": -197.28846740722656, "logps/rejected": -220.36756896972656, "loss": 0.4867, "rewards/chosen": -1.028113603591919, "rewards/margins": 1.651493787765503, "rewards/rejected": -2.679607391357422, "step": 229 }, { "epoch": 0.06, "grad_norm": 44.75341033935547, "kl": 0.0, "learning_rate": 4.6990316671028524e-07, "logps/chosen": -224.07839965820312, "logps/rejected": -217.51248168945312, "loss": 0.3321, "rewards/chosen": -0.5282418131828308, "rewards/margins": 2.796116590499878, "rewards/rejected": -3.3243584632873535, "step": 230 }, { "epoch": 0.06, "grad_norm": 37.11387634277344, "kl": 0.0, "learning_rate": 4.6977231091337343e-07, "logps/chosen": -137.24411010742188, "logps/rejected": -313.54193115234375, "loss": 0.359, "rewards/chosen": -0.643952488899231, "rewards/margins": 2.7075109481811523, "rewards/rejected": -3.3514633178710938, "step": 231 }, { "epoch": 0.06, "grad_norm": 34.26787185668945, "kl": 0.0, "learning_rate": 4.6964145511646163e-07, "logps/chosen": -167.1756134033203, "logps/rejected": -229.2905731201172, "loss": 0.2363, "rewards/chosen": -0.5329712629318237, "rewards/margins": 1.356623649597168, "rewards/rejected": -1.8895949125289917, "step": 232 }, { "epoch": 0.06, "grad_norm": 35.61827850341797, "kl": 0.0, "learning_rate": 4.695105993195498e-07, "logps/chosen": -197.7938690185547, "logps/rejected": -295.2776794433594, "loss": 0.3338, "rewards/chosen": -0.19512000679969788, "rewards/margins": 2.5928189754486084, "rewards/rejected": -2.7879390716552734, "step": 233 }, { "epoch": 0.06, "grad_norm": 49.267677307128906, "kl": 0.0, "learning_rate": 4.69379743522638e-07, "logps/chosen": -235.7493896484375, "logps/rejected": -297.271240234375, "loss": 0.4593, "rewards/chosen": -0.2626681625843048, "rewards/margins": 1.3399256467819214, "rewards/rejected": -1.6025937795639038, "step": 234 }, { "epoch": 0.06, "grad_norm": 31.048612594604492, "kl": 0.0, "learning_rate": 4.692488877257262e-07, "logps/chosen": -232.46897888183594, "logps/rejected": -274.14178466796875, "loss": 0.3967, "rewards/chosen": -1.576374888420105, "rewards/margins": 1.132400631904602, "rewards/rejected": -2.708775520324707, "step": 235 }, { "epoch": 0.06, "grad_norm": 34.81768035888672, "kl": 0.0, "learning_rate": 4.691180319288144e-07, "logps/chosen": -186.15216064453125, "logps/rejected": -307.45843505859375, "loss": 0.322, "rewards/chosen": -0.2905099391937256, "rewards/margins": 1.9975306987762451, "rewards/rejected": -2.2880406379699707, "step": 236 }, { "epoch": 0.06, "grad_norm": 44.079856872558594, "kl": 0.0, "learning_rate": 4.6898717613190266e-07, "logps/chosen": -294.75274658203125, "logps/rejected": -263.3562316894531, "loss": 0.3592, "rewards/chosen": -0.2211398184299469, "rewards/margins": 2.107311487197876, "rewards/rejected": -2.32845139503479, "step": 237 }, { "epoch": 0.06, "grad_norm": 32.26349639892578, "kl": 0.0, "learning_rate": 4.6885632033499086e-07, "logps/chosen": -204.82138061523438, "logps/rejected": -248.318603515625, "loss": 0.3495, "rewards/chosen": 0.03919875621795654, "rewards/margins": 2.928051471710205, "rewards/rejected": -2.888852834701538, "step": 238 }, { "epoch": 0.06, "grad_norm": 35.30009078979492, "kl": 0.0, "learning_rate": 4.6872546453807906e-07, "logps/chosen": -306.24169921875, "logps/rejected": -213.28111267089844, "loss": 0.3976, "rewards/chosen": -1.4108914136886597, "rewards/margins": 0.8999141454696655, "rewards/rejected": -2.310805559158325, "step": 239 }, { "epoch": 0.06, "grad_norm": 35.18433380126953, "kl": 0.0, "learning_rate": 4.685946087411672e-07, "logps/chosen": -258.08056640625, "logps/rejected": -275.6287841796875, "loss": 0.4349, "rewards/chosen": -0.9918340444564819, "rewards/margins": 2.3593192100524902, "rewards/rejected": -3.3511531352996826, "step": 240 }, { "epoch": 0.06, "grad_norm": 24.199462890625, "kl": 0.0, "learning_rate": 4.684637529442554e-07, "logps/chosen": -202.14227294921875, "logps/rejected": -251.01712036132812, "loss": 0.2988, "rewards/chosen": -2.8060295581817627, "rewards/margins": 1.014716625213623, "rewards/rejected": -3.8207461833953857, "step": 241 }, { "epoch": 0.06, "grad_norm": 27.472558975219727, "kl": 0.0, "learning_rate": 4.683328971473436e-07, "logps/chosen": -178.1534881591797, "logps/rejected": -194.3978729248047, "loss": 0.3999, "rewards/chosen": -1.4756015539169312, "rewards/margins": 1.0923720598220825, "rewards/rejected": -2.5679736137390137, "step": 242 }, { "epoch": 0.06, "grad_norm": 28.934558868408203, "kl": 0.0, "learning_rate": 4.682020413504318e-07, "logps/chosen": -232.8424835205078, "logps/rejected": -196.27610778808594, "loss": 0.25, "rewards/chosen": 0.7410671710968018, "rewards/margins": 3.822826862335205, "rewards/rejected": -3.0817596912384033, "step": 243 }, { "epoch": 0.06, "grad_norm": 38.57919692993164, "kl": 0.0, "learning_rate": 4.6807118555352e-07, "logps/chosen": -210.18600463867188, "logps/rejected": -185.69248962402344, "loss": 0.3098, "rewards/chosen": -1.65409255027771, "rewards/margins": 1.2091917991638184, "rewards/rejected": -2.8632843494415283, "step": 244 }, { "epoch": 0.06, "grad_norm": 40.655635833740234, "kl": 0.0, "learning_rate": 4.679403297566082e-07, "logps/chosen": -206.66592407226562, "logps/rejected": -244.49119567871094, "loss": 0.3466, "rewards/chosen": -0.9859999418258667, "rewards/margins": 1.3829981088638306, "rewards/rejected": -2.3689980506896973, "step": 245 }, { "epoch": 0.06, "grad_norm": 37.547794342041016, "kl": 0.0, "learning_rate": 4.678094739596964e-07, "logps/chosen": -256.795654296875, "logps/rejected": -254.77978515625, "loss": 0.3306, "rewards/chosen": -1.4593631029129028, "rewards/margins": 2.0132813453674316, "rewards/rejected": -3.472644567489624, "step": 246 }, { "epoch": 0.06, "grad_norm": 36.778751373291016, "kl": 0.0, "learning_rate": 4.6767861816278457e-07, "logps/chosen": -260.1071472167969, "logps/rejected": -200.48402404785156, "loss": 0.2881, "rewards/chosen": 1.2593364715576172, "rewards/margins": 3.518388271331787, "rewards/rejected": -2.25905179977417, "step": 247 }, { "epoch": 0.06, "grad_norm": 38.92163848876953, "kl": 0.0, "learning_rate": 4.6754776236587277e-07, "logps/chosen": -234.36021423339844, "logps/rejected": -215.75294494628906, "loss": 0.3333, "rewards/chosen": 0.5298810601234436, "rewards/margins": 2.223097324371338, "rewards/rejected": -1.693216323852539, "step": 248 }, { "epoch": 0.07, "grad_norm": 36.91664505004883, "kl": 0.0, "learning_rate": 4.6741690656896096e-07, "logps/chosen": -258.3697204589844, "logps/rejected": -216.38294982910156, "loss": 0.4925, "rewards/chosen": -0.6519094705581665, "rewards/margins": 1.3434323072433472, "rewards/rejected": -1.9953417778015137, "step": 249 }, { "epoch": 0.07, "grad_norm": 31.283388137817383, "kl": 0.0, "learning_rate": 4.672860507720492e-07, "logps/chosen": -216.1317596435547, "logps/rejected": -266.24176025390625, "loss": 0.361, "rewards/chosen": 0.45249366760253906, "rewards/margins": 3.7620913982391357, "rewards/rejected": -3.3095977306365967, "step": 250 }, { "epoch": 0.07, "grad_norm": 39.949859619140625, "kl": 0.0, "learning_rate": 4.671551949751374e-07, "logps/chosen": -291.876708984375, "logps/rejected": -256.897216796875, "loss": 0.4256, "rewards/chosen": -2.2693746089935303, "rewards/margins": 0.718336820602417, "rewards/rejected": -2.9877114295959473, "step": 251 }, { "epoch": 0.07, "grad_norm": 48.09956741333008, "kl": 0.0, "learning_rate": 4.670243391782256e-07, "logps/chosen": -307.2916564941406, "logps/rejected": -232.60179138183594, "loss": 0.3285, "rewards/chosen": -2.1419198513031006, "rewards/margins": 0.21775078773498535, "rewards/rejected": -2.359670639038086, "step": 252 }, { "epoch": 0.07, "grad_norm": 30.89328956604004, "kl": 0.0, "learning_rate": 4.668934833813138e-07, "logps/chosen": -253.2288055419922, "logps/rejected": -260.0640869140625, "loss": 0.3365, "rewards/chosen": -0.9447171092033386, "rewards/margins": 3.576873540878296, "rewards/rejected": -4.521590709686279, "step": 253 }, { "epoch": 0.07, "grad_norm": 47.67763137817383, "kl": 0.0, "learning_rate": 4.66762627584402e-07, "logps/chosen": -176.8143768310547, "logps/rejected": -242.53236389160156, "loss": 0.2547, "rewards/chosen": 1.01246976852417, "rewards/margins": 3.594667434692383, "rewards/rejected": -2.582197666168213, "step": 254 }, { "epoch": 0.07, "grad_norm": 35.259056091308594, "kl": 0.0, "learning_rate": 4.666317717874902e-07, "logps/chosen": -226.5983428955078, "logps/rejected": -223.51144409179688, "loss": 0.263, "rewards/chosen": -0.17412535846233368, "rewards/margins": 2.4813520908355713, "rewards/rejected": -2.655477523803711, "step": 255 }, { "epoch": 0.07, "grad_norm": 41.33015823364258, "kl": 0.0, "learning_rate": 4.6650091599057834e-07, "logps/chosen": -291.62744140625, "logps/rejected": -264.74493408203125, "loss": 0.366, "rewards/chosen": -0.48753511905670166, "rewards/margins": 3.4678664207458496, "rewards/rejected": -3.955401659011841, "step": 256 }, { "epoch": 0.07, "grad_norm": 39.72134017944336, "kl": 0.0, "learning_rate": 4.6637006019366653e-07, "logps/chosen": -302.4512634277344, "logps/rejected": -221.8700408935547, "loss": 0.305, "rewards/chosen": -0.7385875582695007, "rewards/margins": 1.6246614456176758, "rewards/rejected": -2.3632490634918213, "step": 257 }, { "epoch": 0.07, "grad_norm": 39.22993850708008, "kl": 0.0, "learning_rate": 4.6623920439675473e-07, "logps/chosen": -239.31459045410156, "logps/rejected": -185.0542449951172, "loss": 0.3941, "rewards/chosen": -0.697856068611145, "rewards/margins": 1.7438751459121704, "rewards/rejected": -2.4417312145233154, "step": 258 }, { "epoch": 0.07, "grad_norm": 39.030784606933594, "kl": 0.0, "learning_rate": 4.661083485998429e-07, "logps/chosen": -214.9169464111328, "logps/rejected": -231.33749389648438, "loss": 0.3734, "rewards/chosen": -0.9121836423873901, "rewards/margins": 2.3024215698242188, "rewards/rejected": -3.2146050930023193, "step": 259 }, { "epoch": 0.07, "grad_norm": 30.60447120666504, "kl": 0.0, "learning_rate": 4.659774928029311e-07, "logps/chosen": -237.2434539794922, "logps/rejected": -217.3273162841797, "loss": 0.3293, "rewards/chosen": -2.5721724033355713, "rewards/margins": 0.5294625759124756, "rewards/rejected": -3.101634979248047, "step": 260 }, { "epoch": 0.07, "grad_norm": 35.15093231201172, "kl": 0.0, "learning_rate": 4.658466370060193e-07, "logps/chosen": -251.8607635498047, "logps/rejected": -244.05323791503906, "loss": 0.2917, "rewards/chosen": -1.1427861452102661, "rewards/margins": 1.8218802213668823, "rewards/rejected": -2.9646663665771484, "step": 261 }, { "epoch": 0.07, "grad_norm": 42.44389724731445, "kl": 0.0, "learning_rate": 4.657157812091075e-07, "logps/chosen": -237.9396209716797, "logps/rejected": -263.0314025878906, "loss": 0.42, "rewards/chosen": -0.2850094735622406, "rewards/margins": 1.6426814794540405, "rewards/rejected": -1.9276909828186035, "step": 262 }, { "epoch": 0.07, "grad_norm": 37.14183044433594, "kl": 0.0, "learning_rate": 4.6558492541219576e-07, "logps/chosen": -185.787109375, "logps/rejected": -261.66461181640625, "loss": 0.446, "rewards/chosen": -1.9390316009521484, "rewards/margins": 0.6079616546630859, "rewards/rejected": -2.5469932556152344, "step": 263 }, { "epoch": 0.07, "grad_norm": 33.57796859741211, "kl": 0.0, "learning_rate": 4.6545406961528396e-07, "logps/chosen": -157.2796630859375, "logps/rejected": -283.6850891113281, "loss": 0.2372, "rewards/chosen": -0.8357820510864258, "rewards/margins": 2.492841958999634, "rewards/rejected": -3.3286240100860596, "step": 264 }, { "epoch": 0.07, "grad_norm": 40.712528228759766, "kl": 0.0, "learning_rate": 4.6532321381837215e-07, "logps/chosen": -220.23959350585938, "logps/rejected": -255.13534545898438, "loss": 0.4645, "rewards/chosen": -1.641434907913208, "rewards/margins": 0.34841763973236084, "rewards/rejected": -1.9898525476455688, "step": 265 }, { "epoch": 0.07, "grad_norm": 35.535221099853516, "kl": 0.0, "learning_rate": 4.6519235802146035e-07, "logps/chosen": -223.94854736328125, "logps/rejected": -246.826416015625, "loss": 0.4288, "rewards/chosen": -1.153721570968628, "rewards/margins": 0.5903757810592651, "rewards/rejected": -1.744097352027893, "step": 266 }, { "epoch": 0.07, "grad_norm": 42.14219665527344, "kl": 0.0, "learning_rate": 4.6506150222454855e-07, "logps/chosen": -211.50079345703125, "logps/rejected": -159.27609252929688, "loss": 0.3998, "rewards/chosen": -1.4032256603240967, "rewards/margins": 0.3597506284713745, "rewards/rejected": -1.7629762887954712, "step": 267 }, { "epoch": 0.07, "grad_norm": 38.34206771850586, "kl": 0.0, "learning_rate": 4.6493064642763674e-07, "logps/chosen": -269.44757080078125, "logps/rejected": -209.7416534423828, "loss": 0.4226, "rewards/chosen": -0.985856831073761, "rewards/margins": 1.1115570068359375, "rewards/rejected": -2.0974137783050537, "step": 268 }, { "epoch": 0.07, "grad_norm": 37.94346237182617, "kl": 0.0, "learning_rate": 4.6479979063072494e-07, "logps/chosen": -225.378173828125, "logps/rejected": -273.8226318359375, "loss": 0.3977, "rewards/chosen": -1.5155614614486694, "rewards/margins": 1.2213491201400757, "rewards/rejected": -2.736910581588745, "step": 269 }, { "epoch": 0.07, "grad_norm": 32.25834655761719, "kl": 0.0, "learning_rate": 4.6466893483381313e-07, "logps/chosen": -199.16827392578125, "logps/rejected": -248.81382751464844, "loss": 0.2876, "rewards/chosen": -0.5442147254943848, "rewards/margins": 2.8547651767730713, "rewards/rejected": -3.398979902267456, "step": 270 }, { "epoch": 0.07, "grad_norm": 31.128337860107422, "kl": 0.0, "learning_rate": 4.645380790369013e-07, "logps/chosen": -197.1934356689453, "logps/rejected": -247.61465454101562, "loss": 0.3763, "rewards/chosen": -1.405363917350769, "rewards/margins": 1.312684178352356, "rewards/rejected": -2.718048095703125, "step": 271 }, { "epoch": 0.07, "grad_norm": 29.893508911132812, "kl": 0.0, "learning_rate": 4.6440722323998947e-07, "logps/chosen": -136.73426818847656, "logps/rejected": -273.2953796386719, "loss": 0.2353, "rewards/chosen": -0.5470638871192932, "rewards/margins": 3.5090365409851074, "rewards/rejected": -4.056100368499756, "step": 272 }, { "epoch": 0.07, "grad_norm": 41.489437103271484, "kl": 0.0, "learning_rate": 4.6427636744307767e-07, "logps/chosen": -227.7633056640625, "logps/rejected": -265.1458740234375, "loss": 0.4804, "rewards/chosen": -1.4401181936264038, "rewards/margins": 1.0054680109024048, "rewards/rejected": -2.4455862045288086, "step": 273 }, { "epoch": 0.07, "grad_norm": 37.44647216796875, "kl": 0.0, "learning_rate": 4.6414551164616587e-07, "logps/chosen": -255.96173095703125, "logps/rejected": -239.48736572265625, "loss": 0.3189, "rewards/chosen": 0.37182945013046265, "rewards/margins": 3.4821300506591797, "rewards/rejected": -3.1103005409240723, "step": 274 }, { "epoch": 0.07, "grad_norm": 29.41668701171875, "kl": 0.0, "learning_rate": 4.640146558492541e-07, "logps/chosen": -201.92637634277344, "logps/rejected": -269.31353759765625, "loss": 0.3188, "rewards/chosen": -1.8693069219589233, "rewards/margins": 2.036485195159912, "rewards/rejected": -3.905791997909546, "step": 275 }, { "epoch": 0.07, "grad_norm": 38.13005447387695, "kl": 0.0, "learning_rate": 4.638838000523423e-07, "logps/chosen": -295.34039306640625, "logps/rejected": -117.98519134521484, "loss": 0.4628, "rewards/chosen": -1.9872322082519531, "rewards/margins": 0.26657724380493164, "rewards/rejected": -2.2538094520568848, "step": 276 }, { "epoch": 0.07, "grad_norm": 37.00981521606445, "kl": 0.0, "learning_rate": 4.637529442554305e-07, "logps/chosen": -220.0794219970703, "logps/rejected": -217.39553833007812, "loss": 0.3312, "rewards/chosen": -0.0731474757194519, "rewards/margins": 2.3764216899871826, "rewards/rejected": -2.4495692253112793, "step": 277 }, { "epoch": 0.07, "grad_norm": 33.918983459472656, "kl": 0.0, "learning_rate": 4.636220884585187e-07, "logps/chosen": -238.72340393066406, "logps/rejected": -212.756103515625, "loss": 0.3588, "rewards/chosen": -0.8355458378791809, "rewards/margins": 1.306795358657837, "rewards/rejected": -2.142341136932373, "step": 278 }, { "epoch": 0.07, "grad_norm": 30.741132736206055, "kl": 0.0, "learning_rate": 4.634912326616069e-07, "logps/chosen": -169.1579132080078, "logps/rejected": -195.23297119140625, "loss": 0.2557, "rewards/chosen": 0.54413241147995, "rewards/margins": 3.2936644554138184, "rewards/rejected": -2.7495319843292236, "step": 279 }, { "epoch": 0.07, "grad_norm": 27.65318489074707, "kl": 0.0, "learning_rate": 4.633603768646951e-07, "logps/chosen": -157.45840454101562, "logps/rejected": -249.44149780273438, "loss": 0.3092, "rewards/chosen": -1.2606077194213867, "rewards/margins": 1.9259283542633057, "rewards/rejected": -3.1865360736846924, "step": 280 }, { "epoch": 0.07, "grad_norm": 41.376121520996094, "kl": 0.0, "learning_rate": 4.632295210677833e-07, "logps/chosen": -186.80088806152344, "logps/rejected": -292.2984924316406, "loss": 0.3037, "rewards/chosen": -1.9069658517837524, "rewards/margins": 2.8274898529052734, "rewards/rejected": -4.734455585479736, "step": 281 }, { "epoch": 0.07, "grad_norm": 35.813560485839844, "kl": 0.0, "learning_rate": 4.630986652708715e-07, "logps/chosen": -184.72157287597656, "logps/rejected": -164.49346923828125, "loss": 0.4109, "rewards/chosen": -1.1023026704788208, "rewards/margins": 1.4965165853500366, "rewards/rejected": -2.5988192558288574, "step": 282 }, { "epoch": 0.07, "grad_norm": 23.877840042114258, "kl": 0.0, "learning_rate": 4.629678094739597e-07, "logps/chosen": -279.3133850097656, "logps/rejected": -282.68597412109375, "loss": 0.4464, "rewards/chosen": -3.0280158519744873, "rewards/margins": 0.8431928157806396, "rewards/rejected": -3.871208667755127, "step": 283 }, { "epoch": 0.07, "grad_norm": 28.17823028564453, "kl": 0.0, "learning_rate": 4.628369536770479e-07, "logps/chosen": -279.7229919433594, "logps/rejected": -268.95501708984375, "loss": 0.3018, "rewards/chosen": -1.9918068647384644, "rewards/margins": 0.9593816995620728, "rewards/rejected": -2.951188564300537, "step": 284 }, { "epoch": 0.07, "grad_norm": 26.56211280822754, "kl": 0.0, "learning_rate": 4.627060978801361e-07, "logps/chosen": -231.16656494140625, "logps/rejected": -253.2298126220703, "loss": 0.3365, "rewards/chosen": -1.716474175453186, "rewards/margins": 0.8960174322128296, "rewards/rejected": -2.6124916076660156, "step": 285 }, { "epoch": 0.07, "grad_norm": 34.96949005126953, "kl": 0.0, "learning_rate": 4.6257524208322427e-07, "logps/chosen": -228.77427673339844, "logps/rejected": -317.2232666015625, "loss": 0.3701, "rewards/chosen": -1.3939743041992188, "rewards/margins": 2.045186758041382, "rewards/rejected": -3.4391610622406006, "step": 286 }, { "epoch": 0.08, "grad_norm": 31.497352600097656, "kl": 0.0, "learning_rate": 4.624443862863124e-07, "logps/chosen": -260.1445007324219, "logps/rejected": -292.47064208984375, "loss": 0.394, "rewards/chosen": -1.3395678997039795, "rewards/margins": 2.3608195781707764, "rewards/rejected": -3.700387477874756, "step": 287 }, { "epoch": 0.08, "grad_norm": 32.95969772338867, "kl": 0.0, "learning_rate": 4.6231353048940066e-07, "logps/chosen": -177.44320678710938, "logps/rejected": -211.9121551513672, "loss": 0.3771, "rewards/chosen": -1.091148853302002, "rewards/margins": 1.9192008972167969, "rewards/rejected": -3.010349750518799, "step": 288 }, { "epoch": 0.08, "grad_norm": 37.73873519897461, "kl": 0.0, "learning_rate": 4.6218267469248886e-07, "logps/chosen": -193.40924072265625, "logps/rejected": -239.34669494628906, "loss": 0.3655, "rewards/chosen": 0.05758616328239441, "rewards/margins": 3.134945869445801, "rewards/rejected": -3.077359676361084, "step": 289 }, { "epoch": 0.08, "grad_norm": 36.732391357421875, "kl": 0.0, "learning_rate": 4.6205181889557706e-07, "logps/chosen": -208.61558532714844, "logps/rejected": -299.9091796875, "loss": 0.395, "rewards/chosen": -1.2155559062957764, "rewards/margins": 3.3333466053009033, "rewards/rejected": -4.54890251159668, "step": 290 }, { "epoch": 0.08, "grad_norm": 37.14192199707031, "kl": 0.0, "learning_rate": 4.6192096309866525e-07, "logps/chosen": -298.52789306640625, "logps/rejected": -256.9224548339844, "loss": 0.3217, "rewards/chosen": -1.5316705703735352, "rewards/margins": 1.1650197505950928, "rewards/rejected": -2.696690320968628, "step": 291 }, { "epoch": 0.08, "grad_norm": 32.1880989074707, "kl": 0.0, "learning_rate": 4.6179010730175345e-07, "logps/chosen": -230.33580017089844, "logps/rejected": -185.30084228515625, "loss": 0.5325, "rewards/chosen": -2.6992344856262207, "rewards/margins": -1.3056819438934326, "rewards/rejected": -1.393552541732788, "step": 292 }, { "epoch": 0.08, "grad_norm": 29.584102630615234, "kl": 0.0, "learning_rate": 4.6165925150484164e-07, "logps/chosen": -213.70803833007812, "logps/rejected": -351.8007507324219, "loss": 0.2521, "rewards/chosen": 0.7046425938606262, "rewards/margins": 5.104078769683838, "rewards/rejected": -4.399435997009277, "step": 293 }, { "epoch": 0.08, "grad_norm": 38.5718879699707, "kl": 0.0, "learning_rate": 4.6152839570792984e-07, "logps/chosen": -226.90501403808594, "logps/rejected": -238.16050720214844, "loss": 0.3634, "rewards/chosen": -1.4450457096099854, "rewards/margins": 1.0846538543701172, "rewards/rejected": -2.5296995639801025, "step": 294 }, { "epoch": 0.08, "grad_norm": 32.4056396484375, "kl": 0.0, "learning_rate": 4.6139753991101804e-07, "logps/chosen": -232.12557983398438, "logps/rejected": -208.5637969970703, "loss": 0.4, "rewards/chosen": -0.6349735260009766, "rewards/margins": 1.985107421875, "rewards/rejected": -2.6200809478759766, "step": 295 }, { "epoch": 0.08, "grad_norm": 33.10838317871094, "kl": 0.0, "learning_rate": 4.6126668411410623e-07, "logps/chosen": -267.5959777832031, "logps/rejected": -214.76614379882812, "loss": 0.3235, "rewards/chosen": 0.1534360647201538, "rewards/margins": 2.617750644683838, "rewards/rejected": -2.4643144607543945, "step": 296 }, { "epoch": 0.08, "grad_norm": 38.25209426879883, "kl": 0.0, "learning_rate": 4.6113582831719443e-07, "logps/chosen": -195.14288330078125, "logps/rejected": -314.3270263671875, "loss": 0.4319, "rewards/chosen": -1.1439883708953857, "rewards/margins": 0.742764949798584, "rewards/rejected": -1.8867533206939697, "step": 297 }, { "epoch": 0.08, "grad_norm": 36.58635711669922, "kl": 0.0, "learning_rate": 4.610049725202826e-07, "logps/chosen": -229.46063232421875, "logps/rejected": -270.51690673828125, "loss": 0.4123, "rewards/chosen": -1.3162682056427002, "rewards/margins": 2.033482313156128, "rewards/rejected": -3.349750518798828, "step": 298 }, { "epoch": 0.08, "grad_norm": 42.640995025634766, "kl": 0.0, "learning_rate": 4.608741167233708e-07, "logps/chosen": -247.8602294921875, "logps/rejected": -213.57020568847656, "loss": 0.3484, "rewards/chosen": -0.9282722473144531, "rewards/margins": 2.3152124881744385, "rewards/rejected": -3.2434847354888916, "step": 299 }, { "epoch": 0.08, "grad_norm": 30.121410369873047, "kl": 0.0, "learning_rate": 4.60743260926459e-07, "logps/chosen": -322.234375, "logps/rejected": -201.81800842285156, "loss": 0.3658, "rewards/chosen": -4.029051303863525, "rewards/margins": -1.586775779724121, "rewards/rejected": -2.4422755241394043, "step": 300 }, { "epoch": 0.08, "grad_norm": 34.066890716552734, "kl": 0.0, "learning_rate": 4.6061240512954727e-07, "logps/chosen": -241.13233947753906, "logps/rejected": -256.83050537109375, "loss": 0.4248, "rewards/chosen": -0.6289612650871277, "rewards/margins": 2.3376858234405518, "rewards/rejected": -2.966647148132324, "step": 301 }, { "epoch": 0.08, "grad_norm": 40.31227493286133, "kl": 0.0, "learning_rate": 4.604815493326354e-07, "logps/chosen": -160.533447265625, "logps/rejected": -278.9567565917969, "loss": 0.2845, "rewards/chosen": -0.09239254146814346, "rewards/margins": 2.371143341064453, "rewards/rejected": -2.463535785675049, "step": 302 }, { "epoch": 0.08, "grad_norm": 35.84306716918945, "kl": 0.0, "learning_rate": 4.603506935357236e-07, "logps/chosen": -315.9750061035156, "logps/rejected": -267.2980041503906, "loss": 0.3654, "rewards/chosen": -2.6014981269836426, "rewards/margins": 0.05855679512023926, "rewards/rejected": -2.660054922103882, "step": 303 }, { "epoch": 0.08, "grad_norm": 33.047218322753906, "kl": 0.0, "learning_rate": 4.602198377388118e-07, "logps/chosen": -217.36009216308594, "logps/rejected": -280.88397216796875, "loss": 0.3229, "rewards/chosen": -0.9439008235931396, "rewards/margins": 1.590653896331787, "rewards/rejected": -2.5345547199249268, "step": 304 }, { "epoch": 0.08, "grad_norm": 28.75385093688965, "kl": 0.0, "learning_rate": 4.600889819419e-07, "logps/chosen": -239.74850463867188, "logps/rejected": -273.96075439453125, "loss": 0.4472, "rewards/chosen": -1.6880508661270142, "rewards/margins": 0.40789759159088135, "rewards/rejected": -2.0959484577178955, "step": 305 }, { "epoch": 0.08, "grad_norm": 32.19318389892578, "kl": 0.0, "learning_rate": 4.599581261449882e-07, "logps/chosen": -210.97512817382812, "logps/rejected": -190.31158447265625, "loss": 0.3105, "rewards/chosen": -0.14496548473834991, "rewards/margins": 3.4035897254943848, "rewards/rejected": -3.5485551357269287, "step": 306 }, { "epoch": 0.08, "grad_norm": 35.92880630493164, "kl": 0.0, "learning_rate": 4.598272703480764e-07, "logps/chosen": -287.72381591796875, "logps/rejected": -233.5065155029297, "loss": 0.3544, "rewards/chosen": -0.6701926589012146, "rewards/margins": 2.354236602783203, "rewards/rejected": -3.0244293212890625, "step": 307 }, { "epoch": 0.08, "grad_norm": 40.42152404785156, "kl": 0.0, "learning_rate": 4.596964145511646e-07, "logps/chosen": -209.75469970703125, "logps/rejected": -257.677978515625, "loss": 0.3458, "rewards/chosen": -1.3090839385986328, "rewards/margins": 0.7361466884613037, "rewards/rejected": -2.0452306270599365, "step": 308 }, { "epoch": 0.08, "grad_norm": 31.875043869018555, "kl": 0.0, "learning_rate": 4.595655587542528e-07, "logps/chosen": -283.4571228027344, "logps/rejected": -289.9070739746094, "loss": 0.2973, "rewards/chosen": -0.22630636394023895, "rewards/margins": 2.636915445327759, "rewards/rejected": -2.8632218837738037, "step": 309 }, { "epoch": 0.08, "grad_norm": 28.033388137817383, "kl": 0.0, "learning_rate": 4.59434702957341e-07, "logps/chosen": -218.15414428710938, "logps/rejected": -284.31768798828125, "loss": 0.3171, "rewards/chosen": -0.10873293876647949, "rewards/margins": 3.379168748855591, "rewards/rejected": -3.4879016876220703, "step": 310 }, { "epoch": 0.08, "grad_norm": 26.282222747802734, "kl": 0.0, "learning_rate": 4.593038471604292e-07, "logps/chosen": -297.50421142578125, "logps/rejected": -234.680908203125, "loss": 0.3721, "rewards/chosen": -1.7684048414230347, "rewards/margins": 2.316326141357422, "rewards/rejected": -4.084731101989746, "step": 311 }, { "epoch": 0.08, "grad_norm": 37.98878479003906, "kl": 0.0, "learning_rate": 4.5917299136351737e-07, "logps/chosen": -224.66392517089844, "logps/rejected": -205.10140991210938, "loss": 0.4488, "rewards/chosen": 0.4431179463863373, "rewards/margins": 4.447665214538574, "rewards/rejected": -4.004547119140625, "step": 312 }, { "epoch": 0.08, "grad_norm": 31.1519775390625, "kl": 0.0, "learning_rate": 4.590421355666056e-07, "logps/chosen": -162.36737060546875, "logps/rejected": -209.4821319580078, "loss": 0.328, "rewards/chosen": -1.0744552612304688, "rewards/margins": 0.31392955780029297, "rewards/rejected": -1.3883848190307617, "step": 313 }, { "epoch": 0.08, "grad_norm": 35.20518112182617, "kl": 0.0, "learning_rate": 4.589112797696938e-07, "logps/chosen": -234.55455017089844, "logps/rejected": -276.0071716308594, "loss": 0.2797, "rewards/chosen": -1.737378716468811, "rewards/margins": 0.6882723569869995, "rewards/rejected": -2.4256510734558105, "step": 314 }, { "epoch": 0.08, "grad_norm": 32.201969146728516, "kl": 0.0, "learning_rate": 4.58780423972782e-07, "logps/chosen": -250.39517211914062, "logps/rejected": -225.61228942871094, "loss": 0.3912, "rewards/chosen": -0.4257909655570984, "rewards/margins": 2.58058500289917, "rewards/rejected": -3.006376028060913, "step": 315 }, { "epoch": 0.08, "grad_norm": 34.16560363769531, "kl": 0.0, "learning_rate": 4.586495681758702e-07, "logps/chosen": -188.47059631347656, "logps/rejected": -191.3616943359375, "loss": 0.3849, "rewards/chosen": -1.0985850095748901, "rewards/margins": 0.9038892984390259, "rewards/rejected": -2.002474308013916, "step": 316 }, { "epoch": 0.08, "grad_norm": 29.20052146911621, "kl": 0.0, "learning_rate": 4.585187123789584e-07, "logps/chosen": -232.6088104248047, "logps/rejected": -240.9322509765625, "loss": 0.3795, "rewards/chosen": -1.5760377645492554, "rewards/margins": 1.283504605293274, "rewards/rejected": -2.8595423698425293, "step": 317 }, { "epoch": 0.08, "grad_norm": 30.147212982177734, "kl": 0.0, "learning_rate": 4.5838785658204655e-07, "logps/chosen": -205.91464233398438, "logps/rejected": -161.905029296875, "loss": 0.457, "rewards/chosen": -2.334472417831421, "rewards/margins": 0.6229822635650635, "rewards/rejected": -2.9574546813964844, "step": 318 }, { "epoch": 0.08, "grad_norm": 30.38490104675293, "kl": 0.0, "learning_rate": 4.5825700078513474e-07, "logps/chosen": -263.05523681640625, "logps/rejected": -242.82974243164062, "loss": 0.2415, "rewards/chosen": -0.4325067400932312, "rewards/margins": 2.2001450061798096, "rewards/rejected": -2.6326518058776855, "step": 319 }, { "epoch": 0.08, "grad_norm": 30.640369415283203, "kl": 0.0, "learning_rate": 4.5812614498822294e-07, "logps/chosen": -220.70742797851562, "logps/rejected": -234.42156982421875, "loss": 0.328, "rewards/chosen": -0.45821380615234375, "rewards/margins": 2.211210250854492, "rewards/rejected": -2.669424057006836, "step": 320 }, { "epoch": 0.08, "grad_norm": 33.323116302490234, "kl": 0.0, "learning_rate": 4.5799528919131113e-07, "logps/chosen": -186.67019653320312, "logps/rejected": -245.32626342773438, "loss": 0.3918, "rewards/chosen": -1.0099728107452393, "rewards/margins": 2.810298442840576, "rewards/rejected": -3.8202712535858154, "step": 321 }, { "epoch": 0.08, "grad_norm": 33.337738037109375, "kl": 0.0, "learning_rate": 4.5786443339439933e-07, "logps/chosen": -204.4862518310547, "logps/rejected": -188.58395385742188, "loss": 0.3712, "rewards/chosen": -0.8825057148933411, "rewards/margins": 2.2278504371643066, "rewards/rejected": -3.110356092453003, "step": 322 }, { "epoch": 0.08, "grad_norm": 28.831174850463867, "kl": 0.0, "learning_rate": 4.5773357759748753e-07, "logps/chosen": -147.2012481689453, "logps/rejected": -315.11572265625, "loss": 0.1919, "rewards/chosen": -1.441099762916565, "rewards/margins": 1.3434785604476929, "rewards/rejected": -2.784578323364258, "step": 323 }, { "epoch": 0.08, "grad_norm": 36.66320037841797, "kl": 0.0, "learning_rate": 4.576027218005757e-07, "logps/chosen": -251.1447296142578, "logps/rejected": -287.647705078125, "loss": 0.3665, "rewards/chosen": -0.43537652492523193, "rewards/margins": 3.960326671600342, "rewards/rejected": -4.395703315734863, "step": 324 }, { "epoch": 0.09, "grad_norm": 32.31867599487305, "kl": 0.0, "learning_rate": 4.574718660036639e-07, "logps/chosen": -271.9960021972656, "logps/rejected": -210.555908203125, "loss": 0.4788, "rewards/chosen": -2.4970388412475586, "rewards/margins": 0.04955768585205078, "rewards/rejected": -2.5465965270996094, "step": 325 }, { "epoch": 0.09, "grad_norm": 24.782032012939453, "kl": 0.0, "learning_rate": 4.5734101020675217e-07, "logps/chosen": -270.2687683105469, "logps/rejected": -261.0841369628906, "loss": 0.2655, "rewards/chosen": -0.7793827056884766, "rewards/margins": 4.47045373916626, "rewards/rejected": -5.249836444854736, "step": 326 }, { "epoch": 0.09, "grad_norm": 34.565975189208984, "kl": 0.0, "learning_rate": 4.5721015440984036e-07, "logps/chosen": -173.59933471679688, "logps/rejected": -167.2440185546875, "loss": 0.2976, "rewards/chosen": 0.5397694110870361, "rewards/margins": 2.3671255111694336, "rewards/rejected": -1.827355980873108, "step": 327 }, { "epoch": 0.09, "grad_norm": 28.94892692565918, "kl": 0.0, "learning_rate": 4.5707929861292856e-07, "logps/chosen": -195.4495849609375, "logps/rejected": -331.1904296875, "loss": 0.2857, "rewards/chosen": -1.0347931385040283, "rewards/margins": 0.9696147441864014, "rewards/rejected": -2.0044078826904297, "step": 328 }, { "epoch": 0.09, "grad_norm": 32.52096939086914, "kl": 0.0, "learning_rate": 4.5694844281601676e-07, "logps/chosen": -258.057373046875, "logps/rejected": -178.46829223632812, "loss": 0.3334, "rewards/chosen": -0.9900558590888977, "rewards/margins": 1.796595811843872, "rewards/rejected": -2.786651611328125, "step": 329 }, { "epoch": 0.09, "grad_norm": 27.296085357666016, "kl": 0.0, "learning_rate": 4.5681758701910495e-07, "logps/chosen": -184.41143798828125, "logps/rejected": -201.610107421875, "loss": 0.2267, "rewards/chosen": 0.0977037250995636, "rewards/margins": 4.80125617980957, "rewards/rejected": -4.70355224609375, "step": 330 }, { "epoch": 0.09, "grad_norm": 41.09619140625, "kl": 0.0, "learning_rate": 4.5668673122219315e-07, "logps/chosen": -250.09512329101562, "logps/rejected": -241.91793823242188, "loss": 0.235, "rewards/chosen": -0.5410808324813843, "rewards/margins": 3.400998115539551, "rewards/rejected": -3.9420788288116455, "step": 331 }, { "epoch": 0.09, "grad_norm": 42.17990493774414, "kl": 0.0, "learning_rate": 4.5655587542528134e-07, "logps/chosen": -156.83604431152344, "logps/rejected": -210.15084838867188, "loss": 0.3969, "rewards/chosen": -0.7945634126663208, "rewards/margins": 0.6295374631881714, "rewards/rejected": -1.4241008758544922, "step": 332 }, { "epoch": 0.09, "grad_norm": 27.20249366760254, "kl": 0.0, "learning_rate": 4.564250196283695e-07, "logps/chosen": -219.873291015625, "logps/rejected": -245.4140167236328, "loss": 0.417, "rewards/chosen": -1.2123730182647705, "rewards/margins": 1.475210428237915, "rewards/rejected": -2.6875834465026855, "step": 333 }, { "epoch": 0.09, "grad_norm": 36.41046142578125, "kl": 0.0, "learning_rate": 4.562941638314577e-07, "logps/chosen": -280.1678771972656, "logps/rejected": -204.20095825195312, "loss": 0.3973, "rewards/chosen": -0.5638540983200073, "rewards/margins": 1.0525778532028198, "rewards/rejected": -1.6164319515228271, "step": 334 }, { "epoch": 0.09, "grad_norm": 35.49837875366211, "kl": 0.0, "learning_rate": 4.561633080345459e-07, "logps/chosen": -146.32089233398438, "logps/rejected": -191.40664672851562, "loss": 0.3823, "rewards/chosen": -0.7487538456916809, "rewards/margins": 1.607696294784546, "rewards/rejected": -2.356450080871582, "step": 335 }, { "epoch": 0.09, "grad_norm": 30.863536834716797, "kl": 0.0, "learning_rate": 4.560324522376341e-07, "logps/chosen": -187.48995971679688, "logps/rejected": -209.26446533203125, "loss": 0.3971, "rewards/chosen": 0.22723525762557983, "rewards/margins": 2.7782607078552246, "rewards/rejected": -2.551025390625, "step": 336 }, { "epoch": 0.09, "grad_norm": 37.7666015625, "kl": 0.0, "learning_rate": 4.5590159644072227e-07, "logps/chosen": -248.80059814453125, "logps/rejected": -278.6592712402344, "loss": 0.4711, "rewards/chosen": -1.5165441036224365, "rewards/margins": 1.5899741649627686, "rewards/rejected": -3.106518268585205, "step": 337 }, { "epoch": 0.09, "grad_norm": 37.20514678955078, "kl": 0.0, "learning_rate": 4.5577074064381047e-07, "logps/chosen": -194.8486328125, "logps/rejected": -197.070068359375, "loss": 0.2708, "rewards/chosen": -0.3378767967224121, "rewards/margins": 4.038641452789307, "rewards/rejected": -4.376518249511719, "step": 338 }, { "epoch": 0.09, "grad_norm": 32.2724723815918, "kl": 0.0, "learning_rate": 4.556398848468987e-07, "logps/chosen": -183.97116088867188, "logps/rejected": -131.2269287109375, "loss": 0.4721, "rewards/chosen": -0.4851449131965637, "rewards/margins": 1.3216569423675537, "rewards/rejected": -1.8068017959594727, "step": 339 }, { "epoch": 0.09, "grad_norm": 41.788490295410156, "kl": 0.0, "learning_rate": 4.555090290499869e-07, "logps/chosen": -183.49282836914062, "logps/rejected": -221.72186279296875, "loss": 0.442, "rewards/chosen": -0.2105513960123062, "rewards/margins": 1.0875836610794067, "rewards/rejected": -1.2981350421905518, "step": 340 }, { "epoch": 0.09, "grad_norm": 45.68136978149414, "kl": 0.0, "learning_rate": 4.553781732530751e-07, "logps/chosen": -244.565185546875, "logps/rejected": -161.9296417236328, "loss": 0.3436, "rewards/chosen": -1.4009125232696533, "rewards/margins": 0.33695781230926514, "rewards/rejected": -1.7378703355789185, "step": 341 }, { "epoch": 0.09, "grad_norm": 32.6483039855957, "kl": 0.0, "learning_rate": 4.552473174561633e-07, "logps/chosen": -231.45835876464844, "logps/rejected": -231.79014587402344, "loss": 0.3056, "rewards/chosen": -0.8326898813247681, "rewards/margins": 2.3973751068115234, "rewards/rejected": -3.230065107345581, "step": 342 }, { "epoch": 0.09, "grad_norm": 35.41828536987305, "kl": 0.0, "learning_rate": 4.551164616592515e-07, "logps/chosen": -257.51177978515625, "logps/rejected": -246.73941040039062, "loss": 0.3756, "rewards/chosen": -0.0694907158613205, "rewards/margins": 2.133653402328491, "rewards/rejected": -2.203144073486328, "step": 343 }, { "epoch": 0.09, "grad_norm": 34.879451751708984, "kl": 0.0, "learning_rate": 4.549856058623397e-07, "logps/chosen": -234.83639526367188, "logps/rejected": -155.150146484375, "loss": 0.3385, "rewards/chosen": -0.7099578380584717, "rewards/margins": 1.7405083179473877, "rewards/rejected": -2.4504661560058594, "step": 344 }, { "epoch": 0.09, "grad_norm": 31.389076232910156, "kl": 0.0, "learning_rate": 4.548547500654279e-07, "logps/chosen": -258.70452880859375, "logps/rejected": -311.24713134765625, "loss": 0.4328, "rewards/chosen": -2.0910329818725586, "rewards/margins": 0.573084831237793, "rewards/rejected": -2.6641178131103516, "step": 345 }, { "epoch": 0.09, "grad_norm": 27.579402923583984, "kl": 0.0, "learning_rate": 4.547238942685161e-07, "logps/chosen": -117.16097259521484, "logps/rejected": -220.65586853027344, "loss": 0.4051, "rewards/chosen": -1.0259122848510742, "rewards/margins": 2.0292129516601562, "rewards/rejected": -3.0551252365112305, "step": 346 }, { "epoch": 0.09, "grad_norm": 27.12200164794922, "kl": 0.0, "learning_rate": 4.545930384716043e-07, "logps/chosen": -138.2091827392578, "logps/rejected": -207.37220764160156, "loss": 0.2588, "rewards/chosen": -0.047653257846832275, "rewards/margins": 2.87380313873291, "rewards/rejected": -2.9214563369750977, "step": 347 }, { "epoch": 0.09, "grad_norm": 39.55778121948242, "kl": 0.0, "learning_rate": 4.544621826746925e-07, "logps/chosen": -200.1582489013672, "logps/rejected": -231.9014892578125, "loss": 0.4636, "rewards/chosen": -0.7325445413589478, "rewards/margins": 1.465537667274475, "rewards/rejected": -2.198082208633423, "step": 348 }, { "epoch": 0.09, "grad_norm": 42.7136344909668, "kl": 0.0, "learning_rate": 4.543313268777806e-07, "logps/chosen": -276.27520751953125, "logps/rejected": -225.70761108398438, "loss": 0.4484, "rewards/chosen": -0.7546127438545227, "rewards/margins": 2.266116142272949, "rewards/rejected": -3.020728826522827, "step": 349 }, { "epoch": 0.09, "grad_norm": 29.12485694885254, "kl": 0.0, "learning_rate": 4.542004710808688e-07, "logps/chosen": -203.95103454589844, "logps/rejected": -192.92665100097656, "loss": 0.257, "rewards/chosen": -2.052464008331299, "rewards/margins": 0.40848374366760254, "rewards/rejected": -2.4609477519989014, "step": 350 }, { "epoch": 0.09, "grad_norm": 36.106475830078125, "kl": 0.0, "learning_rate": 4.54069615283957e-07, "logps/chosen": -229.77401733398438, "logps/rejected": -179.5250244140625, "loss": 0.3838, "rewards/chosen": -0.8526365756988525, "rewards/margins": 0.7249114513397217, "rewards/rejected": -1.5775480270385742, "step": 351 }, { "epoch": 0.09, "grad_norm": 28.871374130249023, "kl": 0.0, "learning_rate": 4.5393875948704527e-07, "logps/chosen": -155.2418975830078, "logps/rejected": -218.70916748046875, "loss": 0.3559, "rewards/chosen": -0.5292015075683594, "rewards/margins": 1.4907526969909668, "rewards/rejected": -2.019954204559326, "step": 352 }, { "epoch": 0.09, "grad_norm": 31.84480857849121, "kl": 0.0, "learning_rate": 4.5380790369013346e-07, "logps/chosen": -153.22557067871094, "logps/rejected": -258.18243408203125, "loss": 0.2501, "rewards/chosen": -1.4030628204345703, "rewards/margins": 1.9705705642700195, "rewards/rejected": -3.37363338470459, "step": 353 }, { "epoch": 0.09, "grad_norm": 31.34514617919922, "kl": 0.0, "learning_rate": 4.5367704789322166e-07, "logps/chosen": -146.6463165283203, "logps/rejected": -246.03086853027344, "loss": 0.3873, "rewards/chosen": 0.3020341992378235, "rewards/margins": 2.9109833240509033, "rewards/rejected": -2.6089491844177246, "step": 354 }, { "epoch": 0.09, "grad_norm": 33.196434020996094, "kl": 0.0, "learning_rate": 4.5354619209630985e-07, "logps/chosen": -186.69134521484375, "logps/rejected": -210.71072387695312, "loss": 0.3469, "rewards/chosen": 0.3410475254058838, "rewards/margins": 2.4019737243652344, "rewards/rejected": -2.0609261989593506, "step": 355 }, { "epoch": 0.09, "grad_norm": 27.802734375, "kl": 0.0, "learning_rate": 4.5341533629939805e-07, "logps/chosen": -199.94622802734375, "logps/rejected": -146.0865020751953, "loss": 0.3936, "rewards/chosen": -0.6471868753433228, "rewards/margins": 1.7640784978866577, "rewards/rejected": -2.4112653732299805, "step": 356 }, { "epoch": 0.09, "grad_norm": 38.54668045043945, "kl": 0.0, "learning_rate": 4.5328448050248625e-07, "logps/chosen": -223.62625122070312, "logps/rejected": -254.55821228027344, "loss": 0.3434, "rewards/chosen": -1.1160787343978882, "rewards/margins": 1.0182207822799683, "rewards/rejected": -2.1342995166778564, "step": 357 }, { "epoch": 0.09, "grad_norm": 28.735496520996094, "kl": 0.0, "learning_rate": 4.5315362470557444e-07, "logps/chosen": -211.4855194091797, "logps/rejected": -257.02008056640625, "loss": 0.2874, "rewards/chosen": -0.2564331591129303, "rewards/margins": 3.4900126457214355, "rewards/rejected": -3.746445894241333, "step": 358 }, { "epoch": 0.09, "grad_norm": 35.81536102294922, "kl": 0.0, "learning_rate": 4.5302276890866264e-07, "logps/chosen": -304.2633361816406, "logps/rejected": -209.7054901123047, "loss": 0.2967, "rewards/chosen": -0.46264463663101196, "rewards/margins": 3.166010856628418, "rewards/rejected": -3.628655433654785, "step": 359 }, { "epoch": 0.09, "grad_norm": 40.20705795288086, "kl": 0.0, "learning_rate": 4.5289191311175084e-07, "logps/chosen": -266.1392822265625, "logps/rejected": -251.1559600830078, "loss": 0.4054, "rewards/chosen": -0.13369740545749664, "rewards/margins": 1.7442787885665894, "rewards/rejected": -1.8779761791229248, "step": 360 }, { "epoch": 0.09, "grad_norm": 31.861595153808594, "kl": 0.0, "learning_rate": 4.5276105731483903e-07, "logps/chosen": -224.10833740234375, "logps/rejected": -223.89828491210938, "loss": 0.2936, "rewards/chosen": -0.20376244187355042, "rewards/margins": 2.0179710388183594, "rewards/rejected": -2.221733570098877, "step": 361 }, { "epoch": 0.09, "grad_norm": 34.194580078125, "kl": 0.0, "learning_rate": 4.5263020151792723e-07, "logps/chosen": -229.5283203125, "logps/rejected": -227.322998046875, "loss": 0.3522, "rewards/chosen": 0.7541962265968323, "rewards/margins": 2.9511911869049072, "rewards/rejected": -2.1969950199127197, "step": 362 }, { "epoch": 0.1, "grad_norm": 33.832305908203125, "kl": 0.0, "learning_rate": 4.524993457210154e-07, "logps/chosen": -225.90982055664062, "logps/rejected": -228.3668975830078, "loss": 0.3117, "rewards/chosen": -0.7936959266662598, "rewards/margins": 1.0352134704589844, "rewards/rejected": -1.8289093971252441, "step": 363 }, { "epoch": 0.1, "grad_norm": 29.428434371948242, "kl": 0.0, "learning_rate": 4.5236848992410357e-07, "logps/chosen": -185.18519592285156, "logps/rejected": -264.0128173828125, "loss": 0.2548, "rewards/chosen": -1.0546776056289673, "rewards/margins": 2.7360973358154297, "rewards/rejected": -3.7907750606536865, "step": 364 }, { "epoch": 0.1, "grad_norm": 32.661277770996094, "kl": 0.0, "learning_rate": 4.522376341271918e-07, "logps/chosen": -208.7366180419922, "logps/rejected": -296.23974609375, "loss": 0.3074, "rewards/chosen": 0.3005746304988861, "rewards/margins": 2.755589246749878, "rewards/rejected": -2.455014705657959, "step": 365 }, { "epoch": 0.1, "grad_norm": 30.65373420715332, "kl": 0.0, "learning_rate": 4.5210677833028e-07, "logps/chosen": -259.0675354003906, "logps/rejected": -204.466064453125, "loss": 0.2904, "rewards/chosen": -1.7415310144424438, "rewards/margins": 0.5091778039932251, "rewards/rejected": -2.250708818435669, "step": 366 }, { "epoch": 0.1, "grad_norm": 35.81827926635742, "kl": 0.0, "learning_rate": 4.519759225333682e-07, "logps/chosen": -244.27406311035156, "logps/rejected": -152.07162475585938, "loss": 0.3159, "rewards/chosen": -1.1547815799713135, "rewards/margins": 1.677286148071289, "rewards/rejected": -2.8320677280426025, "step": 367 }, { "epoch": 0.1, "grad_norm": 40.60730743408203, "kl": 0.0, "learning_rate": 4.518450667364564e-07, "logps/chosen": -300.86279296875, "logps/rejected": -291.1861572265625, "loss": 0.4233, "rewards/chosen": 0.04416877031326294, "rewards/margins": 3.0914156436920166, "rewards/rejected": -3.0472469329833984, "step": 368 }, { "epoch": 0.1, "grad_norm": 42.675716400146484, "kl": 0.0, "learning_rate": 4.517142109395446e-07, "logps/chosen": -193.25894165039062, "logps/rejected": -163.64535522460938, "loss": 0.4682, "rewards/chosen": -1.3016144037246704, "rewards/margins": 0.8137959241867065, "rewards/rejected": -2.115410327911377, "step": 369 }, { "epoch": 0.1, "grad_norm": 33.09608840942383, "kl": 0.0, "learning_rate": 4.515833551426328e-07, "logps/chosen": -160.6686553955078, "logps/rejected": -247.52890014648438, "loss": 0.3394, "rewards/chosen": -0.44542351365089417, "rewards/margins": 2.8359451293945312, "rewards/rejected": -3.2813687324523926, "step": 370 }, { "epoch": 0.1, "grad_norm": 49.479881286621094, "kl": 0.0, "learning_rate": 4.51452499345721e-07, "logps/chosen": -273.73419189453125, "logps/rejected": -243.27877807617188, "loss": 0.4571, "rewards/chosen": -1.3881046772003174, "rewards/margins": -0.02787947654724121, "rewards/rejected": -1.3602252006530762, "step": 371 }, { "epoch": 0.1, "grad_norm": 41.906150817871094, "kl": 0.0, "learning_rate": 4.513216435488092e-07, "logps/chosen": -220.07510375976562, "logps/rejected": -236.5010528564453, "loss": 0.331, "rewards/chosen": -0.32473501563072205, "rewards/margins": 2.5255327224731445, "rewards/rejected": -2.8502676486968994, "step": 372 }, { "epoch": 0.1, "grad_norm": 42.59035110473633, "kl": 0.0, "learning_rate": 4.511907877518974e-07, "logps/chosen": -282.63482666015625, "logps/rejected": -258.4151306152344, "loss": 0.2893, "rewards/chosen": -0.18127818405628204, "rewards/margins": 1.0610069036483765, "rewards/rejected": -1.242285132408142, "step": 373 }, { "epoch": 0.1, "grad_norm": 37.68351364135742, "kl": 0.0, "learning_rate": 4.510599319549856e-07, "logps/chosen": -210.8810272216797, "logps/rejected": -224.22665405273438, "loss": 0.3508, "rewards/chosen": -1.3154093027114868, "rewards/margins": 2.27657413482666, "rewards/rejected": -3.5919833183288574, "step": 374 }, { "epoch": 0.1, "grad_norm": 31.09569549560547, "kl": 0.0, "learning_rate": 4.509290761580738e-07, "logps/chosen": -195.7852783203125, "logps/rejected": -257.9025573730469, "loss": 0.2977, "rewards/chosen": -0.7623598575592041, "rewards/margins": 3.2512733936309814, "rewards/rejected": -4.0136332511901855, "step": 375 }, { "epoch": 0.1, "grad_norm": 28.333711624145508, "kl": 0.0, "learning_rate": 4.5079822036116197e-07, "logps/chosen": -151.38967895507812, "logps/rejected": -297.0498046875, "loss": 0.3272, "rewards/chosen": -1.5544800758361816, "rewards/margins": 0.2516918182373047, "rewards/rejected": -1.8061718940734863, "step": 376 }, { "epoch": 0.1, "grad_norm": 36.108795166015625, "kl": 0.0, "learning_rate": 4.506673645642502e-07, "logps/chosen": -227.50051879882812, "logps/rejected": -211.9762420654297, "loss": 0.4023, "rewards/chosen": -0.5598442554473877, "rewards/margins": 2.065431833267212, "rewards/rejected": -2.6252760887145996, "step": 377 }, { "epoch": 0.1, "grad_norm": 33.0745849609375, "kl": 0.0, "learning_rate": 4.505365087673384e-07, "logps/chosen": -213.019287109375, "logps/rejected": -207.67184448242188, "loss": 0.3544, "rewards/chosen": -1.3546139001846313, "rewards/margins": 0.8756808042526245, "rewards/rejected": -2.230294704437256, "step": 378 }, { "epoch": 0.1, "grad_norm": 34.654964447021484, "kl": 0.0, "learning_rate": 4.504056529704266e-07, "logps/chosen": -182.68475341796875, "logps/rejected": -255.54612731933594, "loss": 0.3742, "rewards/chosen": -1.2387803792953491, "rewards/margins": 1.8842393159866333, "rewards/rejected": -3.1230196952819824, "step": 379 }, { "epoch": 0.1, "grad_norm": 32.7558479309082, "kl": 0.0, "learning_rate": 4.5027479717351476e-07, "logps/chosen": -192.14639282226562, "logps/rejected": -229.38015747070312, "loss": 0.3754, "rewards/chosen": -0.07900139689445496, "rewards/margins": 2.6180408000946045, "rewards/rejected": -2.697042226791382, "step": 380 }, { "epoch": 0.1, "grad_norm": 36.6190071105957, "kl": 0.0, "learning_rate": 4.5014394137660295e-07, "logps/chosen": -268.7249755859375, "logps/rejected": -251.80455017089844, "loss": 0.4801, "rewards/chosen": -1.5141268968582153, "rewards/margins": 1.2382701635360718, "rewards/rejected": -2.752397060394287, "step": 381 }, { "epoch": 0.1, "grad_norm": 40.666839599609375, "kl": 0.0, "learning_rate": 4.5001308557969115e-07, "logps/chosen": -243.43234252929688, "logps/rejected": -258.41448974609375, "loss": 0.3511, "rewards/chosen": -0.3709592819213867, "rewards/margins": 1.2282294034957886, "rewards/rejected": -1.5991886854171753, "step": 382 }, { "epoch": 0.1, "grad_norm": 28.4544620513916, "kl": 0.0, "learning_rate": 4.4988222978277935e-07, "logps/chosen": -198.5283660888672, "logps/rejected": -217.16868591308594, "loss": 0.3838, "rewards/chosen": -2.058994770050049, "rewards/margins": 0.5409789085388184, "rewards/rejected": -2.599973678588867, "step": 383 }, { "epoch": 0.1, "grad_norm": 39.92369079589844, "kl": 0.0, "learning_rate": 4.4975137398586754e-07, "logps/chosen": -201.83322143554688, "logps/rejected": -245.54714965820312, "loss": 0.3817, "rewards/chosen": -0.4379090666770935, "rewards/margins": 2.6852946281433105, "rewards/rejected": -3.123203754425049, "step": 384 }, { "epoch": 0.1, "grad_norm": 42.2255744934082, "kl": 0.0, "learning_rate": 4.4962051818895574e-07, "logps/chosen": -183.98609924316406, "logps/rejected": -319.1829833984375, "loss": 0.4237, "rewards/chosen": -0.3203003704547882, "rewards/margins": 2.765152931213379, "rewards/rejected": -3.0854532718658447, "step": 385 }, { "epoch": 0.1, "grad_norm": 29.236879348754883, "kl": 0.0, "learning_rate": 4.4948966239204393e-07, "logps/chosen": -93.41141510009766, "logps/rejected": -202.96383666992188, "loss": 0.327, "rewards/chosen": -0.3800518810749054, "rewards/margins": 2.651289701461792, "rewards/rejected": -3.031341552734375, "step": 386 }, { "epoch": 0.1, "grad_norm": 37.4599609375, "kl": 0.0, "learning_rate": 4.4935880659513213e-07, "logps/chosen": -271.9604797363281, "logps/rejected": -222.61170959472656, "loss": 0.4275, "rewards/chosen": 0.17430379986763, "rewards/margins": 2.8483176231384277, "rewards/rejected": -2.67401385307312, "step": 387 }, { "epoch": 0.1, "grad_norm": 32.062278747558594, "kl": 0.0, "learning_rate": 4.492279507982203e-07, "logps/chosen": -190.10693359375, "logps/rejected": -176.67543029785156, "loss": 0.2995, "rewards/chosen": -0.9995455145835876, "rewards/margins": 1.7313144207000732, "rewards/rejected": -2.7308599948883057, "step": 388 }, { "epoch": 0.1, "grad_norm": 38.55339813232422, "kl": 0.0, "learning_rate": 4.490970950013085e-07, "logps/chosen": -220.9341583251953, "logps/rejected": -299.83245849609375, "loss": 0.3378, "rewards/chosen": -0.39133620262145996, "rewards/margins": 2.5305755138397217, "rewards/rejected": -2.9219117164611816, "step": 389 }, { "epoch": 0.1, "grad_norm": 39.30725860595703, "kl": 0.0, "learning_rate": 4.4896623920439677e-07, "logps/chosen": -182.44326782226562, "logps/rejected": -174.75648498535156, "loss": 0.3955, "rewards/chosen": -0.47413867712020874, "rewards/margins": 1.2516155242919922, "rewards/rejected": -1.7257542610168457, "step": 390 }, { "epoch": 0.1, "grad_norm": 33.8344612121582, "kl": 0.0, "learning_rate": 4.4883538340748497e-07, "logps/chosen": -236.1179656982422, "logps/rejected": -250.09539794921875, "loss": 0.4352, "rewards/chosen": -1.4198930263519287, "rewards/margins": 1.684675931930542, "rewards/rejected": -3.1045689582824707, "step": 391 }, { "epoch": 0.1, "grad_norm": 30.08695411682129, "kl": 0.0, "learning_rate": 4.4870452761057316e-07, "logps/chosen": -186.74351501464844, "logps/rejected": -253.93115234375, "loss": 0.2261, "rewards/chosen": -0.4091317355632782, "rewards/margins": 3.6403825283050537, "rewards/rejected": -4.049514293670654, "step": 392 }, { "epoch": 0.1, "grad_norm": 34.06538772583008, "kl": 0.0, "learning_rate": 4.4857367181366136e-07, "logps/chosen": -193.60354614257812, "logps/rejected": -134.43006896972656, "loss": 0.3712, "rewards/chosen": -0.271693617105484, "rewards/margins": 1.1191461086273193, "rewards/rejected": -1.390839695930481, "step": 393 }, { "epoch": 0.1, "grad_norm": 34.09449768066406, "kl": 0.0, "learning_rate": 4.4844281601674956e-07, "logps/chosen": -218.30441284179688, "logps/rejected": -234.31195068359375, "loss": 0.2001, "rewards/chosen": 0.4233044981956482, "rewards/margins": 5.021885871887207, "rewards/rejected": -4.598581314086914, "step": 394 }, { "epoch": 0.1, "grad_norm": 42.07844924926758, "kl": 0.0, "learning_rate": 4.483119602198377e-07, "logps/chosen": -231.95108032226562, "logps/rejected": -268.6423645019531, "loss": 0.4383, "rewards/chosen": -0.824382483959198, "rewards/margins": 2.927910089492798, "rewards/rejected": -3.7522926330566406, "step": 395 }, { "epoch": 0.1, "grad_norm": 38.50265121459961, "kl": 0.0, "learning_rate": 4.481811044229259e-07, "logps/chosen": -270.509033203125, "logps/rejected": -226.2156219482422, "loss": 0.2796, "rewards/chosen": -0.18926948308944702, "rewards/margins": 1.739548921585083, "rewards/rejected": -1.9288183450698853, "step": 396 }, { "epoch": 0.1, "grad_norm": 31.2486515045166, "kl": 0.0, "learning_rate": 4.480502486260141e-07, "logps/chosen": -109.7884521484375, "logps/rejected": -203.29953002929688, "loss": 0.4896, "rewards/chosen": -0.8157114386558533, "rewards/margins": 0.568125307559967, "rewards/rejected": -1.3838367462158203, "step": 397 }, { "epoch": 0.1, "grad_norm": 31.697792053222656, "kl": 0.0, "learning_rate": 4.479193928291023e-07, "logps/chosen": -174.076171875, "logps/rejected": -253.45223999023438, "loss": 0.3, "rewards/chosen": -0.8394641876220703, "rewards/margins": 2.1134212017059326, "rewards/rejected": -2.952885389328003, "step": 398 }, { "epoch": 0.1, "grad_norm": 28.867341995239258, "kl": 0.0, "learning_rate": 4.477885370321905e-07, "logps/chosen": -168.5074005126953, "logps/rejected": -125.79701232910156, "loss": 0.3656, "rewards/chosen": -0.439168244600296, "rewards/margins": 0.6221072673797607, "rewards/rejected": -1.0612754821777344, "step": 399 }, { "epoch": 0.1, "grad_norm": 35.56822967529297, "kl": 0.0, "learning_rate": 4.476576812352787e-07, "logps/chosen": -166.24661254882812, "logps/rejected": -263.20733642578125, "loss": 0.3092, "rewards/chosen": -0.6019458770751953, "rewards/margins": 2.142655611038208, "rewards/rejected": -2.7446014881134033, "step": 400 }, { "epoch": 0.1, "grad_norm": 34.35905838012695, "kl": 0.0, "learning_rate": 4.475268254383669e-07, "logps/chosen": -185.26710510253906, "logps/rejected": -250.02255249023438, "loss": 0.2589, "rewards/chosen": 0.7835355401039124, "rewards/margins": 4.608429431915283, "rewards/rejected": -3.8248939514160156, "step": 401 }, { "epoch": 0.11, "grad_norm": 31.946916580200195, "kl": 0.0, "learning_rate": 4.4739596964145507e-07, "logps/chosen": -147.8795623779297, "logps/rejected": -273.70904541015625, "loss": 0.4864, "rewards/chosen": -0.9677847027778625, "rewards/margins": 1.813450813293457, "rewards/rejected": -2.781235456466675, "step": 402 }, { "epoch": 0.11, "grad_norm": 34.859928131103516, "kl": 0.0, "learning_rate": 4.472651138445433e-07, "logps/chosen": -235.14657592773438, "logps/rejected": -225.0830078125, "loss": 0.3405, "rewards/chosen": -1.2147836685180664, "rewards/margins": 1.1082990169525146, "rewards/rejected": -2.323082685470581, "step": 403 }, { "epoch": 0.11, "grad_norm": 34.983360290527344, "kl": 0.0, "learning_rate": 4.471342580476315e-07, "logps/chosen": -163.03114318847656, "logps/rejected": -290.476318359375, "loss": 0.2981, "rewards/chosen": -1.457593321800232, "rewards/margins": 2.553722381591797, "rewards/rejected": -4.011315822601318, "step": 404 }, { "epoch": 0.11, "grad_norm": 33.62318801879883, "kl": 0.0, "learning_rate": 4.470034022507197e-07, "logps/chosen": -233.85498046875, "logps/rejected": -183.947509765625, "loss": 0.4204, "rewards/chosen": -0.9742795825004578, "rewards/margins": 2.231663227081299, "rewards/rejected": -3.2059428691864014, "step": 405 }, { "epoch": 0.11, "grad_norm": 45.01471710205078, "kl": 0.0, "learning_rate": 4.468725464538079e-07, "logps/chosen": -193.53396606445312, "logps/rejected": -337.14697265625, "loss": 0.4105, "rewards/chosen": -0.45279431343078613, "rewards/margins": 1.2500965595245361, "rewards/rejected": -1.7028908729553223, "step": 406 }, { "epoch": 0.11, "grad_norm": 28.890682220458984, "kl": 0.0, "learning_rate": 4.467416906568961e-07, "logps/chosen": -208.4068145751953, "logps/rejected": -235.36312866210938, "loss": 0.3001, "rewards/chosen": 0.7755467295646667, "rewards/margins": 3.6093943119049072, "rewards/rejected": -2.8338475227355957, "step": 407 }, { "epoch": 0.11, "grad_norm": 31.09847068786621, "kl": 0.0, "learning_rate": 4.466108348599843e-07, "logps/chosen": -238.82534790039062, "logps/rejected": -218.0955352783203, "loss": 0.321, "rewards/chosen": 0.5067283511161804, "rewards/margins": 3.1289937496185303, "rewards/rejected": -2.622265338897705, "step": 408 }, { "epoch": 0.11, "grad_norm": 30.704862594604492, "kl": 0.0, "learning_rate": 4.464799790630725e-07, "logps/chosen": -318.6136169433594, "logps/rejected": -350.36749267578125, "loss": 0.3123, "rewards/chosen": -1.7736846208572388, "rewards/margins": 1.854911208152771, "rewards/rejected": -3.6285958290100098, "step": 409 }, { "epoch": 0.11, "grad_norm": 38.331790924072266, "kl": 0.0, "learning_rate": 4.4634912326616064e-07, "logps/chosen": -206.4320068359375, "logps/rejected": -287.9844970703125, "loss": 0.3798, "rewards/chosen": -0.3898891806602478, "rewards/margins": 3.1153624057769775, "rewards/rejected": -3.50525164604187, "step": 410 }, { "epoch": 0.11, "grad_norm": 25.918659210205078, "kl": 0.0, "learning_rate": 4.4621826746924884e-07, "logps/chosen": -194.8382568359375, "logps/rejected": -218.07589721679688, "loss": 0.2852, "rewards/chosen": -0.7628296613693237, "rewards/margins": 3.037680149078369, "rewards/rejected": -3.8005099296569824, "step": 411 }, { "epoch": 0.11, "grad_norm": 39.002342224121094, "kl": 0.0, "learning_rate": 4.4608741167233703e-07, "logps/chosen": -172.56983947753906, "logps/rejected": -279.04034423828125, "loss": 0.4511, "rewards/chosen": -0.5036152601242065, "rewards/margins": 1.6097949743270874, "rewards/rejected": -2.113410234451294, "step": 412 }, { "epoch": 0.11, "grad_norm": 26.85070037841797, "kl": 0.0, "learning_rate": 4.4595655587542523e-07, "logps/chosen": -221.30715942382812, "logps/rejected": -273.9478759765625, "loss": 0.3823, "rewards/chosen": 0.18906378746032715, "rewards/margins": 3.575185537338257, "rewards/rejected": -3.3861217498779297, "step": 413 }, { "epoch": 0.11, "grad_norm": 32.324066162109375, "kl": 0.0, "learning_rate": 4.458257000785134e-07, "logps/chosen": -226.74598693847656, "logps/rejected": -250.83737182617188, "loss": 0.2715, "rewards/chosen": -0.3493311405181885, "rewards/margins": 3.514986753463745, "rewards/rejected": -3.8643178939819336, "step": 414 }, { "epoch": 0.11, "grad_norm": 33.729129791259766, "kl": 0.0, "learning_rate": 4.456948442816016e-07, "logps/chosen": -257.4822998046875, "logps/rejected": -214.71878051757812, "loss": 0.3478, "rewards/chosen": -1.7488336563110352, "rewards/margins": 2.3019628524780273, "rewards/rejected": -4.0507965087890625, "step": 415 }, { "epoch": 0.11, "grad_norm": 26.120983123779297, "kl": 0.0, "learning_rate": 4.4556398848468987e-07, "logps/chosen": -184.21572875976562, "logps/rejected": -255.57809448242188, "loss": 0.321, "rewards/chosen": -1.7084475755691528, "rewards/margins": 1.36984121799469, "rewards/rejected": -3.0782887935638428, "step": 416 }, { "epoch": 0.11, "grad_norm": 29.163272857666016, "kl": 0.0, "learning_rate": 4.4543313268777807e-07, "logps/chosen": -147.19149780273438, "logps/rejected": -168.89613342285156, "loss": 0.3638, "rewards/chosen": -0.3925864100456238, "rewards/margins": 1.8936991691589355, "rewards/rejected": -2.286285638809204, "step": 417 }, { "epoch": 0.11, "grad_norm": 33.93145751953125, "kl": 0.0, "learning_rate": 4.4530227689086626e-07, "logps/chosen": -214.961181640625, "logps/rejected": -205.99252319335938, "loss": 0.3564, "rewards/chosen": -0.7181621193885803, "rewards/margins": 1.9614551067352295, "rewards/rejected": -2.679617166519165, "step": 418 }, { "epoch": 0.11, "grad_norm": 30.19272232055664, "kl": 0.0, "learning_rate": 4.4517142109395446e-07, "logps/chosen": -200.70773315429688, "logps/rejected": -210.8472442626953, "loss": 0.3503, "rewards/chosen": -1.6026099920272827, "rewards/margins": 0.9804550409317017, "rewards/rejected": -2.5830650329589844, "step": 419 }, { "epoch": 0.11, "grad_norm": 30.529420852661133, "kl": 0.0, "learning_rate": 4.4504056529704265e-07, "logps/chosen": -209.10699462890625, "logps/rejected": -269.783447265625, "loss": 0.2915, "rewards/chosen": -1.2561055421829224, "rewards/margins": 2.0115232467651367, "rewards/rejected": -3.2676289081573486, "step": 420 }, { "epoch": 0.11, "grad_norm": 29.09016990661621, "kl": 0.0, "learning_rate": 4.4490970950013085e-07, "logps/chosen": -326.3652038574219, "logps/rejected": -254.44314575195312, "loss": 0.3048, "rewards/chosen": -1.2969313859939575, "rewards/margins": 2.0072875022888184, "rewards/rejected": -3.3042187690734863, "step": 421 }, { "epoch": 0.11, "grad_norm": 26.089576721191406, "kl": 0.0, "learning_rate": 4.4477885370321905e-07, "logps/chosen": -287.49560546875, "logps/rejected": -273.09234619140625, "loss": 0.2429, "rewards/chosen": -0.9192136526107788, "rewards/margins": 3.4407401084899902, "rewards/rejected": -4.359953880310059, "step": 422 }, { "epoch": 0.11, "grad_norm": 24.35614013671875, "kl": 0.0, "learning_rate": 4.4464799790630724e-07, "logps/chosen": -302.5399475097656, "logps/rejected": -250.1197967529297, "loss": 0.1487, "rewards/chosen": 0.8381339907646179, "rewards/margins": 4.958958625793457, "rewards/rejected": -4.120824813842773, "step": 423 }, { "epoch": 0.11, "grad_norm": 43.92964553833008, "kl": 0.0, "learning_rate": 4.4451714210939544e-07, "logps/chosen": -217.13169860839844, "logps/rejected": -206.9733428955078, "loss": 0.374, "rewards/chosen": -1.333292007446289, "rewards/margins": 4.744856834411621, "rewards/rejected": -6.07814884185791, "step": 424 }, { "epoch": 0.11, "grad_norm": 41.32809829711914, "kl": 0.0, "learning_rate": 4.4438628631248363e-07, "logps/chosen": -259.3089904785156, "logps/rejected": -276.80120849609375, "loss": 0.2415, "rewards/chosen": -0.16916459798812866, "rewards/margins": 3.5735318660736084, "rewards/rejected": -3.742696523666382, "step": 425 }, { "epoch": 0.11, "grad_norm": 39.2205810546875, "kl": 0.0, "learning_rate": 4.442554305155718e-07, "logps/chosen": -261.5947265625, "logps/rejected": -271.5604248046875, "loss": 0.426, "rewards/chosen": -0.7233175039291382, "rewards/margins": 3.2420592308044434, "rewards/rejected": -3.965376615524292, "step": 426 }, { "epoch": 0.11, "grad_norm": 41.977691650390625, "kl": 0.0, "learning_rate": 4.4412457471866e-07, "logps/chosen": -246.94467163085938, "logps/rejected": -245.75967407226562, "loss": 0.3938, "rewards/chosen": -1.275907039642334, "rewards/margins": 2.263812780380249, "rewards/rejected": -3.539719820022583, "step": 427 }, { "epoch": 0.11, "grad_norm": 29.9937744140625, "kl": 0.0, "learning_rate": 4.4399371892174817e-07, "logps/chosen": -205.16627502441406, "logps/rejected": -245.968505859375, "loss": 0.3472, "rewards/chosen": -1.6494184732437134, "rewards/margins": 3.15864896774292, "rewards/rejected": -4.808067321777344, "step": 428 }, { "epoch": 0.11, "grad_norm": 35.75617980957031, "kl": 0.0, "learning_rate": 4.438628631248364e-07, "logps/chosen": -187.45712280273438, "logps/rejected": -228.374755859375, "loss": 0.3824, "rewards/chosen": -0.853541374206543, "rewards/margins": 1.3908743858337402, "rewards/rejected": -2.244415760040283, "step": 429 }, { "epoch": 0.11, "grad_norm": 30.78190040588379, "kl": 0.0, "learning_rate": 4.437320073279246e-07, "logps/chosen": -180.026123046875, "logps/rejected": -260.7657165527344, "loss": 0.4076, "rewards/chosen": -0.43349024653434753, "rewards/margins": 2.4749932289123535, "rewards/rejected": -2.9084835052490234, "step": 430 }, { "epoch": 0.11, "grad_norm": 39.93263626098633, "kl": 0.0, "learning_rate": 4.436011515310128e-07, "logps/chosen": -221.72950744628906, "logps/rejected": -139.82058715820312, "loss": 0.3167, "rewards/chosen": 0.9334081411361694, "rewards/margins": 2.3411035537719727, "rewards/rejected": -1.4076952934265137, "step": 431 }, { "epoch": 0.11, "grad_norm": 36.58866500854492, "kl": 0.0, "learning_rate": 4.43470295734101e-07, "logps/chosen": -187.53683471679688, "logps/rejected": -194.1912841796875, "loss": 0.4121, "rewards/chosen": -0.8121582269668579, "rewards/margins": 1.4370888471603394, "rewards/rejected": -2.2492470741271973, "step": 432 }, { "epoch": 0.11, "grad_norm": 37.1215705871582, "kl": 0.0, "learning_rate": 4.433394399371892e-07, "logps/chosen": -194.21035766601562, "logps/rejected": -261.7844543457031, "loss": 0.2948, "rewards/chosen": -0.80370032787323, "rewards/margins": 2.7978358268737793, "rewards/rejected": -3.6015360355377197, "step": 433 }, { "epoch": 0.11, "grad_norm": 23.91290283203125, "kl": 0.0, "learning_rate": 4.432085841402774e-07, "logps/chosen": -223.78013610839844, "logps/rejected": -208.88961791992188, "loss": 0.3526, "rewards/chosen": -0.15191972255706787, "rewards/margins": 2.919671058654785, "rewards/rejected": -3.0715909004211426, "step": 434 }, { "epoch": 0.11, "grad_norm": 31.860248565673828, "kl": 0.0, "learning_rate": 4.430777283433656e-07, "logps/chosen": -178.88320922851562, "logps/rejected": -286.60174560546875, "loss": 0.4119, "rewards/chosen": -1.61677086353302, "rewards/margins": 2.9354071617126465, "rewards/rejected": -4.552177906036377, "step": 435 }, { "epoch": 0.11, "grad_norm": 32.55792236328125, "kl": 0.0, "learning_rate": 4.429468725464538e-07, "logps/chosen": -168.29994201660156, "logps/rejected": -196.41845703125, "loss": 0.3352, "rewards/chosen": 1.1356920003890991, "rewards/margins": 5.061020851135254, "rewards/rejected": -3.9253289699554443, "step": 436 }, { "epoch": 0.11, "grad_norm": 35.387332916259766, "kl": 0.0, "learning_rate": 4.42816016749542e-07, "logps/chosen": -148.4381103515625, "logps/rejected": -210.69398498535156, "loss": 0.382, "rewards/chosen": -1.410331130027771, "rewards/margins": 0.021602511405944824, "rewards/rejected": -1.4319336414337158, "step": 437 }, { "epoch": 0.11, "grad_norm": 32.027957916259766, "kl": 0.0, "learning_rate": 4.426851609526302e-07, "logps/chosen": -270.435791015625, "logps/rejected": -170.29534912109375, "loss": 0.2659, "rewards/chosen": -0.9582297801971436, "rewards/margins": 1.9723150730133057, "rewards/rejected": -2.930544853210449, "step": 438 }, { "epoch": 0.11, "grad_norm": 28.259552001953125, "kl": 0.0, "learning_rate": 4.425543051557184e-07, "logps/chosen": -171.87156677246094, "logps/rejected": -164.96092224121094, "loss": 0.3884, "rewards/chosen": -0.9051600098609924, "rewards/margins": 1.720262050628662, "rewards/rejected": -2.6254220008850098, "step": 439 }, { "epoch": 0.12, "grad_norm": 30.440942764282227, "kl": 0.0, "learning_rate": 4.424234493588066e-07, "logps/chosen": -263.3205261230469, "logps/rejected": -270.30535888671875, "loss": 0.3815, "rewards/chosen": -1.8120743036270142, "rewards/margins": 1.4591060876846313, "rewards/rejected": -3.2711803913116455, "step": 440 }, { "epoch": 0.12, "grad_norm": 35.62495040893555, "kl": 0.0, "learning_rate": 4.422925935618947e-07, "logps/chosen": -183.18359375, "logps/rejected": -287.8877258300781, "loss": 0.401, "rewards/chosen": -1.1367744207382202, "rewards/margins": 2.48806095123291, "rewards/rejected": -3.62483549118042, "step": 441 }, { "epoch": 0.12, "grad_norm": 26.652666091918945, "kl": 0.0, "learning_rate": 4.4216173776498297e-07, "logps/chosen": -238.51235961914062, "logps/rejected": -200.37075805664062, "loss": 0.3568, "rewards/chosen": -0.887931227684021, "rewards/margins": 1.5551592111587524, "rewards/rejected": -2.4430904388427734, "step": 442 }, { "epoch": 0.12, "grad_norm": 38.41551208496094, "kl": 0.0, "learning_rate": 4.4203088196807116e-07, "logps/chosen": -159.75192260742188, "logps/rejected": -239.3623046875, "loss": 0.2291, "rewards/chosen": 0.6571463346481323, "rewards/margins": 3.578679084777832, "rewards/rejected": -2.92153263092041, "step": 443 }, { "epoch": 0.12, "grad_norm": 27.979955673217773, "kl": 0.0, "learning_rate": 4.4190002617115936e-07, "logps/chosen": -234.58811950683594, "logps/rejected": -179.65280151367188, "loss": 0.2937, "rewards/chosen": -0.8682149052619934, "rewards/margins": 3.7219350337982178, "rewards/rejected": -4.590149879455566, "step": 444 }, { "epoch": 0.12, "grad_norm": 33.59569549560547, "kl": 0.0, "learning_rate": 4.4176917037424756e-07, "logps/chosen": -220.4178009033203, "logps/rejected": -181.2563934326172, "loss": 0.5664, "rewards/chosen": -1.7921407222747803, "rewards/margins": -0.3066593408584595, "rewards/rejected": -1.4854813814163208, "step": 445 }, { "epoch": 0.12, "grad_norm": 33.9988899230957, "kl": 0.0, "learning_rate": 4.4163831457733575e-07, "logps/chosen": -203.83201599121094, "logps/rejected": -372.3682861328125, "loss": 0.2888, "rewards/chosen": 0.25906336307525635, "rewards/margins": 4.292977809906006, "rewards/rejected": -4.033914566040039, "step": 446 }, { "epoch": 0.12, "grad_norm": 40.23044204711914, "kl": 0.0, "learning_rate": 4.4150745878042395e-07, "logps/chosen": -247.44821166992188, "logps/rejected": -221.32139587402344, "loss": 0.4277, "rewards/chosen": -0.29965513944625854, "rewards/margins": 2.7966206073760986, "rewards/rejected": -3.096275806427002, "step": 447 }, { "epoch": 0.12, "grad_norm": 32.306182861328125, "kl": 0.0, "learning_rate": 4.4137660298351214e-07, "logps/chosen": -174.62615966796875, "logps/rejected": -183.30177307128906, "loss": 0.4944, "rewards/chosen": -0.57502681016922, "rewards/margins": 2.9689502716064453, "rewards/rejected": -3.5439770221710205, "step": 448 }, { "epoch": 0.12, "grad_norm": 29.693349838256836, "kl": 0.0, "learning_rate": 4.4124574718660034e-07, "logps/chosen": -230.96018981933594, "logps/rejected": -289.1417236328125, "loss": 0.4293, "rewards/chosen": -3.256486177444458, "rewards/margins": -0.15216612815856934, "rewards/rejected": -3.1043200492858887, "step": 449 }, { "epoch": 0.12, "grad_norm": 33.37322998046875, "kl": 0.0, "learning_rate": 4.4111489138968854e-07, "logps/chosen": -268.78509521484375, "logps/rejected": -179.531982421875, "loss": 0.3369, "rewards/chosen": -0.8727652430534363, "rewards/margins": 2.1034626960754395, "rewards/rejected": -2.9762279987335205, "step": 450 }, { "epoch": 0.12, "grad_norm": 34.335166931152344, "kl": 0.0, "learning_rate": 4.4098403559277673e-07, "logps/chosen": -316.95068359375, "logps/rejected": -154.5581512451172, "loss": 0.4674, "rewards/chosen": -2.679093360900879, "rewards/margins": -1.4148513078689575, "rewards/rejected": -1.2642420530319214, "step": 451 }, { "epoch": 0.12, "grad_norm": 39.86390686035156, "kl": 0.0, "learning_rate": 4.4085317979586493e-07, "logps/chosen": -219.30465698242188, "logps/rejected": -263.12744140625, "loss": 0.5171, "rewards/chosen": -0.31301993131637573, "rewards/margins": 0.6610985398292542, "rewards/rejected": -0.9741184711456299, "step": 452 }, { "epoch": 0.12, "grad_norm": 30.844038009643555, "kl": 0.0, "learning_rate": 4.407223239989531e-07, "logps/chosen": -225.07394409179688, "logps/rejected": -217.85385131835938, "loss": 0.3538, "rewards/chosen": -0.46413740515708923, "rewards/margins": 2.2845165729522705, "rewards/rejected": -2.7486538887023926, "step": 453 }, { "epoch": 0.12, "grad_norm": 41.72404098510742, "kl": 0.0, "learning_rate": 4.405914682020414e-07, "logps/chosen": -229.8594207763672, "logps/rejected": -241.03822326660156, "loss": 0.4037, "rewards/chosen": -0.8823609948158264, "rewards/margins": 0.31540435552597046, "rewards/rejected": -1.1977653503417969, "step": 454 }, { "epoch": 0.12, "grad_norm": 42.6037483215332, "kl": 0.0, "learning_rate": 4.4046061240512957e-07, "logps/chosen": -300.2820739746094, "logps/rejected": -227.77395629882812, "loss": 0.3454, "rewards/chosen": 0.1626671850681305, "rewards/margins": 2.2905962467193604, "rewards/rejected": -2.1279289722442627, "step": 455 }, { "epoch": 0.12, "grad_norm": 33.86833190917969, "kl": 0.0, "learning_rate": 4.4032975660821777e-07, "logps/chosen": -318.31591796875, "logps/rejected": -182.9334716796875, "loss": 0.3524, "rewards/chosen": 0.35922086238861084, "rewards/margins": 3.2377238273620605, "rewards/rejected": -2.87850284576416, "step": 456 }, { "epoch": 0.12, "grad_norm": 37.31455612182617, "kl": 0.0, "learning_rate": 4.401989008113059e-07, "logps/chosen": -269.36700439453125, "logps/rejected": -237.45333862304688, "loss": 0.3255, "rewards/chosen": -0.11557312309741974, "rewards/margins": 1.8929672241210938, "rewards/rejected": -2.008540391921997, "step": 457 }, { "epoch": 0.12, "grad_norm": 38.61105728149414, "kl": 0.0, "learning_rate": 4.400680450143941e-07, "logps/chosen": -219.1864776611328, "logps/rejected": -252.305908203125, "loss": 0.3503, "rewards/chosen": -1.6341336965560913, "rewards/margins": 2.3688693046569824, "rewards/rejected": -4.003003120422363, "step": 458 }, { "epoch": 0.12, "grad_norm": 37.14291763305664, "kl": 0.0, "learning_rate": 4.399371892174823e-07, "logps/chosen": -265.4627380371094, "logps/rejected": -212.84605407714844, "loss": 0.2379, "rewards/chosen": 0.17642521858215332, "rewards/margins": 2.7223758697509766, "rewards/rejected": -2.5459506511688232, "step": 459 }, { "epoch": 0.12, "grad_norm": 22.45039939880371, "kl": 0.0, "learning_rate": 4.398063334205705e-07, "logps/chosen": -206.451171875, "logps/rejected": -218.98048400878906, "loss": 0.2924, "rewards/chosen": -1.1858196258544922, "rewards/margins": 1.884453296661377, "rewards/rejected": -3.070272922515869, "step": 460 }, { "epoch": 0.12, "grad_norm": 33.07036590576172, "kl": 0.0, "learning_rate": 4.396754776236587e-07, "logps/chosen": -241.53549194335938, "logps/rejected": -200.14956665039062, "loss": 0.431, "rewards/chosen": -0.8274599313735962, "rewards/margins": 1.791495442390442, "rewards/rejected": -2.618955373764038, "step": 461 }, { "epoch": 0.12, "grad_norm": 39.4976806640625, "kl": 0.0, "learning_rate": 4.395446218267469e-07, "logps/chosen": -127.9756088256836, "logps/rejected": -189.22503662109375, "loss": 0.4124, "rewards/chosen": -0.7792689800262451, "rewards/margins": 0.9542557001113892, "rewards/rejected": -1.7335246801376343, "step": 462 }, { "epoch": 0.12, "grad_norm": 37.367706298828125, "kl": 0.0, "learning_rate": 4.394137660298351e-07, "logps/chosen": -269.013916015625, "logps/rejected": -239.1348876953125, "loss": 0.463, "rewards/chosen": -1.267537236213684, "rewards/margins": 0.260334849357605, "rewards/rejected": -1.527872085571289, "step": 463 }, { "epoch": 0.12, "grad_norm": 33.60445022583008, "kl": 0.0, "learning_rate": 4.392829102329233e-07, "logps/chosen": -214.6671142578125, "logps/rejected": -296.5412902832031, "loss": 0.3431, "rewards/chosen": 0.05106091499328613, "rewards/margins": 4.666691780090332, "rewards/rejected": -4.615631103515625, "step": 464 }, { "epoch": 0.12, "grad_norm": 43.4371337890625, "kl": 0.0, "learning_rate": 4.391520544360115e-07, "logps/chosen": -294.7562561035156, "logps/rejected": -311.7882995605469, "loss": 0.4404, "rewards/chosen": -1.535415768623352, "rewards/margins": 3.4017748832702637, "rewards/rejected": -4.937190532684326, "step": 465 }, { "epoch": 0.12, "grad_norm": 31.70909881591797, "kl": 0.0, "learning_rate": 4.390211986390997e-07, "logps/chosen": -289.6376953125, "logps/rejected": -218.08839416503906, "loss": 0.3983, "rewards/chosen": -0.7291394472122192, "rewards/margins": 1.2570115327835083, "rewards/rejected": -1.9861509799957275, "step": 466 }, { "epoch": 0.12, "grad_norm": 36.29698181152344, "kl": 0.0, "learning_rate": 4.388903428421879e-07, "logps/chosen": -302.9260559082031, "logps/rejected": -270.5092468261719, "loss": 0.3438, "rewards/chosen": -1.1891206502914429, "rewards/margins": 1.5823391675949097, "rewards/rejected": -2.7714598178863525, "step": 467 }, { "epoch": 0.12, "grad_norm": 41.817020416259766, "kl": 0.0, "learning_rate": 4.387594870452761e-07, "logps/chosen": -211.2220458984375, "logps/rejected": -263.3524169921875, "loss": 0.2837, "rewards/chosen": 0.021623237058520317, "rewards/margins": 3.064910411834717, "rewards/rejected": -3.0432872772216797, "step": 468 }, { "epoch": 0.12, "grad_norm": 32.100032806396484, "kl": 0.0, "learning_rate": 4.386286312483643e-07, "logps/chosen": -197.6712646484375, "logps/rejected": -304.293212890625, "loss": 0.3716, "rewards/chosen": -0.31620892882347107, "rewards/margins": 4.1429762840271, "rewards/rejected": -4.4591851234436035, "step": 469 }, { "epoch": 0.12, "grad_norm": 35.59612274169922, "kl": 0.0, "learning_rate": 4.384977754514525e-07, "logps/chosen": -198.55593872070312, "logps/rejected": -264.3667297363281, "loss": 0.2592, "rewards/chosen": -0.7703940272331238, "rewards/margins": 1.892730474472046, "rewards/rejected": -2.6631245613098145, "step": 470 }, { "epoch": 0.12, "grad_norm": 39.834415435791016, "kl": 0.0, "learning_rate": 4.383669196545407e-07, "logps/chosen": -242.6627655029297, "logps/rejected": -194.5601806640625, "loss": 0.2882, "rewards/chosen": -0.9662243127822876, "rewards/margins": 0.030240893363952637, "rewards/rejected": -0.9964652061462402, "step": 471 }, { "epoch": 0.12, "grad_norm": 38.55018615722656, "kl": 0.0, "learning_rate": 4.3823606385762885e-07, "logps/chosen": -211.4266357421875, "logps/rejected": -157.74371337890625, "loss": 0.3696, "rewards/chosen": -0.15886731445789337, "rewards/margins": 2.9636335372924805, "rewards/rejected": -3.1225008964538574, "step": 472 }, { "epoch": 0.12, "grad_norm": 36.88655090332031, "kl": 0.0, "learning_rate": 4.3810520806071705e-07, "logps/chosen": -195.3677215576172, "logps/rejected": -293.7377624511719, "loss": 0.2387, "rewards/chosen": 0.6904792189598083, "rewards/margins": 3.3669517040252686, "rewards/rejected": -2.6764724254608154, "step": 473 }, { "epoch": 0.12, "grad_norm": 38.9611701965332, "kl": 0.0, "learning_rate": 4.3797435226380524e-07, "logps/chosen": -243.78555297851562, "logps/rejected": -197.21170043945312, "loss": 0.4396, "rewards/chosen": -0.7475346326828003, "rewards/margins": 0.7941581010818481, "rewards/rejected": -1.5416927337646484, "step": 474 }, { "epoch": 0.12, "grad_norm": 33.91997528076172, "kl": 0.0, "learning_rate": 4.3784349646689344e-07, "logps/chosen": -268.0740661621094, "logps/rejected": -150.24636840820312, "loss": 0.3428, "rewards/chosen": -2.255317211151123, "rewards/margins": 0.03300309181213379, "rewards/rejected": -2.288320302963257, "step": 475 }, { "epoch": 0.12, "grad_norm": 35.875980377197266, "kl": 0.0, "learning_rate": 4.3771264066998164e-07, "logps/chosen": -296.677001953125, "logps/rejected": -214.2599334716797, "loss": 0.3321, "rewards/chosen": -1.3506510257720947, "rewards/margins": 1.1312048435211182, "rewards/rejected": -2.481855869293213, "step": 476 }, { "epoch": 0.12, "grad_norm": 34.1489372253418, "kl": 0.0, "learning_rate": 4.3758178487306983e-07, "logps/chosen": -250.9139404296875, "logps/rejected": -231.31741333007812, "loss": 0.3651, "rewards/chosen": -1.9945584535598755, "rewards/margins": 1.617838978767395, "rewards/rejected": -3.6123974323272705, "step": 477 }, { "epoch": 0.13, "grad_norm": 35.04484939575195, "kl": 0.0, "learning_rate": 4.3745092907615803e-07, "logps/chosen": -236.66690063476562, "logps/rejected": -208.755615234375, "loss": 0.3611, "rewards/chosen": -0.8417435884475708, "rewards/margins": 1.9930511713027954, "rewards/rejected": -2.834794759750366, "step": 478 }, { "epoch": 0.13, "grad_norm": 29.14078140258789, "kl": 0.0, "learning_rate": 4.373200732792462e-07, "logps/chosen": -193.83944702148438, "logps/rejected": -175.72816467285156, "loss": 0.3968, "rewards/chosen": -1.354570746421814, "rewards/margins": 1.389928936958313, "rewards/rejected": -2.744499683380127, "step": 479 }, { "epoch": 0.13, "grad_norm": 37.82932662963867, "kl": 0.0, "learning_rate": 4.3718921748233447e-07, "logps/chosen": -293.6022644042969, "logps/rejected": -255.2928009033203, "loss": 0.4888, "rewards/chosen": -0.7668954133987427, "rewards/margins": -0.5312289595603943, "rewards/rejected": -0.23566646873950958, "step": 480 }, { "epoch": 0.13, "grad_norm": 34.05666732788086, "kl": 0.0, "learning_rate": 4.3705836168542267e-07, "logps/chosen": -210.43983459472656, "logps/rejected": -252.8617706298828, "loss": 0.3394, "rewards/chosen": -0.34639981389045715, "rewards/margins": 1.4680942296981812, "rewards/rejected": -1.814494013786316, "step": 481 }, { "epoch": 0.13, "grad_norm": 26.431236267089844, "kl": 0.0, "learning_rate": 4.3692750588851086e-07, "logps/chosen": -230.23416137695312, "logps/rejected": -259.32794189453125, "loss": 0.3312, "rewards/chosen": 1.6871323585510254, "rewards/margins": 4.462740898132324, "rewards/rejected": -2.7756083011627197, "step": 482 }, { "epoch": 0.13, "grad_norm": 34.93710708618164, "kl": 0.0, "learning_rate": 4.3679665009159906e-07, "logps/chosen": -192.50726318359375, "logps/rejected": -204.75120544433594, "loss": 0.4178, "rewards/chosen": -0.37875622510910034, "rewards/margins": 2.6469016075134277, "rewards/rejected": -3.025657892227173, "step": 483 }, { "epoch": 0.13, "grad_norm": 29.03684425354004, "kl": 0.0, "learning_rate": 4.3666579429468726e-07, "logps/chosen": -230.1165771484375, "logps/rejected": -233.3046875, "loss": 0.2737, "rewards/chosen": -0.02157449722290039, "rewards/margins": 3.5609054565429688, "rewards/rejected": -3.582479953765869, "step": 484 }, { "epoch": 0.13, "grad_norm": 34.31352233886719, "kl": 0.0, "learning_rate": 4.3653493849777545e-07, "logps/chosen": -213.8225555419922, "logps/rejected": -216.7666015625, "loss": 0.4475, "rewards/chosen": -0.675076961517334, "rewards/margins": 1.7822785377502441, "rewards/rejected": -2.457355499267578, "step": 485 }, { "epoch": 0.13, "grad_norm": 32.48976516723633, "kl": 0.0, "learning_rate": 4.3640408270086365e-07, "logps/chosen": -186.95602416992188, "logps/rejected": -334.0401306152344, "loss": 0.237, "rewards/chosen": -1.392081379890442, "rewards/margins": 2.3323307037353516, "rewards/rejected": -3.724411964416504, "step": 486 }, { "epoch": 0.13, "grad_norm": 26.30244255065918, "kl": 0.0, "learning_rate": 4.3627322690395185e-07, "logps/chosen": -195.7552947998047, "logps/rejected": -242.45310974121094, "loss": 0.2757, "rewards/chosen": -0.8110968470573425, "rewards/margins": 3.5064804553985596, "rewards/rejected": -4.317577362060547, "step": 487 }, { "epoch": 0.13, "grad_norm": 43.99164962768555, "kl": 0.0, "learning_rate": 4.3614237110704e-07, "logps/chosen": -155.76412963867188, "logps/rejected": -216.58558654785156, "loss": 0.316, "rewards/chosen": -0.40736696124076843, "rewards/margins": 2.6981070041656494, "rewards/rejected": -3.1054739952087402, "step": 488 }, { "epoch": 0.13, "grad_norm": 32.1126594543457, "kl": 0.0, "learning_rate": 4.360115153101282e-07, "logps/chosen": -205.777099609375, "logps/rejected": -174.24252319335938, "loss": 0.3616, "rewards/chosen": -0.022801468148827553, "rewards/margins": 2.2425994873046875, "rewards/rejected": -2.2654008865356445, "step": 489 }, { "epoch": 0.13, "grad_norm": 33.053733825683594, "kl": 0.0, "learning_rate": 4.358806595132164e-07, "logps/chosen": -283.0188293457031, "logps/rejected": -314.4449157714844, "loss": 0.2158, "rewards/chosen": -0.4068226218223572, "rewards/margins": 2.8746418952941895, "rewards/rejected": -3.2814645767211914, "step": 490 }, { "epoch": 0.13, "grad_norm": 35.346317291259766, "kl": 0.0, "learning_rate": 4.357498037163046e-07, "logps/chosen": -168.42393493652344, "logps/rejected": -167.74624633789062, "loss": 0.3801, "rewards/chosen": -0.9554387331008911, "rewards/margins": 1.6682840585708618, "rewards/rejected": -2.623722791671753, "step": 491 }, { "epoch": 0.13, "grad_norm": 34.33647155761719, "kl": 0.0, "learning_rate": 4.3561894791939277e-07, "logps/chosen": -324.6409606933594, "logps/rejected": -205.80564880371094, "loss": 0.4075, "rewards/chosen": -0.8289295434951782, "rewards/margins": 3.134256362915039, "rewards/rejected": -3.9631857872009277, "step": 492 }, { "epoch": 0.13, "grad_norm": 32.74213409423828, "kl": 0.0, "learning_rate": 4.35488092122481e-07, "logps/chosen": -186.9199981689453, "logps/rejected": -213.62106323242188, "loss": 0.288, "rewards/chosen": -0.1531243622303009, "rewards/margins": 2.8898119926452637, "rewards/rejected": -3.042936325073242, "step": 493 }, { "epoch": 0.13, "grad_norm": 39.94125747680664, "kl": 0.0, "learning_rate": 4.353572363255692e-07, "logps/chosen": -150.59657287597656, "logps/rejected": -246.30706787109375, "loss": 0.3572, "rewards/chosen": -0.5869843363761902, "rewards/margins": 2.4806878566741943, "rewards/rejected": -3.0676722526550293, "step": 494 }, { "epoch": 0.13, "grad_norm": 35.68771743774414, "kl": 0.0, "learning_rate": 4.352263805286574e-07, "logps/chosen": -313.64434814453125, "logps/rejected": -268.2545471191406, "loss": 0.3196, "rewards/chosen": -1.9877912998199463, "rewards/margins": 2.831235647201538, "rewards/rejected": -4.819026947021484, "step": 495 }, { "epoch": 0.13, "grad_norm": 28.748931884765625, "kl": 0.0, "learning_rate": 4.350955247317456e-07, "logps/chosen": -221.89503479003906, "logps/rejected": -178.8087158203125, "loss": 0.4381, "rewards/chosen": -1.9237765073776245, "rewards/margins": 0.5343056917190552, "rewards/rejected": -2.4580821990966797, "step": 496 }, { "epoch": 0.13, "grad_norm": 34.48188400268555, "kl": 0.0, "learning_rate": 4.349646689348338e-07, "logps/chosen": -232.4418182373047, "logps/rejected": -245.66510009765625, "loss": 0.301, "rewards/chosen": -0.1335909068584442, "rewards/margins": 3.0244925022125244, "rewards/rejected": -3.158083438873291, "step": 497 }, { "epoch": 0.13, "grad_norm": 30.280105590820312, "kl": 0.0, "learning_rate": 4.34833813137922e-07, "logps/chosen": -176.8372344970703, "logps/rejected": -306.17205810546875, "loss": 0.3286, "rewards/chosen": -0.7078924179077148, "rewards/margins": 5.860451698303223, "rewards/rejected": -6.5683441162109375, "step": 498 }, { "epoch": 0.13, "grad_norm": 28.61834144592285, "kl": 0.0, "learning_rate": 4.347029573410102e-07, "logps/chosen": -205.4990234375, "logps/rejected": -324.9005432128906, "loss": 0.2317, "rewards/chosen": -0.580607533454895, "rewards/margins": 4.204123020172119, "rewards/rejected": -4.784730434417725, "step": 499 }, { "epoch": 0.13, "grad_norm": 31.07453727722168, "kl": 0.0, "learning_rate": 4.345721015440984e-07, "logps/chosen": -212.39117431640625, "logps/rejected": -107.82008361816406, "loss": 0.3148, "rewards/chosen": 0.3389224112033844, "rewards/margins": 2.8236329555511475, "rewards/rejected": -2.484710454940796, "step": 500 }, { "epoch": 0.13, "grad_norm": 30.91390609741211, "kl": 0.0, "learning_rate": 4.344412457471866e-07, "logps/chosen": -230.955078125, "logps/rejected": -310.66424560546875, "loss": 0.3263, "rewards/chosen": -0.5058167576789856, "rewards/margins": 1.7073900699615479, "rewards/rejected": -2.2132067680358887, "step": 501 }, { "epoch": 0.13, "grad_norm": 33.28854751586914, "kl": 0.0, "learning_rate": 4.343103899502748e-07, "logps/chosen": -167.78994750976562, "logps/rejected": -217.27552795410156, "loss": 0.2653, "rewards/chosen": -0.5831389427185059, "rewards/margins": 2.3856923580169678, "rewards/rejected": -2.9688313007354736, "step": 502 }, { "epoch": 0.13, "grad_norm": 32.2841682434082, "kl": 0.0, "learning_rate": 4.3417953415336293e-07, "logps/chosen": -182.3533935546875, "logps/rejected": -282.0680847167969, "loss": 0.3076, "rewards/chosen": -1.6923186779022217, "rewards/margins": 2.6697137355804443, "rewards/rejected": -4.362032413482666, "step": 503 }, { "epoch": 0.13, "grad_norm": 35.49862289428711, "kl": 0.0, "learning_rate": 4.340486783564511e-07, "logps/chosen": -207.2657470703125, "logps/rejected": -210.40371704101562, "loss": 0.3002, "rewards/chosen": 1.06424880027771, "rewards/margins": 3.628180742263794, "rewards/rejected": -2.563931941986084, "step": 504 }, { "epoch": 0.13, "grad_norm": 25.61597442626953, "kl": 0.0, "learning_rate": 4.339178225595393e-07, "logps/chosen": -242.67938232421875, "logps/rejected": -326.7820129394531, "loss": 0.2194, "rewards/chosen": -1.2337604761123657, "rewards/margins": 4.276817798614502, "rewards/rejected": -5.510578155517578, "step": 505 }, { "epoch": 0.13, "grad_norm": 35.94837951660156, "kl": 0.0, "learning_rate": 4.3378696676262757e-07, "logps/chosen": -195.439697265625, "logps/rejected": -228.76486206054688, "loss": 0.4639, "rewards/chosen": -0.31265154480934143, "rewards/margins": 1.3135621547698975, "rewards/rejected": -1.6262136697769165, "step": 506 }, { "epoch": 0.13, "grad_norm": 28.738576889038086, "kl": 0.0, "learning_rate": 4.3365611096571577e-07, "logps/chosen": -307.04888916015625, "logps/rejected": -234.9107208251953, "loss": 0.2035, "rewards/chosen": -0.4320366680622101, "rewards/margins": 3.774815797805786, "rewards/rejected": -4.206852436065674, "step": 507 }, { "epoch": 0.13, "grad_norm": 38.1010856628418, "kl": 0.0, "learning_rate": 4.3352525516880396e-07, "logps/chosen": -271.90325927734375, "logps/rejected": -230.39442443847656, "loss": 0.251, "rewards/chosen": 0.012623111717402935, "rewards/margins": 2.0225515365600586, "rewards/rejected": -2.0099284648895264, "step": 508 }, { "epoch": 0.13, "grad_norm": 36.99441146850586, "kl": 0.0, "learning_rate": 4.3339439937189216e-07, "logps/chosen": -268.9024353027344, "logps/rejected": -292.2628479003906, "loss": 0.1964, "rewards/chosen": 0.6849376559257507, "rewards/margins": 4.367363929748535, "rewards/rejected": -3.6824264526367188, "step": 509 }, { "epoch": 0.13, "grad_norm": 28.90747833251953, "kl": 0.0, "learning_rate": 4.3326354357498036e-07, "logps/chosen": -326.12493896484375, "logps/rejected": -215.9102020263672, "loss": 0.1947, "rewards/chosen": -0.9350789785385132, "rewards/margins": 2.6486573219299316, "rewards/rejected": -3.5837364196777344, "step": 510 }, { "epoch": 0.13, "grad_norm": 33.20688247680664, "kl": 0.0, "learning_rate": 4.3313268777806855e-07, "logps/chosen": -221.3362579345703, "logps/rejected": -250.4815673828125, "loss": 0.3717, "rewards/chosen": 0.06487315148115158, "rewards/margins": 3.2546393871307373, "rewards/rejected": -3.1897661685943604, "step": 511 }, { "epoch": 0.13, "grad_norm": 41.531349182128906, "kl": 0.0, "learning_rate": 4.3300183198115675e-07, "logps/chosen": -205.99996948242188, "logps/rejected": -335.3144226074219, "loss": 0.3992, "rewards/chosen": 0.4203885793685913, "rewards/margins": 3.3626184463500977, "rewards/rejected": -2.942229747772217, "step": 512 }, { "epoch": 0.13, "grad_norm": 38.602996826171875, "kl": 0.0, "learning_rate": 4.3287097618424494e-07, "logps/chosen": -310.7867126464844, "logps/rejected": -257.95892333984375, "loss": 0.2275, "rewards/chosen": -0.5336145758628845, "rewards/margins": 2.8806161880493164, "rewards/rejected": -3.4142308235168457, "step": 513 }, { "epoch": 0.13, "grad_norm": 27.161235809326172, "kl": 0.0, "learning_rate": 4.3274012038733314e-07, "logps/chosen": -326.2843017578125, "logps/rejected": -185.2740936279297, "loss": 0.3319, "rewards/chosen": -0.76652991771698, "rewards/margins": 1.6281522512435913, "rewards/rejected": -2.3946821689605713, "step": 514 }, { "epoch": 0.13, "grad_norm": 31.2752685546875, "kl": 0.0, "learning_rate": 4.3260926459042134e-07, "logps/chosen": -263.0508728027344, "logps/rejected": -212.15830993652344, "loss": 0.3423, "rewards/chosen": -2.017266035079956, "rewards/margins": 0.27599525451660156, "rewards/rejected": -2.2932612895965576, "step": 515 }, { "epoch": 0.14, "grad_norm": 30.69701385498047, "kl": 0.0, "learning_rate": 4.3247840879350953e-07, "logps/chosen": -248.09197998046875, "logps/rejected": -316.41058349609375, "loss": 0.1471, "rewards/chosen": -0.44200658798217773, "rewards/margins": 5.960631847381592, "rewards/rejected": -6.4026384353637695, "step": 516 }, { "epoch": 0.14, "grad_norm": 32.12960433959961, "kl": 0.0, "learning_rate": 4.3234755299659773e-07, "logps/chosen": -255.32371520996094, "logps/rejected": -278.0555419921875, "loss": 0.2701, "rewards/chosen": -0.7212596535682678, "rewards/margins": 2.7007358074188232, "rewards/rejected": -3.4219954013824463, "step": 517 }, { "epoch": 0.14, "grad_norm": 32.733726501464844, "kl": 0.0, "learning_rate": 4.32216697199686e-07, "logps/chosen": -276.49493408203125, "logps/rejected": -223.2489013671875, "loss": 0.3113, "rewards/chosen": -0.1942574679851532, "rewards/margins": 1.6792775392532349, "rewards/rejected": -1.8735350370407104, "step": 518 }, { "epoch": 0.14, "grad_norm": 38.33927536010742, "kl": 0.0, "learning_rate": 4.320858414027741e-07, "logps/chosen": -331.3040466308594, "logps/rejected": -308.75128173828125, "loss": 0.3721, "rewards/chosen": 0.15606045722961426, "rewards/margins": 3.0476534366607666, "rewards/rejected": -2.8915929794311523, "step": 519 }, { "epoch": 0.14, "grad_norm": 35.864749908447266, "kl": 0.0, "learning_rate": 4.319549856058623e-07, "logps/chosen": -233.13507080078125, "logps/rejected": -319.062255859375, "loss": 0.2592, "rewards/chosen": -1.1843230724334717, "rewards/margins": 2.3068020343780518, "rewards/rejected": -3.4911251068115234, "step": 520 }, { "epoch": 0.14, "grad_norm": 35.09690475463867, "kl": 0.0, "learning_rate": 4.318241298089505e-07, "logps/chosen": -216.5358123779297, "logps/rejected": -185.92214965820312, "loss": 0.3433, "rewards/chosen": -0.31711503863334656, "rewards/margins": 2.2386279106140137, "rewards/rejected": -2.5557429790496826, "step": 521 }, { "epoch": 0.14, "grad_norm": 31.03679656982422, "kl": 0.0, "learning_rate": 4.316932740120387e-07, "logps/chosen": -218.7315673828125, "logps/rejected": -237.93017578125, "loss": 0.2502, "rewards/chosen": 0.30230313539505005, "rewards/margins": 3.891742467880249, "rewards/rejected": -3.5894393920898438, "step": 522 }, { "epoch": 0.14, "grad_norm": 35.12614059448242, "kl": 0.0, "learning_rate": 4.315624182151269e-07, "logps/chosen": -248.89341735839844, "logps/rejected": -218.71405029296875, "loss": 0.4314, "rewards/chosen": -0.9112810492515564, "rewards/margins": 1.5321125984191895, "rewards/rejected": -2.4433937072753906, "step": 523 }, { "epoch": 0.14, "grad_norm": 34.43649673461914, "kl": 0.0, "learning_rate": 4.314315624182151e-07, "logps/chosen": -174.38783264160156, "logps/rejected": -224.63009643554688, "loss": 0.1099, "rewards/chosen": 1.0011378526687622, "rewards/margins": 4.093344688415527, "rewards/rejected": -3.0922069549560547, "step": 524 }, { "epoch": 0.14, "grad_norm": 34.96295166015625, "kl": 0.0, "learning_rate": 4.313007066213033e-07, "logps/chosen": -292.97552490234375, "logps/rejected": -306.924072265625, "loss": 0.3233, "rewards/chosen": -2.473991870880127, "rewards/margins": 2.015058994293213, "rewards/rejected": -4.48905086517334, "step": 525 }, { "epoch": 0.14, "grad_norm": 42.44701385498047, "kl": 0.0, "learning_rate": 4.311698508243915e-07, "logps/chosen": -219.43673706054688, "logps/rejected": -317.16217041015625, "loss": 0.3468, "rewards/chosen": -0.6824592351913452, "rewards/margins": 1.6270591020584106, "rewards/rejected": -2.309518337249756, "step": 526 }, { "epoch": 0.14, "grad_norm": 33.045780181884766, "kl": 0.0, "learning_rate": 4.310389950274797e-07, "logps/chosen": -237.01206970214844, "logps/rejected": -284.2718505859375, "loss": 0.3741, "rewards/chosen": -1.5432114601135254, "rewards/margins": 1.7887234687805176, "rewards/rejected": -3.331934928894043, "step": 527 }, { "epoch": 0.14, "grad_norm": 64.60256958007812, "kl": 0.0, "learning_rate": 4.309081392305679e-07, "logps/chosen": -213.1400909423828, "logps/rejected": -363.4927673339844, "loss": 0.2004, "rewards/chosen": -0.3685699701309204, "rewards/margins": 3.1882615089416504, "rewards/rejected": -3.5568313598632812, "step": 528 }, { "epoch": 0.14, "grad_norm": 45.85017776489258, "kl": 0.0, "learning_rate": 4.307772834336561e-07, "logps/chosen": -292.92279052734375, "logps/rejected": -285.4364013671875, "loss": 0.3617, "rewards/chosen": 0.07168924808502197, "rewards/margins": 2.3396801948547363, "rewards/rejected": -2.267990827560425, "step": 529 }, { "epoch": 0.14, "grad_norm": 29.200170516967773, "kl": 0.0, "learning_rate": 4.306464276367443e-07, "logps/chosen": -171.74261474609375, "logps/rejected": -179.36610412597656, "loss": 0.3663, "rewards/chosen": -0.33816060423851013, "rewards/margins": 2.3799824714660645, "rewards/rejected": -2.7181429862976074, "step": 530 }, { "epoch": 0.14, "grad_norm": 27.944580078125, "kl": 0.0, "learning_rate": 4.305155718398325e-07, "logps/chosen": -166.678466796875, "logps/rejected": -235.33338928222656, "loss": 0.3208, "rewards/chosen": -0.9628866910934448, "rewards/margins": 1.1621040105819702, "rewards/rejected": -2.124990701675415, "step": 531 }, { "epoch": 0.14, "grad_norm": 27.248119354248047, "kl": 0.0, "learning_rate": 4.303847160429207e-07, "logps/chosen": -226.89649963378906, "logps/rejected": -243.8878173828125, "loss": 0.3707, "rewards/chosen": -0.1505812406539917, "rewards/margins": 2.931156635284424, "rewards/rejected": -3.081737756729126, "step": 532 }, { "epoch": 0.14, "grad_norm": 30.732263565063477, "kl": 0.0, "learning_rate": 4.302538602460089e-07, "logps/chosen": -165.86468505859375, "logps/rejected": -238.6931915283203, "loss": 0.4329, "rewards/chosen": -0.7833044528961182, "rewards/margins": 1.562227725982666, "rewards/rejected": -2.345532178878784, "step": 533 }, { "epoch": 0.14, "grad_norm": 24.79857063293457, "kl": 0.0, "learning_rate": 4.3012300444909706e-07, "logps/chosen": -260.51019287109375, "logps/rejected": -267.1363830566406, "loss": 0.2559, "rewards/chosen": 0.522705078125, "rewards/margins": 4.974919319152832, "rewards/rejected": -4.452214241027832, "step": 534 }, { "epoch": 0.14, "grad_norm": 34.72758483886719, "kl": 0.0, "learning_rate": 4.2999214865218526e-07, "logps/chosen": -218.3809814453125, "logps/rejected": -179.02288818359375, "loss": 0.3901, "rewards/chosen": -0.4237063527107239, "rewards/margins": 2.7379310131073, "rewards/rejected": -3.161637306213379, "step": 535 }, { "epoch": 0.14, "grad_norm": 32.278099060058594, "kl": 0.0, "learning_rate": 4.2986129285527345e-07, "logps/chosen": -238.82688903808594, "logps/rejected": -274.1971130371094, "loss": 0.3413, "rewards/chosen": 0.003850996494293213, "rewards/margins": 2.9562597274780273, "rewards/rejected": -2.952408790588379, "step": 536 }, { "epoch": 0.14, "grad_norm": 38.88985824584961, "kl": 0.0, "learning_rate": 4.2973043705836165e-07, "logps/chosen": -348.93212890625, "logps/rejected": -186.86294555664062, "loss": 0.3678, "rewards/chosen": -0.6001288294792175, "rewards/margins": 1.5558688640594482, "rewards/rejected": -2.1559977531433105, "step": 537 }, { "epoch": 0.14, "grad_norm": 43.40470504760742, "kl": 0.0, "learning_rate": 4.2959958126144985e-07, "logps/chosen": -260.8814392089844, "logps/rejected": -192.5634307861328, "loss": 0.3342, "rewards/chosen": -0.5589590072631836, "rewards/margins": 3.7600107192993164, "rewards/rejected": -4.3189697265625, "step": 538 }, { "epoch": 0.14, "grad_norm": 43.13268280029297, "kl": 0.0, "learning_rate": 4.2946872546453804e-07, "logps/chosen": -282.29571533203125, "logps/rejected": -256.6288757324219, "loss": 0.383, "rewards/chosen": 0.4491698741912842, "rewards/margins": 2.131098508834839, "rewards/rejected": -1.6819286346435547, "step": 539 }, { "epoch": 0.14, "grad_norm": 34.14078140258789, "kl": 0.0, "learning_rate": 4.2933786966762624e-07, "logps/chosen": -225.33982849121094, "logps/rejected": -292.35760498046875, "loss": 0.2406, "rewards/chosen": 0.6705055236816406, "rewards/margins": 3.4733963012695312, "rewards/rejected": -2.8028907775878906, "step": 540 }, { "epoch": 0.14, "grad_norm": 32.075401306152344, "kl": 0.0, "learning_rate": 4.2920701387071443e-07, "logps/chosen": -250.06369018554688, "logps/rejected": -244.3541717529297, "loss": 0.2555, "rewards/chosen": 0.8597911596298218, "rewards/margins": 3.334747791290283, "rewards/rejected": -2.474956512451172, "step": 541 }, { "epoch": 0.14, "grad_norm": 26.739641189575195, "kl": 0.0, "learning_rate": 4.2907615807380263e-07, "logps/chosen": -330.94866943359375, "logps/rejected": -247.71241760253906, "loss": 0.2611, "rewards/chosen": 0.851098895072937, "rewards/margins": 4.732474327087402, "rewards/rejected": -3.881375551223755, "step": 542 }, { "epoch": 0.14, "grad_norm": 28.64678955078125, "kl": 0.0, "learning_rate": 4.289453022768908e-07, "logps/chosen": -200.18264770507812, "logps/rejected": -195.34963989257812, "loss": 0.3216, "rewards/chosen": -1.2550278902053833, "rewards/margins": 0.44221389293670654, "rewards/rejected": -1.6972417831420898, "step": 543 }, { "epoch": 0.14, "grad_norm": 33.95494842529297, "kl": 0.0, "learning_rate": 4.288144464799791e-07, "logps/chosen": -177.20236206054688, "logps/rejected": -241.26626586914062, "loss": 0.37, "rewards/chosen": -0.8912516832351685, "rewards/margins": 1.3814114332199097, "rewards/rejected": -2.272663116455078, "step": 544 }, { "epoch": 0.14, "grad_norm": 29.539703369140625, "kl": 0.0, "learning_rate": 4.2868359068306727e-07, "logps/chosen": -168.141845703125, "logps/rejected": -123.10795593261719, "loss": 0.2149, "rewards/chosen": -1.0771909952163696, "rewards/margins": 1.4760149717330933, "rewards/rejected": -2.553205966949463, "step": 545 }, { "epoch": 0.14, "grad_norm": 30.06838607788086, "kl": 0.0, "learning_rate": 4.2855273488615547e-07, "logps/chosen": -226.75885009765625, "logps/rejected": -265.8707275390625, "loss": 0.3497, "rewards/chosen": 0.03218793869018555, "rewards/margins": 2.9179019927978516, "rewards/rejected": -2.885714054107666, "step": 546 }, { "epoch": 0.14, "grad_norm": 24.598175048828125, "kl": 0.0, "learning_rate": 4.2842187908924366e-07, "logps/chosen": -201.21563720703125, "logps/rejected": -197.22674560546875, "loss": 0.1921, "rewards/chosen": -0.34368598461151123, "rewards/margins": 2.0713300704956055, "rewards/rejected": -2.415015935897827, "step": 547 }, { "epoch": 0.14, "grad_norm": 35.25346374511719, "kl": 0.0, "learning_rate": 4.2829102329233186e-07, "logps/chosen": -301.93560791015625, "logps/rejected": -147.30148315429688, "loss": 0.4078, "rewards/chosen": -2.1355183124542236, "rewards/margins": -0.27744102478027344, "rewards/rejected": -1.8580772876739502, "step": 548 }, { "epoch": 0.14, "grad_norm": 33.94302749633789, "kl": 0.0, "learning_rate": 4.2816016749542006e-07, "logps/chosen": -248.69700622558594, "logps/rejected": -231.81719970703125, "loss": 0.3855, "rewards/chosen": 0.03941810131072998, "rewards/margins": 2.8565940856933594, "rewards/rejected": -2.81717586517334, "step": 549 }, { "epoch": 0.14, "grad_norm": 32.44513702392578, "kl": 0.0, "learning_rate": 4.280293116985082e-07, "logps/chosen": -176.3740997314453, "logps/rejected": -261.5088195800781, "loss": 0.319, "rewards/chosen": 0.6193097829818726, "rewards/margins": 3.7904415130615234, "rewards/rejected": -3.1711316108703613, "step": 550 }, { "epoch": 0.14, "grad_norm": 32.98072052001953, "kl": 0.0, "learning_rate": 4.278984559015964e-07, "logps/chosen": -182.24502563476562, "logps/rejected": -238.89453125, "loss": 0.3299, "rewards/chosen": -0.6983445882797241, "rewards/margins": 2.0171093940734863, "rewards/rejected": -2.7154541015625, "step": 551 }, { "epoch": 0.14, "grad_norm": 26.78199005126953, "kl": 0.0, "learning_rate": 4.277676001046846e-07, "logps/chosen": -162.42233276367188, "logps/rejected": -246.67506408691406, "loss": 0.3525, "rewards/chosen": -1.2917745113372803, "rewards/margins": 2.2916183471679688, "rewards/rejected": -3.583392858505249, "step": 552 }, { "epoch": 0.14, "grad_norm": 29.860422134399414, "kl": 0.0, "learning_rate": 4.276367443077728e-07, "logps/chosen": -237.3893280029297, "logps/rejected": -250.49899291992188, "loss": 0.2725, "rewards/chosen": -0.23593808710575104, "rewards/margins": 2.402143716812134, "rewards/rejected": -2.6380817890167236, "step": 553 }, { "epoch": 0.14, "grad_norm": 33.161163330078125, "kl": 0.0, "learning_rate": 4.27505888510861e-07, "logps/chosen": -237.7818145751953, "logps/rejected": -227.72335815429688, "loss": 0.3539, "rewards/chosen": 0.2122727930545807, "rewards/margins": 2.9937832355499268, "rewards/rejected": -2.781510353088379, "step": 554 }, { "epoch": 0.15, "grad_norm": 38.98054122924805, "kl": 0.0, "learning_rate": 4.273750327139492e-07, "logps/chosen": -214.34078979492188, "logps/rejected": -263.860595703125, "loss": 0.3239, "rewards/chosen": -0.39826700091362, "rewards/margins": 2.020453691482544, "rewards/rejected": -2.4187207221984863, "step": 555 }, { "epoch": 0.15, "grad_norm": 33.922691345214844, "kl": 0.0, "learning_rate": 4.2724417691703743e-07, "logps/chosen": -206.01144409179688, "logps/rejected": -166.6465606689453, "loss": 0.3363, "rewards/chosen": -0.8341464996337891, "rewards/margins": 1.285893201828003, "rewards/rejected": -2.120039701461792, "step": 556 }, { "epoch": 0.15, "grad_norm": 30.631797790527344, "kl": 0.0, "learning_rate": 4.271133211201256e-07, "logps/chosen": -158.25732421875, "logps/rejected": -213.3656768798828, "loss": 0.2741, "rewards/chosen": -0.12498453259468079, "rewards/margins": 2.9033560752868652, "rewards/rejected": -3.0283405780792236, "step": 557 }, { "epoch": 0.15, "grad_norm": 24.324567794799805, "kl": 0.0, "learning_rate": 4.269824653232138e-07, "logps/chosen": -217.47898864746094, "logps/rejected": -165.88308715820312, "loss": 0.3939, "rewards/chosen": -1.5370630025863647, "rewards/margins": 0.32974278926849365, "rewards/rejected": -1.8668057918548584, "step": 558 }, { "epoch": 0.15, "grad_norm": 34.045230865478516, "kl": 0.0, "learning_rate": 4.26851609526302e-07, "logps/chosen": -294.7262268066406, "logps/rejected": -205.035400390625, "loss": 0.4029, "rewards/chosen": -1.259482502937317, "rewards/margins": 0.6094952821731567, "rewards/rejected": -1.8689777851104736, "step": 559 }, { "epoch": 0.15, "grad_norm": 31.53570556640625, "kl": 0.0, "learning_rate": 4.267207537293902e-07, "logps/chosen": -179.20379638671875, "logps/rejected": -265.7896728515625, "loss": 0.4417, "rewards/chosen": -0.9781392812728882, "rewards/margins": 3.028449535369873, "rewards/rejected": -4.006588935852051, "step": 560 }, { "epoch": 0.15, "grad_norm": 28.38507652282715, "kl": 0.0, "learning_rate": 4.265898979324784e-07, "logps/chosen": -105.09141540527344, "logps/rejected": -330.2618103027344, "loss": 0.3743, "rewards/chosen": -0.9884614944458008, "rewards/margins": 1.17330002784729, "rewards/rejected": -2.161761522293091, "step": 561 }, { "epoch": 0.15, "grad_norm": 34.71090316772461, "kl": 0.0, "learning_rate": 4.264590421355666e-07, "logps/chosen": -200.91526794433594, "logps/rejected": -304.50482177734375, "loss": 0.2676, "rewards/chosen": 1.216893196105957, "rewards/margins": 4.012282371520996, "rewards/rejected": -2.795389413833618, "step": 562 }, { "epoch": 0.15, "grad_norm": 37.67556381225586, "kl": 0.0, "learning_rate": 4.263281863386548e-07, "logps/chosen": -233.8171844482422, "logps/rejected": -209.13409423828125, "loss": 0.4123, "rewards/chosen": -0.7608001232147217, "rewards/margins": 1.143924593925476, "rewards/rejected": -1.9047247171401978, "step": 563 }, { "epoch": 0.15, "grad_norm": 37.23823547363281, "kl": 0.0, "learning_rate": 4.26197330541743e-07, "logps/chosen": -214.50765991210938, "logps/rejected": -472.725830078125, "loss": 0.3872, "rewards/chosen": -0.8464738726615906, "rewards/margins": 4.16305685043335, "rewards/rejected": -5.009530544281006, "step": 564 }, { "epoch": 0.15, "grad_norm": 35.8170166015625, "kl": 0.0, "learning_rate": 4.2606647474483114e-07, "logps/chosen": -287.5885314941406, "logps/rejected": -199.86541748046875, "loss": 0.3366, "rewards/chosen": -0.5958128571510315, "rewards/margins": 1.9663183689117432, "rewards/rejected": -2.56213116645813, "step": 565 }, { "epoch": 0.15, "grad_norm": 40.15536880493164, "kl": 0.0, "learning_rate": 4.2593561894791934e-07, "logps/chosen": -253.20193481445312, "logps/rejected": -249.92214965820312, "loss": 0.3344, "rewards/chosen": -0.08916880190372467, "rewards/margins": 2.356729745864868, "rewards/rejected": -2.4458985328674316, "step": 566 }, { "epoch": 0.15, "grad_norm": 31.04861068725586, "kl": 0.0, "learning_rate": 4.2580476315100753e-07, "logps/chosen": -196.5449981689453, "logps/rejected": -212.40931701660156, "loss": 0.33, "rewards/chosen": -0.20059321820735931, "rewards/margins": 2.8560163974761963, "rewards/rejected": -3.056609630584717, "step": 567 }, { "epoch": 0.15, "grad_norm": 35.633323669433594, "kl": 0.0, "learning_rate": 4.2567390735409573e-07, "logps/chosen": -246.609619140625, "logps/rejected": -201.65283203125, "loss": 0.3916, "rewards/chosen": -0.4009683132171631, "rewards/margins": 2.062058448791504, "rewards/rejected": -2.463026762008667, "step": 568 }, { "epoch": 0.15, "grad_norm": 33.654693603515625, "kl": 0.0, "learning_rate": 4.25543051557184e-07, "logps/chosen": -264.0102844238281, "logps/rejected": -169.67617797851562, "loss": 0.314, "rewards/chosen": -0.4244236350059509, "rewards/margins": 3.7280030250549316, "rewards/rejected": -4.152426719665527, "step": 569 }, { "epoch": 0.15, "grad_norm": 38.11521911621094, "kl": 0.0, "learning_rate": 4.254121957602722e-07, "logps/chosen": -198.6380157470703, "logps/rejected": -192.98634338378906, "loss": 0.3554, "rewards/chosen": -0.11442930996417999, "rewards/margins": 2.008514165878296, "rewards/rejected": -2.12294340133667, "step": 570 }, { "epoch": 0.15, "grad_norm": 35.716495513916016, "kl": 0.0, "learning_rate": 4.2528133996336037e-07, "logps/chosen": -156.13453674316406, "logps/rejected": -272.1989440917969, "loss": 0.275, "rewards/chosen": -0.06980940699577332, "rewards/margins": 2.9614274501800537, "rewards/rejected": -3.0312368869781494, "step": 571 }, { "epoch": 0.15, "grad_norm": 38.65645980834961, "kl": 0.0, "learning_rate": 4.2515048416644857e-07, "logps/chosen": -178.9793243408203, "logps/rejected": -289.0226135253906, "loss": 0.3303, "rewards/chosen": -0.03785140812397003, "rewards/margins": 3.243406295776367, "rewards/rejected": -3.2812576293945312, "step": 572 }, { "epoch": 0.15, "grad_norm": 36.98851776123047, "kl": 0.0, "learning_rate": 4.2501962836953676e-07, "logps/chosen": -240.0013885498047, "logps/rejected": -235.85000610351562, "loss": 0.3211, "rewards/chosen": 1.605520248413086, "rewards/margins": 4.609771251678467, "rewards/rejected": -3.004251003265381, "step": 573 }, { "epoch": 0.15, "grad_norm": 52.9211311340332, "kl": 0.0, "learning_rate": 4.2488877257262496e-07, "logps/chosen": -170.80899047851562, "logps/rejected": -318.06646728515625, "loss": 0.3078, "rewards/chosen": -0.9244940876960754, "rewards/margins": 1.4279897212982178, "rewards/rejected": -2.3524837493896484, "step": 574 }, { "epoch": 0.15, "grad_norm": 36.183860778808594, "kl": 0.0, "learning_rate": 4.2475791677571315e-07, "logps/chosen": -203.8833770751953, "logps/rejected": -260.57208251953125, "loss": 0.2326, "rewards/chosen": -0.2753835618495941, "rewards/margins": 3.029111623764038, "rewards/rejected": -3.304495096206665, "step": 575 }, { "epoch": 0.15, "grad_norm": 33.7669792175293, "kl": 0.0, "learning_rate": 4.2462706097880135e-07, "logps/chosen": -213.69007873535156, "logps/rejected": -252.35267639160156, "loss": 0.3478, "rewards/chosen": 0.12792816758155823, "rewards/margins": 3.7786970138549805, "rewards/rejected": -3.650768756866455, "step": 576 }, { "epoch": 0.15, "grad_norm": 40.28141784667969, "kl": 0.0, "learning_rate": 4.2449620518188955e-07, "logps/chosen": -275.3423156738281, "logps/rejected": -183.16616821289062, "loss": 0.3975, "rewards/chosen": -0.9379222393035889, "rewards/margins": 1.8455045223236084, "rewards/rejected": -2.7834267616271973, "step": 577 }, { "epoch": 0.15, "grad_norm": 41.42319869995117, "kl": 0.0, "learning_rate": 4.2436534938497774e-07, "logps/chosen": -192.60484313964844, "logps/rejected": -239.24285888671875, "loss": 0.3576, "rewards/chosen": 0.22006237506866455, "rewards/margins": 1.8599191904067993, "rewards/rejected": -1.6398568153381348, "step": 578 }, { "epoch": 0.15, "grad_norm": 22.584609985351562, "kl": 0.0, "learning_rate": 4.2423449358806594e-07, "logps/chosen": -168.68499755859375, "logps/rejected": -194.3863983154297, "loss": 0.3093, "rewards/chosen": 1.0403271913528442, "rewards/margins": 3.6925063133239746, "rewards/rejected": -2.65217924118042, "step": 579 }, { "epoch": 0.15, "grad_norm": 28.20046043395996, "kl": 0.0, "learning_rate": 4.2410363779115413e-07, "logps/chosen": -246.39317321777344, "logps/rejected": -256.670654296875, "loss": 0.3676, "rewards/chosen": 0.45120349526405334, "rewards/margins": 2.9600284099578857, "rewards/rejected": -2.5088248252868652, "step": 580 }, { "epoch": 0.15, "grad_norm": 34.99135971069336, "kl": 0.0, "learning_rate": 4.239727819942423e-07, "logps/chosen": -195.71902465820312, "logps/rejected": -272.074951171875, "loss": 0.3224, "rewards/chosen": -0.10005563497543335, "rewards/margins": 3.6908347606658936, "rewards/rejected": -3.7908904552459717, "step": 581 }, { "epoch": 0.15, "grad_norm": 37.57238006591797, "kl": 0.0, "learning_rate": 4.2384192619733053e-07, "logps/chosen": -322.5052185058594, "logps/rejected": -237.87149047851562, "loss": 0.4824, "rewards/chosen": -0.9541627168655396, "rewards/margins": 1.8996022939682007, "rewards/rejected": -2.8537650108337402, "step": 582 }, { "epoch": 0.15, "grad_norm": 32.19499969482422, "kl": 0.0, "learning_rate": 4.237110704004187e-07, "logps/chosen": -165.03366088867188, "logps/rejected": -246.01821899414062, "loss": 0.38, "rewards/chosen": -0.257803738117218, "rewards/margins": 2.8370163440704346, "rewards/rejected": -3.094820022583008, "step": 583 }, { "epoch": 0.15, "grad_norm": 37.46137237548828, "kl": 0.0, "learning_rate": 4.235802146035069e-07, "logps/chosen": -261.37066650390625, "logps/rejected": -227.03553771972656, "loss": 0.2971, "rewards/chosen": -0.23255617916584015, "rewards/margins": 1.408286213874817, "rewards/rejected": -1.6408424377441406, "step": 584 }, { "epoch": 0.15, "grad_norm": 22.948627471923828, "kl": 0.0, "learning_rate": 4.234493588065951e-07, "logps/chosen": -147.1380615234375, "logps/rejected": -239.35177612304688, "loss": 0.3552, "rewards/chosen": -0.27524644136428833, "rewards/margins": 1.9764175415039062, "rewards/rejected": -2.25166392326355, "step": 585 }, { "epoch": 0.15, "grad_norm": 33.70252990722656, "kl": 0.0, "learning_rate": 4.233185030096833e-07, "logps/chosen": -183.9422149658203, "logps/rejected": -268.509033203125, "loss": 0.1464, "rewards/chosen": -0.05040533095598221, "rewards/margins": 6.646665573120117, "rewards/rejected": -6.697071075439453, "step": 586 }, { "epoch": 0.15, "grad_norm": 33.52576446533203, "kl": 0.0, "learning_rate": 4.231876472127715e-07, "logps/chosen": -228.71929931640625, "logps/rejected": -263.7425231933594, "loss": 0.3134, "rewards/chosen": -1.5203044414520264, "rewards/margins": 1.3902077674865723, "rewards/rejected": -2.9105122089385986, "step": 587 }, { "epoch": 0.15, "grad_norm": 34.12139129638672, "kl": 0.0, "learning_rate": 4.230567914158597e-07, "logps/chosen": -153.0979766845703, "logps/rejected": -236.69737243652344, "loss": 0.2226, "rewards/chosen": 0.7525811791419983, "rewards/margins": 3.2662181854248047, "rewards/rejected": -2.513637065887451, "step": 588 }, { "epoch": 0.15, "grad_norm": 32.09413528442383, "kl": 0.0, "learning_rate": 4.229259356189479e-07, "logps/chosen": -185.1908416748047, "logps/rejected": -255.58477783203125, "loss": 0.3693, "rewards/chosen": -0.6914921402931213, "rewards/margins": 1.7158796787261963, "rewards/rejected": -2.407371759414673, "step": 589 }, { "epoch": 0.15, "grad_norm": 36.44728469848633, "kl": 0.0, "learning_rate": 4.227950798220361e-07, "logps/chosen": -188.67189025878906, "logps/rejected": -242.02938842773438, "loss": 0.3205, "rewards/chosen": 0.6397703886032104, "rewards/margins": 5.1770548820495605, "rewards/rejected": -4.5372843742370605, "step": 590 }, { "epoch": 0.15, "grad_norm": 33.02339172363281, "kl": 0.0, "learning_rate": 4.226642240251243e-07, "logps/chosen": -218.4967803955078, "logps/rejected": -230.99813842773438, "loss": 0.3533, "rewards/chosen": -1.1348559856414795, "rewards/margins": 1.3938672542572021, "rewards/rejected": -2.5287232398986816, "step": 591 }, { "epoch": 0.15, "grad_norm": 38.133026123046875, "kl": 0.0, "learning_rate": 4.225333682282125e-07, "logps/chosen": -347.8610534667969, "logps/rejected": -249.60537719726562, "loss": 0.2811, "rewards/chosen": 1.11420476436615, "rewards/margins": 3.734504222869873, "rewards/rejected": -2.6202993392944336, "step": 592 }, { "epoch": 0.16, "grad_norm": 32.02317428588867, "kl": 0.0, "learning_rate": 4.224025124313007e-07, "logps/chosen": -224.7441864013672, "logps/rejected": -204.56478881835938, "loss": 0.3995, "rewards/chosen": -1.2623833417892456, "rewards/margins": 2.1233153343200684, "rewards/rejected": -3.3856987953186035, "step": 593 }, { "epoch": 0.16, "grad_norm": 37.762245178222656, "kl": 0.0, "learning_rate": 4.2227165663438893e-07, "logps/chosen": -230.5582275390625, "logps/rejected": -199.21585083007812, "loss": 0.3758, "rewards/chosen": -0.6932766437530518, "rewards/margins": 0.7945023775100708, "rewards/rejected": -1.4877790212631226, "step": 594 }, { "epoch": 0.16, "grad_norm": 35.36671829223633, "kl": 0.0, "learning_rate": 4.2214080083747713e-07, "logps/chosen": -280.342041015625, "logps/rejected": -314.884765625, "loss": 0.3282, "rewards/chosen": 0.6207032203674316, "rewards/margins": 3.5934977531433105, "rewards/rejected": -2.972794532775879, "step": 595 }, { "epoch": 0.16, "grad_norm": 33.32785415649414, "kl": 0.0, "learning_rate": 4.2200994504056527e-07, "logps/chosen": -183.12124633789062, "logps/rejected": -246.1369171142578, "loss": 0.3199, "rewards/chosen": -0.36766961216926575, "rewards/margins": 2.0127646923065186, "rewards/rejected": -2.380434274673462, "step": 596 }, { "epoch": 0.16, "grad_norm": 38.22690200805664, "kl": 0.0, "learning_rate": 4.2187908924365347e-07, "logps/chosen": -195.23191833496094, "logps/rejected": -224.2406768798828, "loss": 0.3949, "rewards/chosen": -0.9440802931785583, "rewards/margins": 0.899925172328949, "rewards/rejected": -1.8440054655075073, "step": 597 }, { "epoch": 0.16, "grad_norm": 34.1783332824707, "kl": 0.0, "learning_rate": 4.2174823344674166e-07, "logps/chosen": -123.10539245605469, "logps/rejected": -198.38302612304688, "loss": 0.2468, "rewards/chosen": 0.9094799757003784, "rewards/margins": 3.0081400871276855, "rewards/rejected": -2.0986599922180176, "step": 598 }, { "epoch": 0.16, "grad_norm": 40.53157043457031, "kl": 0.0, "learning_rate": 4.2161737764982986e-07, "logps/chosen": -218.34637451171875, "logps/rejected": -308.47027587890625, "loss": 0.3564, "rewards/chosen": 0.3253116011619568, "rewards/margins": 4.2715067863464355, "rewards/rejected": -3.946195363998413, "step": 599 }, { "epoch": 0.16, "grad_norm": 31.91612434387207, "kl": 0.0, "learning_rate": 4.2148652185291806e-07, "logps/chosen": -251.1618194580078, "logps/rejected": -243.45339965820312, "loss": 0.3264, "rewards/chosen": -1.2941466569900513, "rewards/margins": 1.6014741659164429, "rewards/rejected": -2.895620822906494, "step": 600 }, { "epoch": 0.16, "grad_norm": 37.07390213012695, "kl": 0.0, "learning_rate": 4.2135566605600625e-07, "logps/chosen": -293.2223815917969, "logps/rejected": -259.8509826660156, "loss": 0.3254, "rewards/chosen": -0.37220340967178345, "rewards/margins": 4.210824966430664, "rewards/rejected": -4.583028316497803, "step": 601 }, { "epoch": 0.16, "grad_norm": 33.21635818481445, "kl": 0.0, "learning_rate": 4.2122481025909445e-07, "logps/chosen": -229.32579040527344, "logps/rejected": -207.85536193847656, "loss": 0.3338, "rewards/chosen": -0.4946441054344177, "rewards/margins": 1.616485357284546, "rewards/rejected": -2.1111295223236084, "step": 602 }, { "epoch": 0.16, "grad_norm": 35.14589309692383, "kl": 0.0, "learning_rate": 4.2109395446218264e-07, "logps/chosen": -218.96194458007812, "logps/rejected": -198.21871948242188, "loss": 0.3888, "rewards/chosen": -1.2031952142715454, "rewards/margins": 0.5610319375991821, "rewards/rejected": -1.7642271518707275, "step": 603 }, { "epoch": 0.16, "grad_norm": 32.1645393371582, "kl": 0.0, "learning_rate": 4.2096309866527084e-07, "logps/chosen": -224.2860870361328, "logps/rejected": -265.3037109375, "loss": 0.2591, "rewards/chosen": 0.464304119348526, "rewards/margins": 3.8499107360839844, "rewards/rejected": -3.385606527328491, "step": 604 }, { "epoch": 0.16, "grad_norm": 36.952308654785156, "kl": 0.0, "learning_rate": 4.2083224286835904e-07, "logps/chosen": -258.63189697265625, "logps/rejected": -289.3390808105469, "loss": 0.3015, "rewards/chosen": -1.1682672500610352, "rewards/margins": 2.1730809211730957, "rewards/rejected": -3.341348171234131, "step": 605 }, { "epoch": 0.16, "grad_norm": 27.418455123901367, "kl": 0.0, "learning_rate": 4.2070138707144723e-07, "logps/chosen": -214.0426025390625, "logps/rejected": -213.5802764892578, "loss": 0.3561, "rewards/chosen": -0.10803361982107162, "rewards/margins": 1.9988174438476562, "rewards/rejected": -2.106851100921631, "step": 606 }, { "epoch": 0.16, "grad_norm": 34.8771858215332, "kl": 0.0, "learning_rate": 4.205705312745355e-07, "logps/chosen": -207.3766326904297, "logps/rejected": -240.21054077148438, "loss": 0.3563, "rewards/chosen": -0.24712109565734863, "rewards/margins": 3.139512300491333, "rewards/rejected": -3.3866333961486816, "step": 607 }, { "epoch": 0.16, "grad_norm": 31.268268585205078, "kl": 0.0, "learning_rate": 4.204396754776237e-07, "logps/chosen": -240.3983154296875, "logps/rejected": -267.9146728515625, "loss": 0.3182, "rewards/chosen": -0.6161116361618042, "rewards/margins": 3.3313660621643066, "rewards/rejected": -3.9474778175354004, "step": 608 }, { "epoch": 0.16, "grad_norm": 38.52226257324219, "kl": 0.0, "learning_rate": 4.203088196807119e-07, "logps/chosen": -225.6261749267578, "logps/rejected": -227.69583129882812, "loss": 0.2613, "rewards/chosen": -0.07037395238876343, "rewards/margins": 2.7204816341400146, "rewards/rejected": -2.790855646133423, "step": 609 }, { "epoch": 0.16, "grad_norm": 36.78439712524414, "kl": 0.0, "learning_rate": 4.2017796388380007e-07, "logps/chosen": -194.47763061523438, "logps/rejected": -309.6812438964844, "loss": 0.3235, "rewards/chosen": -1.9775137901306152, "rewards/margins": 8.399917602539062, "rewards/rejected": -10.377431869506836, "step": 610 }, { "epoch": 0.16, "grad_norm": 36.382999420166016, "kl": 0.0, "learning_rate": 4.2004710808688827e-07, "logps/chosen": -271.60491943359375, "logps/rejected": -241.863037109375, "loss": 0.392, "rewards/chosen": -0.5924327373504639, "rewards/margins": 2.0697991847991943, "rewards/rejected": -2.662231922149658, "step": 611 }, { "epoch": 0.16, "grad_norm": 34.29161071777344, "kl": 0.0, "learning_rate": 4.199162522899764e-07, "logps/chosen": -264.40716552734375, "logps/rejected": -227.8290252685547, "loss": 0.3088, "rewards/chosen": -0.9955476522445679, "rewards/margins": 1.4795089960098267, "rewards/rejected": -2.4750566482543945, "step": 612 }, { "epoch": 0.16, "grad_norm": 32.622154235839844, "kl": 0.0, "learning_rate": 4.197853964930646e-07, "logps/chosen": -312.36907958984375, "logps/rejected": -288.0682678222656, "loss": 0.2823, "rewards/chosen": -2.0785109996795654, "rewards/margins": 3.3038604259490967, "rewards/rejected": -5.382371425628662, "step": 613 }, { "epoch": 0.16, "grad_norm": 37.70346450805664, "kl": 0.0, "learning_rate": 4.196545406961528e-07, "logps/chosen": -221.8252716064453, "logps/rejected": -356.6756591796875, "loss": 0.4246, "rewards/chosen": -0.5658776164054871, "rewards/margins": 2.9037880897521973, "rewards/rejected": -3.469665765762329, "step": 614 }, { "epoch": 0.16, "grad_norm": 30.7971248626709, "kl": 0.0, "learning_rate": 4.19523684899241e-07, "logps/chosen": -177.7958221435547, "logps/rejected": -268.3973693847656, "loss": 0.274, "rewards/chosen": 0.7021116018295288, "rewards/margins": 3.8640646934509277, "rewards/rejected": -3.1619529724121094, "step": 615 }, { "epoch": 0.16, "grad_norm": 26.313661575317383, "kl": 0.0, "learning_rate": 4.193928291023292e-07, "logps/chosen": -137.11647033691406, "logps/rejected": -263.5867919921875, "loss": 0.2909, "rewards/chosen": -1.9836171865463257, "rewards/margins": 0.7680078744888306, "rewards/rejected": -2.7516250610351562, "step": 616 }, { "epoch": 0.16, "grad_norm": 31.26655387878418, "kl": 0.0, "learning_rate": 4.192619733054174e-07, "logps/chosen": -233.62945556640625, "logps/rejected": -212.066162109375, "loss": 0.2317, "rewards/chosen": 2.0348241329193115, "rewards/margins": 4.331893444061279, "rewards/rejected": -2.2970693111419678, "step": 617 }, { "epoch": 0.16, "grad_norm": 32.78192138671875, "kl": 0.0, "learning_rate": 4.191311175085056e-07, "logps/chosen": -142.2562713623047, "logps/rejected": -229.1559600830078, "loss": 0.3904, "rewards/chosen": -0.5529773235321045, "rewards/margins": 2.3756163120269775, "rewards/rejected": -2.928593635559082, "step": 618 }, { "epoch": 0.16, "grad_norm": 37.07200241088867, "kl": 0.0, "learning_rate": 4.190002617115938e-07, "logps/chosen": -250.66610717773438, "logps/rejected": -307.2555236816406, "loss": 0.3443, "rewards/chosen": 0.45598897337913513, "rewards/margins": 3.6246819496154785, "rewards/rejected": -3.1686930656433105, "step": 619 }, { "epoch": 0.16, "grad_norm": 34.067134857177734, "kl": 0.0, "learning_rate": 4.1886940591468203e-07, "logps/chosen": -231.55824279785156, "logps/rejected": -225.37667846679688, "loss": 0.2838, "rewards/chosen": -1.5656440258026123, "rewards/margins": 1.4117012023925781, "rewards/rejected": -2.9773452281951904, "step": 620 }, { "epoch": 0.16, "grad_norm": 33.20025634765625, "kl": 0.0, "learning_rate": 4.1873855011777023e-07, "logps/chosen": -286.56365966796875, "logps/rejected": -232.8338623046875, "loss": 0.3906, "rewards/chosen": -1.1976393461227417, "rewards/margins": 1.7728246450424194, "rewards/rejected": -2.970463991165161, "step": 621 }, { "epoch": 0.16, "grad_norm": 31.43844223022461, "kl": 0.0, "learning_rate": 4.186076943208584e-07, "logps/chosen": -161.1515350341797, "logps/rejected": -221.69117736816406, "loss": 0.232, "rewards/chosen": -1.0699456930160522, "rewards/margins": 1.7152012586593628, "rewards/rejected": -2.785146951675415, "step": 622 }, { "epoch": 0.16, "grad_norm": 35.48064041137695, "kl": 0.0, "learning_rate": 4.184768385239466e-07, "logps/chosen": -266.043212890625, "logps/rejected": -332.0105895996094, "loss": 0.3535, "rewards/chosen": -0.6172822117805481, "rewards/margins": 2.453559398651123, "rewards/rejected": -3.0708415508270264, "step": 623 }, { "epoch": 0.16, "grad_norm": 30.414508819580078, "kl": 0.0, "learning_rate": 4.183459827270348e-07, "logps/chosen": -195.5116729736328, "logps/rejected": -175.37014770507812, "loss": 0.2711, "rewards/chosen": -0.0021307978313416243, "rewards/margins": 2.2265679836273193, "rewards/rejected": -2.22869873046875, "step": 624 }, { "epoch": 0.16, "grad_norm": 40.96977996826172, "kl": 0.0, "learning_rate": 4.18215126930123e-07, "logps/chosen": -211.9053497314453, "logps/rejected": -200.890869140625, "loss": 0.3433, "rewards/chosen": -0.9147431254386902, "rewards/margins": 1.683835744857788, "rewards/rejected": -2.598578929901123, "step": 625 }, { "epoch": 0.16, "grad_norm": 32.06422424316406, "kl": 0.0, "learning_rate": 4.180842711332112e-07, "logps/chosen": -168.45120239257812, "logps/rejected": -262.6234130859375, "loss": 0.4266, "rewards/chosen": -1.8037545680999756, "rewards/margins": 0.47751498222351074, "rewards/rejected": -2.2812695503234863, "step": 626 }, { "epoch": 0.16, "grad_norm": 29.26646614074707, "kl": 0.0, "learning_rate": 4.1795341533629935e-07, "logps/chosen": -184.18740844726562, "logps/rejected": -222.1627655029297, "loss": 0.2588, "rewards/chosen": 0.5016359686851501, "rewards/margins": 3.441005229949951, "rewards/rejected": -2.9393692016601562, "step": 627 }, { "epoch": 0.16, "grad_norm": 31.68562126159668, "kl": 0.0, "learning_rate": 4.1782255953938755e-07, "logps/chosen": -177.28375244140625, "logps/rejected": -206.3604736328125, "loss": 0.442, "rewards/chosen": -0.5679258704185486, "rewards/margins": 2.414395809173584, "rewards/rejected": -2.9823217391967773, "step": 628 }, { "epoch": 0.16, "grad_norm": 35.57976150512695, "kl": 0.0, "learning_rate": 4.1769170374247574e-07, "logps/chosen": -192.6014404296875, "logps/rejected": -193.0386505126953, "loss": 0.2338, "rewards/chosen": -0.7253746390342712, "rewards/margins": 1.539360761642456, "rewards/rejected": -2.264735460281372, "step": 629 }, { "epoch": 0.16, "grad_norm": 26.9422607421875, "kl": 0.0, "learning_rate": 4.1756084794556394e-07, "logps/chosen": -192.63441467285156, "logps/rejected": -205.19033813476562, "loss": 0.3647, "rewards/chosen": -1.236472487449646, "rewards/margins": 1.0552409887313843, "rewards/rejected": -2.2917134761810303, "step": 630 }, { "epoch": 0.17, "grad_norm": 34.09742736816406, "kl": 0.0, "learning_rate": 4.1742999214865214e-07, "logps/chosen": -277.0921936035156, "logps/rejected": -268.65423583984375, "loss": 0.3759, "rewards/chosen": -1.4564865827560425, "rewards/margins": 1.5203653573989868, "rewards/rejected": -2.9768519401550293, "step": 631 }, { "epoch": 0.17, "grad_norm": 35.76420211791992, "kl": 0.0, "learning_rate": 4.1729913635174033e-07, "logps/chosen": -220.2989044189453, "logps/rejected": -355.736083984375, "loss": 0.235, "rewards/chosen": 0.18057142198085785, "rewards/margins": 4.921010494232178, "rewards/rejected": -4.740438938140869, "step": 632 }, { "epoch": 0.17, "grad_norm": 24.0476131439209, "kl": 0.0, "learning_rate": 4.171682805548286e-07, "logps/chosen": -152.52206420898438, "logps/rejected": -300.75396728515625, "loss": 0.2029, "rewards/chosen": -0.4941352307796478, "rewards/margins": 4.297951698303223, "rewards/rejected": -4.792087078094482, "step": 633 }, { "epoch": 0.17, "grad_norm": 32.8790397644043, "kl": 0.0, "learning_rate": 4.170374247579168e-07, "logps/chosen": -149.47360229492188, "logps/rejected": -233.9814910888672, "loss": 0.3435, "rewards/chosen": -0.6808183789253235, "rewards/margins": 3.682387590408325, "rewards/rejected": -4.363205909729004, "step": 634 }, { "epoch": 0.17, "grad_norm": 31.50946617126465, "kl": 0.0, "learning_rate": 4.1690656896100497e-07, "logps/chosen": -199.74887084960938, "logps/rejected": -254.82545471191406, "loss": 0.2709, "rewards/chosen": 0.24805386364459991, "rewards/margins": 4.510081768035889, "rewards/rejected": -4.262027740478516, "step": 635 }, { "epoch": 0.17, "grad_norm": 40.11531066894531, "kl": 0.0, "learning_rate": 4.1677571316409317e-07, "logps/chosen": -203.65505981445312, "logps/rejected": -272.3819580078125, "loss": 0.3712, "rewards/chosen": -0.36780452728271484, "rewards/margins": 2.217893123626709, "rewards/rejected": -2.585697650909424, "step": 636 }, { "epoch": 0.17, "grad_norm": 35.829952239990234, "kl": 0.0, "learning_rate": 4.1664485736718136e-07, "logps/chosen": -211.70443725585938, "logps/rejected": -145.94801330566406, "loss": 0.4441, "rewards/chosen": -0.9229274988174438, "rewards/margins": 0.6648210287094116, "rewards/rejected": -1.5877485275268555, "step": 637 }, { "epoch": 0.17, "grad_norm": 33.948787689208984, "kl": 0.0, "learning_rate": 4.1651400157026956e-07, "logps/chosen": -205.24441528320312, "logps/rejected": -201.89073181152344, "loss": 0.3173, "rewards/chosen": 0.8895024061203003, "rewards/margins": 3.2825989723205566, "rewards/rejected": -2.393096446990967, "step": 638 }, { "epoch": 0.17, "grad_norm": 31.641845703125, "kl": 0.0, "learning_rate": 4.1638314577335776e-07, "logps/chosen": -215.01095581054688, "logps/rejected": -254.07305908203125, "loss": 0.3493, "rewards/chosen": -0.3878760039806366, "rewards/margins": 2.593442916870117, "rewards/rejected": -2.981318950653076, "step": 639 }, { "epoch": 0.17, "grad_norm": 27.793249130249023, "kl": 0.0, "learning_rate": 4.1625228997644595e-07, "logps/chosen": -200.00442504882812, "logps/rejected": -264.08099365234375, "loss": 0.3061, "rewards/chosen": -0.18434447050094604, "rewards/margins": 3.4469001293182373, "rewards/rejected": -3.631244659423828, "step": 640 }, { "epoch": 0.17, "grad_norm": 36.78108596801758, "kl": 0.0, "learning_rate": 4.1612143417953415e-07, "logps/chosen": -289.6894226074219, "logps/rejected": -291.83001708984375, "loss": 0.2781, "rewards/chosen": -1.6234275102615356, "rewards/margins": 1.1950386762619019, "rewards/rejected": -2.8184661865234375, "step": 641 }, { "epoch": 0.17, "grad_norm": 34.74748229980469, "kl": 0.0, "learning_rate": 4.159905783826223e-07, "logps/chosen": -244.11862182617188, "logps/rejected": -220.2618865966797, "loss": 0.4239, "rewards/chosen": -1.8187682628631592, "rewards/margins": 0.42972373962402344, "rewards/rejected": -2.2484920024871826, "step": 642 }, { "epoch": 0.17, "grad_norm": 36.53879928588867, "kl": 0.0, "learning_rate": 4.158597225857105e-07, "logps/chosen": -297.8824768066406, "logps/rejected": -242.40858459472656, "loss": 0.3813, "rewards/chosen": -0.17054195702075958, "rewards/margins": 3.0910396575927734, "rewards/rejected": -3.2615816593170166, "step": 643 }, { "epoch": 0.17, "grad_norm": 35.24003219604492, "kl": 0.0, "learning_rate": 4.157288667887987e-07, "logps/chosen": -203.4264373779297, "logps/rejected": -251.9635009765625, "loss": 0.2558, "rewards/chosen": -0.09030131250619888, "rewards/margins": 2.920102119445801, "rewards/rejected": -3.0104033946990967, "step": 644 }, { "epoch": 0.17, "grad_norm": 39.00501251220703, "kl": 0.0, "learning_rate": 4.155980109918869e-07, "logps/chosen": -320.9664306640625, "logps/rejected": -198.15609741210938, "loss": 0.4067, "rewards/chosen": -1.1451497077941895, "rewards/margins": 1.429276704788208, "rewards/rejected": -2.5744264125823975, "step": 645 }, { "epoch": 0.17, "grad_norm": 34.85394287109375, "kl": 0.0, "learning_rate": 4.1546715519497513e-07, "logps/chosen": -257.5217590332031, "logps/rejected": -276.9884033203125, "loss": 0.2091, "rewards/chosen": -0.7587192058563232, "rewards/margins": 2.568091630935669, "rewards/rejected": -3.326810836791992, "step": 646 }, { "epoch": 0.17, "grad_norm": 34.492549896240234, "kl": 0.0, "learning_rate": 4.153362993980633e-07, "logps/chosen": -238.52377319335938, "logps/rejected": -237.36590576171875, "loss": 0.4542, "rewards/chosen": -0.41595375537872314, "rewards/margins": 1.1079202890396118, "rewards/rejected": -1.523874044418335, "step": 647 }, { "epoch": 0.17, "grad_norm": 27.814579010009766, "kl": 0.0, "learning_rate": 4.152054436011515e-07, "logps/chosen": -177.15321350097656, "logps/rejected": -198.52593994140625, "loss": 0.2217, "rewards/chosen": 0.3595758378505707, "rewards/margins": 3.194960117340088, "rewards/rejected": -2.8353843688964844, "step": 648 }, { "epoch": 0.17, "grad_norm": 28.996912002563477, "kl": 0.0, "learning_rate": 4.150745878042397e-07, "logps/chosen": -227.43548583984375, "logps/rejected": -220.3048095703125, "loss": 0.3507, "rewards/chosen": -1.0934927463531494, "rewards/margins": 1.8975601196289062, "rewards/rejected": -2.9910528659820557, "step": 649 }, { "epoch": 0.17, "grad_norm": 36.756919860839844, "kl": 0.0, "learning_rate": 4.149437320073279e-07, "logps/chosen": -231.71620178222656, "logps/rejected": -262.20648193359375, "loss": 0.3661, "rewards/chosen": -0.6306547522544861, "rewards/margins": 2.2297322750091553, "rewards/rejected": -2.860387086868286, "step": 650 }, { "epoch": 0.17, "grad_norm": 36.70586013793945, "kl": 0.0, "learning_rate": 4.148128762104161e-07, "logps/chosen": -317.0033264160156, "logps/rejected": -279.13165283203125, "loss": 0.2521, "rewards/chosen": -0.37969204783439636, "rewards/margins": 3.3592135906219482, "rewards/rejected": -3.738905668258667, "step": 651 }, { "epoch": 0.17, "grad_norm": 46.720123291015625, "kl": 0.0, "learning_rate": 4.146820204135043e-07, "logps/chosen": -292.99957275390625, "logps/rejected": -220.20150756835938, "loss": 0.3879, "rewards/chosen": -0.10919928550720215, "rewards/margins": 2.287581205368042, "rewards/rejected": -2.396780490875244, "step": 652 }, { "epoch": 0.17, "grad_norm": 33.120811462402344, "kl": 0.0, "learning_rate": 4.145511646165925e-07, "logps/chosen": -309.48529052734375, "logps/rejected": -180.5233154296875, "loss": 0.3888, "rewards/chosen": -1.0322825908660889, "rewards/margins": 1.6642029285430908, "rewards/rejected": -2.6964855194091797, "step": 653 }, { "epoch": 0.17, "grad_norm": 30.720664978027344, "kl": 0.0, "learning_rate": 4.144203088196807e-07, "logps/chosen": -141.5933380126953, "logps/rejected": -177.95643615722656, "loss": 0.322, "rewards/chosen": -0.5805988311767578, "rewards/margins": 1.3864021301269531, "rewards/rejected": -1.967000961303711, "step": 654 }, { "epoch": 0.17, "grad_norm": 28.06136703491211, "kl": 0.0, "learning_rate": 4.142894530227689e-07, "logps/chosen": -245.1380615234375, "logps/rejected": -213.3980712890625, "loss": 0.2355, "rewards/chosen": -0.2578246295452118, "rewards/margins": 3.7815134525299072, "rewards/rejected": -4.039338111877441, "step": 655 }, { "epoch": 0.17, "grad_norm": 38.5828971862793, "kl": 0.0, "learning_rate": 4.141585972258571e-07, "logps/chosen": -239.96658325195312, "logps/rejected": -230.351318359375, "loss": 0.3052, "rewards/chosen": -0.9050352573394775, "rewards/margins": 1.6403262615203857, "rewards/rejected": -2.5453615188598633, "step": 656 }, { "epoch": 0.17, "grad_norm": 27.131685256958008, "kl": 0.0, "learning_rate": 4.140277414289453e-07, "logps/chosen": -194.69229125976562, "logps/rejected": -242.11758422851562, "loss": 0.4452, "rewards/chosen": -2.399684429168701, "rewards/margins": 1.4530136585235596, "rewards/rejected": -3.8526980876922607, "step": 657 }, { "epoch": 0.17, "grad_norm": 38.746849060058594, "kl": 0.0, "learning_rate": 4.1389688563203343e-07, "logps/chosen": -147.9233856201172, "logps/rejected": -222.4149627685547, "loss": 0.3078, "rewards/chosen": 1.145562767982483, "rewards/margins": 3.6172871589660645, "rewards/rejected": -2.471724271774292, "step": 658 }, { "epoch": 0.17, "grad_norm": 30.03120994567871, "kl": 0.0, "learning_rate": 4.137660298351217e-07, "logps/chosen": -189.79733276367188, "logps/rejected": -279.6260070800781, "loss": 0.2722, "rewards/chosen": -0.8100014925003052, "rewards/margins": 3.497687816619873, "rewards/rejected": -4.307689189910889, "step": 659 }, { "epoch": 0.17, "grad_norm": 35.37248229980469, "kl": 0.0, "learning_rate": 4.136351740382099e-07, "logps/chosen": -191.43023681640625, "logps/rejected": -192.27333068847656, "loss": 0.3035, "rewards/chosen": -0.273165225982666, "rewards/margins": 2.452721118927002, "rewards/rejected": -2.725886344909668, "step": 660 }, { "epoch": 0.17, "grad_norm": 34.3448600769043, "kl": 0.0, "learning_rate": 4.1350431824129807e-07, "logps/chosen": -280.6360778808594, "logps/rejected": -250.7725372314453, "loss": 0.3402, "rewards/chosen": -1.419081449508667, "rewards/margins": 1.5376768112182617, "rewards/rejected": -2.9567582607269287, "step": 661 }, { "epoch": 0.17, "grad_norm": 29.6294002532959, "kl": 0.0, "learning_rate": 4.1337346244438627e-07, "logps/chosen": -216.47866821289062, "logps/rejected": -193.3513946533203, "loss": 0.2504, "rewards/chosen": 1.5271568298339844, "rewards/margins": 4.2868781089782715, "rewards/rejected": -2.759721279144287, "step": 662 }, { "epoch": 0.17, "grad_norm": 31.874507904052734, "kl": 0.0, "learning_rate": 4.1324260664747446e-07, "logps/chosen": -242.6487274169922, "logps/rejected": -256.5516662597656, "loss": 0.308, "rewards/chosen": -0.4256035387516022, "rewards/margins": 2.785071611404419, "rewards/rejected": -3.2106752395629883, "step": 663 }, { "epoch": 0.17, "grad_norm": 33.38178634643555, "kl": 0.0, "learning_rate": 4.1311175085056266e-07, "logps/chosen": -278.8080139160156, "logps/rejected": -300.10455322265625, "loss": 0.2865, "rewards/chosen": -0.9544926881790161, "rewards/margins": 2.5183773040771484, "rewards/rejected": -3.472870111465454, "step": 664 }, { "epoch": 0.17, "grad_norm": 29.179576873779297, "kl": 0.0, "learning_rate": 4.1298089505365086e-07, "logps/chosen": -169.78851318359375, "logps/rejected": -210.986572265625, "loss": 0.2921, "rewards/chosen": -0.48213818669319153, "rewards/margins": 2.5531556606292725, "rewards/rejected": -3.0352938175201416, "step": 665 }, { "epoch": 0.17, "grad_norm": 29.703609466552734, "kl": 0.0, "learning_rate": 4.1285003925673905e-07, "logps/chosen": -247.66671752929688, "logps/rejected": -162.64111328125, "loss": 0.3197, "rewards/chosen": -0.4724392890930176, "rewards/margins": 1.8395106792449951, "rewards/rejected": -2.3119499683380127, "step": 666 }, { "epoch": 0.17, "grad_norm": 28.3862247467041, "kl": 0.0, "learning_rate": 4.1271918345982725e-07, "logps/chosen": -225.37664794921875, "logps/rejected": -327.6197204589844, "loss": 0.3564, "rewards/chosen": -1.468646764755249, "rewards/margins": 1.1545610427856445, "rewards/rejected": -2.6232078075408936, "step": 667 }, { "epoch": 0.17, "grad_norm": 38.491947174072266, "kl": 0.0, "learning_rate": 4.1258832766291544e-07, "logps/chosen": -182.65261840820312, "logps/rejected": -207.8057861328125, "loss": 0.4306, "rewards/chosen": -0.28253644704818726, "rewards/margins": 1.8097026348114014, "rewards/rejected": -2.0922391414642334, "step": 668 }, { "epoch": 0.18, "grad_norm": 40.138816833496094, "kl": 0.0, "learning_rate": 4.1245747186600364e-07, "logps/chosen": -298.6606140136719, "logps/rejected": -283.8145751953125, "loss": 0.2849, "rewards/chosen": -0.5111945271492004, "rewards/margins": 2.231806993484497, "rewards/rejected": -2.7430014610290527, "step": 669 }, { "epoch": 0.18, "grad_norm": 43.718994140625, "kl": 0.0, "learning_rate": 4.1232661606909184e-07, "logps/chosen": -194.55783081054688, "logps/rejected": -236.71018981933594, "loss": 0.3596, "rewards/chosen": -0.527443528175354, "rewards/margins": 2.3515138626098633, "rewards/rejected": -2.8789572715759277, "step": 670 }, { "epoch": 0.18, "grad_norm": 35.09980392456055, "kl": 0.0, "learning_rate": 4.121957602721801e-07, "logps/chosen": -190.4247589111328, "logps/rejected": -208.2495574951172, "loss": 0.2523, "rewards/chosen": 0.05250234156847, "rewards/margins": 1.295493483543396, "rewards/rejected": -1.2429910898208618, "step": 671 }, { "epoch": 0.18, "grad_norm": 27.459993362426758, "kl": 0.0, "learning_rate": 4.120649044752683e-07, "logps/chosen": -271.6382751464844, "logps/rejected": -225.0758514404297, "loss": 0.2745, "rewards/chosen": -1.4889888763427734, "rewards/margins": 1.235353708267212, "rewards/rejected": -2.7243425846099854, "step": 672 }, { "epoch": 0.18, "grad_norm": 28.141847610473633, "kl": 0.0, "learning_rate": 4.119340486783564e-07, "logps/chosen": -279.026123046875, "logps/rejected": -240.21389770507812, "loss": 0.2851, "rewards/chosen": 0.7028965950012207, "rewards/margins": 3.4597043991088867, "rewards/rejected": -2.756807804107666, "step": 673 }, { "epoch": 0.18, "grad_norm": 28.72968101501465, "kl": 0.0, "learning_rate": 4.118031928814446e-07, "logps/chosen": -163.95077514648438, "logps/rejected": -258.7970275878906, "loss": 0.4225, "rewards/chosen": -0.9373546242713928, "rewards/margins": 1.647026538848877, "rewards/rejected": -2.584381103515625, "step": 674 }, { "epoch": 0.18, "grad_norm": 38.473487854003906, "kl": 0.0, "learning_rate": 4.116723370845328e-07, "logps/chosen": -280.1283264160156, "logps/rejected": -214.69662475585938, "loss": 0.2826, "rewards/chosen": 0.9605988264083862, "rewards/margins": 2.3107542991638184, "rewards/rejected": -1.3501554727554321, "step": 675 }, { "epoch": 0.18, "grad_norm": 35.380428314208984, "kl": 0.0, "learning_rate": 4.11541481287621e-07, "logps/chosen": -258.97076416015625, "logps/rejected": -240.42025756835938, "loss": 0.3657, "rewards/chosen": 0.23240822553634644, "rewards/margins": 2.945939540863037, "rewards/rejected": -2.713531255722046, "step": 676 }, { "epoch": 0.18, "grad_norm": 32.557743072509766, "kl": 0.0, "learning_rate": 4.114106254907092e-07, "logps/chosen": -272.61553955078125, "logps/rejected": -235.87657165527344, "loss": 0.406, "rewards/chosen": -1.2601265907287598, "rewards/margins": 1.2022619247436523, "rewards/rejected": -2.462388515472412, "step": 677 }, { "epoch": 0.18, "grad_norm": 39.19186019897461, "kl": 0.0, "learning_rate": 4.112797696937974e-07, "logps/chosen": -255.88790893554688, "logps/rejected": -164.4857940673828, "loss": 0.401, "rewards/chosen": -0.7437317967414856, "rewards/margins": 0.8636577725410461, "rewards/rejected": -1.6073895692825317, "step": 678 }, { "epoch": 0.18, "grad_norm": 39.593902587890625, "kl": 0.0, "learning_rate": 4.111489138968856e-07, "logps/chosen": -185.75872802734375, "logps/rejected": -262.96502685546875, "loss": 0.2612, "rewards/chosen": 1.2888895273208618, "rewards/margins": 3.809917449951172, "rewards/rejected": -2.5210278034210205, "step": 679 }, { "epoch": 0.18, "grad_norm": 34.13742446899414, "kl": 0.0, "learning_rate": 4.110180580999738e-07, "logps/chosen": -247.64129638671875, "logps/rejected": -184.8863067626953, "loss": 0.3106, "rewards/chosen": 0.7847617864608765, "rewards/margins": 3.7181215286254883, "rewards/rejected": -2.9333596229553223, "step": 680 }, { "epoch": 0.18, "grad_norm": 31.16827392578125, "kl": 0.0, "learning_rate": 4.10887202303062e-07, "logps/chosen": -312.4378967285156, "logps/rejected": -152.6389923095703, "loss": 0.3165, "rewards/chosen": 0.8620835542678833, "rewards/margins": 2.3805339336395264, "rewards/rejected": -1.518450379371643, "step": 681 }, { "epoch": 0.18, "grad_norm": 34.24968338012695, "kl": 0.0, "learning_rate": 4.107563465061502e-07, "logps/chosen": -227.52528381347656, "logps/rejected": -248.9477996826172, "loss": 0.3509, "rewards/chosen": -0.8197565078735352, "rewards/margins": 0.4830266237258911, "rewards/rejected": -1.3027831315994263, "step": 682 }, { "epoch": 0.18, "grad_norm": 33.73033142089844, "kl": 0.0, "learning_rate": 4.106254907092384e-07, "logps/chosen": -190.738037109375, "logps/rejected": -196.4342498779297, "loss": 0.2827, "rewards/chosen": -0.5406025052070618, "rewards/margins": 2.1505613327026367, "rewards/rejected": -2.6911637783050537, "step": 683 }, { "epoch": 0.18, "grad_norm": 27.662273406982422, "kl": 0.0, "learning_rate": 4.1049463491232663e-07, "logps/chosen": -239.35816955566406, "logps/rejected": -242.4320068359375, "loss": 0.3093, "rewards/chosen": -1.608490228652954, "rewards/margins": 1.3844609260559082, "rewards/rejected": -2.9929511547088623, "step": 684 }, { "epoch": 0.18, "grad_norm": 42.05573654174805, "kl": 0.0, "learning_rate": 4.1036377911541483e-07, "logps/chosen": -277.0669860839844, "logps/rejected": -207.38720703125, "loss": 0.4063, "rewards/chosen": -0.9567424654960632, "rewards/margins": 0.5377658009529114, "rewards/rejected": -1.4945082664489746, "step": 685 }, { "epoch": 0.18, "grad_norm": 38.324893951416016, "kl": 0.0, "learning_rate": 4.10232923318503e-07, "logps/chosen": -287.05841064453125, "logps/rejected": -240.85494995117188, "loss": 0.2541, "rewards/chosen": 0.4614275097846985, "rewards/margins": 3.325218915939331, "rewards/rejected": -2.8637914657592773, "step": 686 }, { "epoch": 0.18, "grad_norm": 43.04709243774414, "kl": 0.0, "learning_rate": 4.101020675215912e-07, "logps/chosen": -195.44039916992188, "logps/rejected": -273.97906494140625, "loss": 0.2414, "rewards/chosen": 0.7585437297821045, "rewards/margins": 3.664416551589966, "rewards/rejected": -2.9058728218078613, "step": 687 }, { "epoch": 0.18, "grad_norm": 34.49761199951172, "kl": 0.0, "learning_rate": 4.099712117246794e-07, "logps/chosen": -261.4673767089844, "logps/rejected": -224.2903594970703, "loss": 0.2617, "rewards/chosen": 0.09347956627607346, "rewards/margins": 2.2225935459136963, "rewards/rejected": -2.1291139125823975, "step": 688 }, { "epoch": 0.18, "grad_norm": 32.107749938964844, "kl": 0.0, "learning_rate": 4.0984035592776756e-07, "logps/chosen": -219.35438537597656, "logps/rejected": -246.00823974609375, "loss": 0.2946, "rewards/chosen": -0.08469846844673157, "rewards/margins": 3.9561054706573486, "rewards/rejected": -4.040803909301758, "step": 689 }, { "epoch": 0.18, "grad_norm": 27.49036979675293, "kl": 0.0, "learning_rate": 4.0970950013085576e-07, "logps/chosen": -270.11932373046875, "logps/rejected": -237.36947631835938, "loss": 0.3557, "rewards/chosen": -0.5459706783294678, "rewards/margins": 2.900925397872925, "rewards/rejected": -3.4468960762023926, "step": 690 }, { "epoch": 0.18, "grad_norm": 30.500295639038086, "kl": 0.0, "learning_rate": 4.0957864433394395e-07, "logps/chosen": -253.16883850097656, "logps/rejected": -164.48680114746094, "loss": 0.3479, "rewards/chosen": -0.3456551730632782, "rewards/margins": 2.1069955825805664, "rewards/rejected": -2.452650785446167, "step": 691 }, { "epoch": 0.18, "grad_norm": 43.131839752197266, "kl": 0.0, "learning_rate": 4.0944778853703215e-07, "logps/chosen": -270.4461975097656, "logps/rejected": -240.70751953125, "loss": 0.337, "rewards/chosen": 0.804510235786438, "rewards/margins": 2.1393561363220215, "rewards/rejected": -1.334845781326294, "step": 692 }, { "epoch": 0.18, "grad_norm": 32.07833480834961, "kl": 0.0, "learning_rate": 4.0931693274012035e-07, "logps/chosen": -249.09523010253906, "logps/rejected": -230.19061279296875, "loss": 0.406, "rewards/chosen": -1.9368900060653687, "rewards/margins": 1.1023679971694946, "rewards/rejected": -3.0392580032348633, "step": 693 }, { "epoch": 0.18, "grad_norm": 33.6291618347168, "kl": 0.0, "learning_rate": 4.0918607694320854e-07, "logps/chosen": -245.869140625, "logps/rejected": -304.74322509765625, "loss": 0.3806, "rewards/chosen": -3.02115797996521, "rewards/margins": 0.01259756088256836, "rewards/rejected": -3.0337555408477783, "step": 694 }, { "epoch": 0.18, "grad_norm": 33.45709991455078, "kl": 0.0, "learning_rate": 4.0905522114629674e-07, "logps/chosen": -254.72018432617188, "logps/rejected": -244.65679931640625, "loss": 0.3315, "rewards/chosen": -1.7159273624420166, "rewards/margins": 0.9991257190704346, "rewards/rejected": -2.715053081512451, "step": 695 }, { "epoch": 0.18, "grad_norm": 40.40908432006836, "kl": 0.0, "learning_rate": 4.0892436534938493e-07, "logps/chosen": -275.8647155761719, "logps/rejected": -266.954345703125, "loss": 0.4751, "rewards/chosen": -1.3798705339431763, "rewards/margins": 0.4689023494720459, "rewards/rejected": -1.8487728834152222, "step": 696 }, { "epoch": 0.18, "grad_norm": 30.23027992248535, "kl": 0.0, "learning_rate": 4.087935095524732e-07, "logps/chosen": -178.09228515625, "logps/rejected": -278.37213134765625, "loss": 0.3044, "rewards/chosen": -0.40018245577812195, "rewards/margins": 4.596784591674805, "rewards/rejected": -4.99696683883667, "step": 697 }, { "epoch": 0.18, "grad_norm": 34.83966064453125, "kl": 0.0, "learning_rate": 4.086626537555614e-07, "logps/chosen": -211.6435546875, "logps/rejected": -201.00558471679688, "loss": 0.1905, "rewards/chosen": 0.7989832758903503, "rewards/margins": 4.710301876068115, "rewards/rejected": -3.91131854057312, "step": 698 }, { "epoch": 0.18, "grad_norm": 38.152313232421875, "kl": 0.0, "learning_rate": 4.085317979586496e-07, "logps/chosen": -196.98355102539062, "logps/rejected": -302.0910339355469, "loss": 0.279, "rewards/chosen": -1.0276482105255127, "rewards/margins": 2.165983200073242, "rewards/rejected": -3.193631410598755, "step": 699 }, { "epoch": 0.18, "grad_norm": 29.363718032836914, "kl": 0.0, "learning_rate": 4.0840094216173777e-07, "logps/chosen": -239.1320343017578, "logps/rejected": -264.94970703125, "loss": 0.2588, "rewards/chosen": 0.14816318452358246, "rewards/margins": 4.279280662536621, "rewards/rejected": -4.131117343902588, "step": 700 }, { "epoch": 0.18, "grad_norm": 44.35689926147461, "kl": 0.0, "learning_rate": 4.0827008636482597e-07, "logps/chosen": -221.08709716796875, "logps/rejected": -205.1977996826172, "loss": 0.3414, "rewards/chosen": 0.24511413276195526, "rewards/margins": 3.009829521179199, "rewards/rejected": -2.7647154331207275, "step": 701 }, { "epoch": 0.18, "grad_norm": 38.40766906738281, "kl": 0.0, "learning_rate": 4.0813923056791416e-07, "logps/chosen": -210.72152709960938, "logps/rejected": -245.8160858154297, "loss": 0.3501, "rewards/chosen": 0.41612035036087036, "rewards/margins": 3.429255485534668, "rewards/rejected": -3.0131351947784424, "step": 702 }, { "epoch": 0.18, "grad_norm": 26.7491512298584, "kl": 0.0, "learning_rate": 4.0800837477100236e-07, "logps/chosen": -259.2345275878906, "logps/rejected": -299.02435302734375, "loss": 0.4981, "rewards/chosen": -2.508211374282837, "rewards/margins": -1.5818876028060913, "rewards/rejected": -0.9263237714767456, "step": 703 }, { "epoch": 0.18, "grad_norm": 37.0341911315918, "kl": 0.0, "learning_rate": 4.078775189740905e-07, "logps/chosen": -234.02003479003906, "logps/rejected": -213.5331573486328, "loss": 0.2716, "rewards/chosen": 0.10059448331594467, "rewards/margins": 1.6921581029891968, "rewards/rejected": -1.5915635824203491, "step": 704 }, { "epoch": 0.18, "grad_norm": 35.68254470825195, "kl": 0.0, "learning_rate": 4.077466631771787e-07, "logps/chosen": -214.12158203125, "logps/rejected": -266.066162109375, "loss": 0.3309, "rewards/chosen": -0.1318395882844925, "rewards/margins": 2.9635441303253174, "rewards/rejected": -3.095383644104004, "step": 705 }, { "epoch": 0.18, "grad_norm": 31.081863403320312, "kl": 0.0, "learning_rate": 4.076158073802669e-07, "logps/chosen": -182.43540954589844, "logps/rejected": -272.5527038574219, "loss": 0.3125, "rewards/chosen": 0.043891992419958115, "rewards/margins": 2.875239372253418, "rewards/rejected": -2.8313474655151367, "step": 706 }, { "epoch": 0.19, "grad_norm": 38.6630973815918, "kl": 0.0, "learning_rate": 4.074849515833551e-07, "logps/chosen": -417.9666748046875, "logps/rejected": -233.2468719482422, "loss": 0.374, "rewards/chosen": -0.5666962265968323, "rewards/margins": 2.2654359340667725, "rewards/rejected": -2.83213210105896, "step": 707 }, { "epoch": 0.19, "grad_norm": 27.11510467529297, "kl": 0.0, "learning_rate": 4.073540957864433e-07, "logps/chosen": -112.95711517333984, "logps/rejected": -215.8408966064453, "loss": 0.2604, "rewards/chosen": 1.1336884498596191, "rewards/margins": 4.025924205780029, "rewards/rejected": -2.89223575592041, "step": 708 }, { "epoch": 0.19, "grad_norm": 28.093170166015625, "kl": 0.0, "learning_rate": 4.072232399895315e-07, "logps/chosen": -271.0452575683594, "logps/rejected": -252.80313110351562, "loss": 0.3726, "rewards/chosen": 0.4887799322605133, "rewards/margins": 4.524040222167969, "rewards/rejected": -4.035260200500488, "step": 709 }, { "epoch": 0.19, "grad_norm": 31.50494384765625, "kl": 0.0, "learning_rate": 4.0709238419261973e-07, "logps/chosen": -262.0550537109375, "logps/rejected": -349.9918518066406, "loss": 0.2159, "rewards/chosen": -0.8234438896179199, "rewards/margins": 5.1398701667785645, "rewards/rejected": -5.963314056396484, "step": 710 }, { "epoch": 0.19, "grad_norm": 30.006282806396484, "kl": 0.0, "learning_rate": 4.0696152839570793e-07, "logps/chosen": -198.72296142578125, "logps/rejected": -301.7229919433594, "loss": 0.2859, "rewards/chosen": 1.8006759881973267, "rewards/margins": 5.980476379394531, "rewards/rejected": -4.179800510406494, "step": 711 }, { "epoch": 0.19, "grad_norm": 32.877410888671875, "kl": 0.0, "learning_rate": 4.068306725987961e-07, "logps/chosen": -285.0302429199219, "logps/rejected": -271.06414794921875, "loss": 0.2594, "rewards/chosen": -0.30313801765441895, "rewards/margins": 2.021474838256836, "rewards/rejected": -2.324612855911255, "step": 712 }, { "epoch": 0.19, "grad_norm": 29.744258880615234, "kl": 0.0, "learning_rate": 4.066998168018843e-07, "logps/chosen": -179.3466796875, "logps/rejected": -232.30099487304688, "loss": 0.2743, "rewards/chosen": 1.125133752822876, "rewards/margins": 5.002973556518555, "rewards/rejected": -3.877840042114258, "step": 713 }, { "epoch": 0.19, "grad_norm": 37.29856872558594, "kl": 0.0, "learning_rate": 4.065689610049725e-07, "logps/chosen": -248.66502380371094, "logps/rejected": -288.7413330078125, "loss": 0.3168, "rewards/chosen": -0.6923273205757141, "rewards/margins": 1.4932734966278076, "rewards/rejected": -2.185600757598877, "step": 714 }, { "epoch": 0.19, "grad_norm": 33.92723083496094, "kl": 0.0, "learning_rate": 4.064381052080607e-07, "logps/chosen": -213.46885681152344, "logps/rejected": -257.55938720703125, "loss": 0.2876, "rewards/chosen": -0.8362697958946228, "rewards/margins": 3.334930896759033, "rewards/rejected": -4.171200752258301, "step": 715 }, { "epoch": 0.19, "grad_norm": 30.284574508666992, "kl": 0.0, "learning_rate": 4.063072494111489e-07, "logps/chosen": -181.6216278076172, "logps/rejected": -147.3481903076172, "loss": 0.2717, "rewards/chosen": -0.6255307793617249, "rewards/margins": 2.0915329456329346, "rewards/rejected": -2.7170636653900146, "step": 716 }, { "epoch": 0.19, "grad_norm": 38.27483367919922, "kl": 0.0, "learning_rate": 4.061763936142371e-07, "logps/chosen": -180.31236267089844, "logps/rejected": -254.57325744628906, "loss": 0.3388, "rewards/chosen": -0.985579788684845, "rewards/margins": 3.0883657932281494, "rewards/rejected": -4.07394552230835, "step": 717 }, { "epoch": 0.19, "grad_norm": 25.0219669342041, "kl": 0.0, "learning_rate": 4.060455378173253e-07, "logps/chosen": -241.9582977294922, "logps/rejected": -283.1595458984375, "loss": 0.2424, "rewards/chosen": 0.6642072200775146, "rewards/margins": 5.217754364013672, "rewards/rejected": -4.553547382354736, "step": 718 }, { "epoch": 0.19, "grad_norm": 46.34675216674805, "kl": 0.0, "learning_rate": 4.059146820204135e-07, "logps/chosen": -250.87704467773438, "logps/rejected": -292.99713134765625, "loss": 0.4718, "rewards/chosen": -1.2733533382415771, "rewards/margins": 1.5549085140228271, "rewards/rejected": -2.8282618522644043, "step": 719 }, { "epoch": 0.19, "grad_norm": 32.27806091308594, "kl": 0.0, "learning_rate": 4.0578382622350164e-07, "logps/chosen": -293.7688903808594, "logps/rejected": -239.4044952392578, "loss": 0.2566, "rewards/chosen": -0.23062366247177124, "rewards/margins": 2.491293430328369, "rewards/rejected": -2.721917152404785, "step": 720 }, { "epoch": 0.19, "grad_norm": 34.6161003112793, "kl": 0.0, "learning_rate": 4.0565297042658984e-07, "logps/chosen": -192.505615234375, "logps/rejected": -220.963134765625, "loss": 0.3112, "rewards/chosen": -0.9563778638839722, "rewards/margins": 1.9204825162887573, "rewards/rejected": -2.8768603801727295, "step": 721 }, { "epoch": 0.19, "grad_norm": 31.44890594482422, "kl": 0.0, "learning_rate": 4.0552211462967803e-07, "logps/chosen": -160.75257873535156, "logps/rejected": -272.4718933105469, "loss": 0.203, "rewards/chosen": 0.37976592779159546, "rewards/margins": 5.139260292053223, "rewards/rejected": -4.759494304656982, "step": 722 }, { "epoch": 0.19, "grad_norm": 42.7325439453125, "kl": 0.0, "learning_rate": 4.053912588327663e-07, "logps/chosen": -181.53472900390625, "logps/rejected": -288.3123779296875, "loss": 0.3532, "rewards/chosen": 0.9533110857009888, "rewards/margins": 3.2428879737854004, "rewards/rejected": -2.289577007293701, "step": 723 }, { "epoch": 0.19, "grad_norm": 38.448360443115234, "kl": 0.0, "learning_rate": 4.052604030358545e-07, "logps/chosen": -236.40652465820312, "logps/rejected": -234.30282592773438, "loss": 0.2709, "rewards/chosen": 0.140406996011734, "rewards/margins": 5.7353973388671875, "rewards/rejected": -5.594990253448486, "step": 724 }, { "epoch": 0.19, "grad_norm": 35.80790710449219, "kl": 0.0, "learning_rate": 4.051295472389427e-07, "logps/chosen": -330.27020263671875, "logps/rejected": -213.01524353027344, "loss": 0.2631, "rewards/chosen": -0.5340307950973511, "rewards/margins": 1.4044686555862427, "rewards/rejected": -1.9384994506835938, "step": 725 }, { "epoch": 0.19, "grad_norm": 36.39134216308594, "kl": 0.0, "learning_rate": 4.0499869144203087e-07, "logps/chosen": -249.36798095703125, "logps/rejected": -221.98782348632812, "loss": 0.3286, "rewards/chosen": -0.7803311347961426, "rewards/margins": 2.9748716354370117, "rewards/rejected": -3.7552027702331543, "step": 726 }, { "epoch": 0.19, "grad_norm": 29.210561752319336, "kl": 0.0, "learning_rate": 4.0486783564511907e-07, "logps/chosen": -224.4700164794922, "logps/rejected": -207.71670532226562, "loss": 0.2853, "rewards/chosen": 1.7469308376312256, "rewards/margins": 4.717103004455566, "rewards/rejected": -2.9701719284057617, "step": 727 }, { "epoch": 0.19, "grad_norm": 31.407520294189453, "kl": 0.0, "learning_rate": 4.0473697984820726e-07, "logps/chosen": -224.3354034423828, "logps/rejected": -240.8251953125, "loss": 0.1928, "rewards/chosen": 1.139357089996338, "rewards/margins": 3.9633731842041016, "rewards/rejected": -2.8240160942077637, "step": 728 }, { "epoch": 0.19, "grad_norm": 29.426000595092773, "kl": 0.0, "learning_rate": 4.0460612405129546e-07, "logps/chosen": -196.70066833496094, "logps/rejected": -204.2018280029297, "loss": 0.3489, "rewards/chosen": 0.37072229385375977, "rewards/margins": 3.5842480659484863, "rewards/rejected": -3.2135257720947266, "step": 729 }, { "epoch": 0.19, "grad_norm": 31.12305450439453, "kl": 0.0, "learning_rate": 4.0447526825438365e-07, "logps/chosen": -161.67770385742188, "logps/rejected": -298.96490478515625, "loss": 0.2341, "rewards/chosen": -0.16333018243312836, "rewards/margins": 3.9531426429748535, "rewards/rejected": -4.1164727210998535, "step": 730 }, { "epoch": 0.19, "grad_norm": 30.204750061035156, "kl": 0.0, "learning_rate": 4.0434441245747185e-07, "logps/chosen": -208.2093505859375, "logps/rejected": -243.5538787841797, "loss": 0.3252, "rewards/chosen": -0.614426851272583, "rewards/margins": 4.058218002319336, "rewards/rejected": -4.67264461517334, "step": 731 }, { "epoch": 0.19, "grad_norm": 40.43870544433594, "kl": 0.0, "learning_rate": 4.0421355666056005e-07, "logps/chosen": -240.05763244628906, "logps/rejected": -222.07400512695312, "loss": 0.4016, "rewards/chosen": -1.9779905080795288, "rewards/margins": 0.4816056489944458, "rewards/rejected": -2.4595961570739746, "step": 732 }, { "epoch": 0.19, "grad_norm": 34.84939956665039, "kl": 0.0, "learning_rate": 4.0408270086364824e-07, "logps/chosen": -223.86077880859375, "logps/rejected": -283.632568359375, "loss": 0.2978, "rewards/chosen": -0.9752624034881592, "rewards/margins": 3.3159701824188232, "rewards/rejected": -4.291232585906982, "step": 733 }, { "epoch": 0.19, "grad_norm": 27.223562240600586, "kl": 0.0, "learning_rate": 4.0395184506673644e-07, "logps/chosen": -153.81460571289062, "logps/rejected": -231.26126098632812, "loss": 0.3124, "rewards/chosen": -1.2617738246917725, "rewards/margins": 2.4990763664245605, "rewards/rejected": -3.760850191116333, "step": 734 }, { "epoch": 0.19, "grad_norm": 37.23539352416992, "kl": 0.0, "learning_rate": 4.038209892698246e-07, "logps/chosen": -207.6356964111328, "logps/rejected": -197.76123046875, "loss": 0.2752, "rewards/chosen": 0.7991582751274109, "rewards/margins": 4.932627201080322, "rewards/rejected": -4.133469104766846, "step": 735 }, { "epoch": 0.19, "grad_norm": 32.925113677978516, "kl": 0.0, "learning_rate": 4.0369013347291283e-07, "logps/chosen": -155.8492889404297, "logps/rejected": -218.74008178710938, "loss": 0.2874, "rewards/chosen": 0.39519715309143066, "rewards/margins": 3.102106809616089, "rewards/rejected": -2.706909656524658, "step": 736 }, { "epoch": 0.19, "grad_norm": 29.543956756591797, "kl": 0.0, "learning_rate": 4.0355927767600103e-07, "logps/chosen": -244.0968017578125, "logps/rejected": -110.96414184570312, "loss": 0.4021, "rewards/chosen": -0.7129060626029968, "rewards/margins": 1.0993714332580566, "rewards/rejected": -1.8122774362564087, "step": 737 }, { "epoch": 0.19, "grad_norm": 31.560298919677734, "kl": 0.0, "learning_rate": 4.034284218790892e-07, "logps/chosen": -276.5502624511719, "logps/rejected": -237.48460388183594, "loss": 0.176, "rewards/chosen": -0.5135056376457214, "rewards/margins": 2.5795295238494873, "rewards/rejected": -3.0930352210998535, "step": 738 }, { "epoch": 0.19, "grad_norm": 41.65924835205078, "kl": 0.0, "learning_rate": 4.032975660821774e-07, "logps/chosen": -245.82003784179688, "logps/rejected": -316.614013671875, "loss": 0.3456, "rewards/chosen": -0.2647044062614441, "rewards/margins": 3.4045932292938232, "rewards/rejected": -3.669297695159912, "step": 739 }, { "epoch": 0.19, "grad_norm": 32.747581481933594, "kl": 0.0, "learning_rate": 4.031667102852656e-07, "logps/chosen": -249.56219482421875, "logps/rejected": -236.9765625, "loss": 0.197, "rewards/chosen": 0.49999967217445374, "rewards/margins": 4.147944450378418, "rewards/rejected": -3.647944688796997, "step": 740 }, { "epoch": 0.19, "grad_norm": 19.987966537475586, "kl": 0.0, "learning_rate": 4.030358544883538e-07, "logps/chosen": -179.17042541503906, "logps/rejected": -272.2667541503906, "loss": 0.3512, "rewards/chosen": -0.9662373065948486, "rewards/margins": 2.9937665462493896, "rewards/rejected": -3.9600038528442383, "step": 741 }, { "epoch": 0.19, "grad_norm": 29.590360641479492, "kl": 0.0, "learning_rate": 4.02904998691442e-07, "logps/chosen": -203.13223266601562, "logps/rejected": -251.3955078125, "loss": 0.2955, "rewards/chosen": 0.5274904370307922, "rewards/margins": 4.236715793609619, "rewards/rejected": -3.7092251777648926, "step": 742 }, { "epoch": 0.19, "grad_norm": 35.01331329345703, "kl": 0.0, "learning_rate": 4.027741428945302e-07, "logps/chosen": -230.51190185546875, "logps/rejected": -259.7726745605469, "loss": 0.3293, "rewards/chosen": 1.5364973545074463, "rewards/margins": 5.86799430847168, "rewards/rejected": -4.3314971923828125, "step": 743 }, { "epoch": 0.19, "grad_norm": 31.845294952392578, "kl": 0.0, "learning_rate": 4.026432870976184e-07, "logps/chosen": -213.4208984375, "logps/rejected": -289.29815673828125, "loss": 0.3252, "rewards/chosen": -0.8649560213088989, "rewards/margins": 2.6383767127990723, "rewards/rejected": -3.5033326148986816, "step": 744 }, { "epoch": 0.19, "grad_norm": 36.557167053222656, "kl": 0.0, "learning_rate": 4.025124313007066e-07, "logps/chosen": -256.5347900390625, "logps/rejected": -235.42843627929688, "loss": 0.3793, "rewards/chosen": -1.1741714477539062, "rewards/margins": 0.9963676929473877, "rewards/rejected": -2.170539140701294, "step": 745 }, { "epoch": 0.2, "grad_norm": 28.335918426513672, "kl": 0.0, "learning_rate": 4.023815755037948e-07, "logps/chosen": -231.30001831054688, "logps/rejected": -236.99227905273438, "loss": 0.2908, "rewards/chosen": -1.4739785194396973, "rewards/margins": 1.9531621932983398, "rewards/rejected": -3.427140712738037, "step": 746 }, { "epoch": 0.2, "grad_norm": 31.791410446166992, "kl": 0.0, "learning_rate": 4.02250719706883e-07, "logps/chosen": -297.44525146484375, "logps/rejected": -186.52516174316406, "loss": 0.3306, "rewards/chosen": -1.067051887512207, "rewards/margins": 1.990215539932251, "rewards/rejected": -3.057267427444458, "step": 747 }, { "epoch": 0.2, "grad_norm": 32.735816955566406, "kl": 0.0, "learning_rate": 4.0211986390997124e-07, "logps/chosen": -227.0963897705078, "logps/rejected": -272.2472839355469, "loss": 0.2703, "rewards/chosen": -0.6910209655761719, "rewards/margins": 2.193660259246826, "rewards/rejected": -2.884681224822998, "step": 748 }, { "epoch": 0.2, "grad_norm": 29.596969604492188, "kl": 0.0, "learning_rate": 4.0198900811305943e-07, "logps/chosen": -227.31190490722656, "logps/rejected": -416.7991638183594, "loss": 0.3962, "rewards/chosen": -1.4798189401626587, "rewards/margins": 1.5923632383346558, "rewards/rejected": -3.0721821784973145, "step": 749 }, { "epoch": 0.2, "grad_norm": 27.834747314453125, "kl": 0.0, "learning_rate": 4.0185815231614763e-07, "logps/chosen": -251.62158203125, "logps/rejected": -175.81283569335938, "loss": 0.3261, "rewards/chosen": 0.20242267847061157, "rewards/margins": 3.325491189956665, "rewards/rejected": -3.1230685710906982, "step": 750 }, { "epoch": 0.2, "grad_norm": 25.614028930664062, "kl": 0.0, "learning_rate": 4.0172729651923577e-07, "logps/chosen": -137.84701538085938, "logps/rejected": -316.3259582519531, "loss": 0.3197, "rewards/chosen": -1.730677604675293, "rewards/margins": 1.8563814163208008, "rewards/rejected": -3.5870590209960938, "step": 751 }, { "epoch": 0.2, "grad_norm": 21.44118881225586, "kl": 0.0, "learning_rate": 4.0159644072232397e-07, "logps/chosen": -244.3109893798828, "logps/rejected": -223.697021484375, "loss": 0.2882, "rewards/chosen": 3.215583324432373, "rewards/margins": 6.77751350402832, "rewards/rejected": -3.5619304180145264, "step": 752 }, { "epoch": 0.2, "grad_norm": 35.268070220947266, "kl": 0.0, "learning_rate": 4.0146558492541216e-07, "logps/chosen": -205.3878631591797, "logps/rejected": -285.7146301269531, "loss": 0.2874, "rewards/chosen": 0.2579948902130127, "rewards/margins": 6.138503074645996, "rewards/rejected": -5.880507946014404, "step": 753 }, { "epoch": 0.2, "grad_norm": 36.49318313598633, "kl": 0.0, "learning_rate": 4.0133472912850036e-07, "logps/chosen": -299.867919921875, "logps/rejected": -236.50704956054688, "loss": 0.3799, "rewards/chosen": 0.2085830122232437, "rewards/margins": 3.1150100231170654, "rewards/rejected": -2.9064269065856934, "step": 754 }, { "epoch": 0.2, "grad_norm": 40.88338851928711, "kl": 0.0, "learning_rate": 4.0120387333158856e-07, "logps/chosen": -255.27439880371094, "logps/rejected": -266.44586181640625, "loss": 0.4113, "rewards/chosen": -1.1876412630081177, "rewards/margins": 2.202840805053711, "rewards/rejected": -3.390482187271118, "step": 755 }, { "epoch": 0.2, "grad_norm": 29.45452880859375, "kl": 0.0, "learning_rate": 4.0107301753467675e-07, "logps/chosen": -275.1477966308594, "logps/rejected": -321.3170166015625, "loss": 0.3062, "rewards/chosen": -1.2740792036056519, "rewards/margins": 2.2294044494628906, "rewards/rejected": -3.503483772277832, "step": 756 }, { "epoch": 0.2, "grad_norm": 30.438161849975586, "kl": 0.0, "learning_rate": 4.0094216173776495e-07, "logps/chosen": -149.62611389160156, "logps/rejected": -329.66607666015625, "loss": 0.2647, "rewards/chosen": 0.9496825337409973, "rewards/margins": 3.765160083770752, "rewards/rejected": -2.8154776096343994, "step": 757 }, { "epoch": 0.2, "grad_norm": 36.988014221191406, "kl": 0.0, "learning_rate": 4.0081130594085314e-07, "logps/chosen": -206.1259765625, "logps/rejected": -309.4862976074219, "loss": 0.3293, "rewards/chosen": -0.4844217896461487, "rewards/margins": 1.4037492275238037, "rewards/rejected": -1.8881710767745972, "step": 758 }, { "epoch": 0.2, "grad_norm": 35.78725814819336, "kl": 0.0, "learning_rate": 4.0068045014394134e-07, "logps/chosen": -182.4107208251953, "logps/rejected": -163.76229858398438, "loss": 0.3308, "rewards/chosen": -0.25232523679733276, "rewards/margins": 2.208416700363159, "rewards/rejected": -2.4607419967651367, "step": 759 }, { "epoch": 0.2, "grad_norm": 24.877836227416992, "kl": 0.0, "learning_rate": 4.0054959434702954e-07, "logps/chosen": -205.92320251464844, "logps/rejected": -265.5527648925781, "loss": 0.3524, "rewards/chosen": 0.09859514236450195, "rewards/margins": 2.4194495677948, "rewards/rejected": -2.320854425430298, "step": 760 }, { "epoch": 0.2, "grad_norm": 34.51003646850586, "kl": 0.0, "learning_rate": 4.004187385501178e-07, "logps/chosen": -168.7631378173828, "logps/rejected": -230.7778778076172, "loss": 0.4157, "rewards/chosen": -1.2124521732330322, "rewards/margins": 0.986685037612915, "rewards/rejected": -2.1991372108459473, "step": 761 }, { "epoch": 0.2, "grad_norm": 41.73928451538086, "kl": 0.0, "learning_rate": 4.00287882753206e-07, "logps/chosen": -195.952880859375, "logps/rejected": -289.32244873046875, "loss": 0.3697, "rewards/chosen": -0.5307539105415344, "rewards/margins": 2.295314311981201, "rewards/rejected": -2.826068162918091, "step": 762 }, { "epoch": 0.2, "grad_norm": 41.18114471435547, "kl": 0.0, "learning_rate": 4.001570269562942e-07, "logps/chosen": -235.7907257080078, "logps/rejected": -300.2009582519531, "loss": 0.3011, "rewards/chosen": -0.7677313685417175, "rewards/margins": 2.5477404594421387, "rewards/rejected": -3.315471887588501, "step": 763 }, { "epoch": 0.2, "grad_norm": 31.07491683959961, "kl": 0.0, "learning_rate": 4.000261711593824e-07, "logps/chosen": -199.22885131835938, "logps/rejected": -272.57720947265625, "loss": 0.3352, "rewards/chosen": -1.1168694496154785, "rewards/margins": 3.9204258918762207, "rewards/rejected": -5.037295341491699, "step": 764 }, { "epoch": 0.2, "grad_norm": 31.78119659423828, "kl": 0.0, "learning_rate": 3.9989531536247057e-07, "logps/chosen": -228.1612091064453, "logps/rejected": -297.5334167480469, "loss": 0.2313, "rewards/chosen": 0.9916869401931763, "rewards/margins": 4.4241437911987305, "rewards/rejected": -3.4324569702148438, "step": 765 }, { "epoch": 0.2, "grad_norm": 29.278806686401367, "kl": 0.0, "learning_rate": 3.997644595655587e-07, "logps/chosen": -241.4243621826172, "logps/rejected": -190.7371826171875, "loss": 0.3067, "rewards/chosen": -1.7682384252548218, "rewards/margins": 1.0480161905288696, "rewards/rejected": -2.8162546157836914, "step": 766 }, { "epoch": 0.2, "grad_norm": 35.81673049926758, "kl": 0.0, "learning_rate": 3.996336037686469e-07, "logps/chosen": -196.4593963623047, "logps/rejected": -266.684814453125, "loss": 0.2646, "rewards/chosen": -1.6129564046859741, "rewards/margins": 1.3678327798843384, "rewards/rejected": -2.9807891845703125, "step": 767 }, { "epoch": 0.2, "grad_norm": 41.0791130065918, "kl": 0.0, "learning_rate": 3.995027479717351e-07, "logps/chosen": -232.69644165039062, "logps/rejected": -138.3663787841797, "loss": 0.3468, "rewards/chosen": -0.28881752490997314, "rewards/margins": 1.794512152671814, "rewards/rejected": -2.083329677581787, "step": 768 }, { "epoch": 0.2, "grad_norm": 42.20341110229492, "kl": 0.0, "learning_rate": 3.993718921748233e-07, "logps/chosen": -226.63327026367188, "logps/rejected": -290.8764343261719, "loss": 0.3823, "rewards/chosen": -0.42782607674598694, "rewards/margins": 5.769502639770508, "rewards/rejected": -6.197328567504883, "step": 769 }, { "epoch": 0.2, "grad_norm": 25.241504669189453, "kl": 0.0, "learning_rate": 3.992410363779115e-07, "logps/chosen": -147.48744201660156, "logps/rejected": -215.68814086914062, "loss": 0.2609, "rewards/chosen": -0.21872329711914062, "rewards/margins": 3.802682399749756, "rewards/rejected": -4.0214056968688965, "step": 770 }, { "epoch": 0.2, "grad_norm": 34.434165954589844, "kl": 0.0, "learning_rate": 3.991101805809997e-07, "logps/chosen": -243.2288818359375, "logps/rejected": -340.7574462890625, "loss": 0.3147, "rewards/chosen": 0.6935755014419556, "rewards/margins": 5.347521781921387, "rewards/rejected": -4.653946399688721, "step": 771 }, { "epoch": 0.2, "grad_norm": 30.723644256591797, "kl": 0.0, "learning_rate": 3.989793247840879e-07, "logps/chosen": -243.35061645507812, "logps/rejected": -166.70977783203125, "loss": 0.4454, "rewards/chosen": -0.8095527291297913, "rewards/margins": 2.356269359588623, "rewards/rejected": -3.1658220291137695, "step": 772 }, { "epoch": 0.2, "grad_norm": 31.855859756469727, "kl": 0.0, "learning_rate": 3.988484689871761e-07, "logps/chosen": -212.469970703125, "logps/rejected": -177.2041473388672, "loss": 0.2554, "rewards/chosen": 0.17340558767318726, "rewards/margins": 3.40055251121521, "rewards/rejected": -3.227146863937378, "step": 773 }, { "epoch": 0.2, "grad_norm": 31.05317497253418, "kl": 0.0, "learning_rate": 3.9871761319026434e-07, "logps/chosen": -134.61614990234375, "logps/rejected": -240.55783081054688, "loss": 0.2107, "rewards/chosen": 0.7359917163848877, "rewards/margins": 3.681432008743286, "rewards/rejected": -2.9454402923583984, "step": 774 }, { "epoch": 0.2, "grad_norm": 27.210365295410156, "kl": 0.0, "learning_rate": 3.9858675739335253e-07, "logps/chosen": -221.8593292236328, "logps/rejected": -191.375, "loss": 0.2359, "rewards/chosen": 0.355133056640625, "rewards/margins": 3.7575221061706543, "rewards/rejected": -3.4023890495300293, "step": 775 }, { "epoch": 0.2, "grad_norm": 25.456256866455078, "kl": 0.0, "learning_rate": 3.9845590159644073e-07, "logps/chosen": -272.27392578125, "logps/rejected": -279.5438232421875, "loss": 0.2631, "rewards/chosen": -1.3574875593185425, "rewards/margins": 2.907285690307617, "rewards/rejected": -4.264773368835449, "step": 776 }, { "epoch": 0.2, "grad_norm": 42.95475387573242, "kl": 0.0, "learning_rate": 3.983250457995289e-07, "logps/chosen": -223.12966918945312, "logps/rejected": -245.69009399414062, "loss": 0.4357, "rewards/chosen": -1.1708242893218994, "rewards/margins": 0.8256139755249023, "rewards/rejected": -1.9964382648468018, "step": 777 }, { "epoch": 0.2, "grad_norm": 41.72541427612305, "kl": 0.0, "learning_rate": 3.981941900026171e-07, "logps/chosen": -184.670654296875, "logps/rejected": -201.3939208984375, "loss": 0.2308, "rewards/chosen": -0.840877890586853, "rewards/margins": 1.4940797090530396, "rewards/rejected": -2.3349575996398926, "step": 778 }, { "epoch": 0.2, "grad_norm": 27.049043655395508, "kl": 0.0, "learning_rate": 3.980633342057053e-07, "logps/chosen": -215.769287109375, "logps/rejected": -183.2026824951172, "loss": 0.2251, "rewards/chosen": 1.3883291482925415, "rewards/margins": 4.350786209106445, "rewards/rejected": -2.9624569416046143, "step": 779 }, { "epoch": 0.2, "grad_norm": 42.32488250732422, "kl": 0.0, "learning_rate": 3.979324784087935e-07, "logps/chosen": -271.8233947753906, "logps/rejected": -286.89044189453125, "loss": 0.3337, "rewards/chosen": -2.232872724533081, "rewards/margins": 2.1625797748565674, "rewards/rejected": -4.395452499389648, "step": 780 }, { "epoch": 0.2, "grad_norm": 34.89527130126953, "kl": 0.0, "learning_rate": 3.978016226118817e-07, "logps/chosen": -201.700439453125, "logps/rejected": -280.5649719238281, "loss": 0.2726, "rewards/chosen": 1.1064777374267578, "rewards/margins": 5.503815174102783, "rewards/rejected": -4.397337436676025, "step": 781 }, { "epoch": 0.2, "grad_norm": 38.49657440185547, "kl": 0.0, "learning_rate": 3.9767076681496985e-07, "logps/chosen": -210.4178924560547, "logps/rejected": -293.0738525390625, "loss": 0.3281, "rewards/chosen": -1.2712124586105347, "rewards/margins": 3.368025302886963, "rewards/rejected": -4.639237880706787, "step": 782 }, { "epoch": 0.2, "grad_norm": 30.322834014892578, "kl": 0.0, "learning_rate": 3.9753991101805805e-07, "logps/chosen": -178.96539306640625, "logps/rejected": -257.24505615234375, "loss": 0.3139, "rewards/chosen": -0.5862289667129517, "rewards/margins": 6.326912879943848, "rewards/rejected": -6.91314172744751, "step": 783 }, { "epoch": 0.21, "grad_norm": 36.877342224121094, "kl": 0.0, "learning_rate": 3.9740905522114624e-07, "logps/chosen": -254.62535095214844, "logps/rejected": -236.10403442382812, "loss": 0.2649, "rewards/chosen": -0.45170319080352783, "rewards/margins": 2.7409448623657227, "rewards/rejected": -3.192647933959961, "step": 784 }, { "epoch": 0.21, "grad_norm": 35.72412109375, "kl": 0.0, "learning_rate": 3.9727819942423444e-07, "logps/chosen": -272.28033447265625, "logps/rejected": -329.8775939941406, "loss": 0.3099, "rewards/chosen": -2.322260856628418, "rewards/margins": 2.1673550605773926, "rewards/rejected": -4.4896159172058105, "step": 785 }, { "epoch": 0.21, "grad_norm": 35.31153869628906, "kl": 0.0, "learning_rate": 3.9714734362732264e-07, "logps/chosen": -208.0709228515625, "logps/rejected": -185.36216735839844, "loss": 0.3253, "rewards/chosen": -1.4396045207977295, "rewards/margins": 1.5532078742980957, "rewards/rejected": -2.992812395095825, "step": 786 }, { "epoch": 0.21, "grad_norm": 30.253881454467773, "kl": 0.0, "learning_rate": 3.970164878304109e-07, "logps/chosen": -256.6632080078125, "logps/rejected": -263.5473937988281, "loss": 0.243, "rewards/chosen": 3.115372657775879, "rewards/margins": 5.554112434387207, "rewards/rejected": -2.438739776611328, "step": 787 }, { "epoch": 0.21, "grad_norm": 32.833534240722656, "kl": 0.0, "learning_rate": 3.968856320334991e-07, "logps/chosen": -235.54193115234375, "logps/rejected": -272.4754333496094, "loss": 0.3234, "rewards/chosen": -1.1567193269729614, "rewards/margins": 1.9879382848739624, "rewards/rejected": -3.144657611846924, "step": 788 }, { "epoch": 0.21, "grad_norm": 43.316383361816406, "kl": 0.0, "learning_rate": 3.967547762365873e-07, "logps/chosen": -204.2913818359375, "logps/rejected": -260.9977111816406, "loss": 0.3539, "rewards/chosen": -0.3340170979499817, "rewards/margins": 2.9788312911987305, "rewards/rejected": -3.3128483295440674, "step": 789 }, { "epoch": 0.21, "grad_norm": 32.47597122192383, "kl": 0.0, "learning_rate": 3.9662392043967547e-07, "logps/chosen": -313.443603515625, "logps/rejected": -299.80126953125, "loss": 0.3158, "rewards/chosen": -2.1112234592437744, "rewards/margins": 4.413530349731445, "rewards/rejected": -6.524753570556641, "step": 790 }, { "epoch": 0.21, "grad_norm": 34.65275573730469, "kl": 0.0, "learning_rate": 3.9649306464276367e-07, "logps/chosen": -205.21121215820312, "logps/rejected": -269.4504089355469, "loss": 0.5045, "rewards/chosen": -1.286717414855957, "rewards/margins": 0.5504343509674072, "rewards/rejected": -1.8371517658233643, "step": 791 }, { "epoch": 0.21, "grad_norm": 29.15659523010254, "kl": 0.0, "learning_rate": 3.9636220884585187e-07, "logps/chosen": -244.59078979492188, "logps/rejected": -208.80099487304688, "loss": 0.4093, "rewards/chosen": -2.4560344219207764, "rewards/margins": 0.5167515277862549, "rewards/rejected": -2.9727859497070312, "step": 792 }, { "epoch": 0.21, "grad_norm": 35.65248107910156, "kl": 0.0, "learning_rate": 3.9623135304894006e-07, "logps/chosen": -271.3067932128906, "logps/rejected": -251.15574645996094, "loss": 0.3733, "rewards/chosen": -0.8604080677032471, "rewards/margins": 1.2890377044677734, "rewards/rejected": -2.1494457721710205, "step": 793 }, { "epoch": 0.21, "grad_norm": 37.173133850097656, "kl": 0.0, "learning_rate": 3.9610049725202826e-07, "logps/chosen": -294.5159606933594, "logps/rejected": -212.5648956298828, "loss": 0.3855, "rewards/chosen": -1.1113216876983643, "rewards/margins": 0.6509411334991455, "rewards/rejected": -1.7622628211975098, "step": 794 }, { "epoch": 0.21, "grad_norm": 47.86101531982422, "kl": 0.0, "learning_rate": 3.9596964145511645e-07, "logps/chosen": -238.9080047607422, "logps/rejected": -328.6092224121094, "loss": 0.3277, "rewards/chosen": -2.1608142852783203, "rewards/margins": 2.6770687103271484, "rewards/rejected": -4.837882995605469, "step": 795 }, { "epoch": 0.21, "grad_norm": 34.385520935058594, "kl": 0.0, "learning_rate": 3.9583878565820465e-07, "logps/chosen": -236.8154754638672, "logps/rejected": -218.8819122314453, "loss": 0.1998, "rewards/chosen": 0.4100772738456726, "rewards/margins": 3.6112098693847656, "rewards/rejected": -3.2011325359344482, "step": 796 }, { "epoch": 0.21, "grad_norm": 32.56373596191406, "kl": 0.0, "learning_rate": 3.957079298612928e-07, "logps/chosen": -166.31546020507812, "logps/rejected": -275.31103515625, "loss": 0.2913, "rewards/chosen": 0.30415087938308716, "rewards/margins": 4.374905586242676, "rewards/rejected": -4.070754528045654, "step": 797 }, { "epoch": 0.21, "grad_norm": 40.8636474609375, "kl": 0.0, "learning_rate": 3.95577074064381e-07, "logps/chosen": -213.78359985351562, "logps/rejected": -246.65036010742188, "loss": 0.3221, "rewards/chosen": -0.7096244096755981, "rewards/margins": 1.3897720575332642, "rewards/rejected": -2.0993964672088623, "step": 798 }, { "epoch": 0.21, "grad_norm": 43.956844329833984, "kl": 0.0, "learning_rate": 3.9544621826746924e-07, "logps/chosen": -202.52725219726562, "logps/rejected": -294.99359130859375, "loss": 0.3391, "rewards/chosen": -0.27378353476524353, "rewards/margins": 3.027306318283081, "rewards/rejected": -3.3010897636413574, "step": 799 }, { "epoch": 0.21, "grad_norm": 29.847885131835938, "kl": 0.0, "learning_rate": 3.9531536247055743e-07, "logps/chosen": -252.54580688476562, "logps/rejected": -257.87762451171875, "loss": 0.2948, "rewards/chosen": -0.18890556693077087, "rewards/margins": 3.8304316997528076, "rewards/rejected": -4.019337177276611, "step": 800 }, { "epoch": 0.21, "grad_norm": 31.92094612121582, "kl": 0.0, "learning_rate": 3.9518450667364563e-07, "logps/chosen": -227.11886596679688, "logps/rejected": -212.2671356201172, "loss": 0.3175, "rewards/chosen": -0.3426421582698822, "rewards/margins": 1.967758297920227, "rewards/rejected": -2.3104004859924316, "step": 801 }, { "epoch": 0.21, "grad_norm": 30.653305053710938, "kl": 0.0, "learning_rate": 3.950536508767338e-07, "logps/chosen": -308.0470886230469, "logps/rejected": -205.42645263671875, "loss": 0.4064, "rewards/chosen": -1.1722359657287598, "rewards/margins": 1.6847832202911377, "rewards/rejected": -2.8570191860198975, "step": 802 }, { "epoch": 0.21, "grad_norm": 27.230607986450195, "kl": 0.0, "learning_rate": 3.94922795079822e-07, "logps/chosen": -236.7538299560547, "logps/rejected": -220.7267303466797, "loss": 0.134, "rewards/chosen": -1.016723394393921, "rewards/margins": 2.2433347702026367, "rewards/rejected": -3.2600581645965576, "step": 803 }, { "epoch": 0.21, "grad_norm": 31.964181900024414, "kl": 0.0, "learning_rate": 3.947919392829102e-07, "logps/chosen": -150.38731384277344, "logps/rejected": -342.32757568359375, "loss": 0.2802, "rewards/chosen": 0.08924896270036697, "rewards/margins": 3.7942516803741455, "rewards/rejected": -3.705002784729004, "step": 804 }, { "epoch": 0.21, "grad_norm": 42.96985626220703, "kl": 0.0, "learning_rate": 3.946610834859984e-07, "logps/chosen": -248.42263793945312, "logps/rejected": -238.973876953125, "loss": 0.3615, "rewards/chosen": -0.34363865852355957, "rewards/margins": 2.0021135807037354, "rewards/rejected": -2.345752239227295, "step": 805 }, { "epoch": 0.21, "grad_norm": 45.3122673034668, "kl": 0.0, "learning_rate": 3.945302276890866e-07, "logps/chosen": -219.62779235839844, "logps/rejected": -255.35231018066406, "loss": 0.3963, "rewards/chosen": -0.4257380962371826, "rewards/margins": 2.4570958614349365, "rewards/rejected": -2.882833957672119, "step": 806 }, { "epoch": 0.21, "grad_norm": 49.268367767333984, "kl": 0.0, "learning_rate": 3.943993718921748e-07, "logps/chosen": -139.48182678222656, "logps/rejected": -227.3538055419922, "loss": 0.305, "rewards/chosen": -0.5484585165977478, "rewards/margins": 1.365720510482788, "rewards/rejected": -1.9141790866851807, "step": 807 }, { "epoch": 0.21, "grad_norm": 37.057613372802734, "kl": 0.0, "learning_rate": 3.94268516095263e-07, "logps/chosen": -265.90673828125, "logps/rejected": -205.68800354003906, "loss": 0.3106, "rewards/chosen": -1.8103305101394653, "rewards/margins": 1.3182934522628784, "rewards/rejected": -3.1286239624023438, "step": 808 }, { "epoch": 0.21, "grad_norm": 37.79048538208008, "kl": 0.0, "learning_rate": 3.941376602983512e-07, "logps/chosen": -219.385986328125, "logps/rejected": -269.8956298828125, "loss": 0.4439, "rewards/chosen": -1.3443603515625, "rewards/margins": 3.233832836151123, "rewards/rejected": -4.578193187713623, "step": 809 }, { "epoch": 0.21, "grad_norm": 31.44384002685547, "kl": 0.0, "learning_rate": 3.940068045014394e-07, "logps/chosen": -180.49928283691406, "logps/rejected": -192.64857482910156, "loss": 0.3292, "rewards/chosen": -0.6538400650024414, "rewards/margins": 2.0370359420776367, "rewards/rejected": -2.690876007080078, "step": 810 }, { "epoch": 0.21, "grad_norm": 37.697723388671875, "kl": 0.0, "learning_rate": 3.938759487045276e-07, "logps/chosen": -219.0982666015625, "logps/rejected": -220.82827758789062, "loss": 0.2949, "rewards/chosen": -0.7485886216163635, "rewards/margins": 2.1607255935668945, "rewards/rejected": -2.9093141555786133, "step": 811 }, { "epoch": 0.21, "grad_norm": 38.43693923950195, "kl": 0.0, "learning_rate": 3.9374509290761584e-07, "logps/chosen": -150.76431274414062, "logps/rejected": -232.7530059814453, "loss": 0.4337, "rewards/chosen": -0.958981990814209, "rewards/margins": 2.0869970321655273, "rewards/rejected": -3.0459790229797363, "step": 812 }, { "epoch": 0.21, "grad_norm": 32.30561447143555, "kl": 0.0, "learning_rate": 3.93614237110704e-07, "logps/chosen": -216.4032745361328, "logps/rejected": -408.9929504394531, "loss": 0.359, "rewards/chosen": 0.07472788542509079, "rewards/margins": 3.6807384490966797, "rewards/rejected": -3.606010675430298, "step": 813 }, { "epoch": 0.21, "grad_norm": 28.573406219482422, "kl": 0.0, "learning_rate": 3.934833813137922e-07, "logps/chosen": -224.9008026123047, "logps/rejected": -248.16273498535156, "loss": 0.3129, "rewards/chosen": -2.0450844764709473, "rewards/margins": 2.191556930541992, "rewards/rejected": -4.2366414070129395, "step": 814 }, { "epoch": 0.21, "grad_norm": 36.37868118286133, "kl": 0.0, "learning_rate": 3.933525255168804e-07, "logps/chosen": -284.7607116699219, "logps/rejected": -218.2286834716797, "loss": 0.2874, "rewards/chosen": -1.0571905374526978, "rewards/margins": 1.2404452562332153, "rewards/rejected": -2.297635793685913, "step": 815 }, { "epoch": 0.21, "grad_norm": 36.04764938354492, "kl": 0.0, "learning_rate": 3.9322166971996857e-07, "logps/chosen": -190.5691680908203, "logps/rejected": -253.7047576904297, "loss": 0.3125, "rewards/chosen": -0.7160643935203552, "rewards/margins": 1.988569974899292, "rewards/rejected": -2.704634428024292, "step": 816 }, { "epoch": 0.21, "grad_norm": 38.212379455566406, "kl": 0.0, "learning_rate": 3.9309081392305677e-07, "logps/chosen": -264.66156005859375, "logps/rejected": -331.8368225097656, "loss": 0.3209, "rewards/chosen": -1.0136640071868896, "rewards/margins": 3.238999605178833, "rewards/rejected": -4.252663612365723, "step": 817 }, { "epoch": 0.21, "grad_norm": 31.041854858398438, "kl": 0.0, "learning_rate": 3.9295995812614496e-07, "logps/chosen": -195.3871307373047, "logps/rejected": -211.3892059326172, "loss": 0.3107, "rewards/chosen": -0.8108863830566406, "rewards/margins": 2.6762940883636475, "rewards/rejected": -3.487180471420288, "step": 818 }, { "epoch": 0.21, "grad_norm": 37.01015853881836, "kl": 0.0, "learning_rate": 3.9282910232923316e-07, "logps/chosen": -186.79855346679688, "logps/rejected": -205.82369995117188, "loss": 0.3073, "rewards/chosen": -0.5886488556861877, "rewards/margins": 1.8132927417755127, "rewards/rejected": -2.4019415378570557, "step": 819 }, { "epoch": 0.21, "grad_norm": 35.67362976074219, "kl": 0.0, "learning_rate": 3.9269824653232136e-07, "logps/chosen": -237.49017333984375, "logps/rejected": -328.6663513183594, "loss": 0.3152, "rewards/chosen": 0.14792077243328094, "rewards/margins": 3.6670782566070557, "rewards/rejected": -3.5191574096679688, "step": 820 }, { "epoch": 0.21, "grad_norm": 39.48514175415039, "kl": 0.0, "learning_rate": 3.9256739073540955e-07, "logps/chosen": -218.6199493408203, "logps/rejected": -374.603271484375, "loss": 0.2194, "rewards/chosen": 0.45418599247932434, "rewards/margins": 3.1380836963653564, "rewards/rejected": -2.6838977336883545, "step": 821 }, { "epoch": 0.22, "grad_norm": 32.8681755065918, "kl": 0.0, "learning_rate": 3.9243653493849775e-07, "logps/chosen": -204.17843627929688, "logps/rejected": -323.87896728515625, "loss": 0.3093, "rewards/chosen": 0.1345633566379547, "rewards/margins": 3.1268160343170166, "rewards/rejected": -2.9922525882720947, "step": 822 }, { "epoch": 0.22, "grad_norm": 34.83428192138672, "kl": 0.0, "learning_rate": 3.9230567914158594e-07, "logps/chosen": -280.830078125, "logps/rejected": -185.23431396484375, "loss": 0.3816, "rewards/chosen": -0.6364680528640747, "rewards/margins": 2.2966771125793457, "rewards/rejected": -2.933145046234131, "step": 823 }, { "epoch": 0.22, "grad_norm": 32.58478546142578, "kl": 0.0, "learning_rate": 3.9217482334467414e-07, "logps/chosen": -229.3389434814453, "logps/rejected": -239.52867126464844, "loss": 0.41, "rewards/chosen": -0.861241340637207, "rewards/margins": 1.8840315341949463, "rewards/rejected": -2.7452728748321533, "step": 824 }, { "epoch": 0.22, "grad_norm": 36.9166374206543, "kl": 0.0, "learning_rate": 3.920439675477624e-07, "logps/chosen": -230.52386474609375, "logps/rejected": -196.36956787109375, "loss": 0.4067, "rewards/chosen": -0.9328143000602722, "rewards/margins": 1.2270958423614502, "rewards/rejected": -2.159910202026367, "step": 825 }, { "epoch": 0.22, "grad_norm": 30.93659210205078, "kl": 0.0, "learning_rate": 3.919131117508506e-07, "logps/chosen": -269.2838439941406, "logps/rejected": -263.96917724609375, "loss": 0.3151, "rewards/chosen": -0.5156112909317017, "rewards/margins": 3.1444249153137207, "rewards/rejected": -3.660036325454712, "step": 826 }, { "epoch": 0.22, "grad_norm": 38.69263458251953, "kl": 0.0, "learning_rate": 3.917822559539388e-07, "logps/chosen": -322.1570739746094, "logps/rejected": -225.15899658203125, "loss": 0.3515, "rewards/chosen": -0.8002814650535583, "rewards/margins": 2.331637382507324, "rewards/rejected": -3.1319189071655273, "step": 827 }, { "epoch": 0.22, "grad_norm": 30.892852783203125, "kl": 0.0, "learning_rate": 3.916514001570269e-07, "logps/chosen": -158.5875701904297, "logps/rejected": -316.0892028808594, "loss": 0.2362, "rewards/chosen": -0.21511435508728027, "rewards/margins": 5.075641632080078, "rewards/rejected": -5.2907562255859375, "step": 828 }, { "epoch": 0.22, "grad_norm": 24.3665771484375, "kl": 0.0, "learning_rate": 3.915205443601151e-07, "logps/chosen": -233.41485595703125, "logps/rejected": -254.3380584716797, "loss": 0.3087, "rewards/chosen": 0.7938384413719177, "rewards/margins": 4.078500747680664, "rewards/rejected": -3.2846624851226807, "step": 829 }, { "epoch": 0.22, "grad_norm": 26.198406219482422, "kl": 0.0, "learning_rate": 3.913896885632033e-07, "logps/chosen": -134.5040283203125, "logps/rejected": -127.63214111328125, "loss": 0.2277, "rewards/chosen": -0.1338224709033966, "rewards/margins": 1.969205379486084, "rewards/rejected": -2.103027820587158, "step": 830 }, { "epoch": 0.22, "grad_norm": 38.63385772705078, "kl": 0.0, "learning_rate": 3.912588327662915e-07, "logps/chosen": -223.22711181640625, "logps/rejected": -281.6058654785156, "loss": 0.3643, "rewards/chosen": -0.4442996084690094, "rewards/margins": 1.7670749425888062, "rewards/rejected": -2.211374521255493, "step": 831 }, { "epoch": 0.22, "grad_norm": 40.71464538574219, "kl": 0.0, "learning_rate": 3.911279769693797e-07, "logps/chosen": -272.858154296875, "logps/rejected": -323.8492126464844, "loss": 0.3892, "rewards/chosen": -1.7337590456008911, "rewards/margins": 2.563958168029785, "rewards/rejected": -4.297717094421387, "step": 832 }, { "epoch": 0.22, "grad_norm": 41.5645751953125, "kl": 0.0, "learning_rate": 3.909971211724679e-07, "logps/chosen": -202.53445434570312, "logps/rejected": -205.5897979736328, "loss": 0.3593, "rewards/chosen": -0.435829758644104, "rewards/margins": 3.275907039642334, "rewards/rejected": -3.7117366790771484, "step": 833 }, { "epoch": 0.22, "grad_norm": 36.07457733154297, "kl": 0.0, "learning_rate": 3.908662653755561e-07, "logps/chosen": -190.55044555664062, "logps/rejected": -263.7154846191406, "loss": 0.2823, "rewards/chosen": 1.6647402048110962, "rewards/margins": 6.3530097007751465, "rewards/rejected": -4.68826961517334, "step": 834 }, { "epoch": 0.22, "grad_norm": 37.38973617553711, "kl": 0.0, "learning_rate": 3.907354095786443e-07, "logps/chosen": -190.0474090576172, "logps/rejected": -329.23077392578125, "loss": 0.3735, "rewards/chosen": 0.38552647829055786, "rewards/margins": 5.102424621582031, "rewards/rejected": -4.716897964477539, "step": 835 }, { "epoch": 0.22, "grad_norm": 34.40928268432617, "kl": 0.0, "learning_rate": 3.906045537817325e-07, "logps/chosen": -252.2081298828125, "logps/rejected": -211.90760803222656, "loss": 0.2441, "rewards/chosen": 0.08200878649950027, "rewards/margins": 3.04998517036438, "rewards/rejected": -2.9679763317108154, "step": 836 }, { "epoch": 0.22, "grad_norm": 29.55794906616211, "kl": 0.0, "learning_rate": 3.9047369798482074e-07, "logps/chosen": -313.3184814453125, "logps/rejected": -272.58660888671875, "loss": 0.275, "rewards/chosen": 0.34192368388175964, "rewards/margins": 4.394167900085449, "rewards/rejected": -4.052244186401367, "step": 837 }, { "epoch": 0.22, "grad_norm": 41.28226089477539, "kl": 0.0, "learning_rate": 3.9034284218790894e-07, "logps/chosen": -200.53805541992188, "logps/rejected": -207.01441955566406, "loss": 0.4246, "rewards/chosen": -0.12353704869747162, "rewards/margins": 2.117542028427124, "rewards/rejected": -2.241079092025757, "step": 838 }, { "epoch": 0.22, "grad_norm": 35.18424606323242, "kl": 0.0, "learning_rate": 3.9021198639099713e-07, "logps/chosen": -211.63534545898438, "logps/rejected": -174.66592407226562, "loss": 0.453, "rewards/chosen": -1.374969244003296, "rewards/margins": 1.2978932857513428, "rewards/rejected": -2.6728625297546387, "step": 839 }, { "epoch": 0.22, "grad_norm": 25.139108657836914, "kl": 0.0, "learning_rate": 3.9008113059408533e-07, "logps/chosen": -101.32666778564453, "logps/rejected": -270.806640625, "loss": 0.3132, "rewards/chosen": -0.28592658042907715, "rewards/margins": 2.875089168548584, "rewards/rejected": -3.161015748977661, "step": 840 }, { "epoch": 0.22, "grad_norm": 27.314390182495117, "kl": 0.0, "learning_rate": 3.8995027479717353e-07, "logps/chosen": -173.18409729003906, "logps/rejected": -310.06292724609375, "loss": 0.3556, "rewards/chosen": 0.756372332572937, "rewards/margins": 4.136096477508545, "rewards/rejected": -3.3797240257263184, "step": 841 }, { "epoch": 0.22, "grad_norm": 28.0109806060791, "kl": 0.0, "learning_rate": 3.898194190002617e-07, "logps/chosen": -255.11325073242188, "logps/rejected": -244.13511657714844, "loss": 0.3061, "rewards/chosen": -1.0839815139770508, "rewards/margins": 2.8800461292266846, "rewards/rejected": -3.9640276432037354, "step": 842 }, { "epoch": 0.22, "grad_norm": 30.117618560791016, "kl": 0.0, "learning_rate": 3.896885632033499e-07, "logps/chosen": -222.35382080078125, "logps/rejected": -156.29025268554688, "loss": 0.3124, "rewards/chosen": 0.9357742667198181, "rewards/margins": 3.4082491397857666, "rewards/rejected": -2.4724748134613037, "step": 843 }, { "epoch": 0.22, "grad_norm": 32.259674072265625, "kl": 0.0, "learning_rate": 3.8955770740643806e-07, "logps/chosen": -228.333984375, "logps/rejected": -230.0655059814453, "loss": 0.3317, "rewards/chosen": -1.7928178310394287, "rewards/margins": 0.8958656787872314, "rewards/rejected": -2.68868350982666, "step": 844 }, { "epoch": 0.22, "grad_norm": 33.866641998291016, "kl": 0.0, "learning_rate": 3.8942685160952626e-07, "logps/chosen": -162.68394470214844, "logps/rejected": -260.8937072753906, "loss": 0.2789, "rewards/chosen": -0.48359790444374084, "rewards/margins": 1.7450591325759888, "rewards/rejected": -2.2286570072174072, "step": 845 }, { "epoch": 0.22, "grad_norm": 41.97733688354492, "kl": 0.0, "learning_rate": 3.8929599581261445e-07, "logps/chosen": -211.76185607910156, "logps/rejected": -203.70687866210938, "loss": 0.353, "rewards/chosen": -0.6491997838020325, "rewards/margins": 1.6363976001739502, "rewards/rejected": -2.285597324371338, "step": 846 }, { "epoch": 0.22, "grad_norm": 40.185462951660156, "kl": 0.0, "learning_rate": 3.8916514001570265e-07, "logps/chosen": -247.6580047607422, "logps/rejected": -251.53753662109375, "loss": 0.3429, "rewards/chosen": -0.5201565623283386, "rewards/margins": 2.708573579788208, "rewards/rejected": -3.2287302017211914, "step": 847 }, { "epoch": 0.22, "grad_norm": 35.77328872680664, "kl": 0.0, "learning_rate": 3.8903428421879085e-07, "logps/chosen": -221.64239501953125, "logps/rejected": -269.6127014160156, "loss": 0.4148, "rewards/chosen": -1.399546504020691, "rewards/margins": 1.6255131959915161, "rewards/rejected": -3.025059700012207, "step": 848 }, { "epoch": 0.22, "grad_norm": 34.25099182128906, "kl": 0.0, "learning_rate": 3.8890342842187904e-07, "logps/chosen": -205.93341064453125, "logps/rejected": -214.56007385253906, "loss": 0.3326, "rewards/chosen": 0.8295797109603882, "rewards/margins": 3.103825569152832, "rewards/rejected": -2.2742457389831543, "step": 849 }, { "epoch": 0.22, "grad_norm": 34.43983840942383, "kl": 0.0, "learning_rate": 3.887725726249673e-07, "logps/chosen": -381.5731201171875, "logps/rejected": -172.75927734375, "loss": 0.299, "rewards/chosen": -0.3489433228969574, "rewards/margins": 4.253884315490723, "rewards/rejected": -4.602827548980713, "step": 850 }, { "epoch": 0.22, "grad_norm": 34.45042419433594, "kl": 0.0, "learning_rate": 3.886417168280555e-07, "logps/chosen": -308.0503845214844, "logps/rejected": -223.3968963623047, "loss": 0.2272, "rewards/chosen": -1.4904717206954956, "rewards/margins": 0.9229556322097778, "rewards/rejected": -2.4134273529052734, "step": 851 }, { "epoch": 0.22, "grad_norm": 24.188337326049805, "kl": 0.0, "learning_rate": 3.885108610311437e-07, "logps/chosen": -248.4654541015625, "logps/rejected": -141.71022033691406, "loss": 0.4756, "rewards/chosen": -1.6014578342437744, "rewards/margins": 1.601494550704956, "rewards/rejected": -3.2029523849487305, "step": 852 }, { "epoch": 0.22, "grad_norm": 35.598716735839844, "kl": 0.0, "learning_rate": 3.883800052342319e-07, "logps/chosen": -248.89134216308594, "logps/rejected": -201.86526489257812, "loss": 0.2738, "rewards/chosen": -1.4978164434432983, "rewards/margins": 1.9217320680618286, "rewards/rejected": -3.419548511505127, "step": 853 }, { "epoch": 0.22, "grad_norm": 33.07661056518555, "kl": 0.0, "learning_rate": 3.882491494373201e-07, "logps/chosen": -201.11595153808594, "logps/rejected": -292.6474914550781, "loss": 0.3333, "rewards/chosen": -0.014609907753765583, "rewards/margins": 3.7823586463928223, "rewards/rejected": -3.796968460083008, "step": 854 }, { "epoch": 0.22, "grad_norm": 33.12773132324219, "kl": 0.0, "learning_rate": 3.8811829364040827e-07, "logps/chosen": -280.09844970703125, "logps/rejected": -193.63916015625, "loss": 0.3676, "rewards/chosen": -1.4199799299240112, "rewards/margins": 0.9965299367904663, "rewards/rejected": -2.4165098667144775, "step": 855 }, { "epoch": 0.22, "grad_norm": 39.756919860839844, "kl": 0.0, "learning_rate": 3.8798743784349647e-07, "logps/chosen": -260.8897705078125, "logps/rejected": -261.7392578125, "loss": 0.37, "rewards/chosen": 0.2770477533340454, "rewards/margins": 3.7935256958007812, "rewards/rejected": -3.5164780616760254, "step": 856 }, { "epoch": 0.22, "grad_norm": 37.37472152709961, "kl": 0.0, "learning_rate": 3.8785658204658466e-07, "logps/chosen": -322.6257629394531, "logps/rejected": -273.3817443847656, "loss": 0.2932, "rewards/chosen": 0.63135826587677, "rewards/margins": 3.3113021850585938, "rewards/rejected": -2.6799440383911133, "step": 857 }, { "epoch": 0.22, "grad_norm": 28.249853134155273, "kl": 0.0, "learning_rate": 3.8772572624967286e-07, "logps/chosen": -176.0353546142578, "logps/rejected": -240.82408142089844, "loss": 0.2774, "rewards/chosen": -0.18540804088115692, "rewards/margins": 3.2752466201782227, "rewards/rejected": -3.4606547355651855, "step": 858 }, { "epoch": 0.22, "grad_norm": 30.973310470581055, "kl": 0.0, "learning_rate": 3.87594870452761e-07, "logps/chosen": -225.46209716796875, "logps/rejected": -178.56396484375, "loss": 0.368, "rewards/chosen": -1.4540555477142334, "rewards/margins": 2.279228925704956, "rewards/rejected": -3.7332844734191895, "step": 859 }, { "epoch": 0.23, "grad_norm": 33.65345001220703, "kl": 0.0, "learning_rate": 3.874640146558492e-07, "logps/chosen": -363.8309020996094, "logps/rejected": -218.1306915283203, "loss": 0.261, "rewards/chosen": 0.6604343056678772, "rewards/margins": 3.1387128829956055, "rewards/rejected": -2.478278636932373, "step": 860 }, { "epoch": 0.23, "grad_norm": 33.49645233154297, "kl": 0.0, "learning_rate": 3.873331588589374e-07, "logps/chosen": -191.21910095214844, "logps/rejected": -216.03738403320312, "loss": 0.3289, "rewards/chosen": 0.7137885689735413, "rewards/margins": 3.4422836303710938, "rewards/rejected": -2.7284951210021973, "step": 861 }, { "epoch": 0.23, "grad_norm": 37.977264404296875, "kl": 0.0, "learning_rate": 3.872023030620256e-07, "logps/chosen": -245.6107940673828, "logps/rejected": -322.30938720703125, "loss": 0.4959, "rewards/chosen": -0.9423640966415405, "rewards/margins": 2.3004255294799805, "rewards/rejected": -3.2427897453308105, "step": 862 }, { "epoch": 0.23, "grad_norm": 29.341732025146484, "kl": 0.0, "learning_rate": 3.8707144726511384e-07, "logps/chosen": -230.37039184570312, "logps/rejected": -185.85226440429688, "loss": 0.3602, "rewards/chosen": 0.03912970423698425, "rewards/margins": 3.828538417816162, "rewards/rejected": -3.7894086837768555, "step": 863 }, { "epoch": 0.23, "grad_norm": 35.85225296020508, "kl": 0.0, "learning_rate": 3.8694059146820204e-07, "logps/chosen": -275.0382995605469, "logps/rejected": -252.705078125, "loss": 0.2673, "rewards/chosen": -0.8635746836662292, "rewards/margins": 2.1875228881835938, "rewards/rejected": -3.0510976314544678, "step": 864 }, { "epoch": 0.23, "grad_norm": 33.60776901245117, "kl": 0.0, "learning_rate": 3.8680973567129023e-07, "logps/chosen": -262.7815246582031, "logps/rejected": -232.62852478027344, "loss": 0.2906, "rewards/chosen": 1.0779635906219482, "rewards/margins": 4.774511337280273, "rewards/rejected": -3.696547508239746, "step": 865 }, { "epoch": 0.23, "grad_norm": 30.603410720825195, "kl": 0.0, "learning_rate": 3.8667887987437843e-07, "logps/chosen": -282.4066162109375, "logps/rejected": -290.36651611328125, "loss": 0.3335, "rewards/chosen": -0.6011098623275757, "rewards/margins": 3.6880950927734375, "rewards/rejected": -4.289205074310303, "step": 866 }, { "epoch": 0.23, "grad_norm": 26.779672622680664, "kl": 0.0, "learning_rate": 3.865480240774666e-07, "logps/chosen": -200.43505859375, "logps/rejected": -276.28826904296875, "loss": 0.2913, "rewards/chosen": 0.06558636575937271, "rewards/margins": 3.628774881362915, "rewards/rejected": -3.5631885528564453, "step": 867 }, { "epoch": 0.23, "grad_norm": 32.30617141723633, "kl": 0.0, "learning_rate": 3.864171682805548e-07, "logps/chosen": -196.51950073242188, "logps/rejected": -198.25823974609375, "loss": 0.4009, "rewards/chosen": -0.655818521976471, "rewards/margins": 1.8044543266296387, "rewards/rejected": -2.460272789001465, "step": 868 }, { "epoch": 0.23, "grad_norm": 34.12016677856445, "kl": 0.0, "learning_rate": 3.86286312483643e-07, "logps/chosen": -168.55343627929688, "logps/rejected": -235.79209899902344, "loss": 0.2177, "rewards/chosen": 0.27080053091049194, "rewards/margins": 2.634535789489746, "rewards/rejected": -2.3637351989746094, "step": 869 }, { "epoch": 0.23, "grad_norm": 36.46745681762695, "kl": 0.0, "learning_rate": 3.861554566867312e-07, "logps/chosen": -256.4330139160156, "logps/rejected": -251.66062927246094, "loss": 0.3366, "rewards/chosen": 0.3164384663105011, "rewards/margins": 3.171847105026245, "rewards/rejected": -2.8554086685180664, "step": 870 }, { "epoch": 0.23, "grad_norm": 35.32799530029297, "kl": 0.0, "learning_rate": 3.860246008898194e-07, "logps/chosen": -299.51617431640625, "logps/rejected": -287.8265075683594, "loss": 0.3167, "rewards/chosen": 0.08460255712270737, "rewards/margins": 3.289461135864258, "rewards/rejected": -3.2048585414886475, "step": 871 }, { "epoch": 0.23, "grad_norm": 29.19240379333496, "kl": 0.0, "learning_rate": 3.858937450929076e-07, "logps/chosen": -231.63706970214844, "logps/rejected": -258.1219482421875, "loss": 0.3226, "rewards/chosen": -1.3912551403045654, "rewards/margins": 1.5225615501403809, "rewards/rejected": -2.9138166904449463, "step": 872 }, { "epoch": 0.23, "grad_norm": 32.26476287841797, "kl": 0.0, "learning_rate": 3.857628892959958e-07, "logps/chosen": -148.3664093017578, "logps/rejected": -233.32034301757812, "loss": 0.1882, "rewards/chosen": -0.08406653255224228, "rewards/margins": 3.803556203842163, "rewards/rejected": -3.887622833251953, "step": 873 }, { "epoch": 0.23, "grad_norm": 29.396602630615234, "kl": 0.0, "learning_rate": 3.85632033499084e-07, "logps/chosen": -265.3435974121094, "logps/rejected": -235.33168029785156, "loss": 0.2295, "rewards/chosen": -0.5107182860374451, "rewards/margins": 3.317000389099121, "rewards/rejected": -3.827718734741211, "step": 874 }, { "epoch": 0.23, "grad_norm": 35.100589752197266, "kl": 0.0, "learning_rate": 3.8550117770217214e-07, "logps/chosen": -235.42428588867188, "logps/rejected": -248.19354248046875, "loss": 0.31, "rewards/chosen": -2.912658452987671, "rewards/margins": 0.17116618156433105, "rewards/rejected": -3.083824634552002, "step": 875 }, { "epoch": 0.23, "grad_norm": 34.2608528137207, "kl": 0.0, "learning_rate": 3.853703219052604e-07, "logps/chosen": -310.35650634765625, "logps/rejected": -265.16485595703125, "loss": 0.4798, "rewards/chosen": -2.0445914268493652, "rewards/margins": 1.4393863677978516, "rewards/rejected": -3.483977794647217, "step": 876 }, { "epoch": 0.23, "grad_norm": 33.69778060913086, "kl": 0.0, "learning_rate": 3.852394661083486e-07, "logps/chosen": -184.93099975585938, "logps/rejected": -218.10235595703125, "loss": 0.42, "rewards/chosen": -1.306616187095642, "rewards/margins": 1.0505958795547485, "rewards/rejected": -2.3572120666503906, "step": 877 }, { "epoch": 0.23, "grad_norm": 28.923768997192383, "kl": 0.0, "learning_rate": 3.851086103114368e-07, "logps/chosen": -179.19200134277344, "logps/rejected": -285.06695556640625, "loss": 0.3786, "rewards/chosen": -1.5913536548614502, "rewards/margins": 3.530456304550171, "rewards/rejected": -5.121809959411621, "step": 878 }, { "epoch": 0.23, "grad_norm": 30.29254913330078, "kl": 0.0, "learning_rate": 3.84977754514525e-07, "logps/chosen": -225.9005126953125, "logps/rejected": -261.67474365234375, "loss": 0.2837, "rewards/chosen": -0.349773108959198, "rewards/margins": 2.802173376083374, "rewards/rejected": -3.151946544647217, "step": 879 }, { "epoch": 0.23, "grad_norm": 28.67768096923828, "kl": 0.0, "learning_rate": 3.848468987176132e-07, "logps/chosen": -226.13919067382812, "logps/rejected": -247.85031127929688, "loss": 0.3464, "rewards/chosen": -0.07159680128097534, "rewards/margins": 3.374845504760742, "rewards/rejected": -3.4464423656463623, "step": 880 }, { "epoch": 0.23, "grad_norm": 34.244720458984375, "kl": 0.0, "learning_rate": 3.8471604292070137e-07, "logps/chosen": -279.54034423828125, "logps/rejected": -245.26930236816406, "loss": 0.3628, "rewards/chosen": -1.0659271478652954, "rewards/margins": 1.1372548341751099, "rewards/rejected": -2.2031819820404053, "step": 881 }, { "epoch": 0.23, "grad_norm": 26.053386688232422, "kl": 0.0, "learning_rate": 3.8458518712378957e-07, "logps/chosen": -226.8448944091797, "logps/rejected": -236.7382049560547, "loss": 0.2382, "rewards/chosen": -0.6372510194778442, "rewards/margins": 4.326333045959473, "rewards/rejected": -4.963583946228027, "step": 882 }, { "epoch": 0.23, "grad_norm": 33.15485763549805, "kl": 0.0, "learning_rate": 3.8445433132687776e-07, "logps/chosen": -248.4652099609375, "logps/rejected": -286.5483703613281, "loss": 0.4762, "rewards/chosen": -0.453130841255188, "rewards/margins": 1.2334181070327759, "rewards/rejected": -1.6865489482879639, "step": 883 }, { "epoch": 0.23, "grad_norm": 25.893075942993164, "kl": 0.0, "learning_rate": 3.8432347552996596e-07, "logps/chosen": -133.47567749023438, "logps/rejected": -277.131103515625, "loss": 0.3321, "rewards/chosen": -0.7266333699226379, "rewards/margins": 3.487426519393921, "rewards/rejected": -4.214059829711914, "step": 884 }, { "epoch": 0.23, "grad_norm": 36.91793441772461, "kl": 0.0, "learning_rate": 3.8419261973305415e-07, "logps/chosen": -308.4381408691406, "logps/rejected": -243.12611389160156, "loss": 0.3189, "rewards/chosen": -1.1646596193313599, "rewards/margins": 2.519777774810791, "rewards/rejected": -3.6844375133514404, "step": 885 }, { "epoch": 0.23, "grad_norm": 32.67780685424805, "kl": 0.0, "learning_rate": 3.8406176393614235e-07, "logps/chosen": -372.8730163574219, "logps/rejected": -238.40402221679688, "loss": 0.3381, "rewards/chosen": -1.422531247138977, "rewards/margins": 0.20666325092315674, "rewards/rejected": -1.6291944980621338, "step": 886 }, { "epoch": 0.23, "grad_norm": 30.643545150756836, "kl": 0.0, "learning_rate": 3.8393090813923055e-07, "logps/chosen": -182.9138946533203, "logps/rejected": -255.97509765625, "loss": 0.3896, "rewards/chosen": 0.642753005027771, "rewards/margins": 5.055312156677246, "rewards/rejected": -4.4125590324401855, "step": 887 }, { "epoch": 0.23, "grad_norm": 31.73491668701172, "kl": 0.0, "learning_rate": 3.838000523423188e-07, "logps/chosen": -246.7891845703125, "logps/rejected": -216.09461975097656, "loss": 0.3923, "rewards/chosen": 0.15031063556671143, "rewards/margins": 3.669569969177246, "rewards/rejected": -3.519259452819824, "step": 888 }, { "epoch": 0.23, "grad_norm": 33.767276763916016, "kl": 0.0, "learning_rate": 3.83669196545407e-07, "logps/chosen": -181.1678466796875, "logps/rejected": -218.5722198486328, "loss": 0.3537, "rewards/chosen": -0.03673672676086426, "rewards/margins": 3.3610239028930664, "rewards/rejected": -3.3977606296539307, "step": 889 }, { "epoch": 0.23, "grad_norm": 34.10417175292969, "kl": 0.0, "learning_rate": 3.8353834074849514e-07, "logps/chosen": -225.9097900390625, "logps/rejected": -223.67532348632812, "loss": 0.2948, "rewards/chosen": 1.104257583618164, "rewards/margins": 2.821953535079956, "rewards/rejected": -1.717695951461792, "step": 890 }, { "epoch": 0.23, "grad_norm": 31.37761688232422, "kl": 0.0, "learning_rate": 3.8340748495158333e-07, "logps/chosen": -196.47811889648438, "logps/rejected": -288.2806396484375, "loss": 0.3518, "rewards/chosen": -1.7377431392669678, "rewards/margins": 4.380807876586914, "rewards/rejected": -6.118551254272461, "step": 891 }, { "epoch": 0.23, "grad_norm": 29.576194763183594, "kl": 0.0, "learning_rate": 3.8327662915467153e-07, "logps/chosen": -229.60830688476562, "logps/rejected": -204.51947021484375, "loss": 0.3093, "rewards/chosen": -0.2672429084777832, "rewards/margins": 3.7184066772460938, "rewards/rejected": -3.985649585723877, "step": 892 }, { "epoch": 0.23, "grad_norm": 41.917518615722656, "kl": 0.0, "learning_rate": 3.831457733577597e-07, "logps/chosen": -236.6829071044922, "logps/rejected": -225.75125122070312, "loss": 0.2346, "rewards/chosen": -0.9902133941650391, "rewards/margins": 2.6195900440216064, "rewards/rejected": -3.6098034381866455, "step": 893 }, { "epoch": 0.23, "grad_norm": 36.87592697143555, "kl": 0.0, "learning_rate": 3.830149175608479e-07, "logps/chosen": -267.770263671875, "logps/rejected": -300.6760559082031, "loss": 0.2915, "rewards/chosen": 0.8566461801528931, "rewards/margins": 3.4000487327575684, "rewards/rejected": -2.5434024333953857, "step": 894 }, { "epoch": 0.23, "grad_norm": 32.469627380371094, "kl": 0.0, "learning_rate": 3.828840617639361e-07, "logps/chosen": -283.3698425292969, "logps/rejected": -317.1371765136719, "loss": 0.2433, "rewards/chosen": 0.33935195207595825, "rewards/margins": 4.569455623626709, "rewards/rejected": -4.230103492736816, "step": 895 }, { "epoch": 0.23, "grad_norm": 29.65595817565918, "kl": 0.0, "learning_rate": 3.827532059670243e-07, "logps/chosen": -210.16015625, "logps/rejected": -274.7388000488281, "loss": 0.3122, "rewards/chosen": -0.0729527473449707, "rewards/margins": 3.0755321979522705, "rewards/rejected": -3.148484945297241, "step": 896 }, { "epoch": 0.23, "grad_norm": 32.243221282958984, "kl": 0.0, "learning_rate": 3.826223501701125e-07, "logps/chosen": -187.37014770507812, "logps/rejected": -288.8951110839844, "loss": 0.2861, "rewards/chosen": -0.23226681351661682, "rewards/margins": 4.278383731842041, "rewards/rejected": -4.510650634765625, "step": 897 }, { "epoch": 0.24, "grad_norm": 25.51363182067871, "kl": 0.0, "learning_rate": 3.824914943732007e-07, "logps/chosen": -116.35990142822266, "logps/rejected": -182.46832275390625, "loss": 0.2709, "rewards/chosen": 0.5371560454368591, "rewards/margins": 2.7382872104644775, "rewards/rejected": -2.2011311054229736, "step": 898 }, { "epoch": 0.24, "grad_norm": 28.72956085205078, "kl": 0.0, "learning_rate": 3.823606385762889e-07, "logps/chosen": -159.8656768798828, "logps/rejected": -194.38632202148438, "loss": 0.3667, "rewards/chosen": -0.5046699047088623, "rewards/margins": 3.2915899753570557, "rewards/rejected": -3.796259880065918, "step": 899 }, { "epoch": 0.24, "grad_norm": 36.71482467651367, "kl": 0.0, "learning_rate": 3.822297827793771e-07, "logps/chosen": -159.0552978515625, "logps/rejected": -281.41558837890625, "loss": 0.3563, "rewards/chosen": -0.36698049306869507, "rewards/margins": 2.405285358428955, "rewards/rejected": -2.772265911102295, "step": 900 }, { "epoch": 0.24, "grad_norm": 37.9292106628418, "kl": 0.0, "learning_rate": 3.8209892698246535e-07, "logps/chosen": -272.8384094238281, "logps/rejected": -299.947021484375, "loss": 0.3332, "rewards/chosen": -0.057388510555028915, "rewards/margins": 3.6654114723205566, "rewards/rejected": -3.7228000164031982, "step": 901 }, { "epoch": 0.24, "grad_norm": 35.52886199951172, "kl": 0.0, "learning_rate": 3.8196807118555354e-07, "logps/chosen": -202.97857666015625, "logps/rejected": -237.06822204589844, "loss": 0.2748, "rewards/chosen": 0.6133555769920349, "rewards/margins": 4.030307292938232, "rewards/rejected": -3.4169516563415527, "step": 902 }, { "epoch": 0.24, "grad_norm": 24.192476272583008, "kl": 0.0, "learning_rate": 3.8183721538864174e-07, "logps/chosen": -108.77693939208984, "logps/rejected": -283.0856628417969, "loss": 0.1872, "rewards/chosen": -0.7498208284378052, "rewards/margins": 0.35967063903808594, "rewards/rejected": -1.1094914674758911, "step": 903 }, { "epoch": 0.24, "grad_norm": 42.334014892578125, "kl": 0.0, "learning_rate": 3.8170635959172993e-07, "logps/chosen": -282.7518310546875, "logps/rejected": -201.28744506835938, "loss": 0.3745, "rewards/chosen": 0.6112672090530396, "rewards/margins": 3.277454376220703, "rewards/rejected": -2.666187286376953, "step": 904 }, { "epoch": 0.24, "grad_norm": 34.252220153808594, "kl": 0.0, "learning_rate": 3.815755037948181e-07, "logps/chosen": -229.54544067382812, "logps/rejected": -201.85923767089844, "loss": 0.3428, "rewards/chosen": -1.0642545223236084, "rewards/margins": 2.2222585678100586, "rewards/rejected": -3.286513090133667, "step": 905 }, { "epoch": 0.24, "grad_norm": 42.46119689941406, "kl": 0.0, "learning_rate": 3.8144464799790627e-07, "logps/chosen": -321.1701354980469, "logps/rejected": -230.4801025390625, "loss": 0.3239, "rewards/chosen": 0.8055683374404907, "rewards/margins": 3.0229482650756836, "rewards/rejected": -2.2173800468444824, "step": 906 }, { "epoch": 0.24, "grad_norm": 25.848482131958008, "kl": 0.0, "learning_rate": 3.8131379220099447e-07, "logps/chosen": -149.85707092285156, "logps/rejected": -245.2413787841797, "loss": 0.3329, "rewards/chosen": 0.381633460521698, "rewards/margins": 5.1404709815979, "rewards/rejected": -4.758837699890137, "step": 907 }, { "epoch": 0.24, "grad_norm": 42.254573822021484, "kl": 0.0, "learning_rate": 3.8118293640408266e-07, "logps/chosen": -219.65554809570312, "logps/rejected": -204.49111938476562, "loss": 0.405, "rewards/chosen": 0.17839092016220093, "rewards/margins": 3.0522470474243164, "rewards/rejected": -2.8738560676574707, "step": 908 }, { "epoch": 0.24, "grad_norm": 23.2181339263916, "kl": 0.0, "learning_rate": 3.8105208060717086e-07, "logps/chosen": -205.587646484375, "logps/rejected": -261.8460998535156, "loss": 0.355, "rewards/chosen": -0.022454485297203064, "rewards/margins": 4.495090007781982, "rewards/rejected": -4.517544269561768, "step": 909 }, { "epoch": 0.24, "grad_norm": 30.790456771850586, "kl": 0.0, "learning_rate": 3.8092122481025906e-07, "logps/chosen": -284.98150634765625, "logps/rejected": -311.32025146484375, "loss": 0.3364, "rewards/chosen": -0.573701024055481, "rewards/margins": 3.3590402603149414, "rewards/rejected": -3.932741403579712, "step": 910 }, { "epoch": 0.24, "grad_norm": 32.44673538208008, "kl": 0.0, "learning_rate": 3.8079036901334725e-07, "logps/chosen": -270.0657653808594, "logps/rejected": -197.69857788085938, "loss": 0.3679, "rewards/chosen": -0.34433862566947937, "rewards/margins": 2.41924786567688, "rewards/rejected": -2.7635865211486816, "step": 911 }, { "epoch": 0.24, "grad_norm": 33.279083251953125, "kl": 0.0, "learning_rate": 3.8065951321643545e-07, "logps/chosen": -202.7400360107422, "logps/rejected": -321.2475891113281, "loss": 0.2201, "rewards/chosen": 2.0015366077423096, "rewards/margins": 5.378347873687744, "rewards/rejected": -3.3768112659454346, "step": 912 }, { "epoch": 0.24, "grad_norm": 41.26420593261719, "kl": 0.0, "learning_rate": 3.8052865741952365e-07, "logps/chosen": -196.24246215820312, "logps/rejected": -219.8006591796875, "loss": 0.3636, "rewards/chosen": 0.6023075580596924, "rewards/margins": 2.8048958778381348, "rewards/rejected": -2.2025883197784424, "step": 913 }, { "epoch": 0.24, "grad_norm": 24.39764404296875, "kl": 0.0, "learning_rate": 3.803978016226119e-07, "logps/chosen": -209.90945434570312, "logps/rejected": -319.9163513183594, "loss": 0.3177, "rewards/chosen": 0.4894677400588989, "rewards/margins": 4.936036109924316, "rewards/rejected": -4.446568489074707, "step": 914 }, { "epoch": 0.24, "grad_norm": 34.143978118896484, "kl": 0.0, "learning_rate": 3.802669458257001e-07, "logps/chosen": -214.00375366210938, "logps/rejected": -250.49264526367188, "loss": 0.393, "rewards/chosen": 0.20385503768920898, "rewards/margins": 3.571147918701172, "rewards/rejected": -3.367292881011963, "step": 915 }, { "epoch": 0.24, "grad_norm": 36.70888137817383, "kl": 0.0, "learning_rate": 3.801360900287883e-07, "logps/chosen": -231.2817840576172, "logps/rejected": -239.27769470214844, "loss": 0.3282, "rewards/chosen": -0.3268897831439972, "rewards/margins": 1.634190320968628, "rewards/rejected": -1.9610800743103027, "step": 916 }, { "epoch": 0.24, "grad_norm": 27.67314338684082, "kl": 0.0, "learning_rate": 3.800052342318765e-07, "logps/chosen": -184.77256774902344, "logps/rejected": -347.36932373046875, "loss": 0.257, "rewards/chosen": 0.6008602380752563, "rewards/margins": 4.103627681732178, "rewards/rejected": -3.502767324447632, "step": 917 }, { "epoch": 0.24, "grad_norm": 36.47007369995117, "kl": 0.0, "learning_rate": 3.798743784349647e-07, "logps/chosen": -250.0063934326172, "logps/rejected": -304.2515869140625, "loss": 0.2986, "rewards/chosen": -0.7417954802513123, "rewards/margins": 4.9903411865234375, "rewards/rejected": -5.7321367263793945, "step": 918 }, { "epoch": 0.24, "grad_norm": 39.0871467590332, "kl": 0.0, "learning_rate": 3.797435226380529e-07, "logps/chosen": -236.37786865234375, "logps/rejected": -192.05194091796875, "loss": 0.2918, "rewards/chosen": -0.055040180683135986, "rewards/margins": 2.505793571472168, "rewards/rejected": -2.560833692550659, "step": 919 }, { "epoch": 0.24, "grad_norm": 29.87541389465332, "kl": 0.0, "learning_rate": 3.7961266684114107e-07, "logps/chosen": -159.6341094970703, "logps/rejected": -225.9227294921875, "loss": 0.3817, "rewards/chosen": 0.7967673540115356, "rewards/margins": 3.12998628616333, "rewards/rejected": -2.333218812942505, "step": 920 }, { "epoch": 0.24, "grad_norm": 39.75651550292969, "kl": 0.0, "learning_rate": 3.794818110442292e-07, "logps/chosen": -192.595703125, "logps/rejected": -275.8404235839844, "loss": 0.3581, "rewards/chosen": -2.0372021198272705, "rewards/margins": -0.12220478057861328, "rewards/rejected": -1.9149973392486572, "step": 921 }, { "epoch": 0.24, "grad_norm": 26.79347038269043, "kl": 0.0, "learning_rate": 3.793509552473174e-07, "logps/chosen": -189.87782287597656, "logps/rejected": -210.1076202392578, "loss": 0.1836, "rewards/chosen": 1.3419767618179321, "rewards/margins": 5.476966381072998, "rewards/rejected": -4.1349897384643555, "step": 922 }, { "epoch": 0.24, "grad_norm": 43.24937438964844, "kl": 0.0, "learning_rate": 3.792200994504056e-07, "logps/chosen": -194.89622497558594, "logps/rejected": -409.5931701660156, "loss": 0.4562, "rewards/chosen": -1.0916842222213745, "rewards/margins": 0.15808093547821045, "rewards/rejected": -1.249765157699585, "step": 923 }, { "epoch": 0.24, "grad_norm": 36.0235595703125, "kl": 0.0, "learning_rate": 3.790892436534938e-07, "logps/chosen": -212.34266662597656, "logps/rejected": -264.5822448730469, "loss": 0.3522, "rewards/chosen": -0.9759610295295715, "rewards/margins": 2.166531562805176, "rewards/rejected": -3.1424925327301025, "step": 924 }, { "epoch": 0.24, "grad_norm": 39.438472747802734, "kl": 0.0, "learning_rate": 3.78958387856582e-07, "logps/chosen": -199.68655395507812, "logps/rejected": -297.06494140625, "loss": 0.3211, "rewards/chosen": 0.956540584564209, "rewards/margins": 2.8180646896362305, "rewards/rejected": -1.861523985862732, "step": 925 }, { "epoch": 0.24, "grad_norm": 33.72963333129883, "kl": 0.0, "learning_rate": 3.788275320596702e-07, "logps/chosen": -216.25003051757812, "logps/rejected": -326.2884826660156, "loss": 0.2645, "rewards/chosen": -0.2656521499156952, "rewards/margins": 4.0716471672058105, "rewards/rejected": -4.337299346923828, "step": 926 }, { "epoch": 0.24, "grad_norm": 28.301097869873047, "kl": 0.0, "learning_rate": 3.7869667626275844e-07, "logps/chosen": -217.94644165039062, "logps/rejected": -200.2283935546875, "loss": 0.3988, "rewards/chosen": -0.7481070160865784, "rewards/margins": 1.4763429164886475, "rewards/rejected": -2.224449872970581, "step": 927 }, { "epoch": 0.24, "grad_norm": 37.24142837524414, "kl": 0.0, "learning_rate": 3.7856582046584664e-07, "logps/chosen": -178.01388549804688, "logps/rejected": -253.77645874023438, "loss": 0.42, "rewards/chosen": -0.5237429141998291, "rewards/margins": 1.6036875247955322, "rewards/rejected": -2.1274304389953613, "step": 928 }, { "epoch": 0.24, "grad_norm": 38.92591094970703, "kl": 0.0, "learning_rate": 3.7843496466893484e-07, "logps/chosen": -250.7688446044922, "logps/rejected": -279.5671691894531, "loss": 0.2998, "rewards/chosen": 0.08170495927333832, "rewards/margins": 2.645064353942871, "rewards/rejected": -2.563359498977661, "step": 929 }, { "epoch": 0.24, "grad_norm": 41.20185852050781, "kl": 0.0, "learning_rate": 3.7830410887202303e-07, "logps/chosen": -176.0547637939453, "logps/rejected": -232.66940307617188, "loss": 0.2198, "rewards/chosen": 0.13103172183036804, "rewards/margins": 2.8757386207580566, "rewards/rejected": -2.744706869125366, "step": 930 }, { "epoch": 0.24, "grad_norm": 31.94757652282715, "kl": 0.0, "learning_rate": 3.7817325307511123e-07, "logps/chosen": -205.85032653808594, "logps/rejected": -168.09828186035156, "loss": 0.2974, "rewards/chosen": 0.5673466920852661, "rewards/margins": 3.6650705337524414, "rewards/rejected": -3.097723960876465, "step": 931 }, { "epoch": 0.24, "grad_norm": 28.481760025024414, "kl": 0.0, "learning_rate": 3.780423972781994e-07, "logps/chosen": -194.0419464111328, "logps/rejected": -204.07566833496094, "loss": 0.3024, "rewards/chosen": -1.1814078092575073, "rewards/margins": 2.4324450492858887, "rewards/rejected": -3.6138527393341064, "step": 932 }, { "epoch": 0.24, "grad_norm": 36.05358123779297, "kl": 0.0, "learning_rate": 3.779115414812876e-07, "logps/chosen": -160.29495239257812, "logps/rejected": -250.9978790283203, "loss": 0.3588, "rewards/chosen": -0.2665542662143707, "rewards/margins": 2.5105628967285156, "rewards/rejected": -2.7771172523498535, "step": 933 }, { "epoch": 0.24, "grad_norm": 37.9229736328125, "kl": 0.0, "learning_rate": 3.777806856843758e-07, "logps/chosen": -296.37762451171875, "logps/rejected": -274.84759521484375, "loss": 0.3803, "rewards/chosen": 0.46562808752059937, "rewards/margins": 2.7332608699798584, "rewards/rejected": -2.2676327228546143, "step": 934 }, { "epoch": 0.24, "grad_norm": 32.8103141784668, "kl": 0.0, "learning_rate": 3.77649829887464e-07, "logps/chosen": -194.97434997558594, "logps/rejected": -187.0049285888672, "loss": 0.3435, "rewards/chosen": -0.5105401873588562, "rewards/margins": 2.990797281265259, "rewards/rejected": -3.5013375282287598, "step": 935 }, { "epoch": 0.24, "grad_norm": 38.560508728027344, "kl": 0.0, "learning_rate": 3.7751897409055216e-07, "logps/chosen": -317.18994140625, "logps/rejected": -281.34332275390625, "loss": 0.3472, "rewards/chosen": -1.0646255016326904, "rewards/margins": 0.3802691698074341, "rewards/rejected": -1.4448946714401245, "step": 936 }, { "epoch": 0.25, "grad_norm": 38.34934997558594, "kl": 0.0, "learning_rate": 3.7738811829364035e-07, "logps/chosen": -191.30926513671875, "logps/rejected": -268.2271728515625, "loss": 0.3716, "rewards/chosen": -0.6462159752845764, "rewards/margins": 1.863279104232788, "rewards/rejected": -2.5094950199127197, "step": 937 }, { "epoch": 0.25, "grad_norm": 46.346153259277344, "kl": 0.0, "learning_rate": 3.7725726249672855e-07, "logps/chosen": -325.8685302734375, "logps/rejected": -253.2593231201172, "loss": 0.4326, "rewards/chosen": 0.49079614877700806, "rewards/margins": 1.7913362979888916, "rewards/rejected": -1.3005400896072388, "step": 938 }, { "epoch": 0.25, "grad_norm": 36.08719253540039, "kl": 0.0, "learning_rate": 3.7712640669981674e-07, "logps/chosen": -213.1573486328125, "logps/rejected": -184.4655303955078, "loss": 0.3137, "rewards/chosen": -0.3603510856628418, "rewards/margins": 3.5041239261627197, "rewards/rejected": -3.8644750118255615, "step": 939 }, { "epoch": 0.25, "grad_norm": 37.11168670654297, "kl": 0.0, "learning_rate": 3.76995550902905e-07, "logps/chosen": -281.20904541015625, "logps/rejected": -166.6403045654297, "loss": 0.3994, "rewards/chosen": -1.179037094116211, "rewards/margins": 0.9344363212585449, "rewards/rejected": -2.113473415374756, "step": 940 }, { "epoch": 0.25, "grad_norm": 40.791709899902344, "kl": 0.0, "learning_rate": 3.768646951059932e-07, "logps/chosen": -173.112548828125, "logps/rejected": -214.59646606445312, "loss": 0.3807, "rewards/chosen": 0.023116042837500572, "rewards/margins": 2.422356128692627, "rewards/rejected": -2.399240016937256, "step": 941 }, { "epoch": 0.25, "grad_norm": 33.149959564208984, "kl": 0.0, "learning_rate": 3.767338393090814e-07, "logps/chosen": -169.68991088867188, "logps/rejected": -309.6609191894531, "loss": 0.2299, "rewards/chosen": 0.19015148282051086, "rewards/margins": 3.0547852516174316, "rewards/rejected": -2.864633798599243, "step": 942 }, { "epoch": 0.25, "grad_norm": 39.015960693359375, "kl": 0.0, "learning_rate": 3.766029835121696e-07, "logps/chosen": -251.8404998779297, "logps/rejected": -208.2914581298828, "loss": 0.332, "rewards/chosen": 0.2635638117790222, "rewards/margins": 2.563786745071411, "rewards/rejected": -2.300222873687744, "step": 943 }, { "epoch": 0.25, "grad_norm": 38.60781478881836, "kl": 0.0, "learning_rate": 3.764721277152578e-07, "logps/chosen": -215.9370574951172, "logps/rejected": -249.64276123046875, "loss": 0.3206, "rewards/chosen": -0.013658404350280762, "rewards/margins": 3.1306090354919434, "rewards/rejected": -3.1442675590515137, "step": 944 }, { "epoch": 0.25, "grad_norm": 28.41880989074707, "kl": 0.0, "learning_rate": 3.7634127191834597e-07, "logps/chosen": -146.95925903320312, "logps/rejected": -206.71322631835938, "loss": 0.3208, "rewards/chosen": -0.11132961511611938, "rewards/margins": 3.1574666500091553, "rewards/rejected": -3.26879620552063, "step": 945 }, { "epoch": 0.25, "grad_norm": 38.186038970947266, "kl": 0.0, "learning_rate": 3.7621041612143417e-07, "logps/chosen": -236.7716827392578, "logps/rejected": -213.30288696289062, "loss": 0.3769, "rewards/chosen": -0.7647838592529297, "rewards/margins": 4.22573184967041, "rewards/rejected": -4.99051570892334, "step": 946 }, { "epoch": 0.25, "grad_norm": 38.70585250854492, "kl": 0.0, "learning_rate": 3.7607956032452237e-07, "logps/chosen": -191.19248962402344, "logps/rejected": -237.6758575439453, "loss": 0.3382, "rewards/chosen": -0.5101767778396606, "rewards/margins": 1.7929340600967407, "rewards/rejected": -2.3031108379364014, "step": 947 }, { "epoch": 0.25, "grad_norm": 36.55864334106445, "kl": 0.0, "learning_rate": 3.7594870452761056e-07, "logps/chosen": -240.2667236328125, "logps/rejected": -323.38397216796875, "loss": 0.3667, "rewards/chosen": -0.042002975940704346, "rewards/margins": 2.5760934352874756, "rewards/rejected": -2.618096351623535, "step": 948 }, { "epoch": 0.25, "grad_norm": 32.7099494934082, "kl": 0.0, "learning_rate": 3.7581784873069876e-07, "logps/chosen": -211.2158203125, "logps/rejected": -200.59913635253906, "loss": 0.2249, "rewards/chosen": 0.7871925234794617, "rewards/margins": 3.0763463973999023, "rewards/rejected": -2.289153814315796, "step": 949 }, { "epoch": 0.25, "grad_norm": 28.505664825439453, "kl": 0.0, "learning_rate": 3.7568699293378695e-07, "logps/chosen": -158.95338439941406, "logps/rejected": -240.470703125, "loss": 0.269, "rewards/chosen": -0.5699848532676697, "rewards/margins": 2.3539459705352783, "rewards/rejected": -2.9239308834075928, "step": 950 }, { "epoch": 0.25, "grad_norm": 28.49224281311035, "kl": 0.0, "learning_rate": 3.7555613713687515e-07, "logps/chosen": -180.0927734375, "logps/rejected": -222.00131225585938, "loss": 0.2562, "rewards/chosen": 1.0241466760635376, "rewards/margins": 4.006009578704834, "rewards/rejected": -2.981863021850586, "step": 951 }, { "epoch": 0.25, "grad_norm": 43.41466522216797, "kl": 0.0, "learning_rate": 3.754252813399633e-07, "logps/chosen": -233.43089294433594, "logps/rejected": -260.57025146484375, "loss": 0.2668, "rewards/chosen": -0.16725710034370422, "rewards/margins": 3.9502956867218018, "rewards/rejected": -4.117552757263184, "step": 952 }, { "epoch": 0.25, "grad_norm": 27.8471736907959, "kl": 0.0, "learning_rate": 3.7529442554305154e-07, "logps/chosen": -264.3834228515625, "logps/rejected": -317.200927734375, "loss": 0.2867, "rewards/chosen": -0.396359920501709, "rewards/margins": 2.800766944885254, "rewards/rejected": -3.197126865386963, "step": 953 }, { "epoch": 0.25, "grad_norm": 34.129188537597656, "kl": 0.0, "learning_rate": 3.7516356974613974e-07, "logps/chosen": -183.26345825195312, "logps/rejected": -237.68011474609375, "loss": 0.3368, "rewards/chosen": 0.26516619324684143, "rewards/margins": 3.0184478759765625, "rewards/rejected": -2.753281593322754, "step": 954 }, { "epoch": 0.25, "grad_norm": 34.6745491027832, "kl": 0.0, "learning_rate": 3.7503271394922793e-07, "logps/chosen": -284.04638671875, "logps/rejected": -179.2754669189453, "loss": 0.348, "rewards/chosen": 0.24357259273529053, "rewards/margins": 2.473604679107666, "rewards/rejected": -2.230031967163086, "step": 955 }, { "epoch": 0.25, "grad_norm": 31.912418365478516, "kl": 0.0, "learning_rate": 3.7490185815231613e-07, "logps/chosen": -135.66848754882812, "logps/rejected": -251.6403045654297, "loss": 0.357, "rewards/chosen": -0.8683047890663147, "rewards/margins": 2.402848482131958, "rewards/rejected": -3.271153211593628, "step": 956 }, { "epoch": 0.25, "grad_norm": 38.589805603027344, "kl": 0.0, "learning_rate": 3.747710023554043e-07, "logps/chosen": -295.62432861328125, "logps/rejected": -193.68881225585938, "loss": 0.3627, "rewards/chosen": -0.835448145866394, "rewards/margins": 1.8972326517105103, "rewards/rejected": -2.7326807975769043, "step": 957 }, { "epoch": 0.25, "grad_norm": 33.903377532958984, "kl": 0.0, "learning_rate": 3.746401465584925e-07, "logps/chosen": -220.63259887695312, "logps/rejected": -307.62664794921875, "loss": 0.2633, "rewards/chosen": 1.0228039026260376, "rewards/margins": 5.325528144836426, "rewards/rejected": -4.302724361419678, "step": 958 }, { "epoch": 0.25, "grad_norm": 42.30472946166992, "kl": 0.0, "learning_rate": 3.745092907615807e-07, "logps/chosen": -271.06011962890625, "logps/rejected": -179.56674194335938, "loss": 0.3717, "rewards/chosen": -0.6451493501663208, "rewards/margins": 2.17818021774292, "rewards/rejected": -2.8233296871185303, "step": 959 }, { "epoch": 0.25, "grad_norm": 44.385440826416016, "kl": 0.0, "learning_rate": 3.743784349646689e-07, "logps/chosen": -278.98663330078125, "logps/rejected": -321.0003356933594, "loss": 0.3034, "rewards/chosen": -0.3178820312023163, "rewards/margins": 2.567906379699707, "rewards/rejected": -2.8857884407043457, "step": 960 }, { "epoch": 0.25, "grad_norm": 30.137678146362305, "kl": 0.0, "learning_rate": 3.742475791677571e-07, "logps/chosen": -188.7030029296875, "logps/rejected": -228.55557250976562, "loss": 0.4215, "rewards/chosen": -0.25972381234169006, "rewards/margins": 2.330620765686035, "rewards/rejected": -2.5903446674346924, "step": 961 }, { "epoch": 0.25, "grad_norm": 31.367036819458008, "kl": 0.0, "learning_rate": 3.741167233708453e-07, "logps/chosen": -173.17393493652344, "logps/rejected": -288.6369323730469, "loss": 0.3492, "rewards/chosen": -0.5615468621253967, "rewards/margins": 3.085667371749878, "rewards/rejected": -3.64721417427063, "step": 962 }, { "epoch": 0.25, "grad_norm": 36.26879119873047, "kl": 0.0, "learning_rate": 3.739858675739335e-07, "logps/chosen": -234.41134643554688, "logps/rejected": -241.38648986816406, "loss": 0.388, "rewards/chosen": -0.6199156641960144, "rewards/margins": 2.953280210494995, "rewards/rejected": -3.5731959342956543, "step": 963 }, { "epoch": 0.25, "grad_norm": 37.49041748046875, "kl": 0.0, "learning_rate": 3.738550117770217e-07, "logps/chosen": -270.44732666015625, "logps/rejected": -258.4364929199219, "loss": 0.3476, "rewards/chosen": -0.3076787292957306, "rewards/margins": 2.230152130126953, "rewards/rejected": -2.5378308296203613, "step": 964 }, { "epoch": 0.25, "grad_norm": 31.639141082763672, "kl": 0.0, "learning_rate": 3.7372415598010995e-07, "logps/chosen": -190.83172607421875, "logps/rejected": -251.1099853515625, "loss": 0.3503, "rewards/chosen": 0.7067262530326843, "rewards/margins": 3.0306055545806885, "rewards/rejected": -2.3238792419433594, "step": 965 }, { "epoch": 0.25, "grad_norm": 33.14474105834961, "kl": 0.0, "learning_rate": 3.7359330018319814e-07, "logps/chosen": -229.0324249267578, "logps/rejected": -255.16261291503906, "loss": 0.2465, "rewards/chosen": -0.09261378645896912, "rewards/margins": 2.950747013092041, "rewards/rejected": -3.043360710144043, "step": 966 }, { "epoch": 0.25, "grad_norm": 22.569007873535156, "kl": 0.0, "learning_rate": 3.734624443862863e-07, "logps/chosen": -254.2834930419922, "logps/rejected": -233.958251953125, "loss": 0.2995, "rewards/chosen": -0.8716516494750977, "rewards/margins": 3.8399643898010254, "rewards/rejected": -4.711616039276123, "step": 967 }, { "epoch": 0.25, "grad_norm": 46.63541793823242, "kl": 0.0, "learning_rate": 3.733315885893745e-07, "logps/chosen": -187.2761993408203, "logps/rejected": -234.6182098388672, "loss": 0.2998, "rewards/chosen": -0.7562770247459412, "rewards/margins": 1.2845125198364258, "rewards/rejected": -2.0407896041870117, "step": 968 }, { "epoch": 0.25, "grad_norm": 37.27079772949219, "kl": 0.0, "learning_rate": 3.732007327924627e-07, "logps/chosen": -147.63406372070312, "logps/rejected": -245.00823974609375, "loss": 0.3315, "rewards/chosen": -0.24929417669773102, "rewards/margins": 3.6097397804260254, "rewards/rejected": -3.8590340614318848, "step": 969 }, { "epoch": 0.25, "grad_norm": 28.529054641723633, "kl": 0.0, "learning_rate": 3.730698769955509e-07, "logps/chosen": -217.55023193359375, "logps/rejected": -245.34091186523438, "loss": 0.2898, "rewards/chosen": 0.5950266122817993, "rewards/margins": 3.8983559608459473, "rewards/rejected": -3.3033292293548584, "step": 970 }, { "epoch": 0.25, "grad_norm": 33.69866943359375, "kl": 0.0, "learning_rate": 3.7293902119863907e-07, "logps/chosen": -273.6669921875, "logps/rejected": -220.08108520507812, "loss": 0.336, "rewards/chosen": -0.6207393407821655, "rewards/margins": 3.184846878051758, "rewards/rejected": -3.805586099624634, "step": 971 }, { "epoch": 0.25, "grad_norm": 33.63166427612305, "kl": 0.0, "learning_rate": 3.7280816540172727e-07, "logps/chosen": -204.73757934570312, "logps/rejected": -367.3917236328125, "loss": 0.1605, "rewards/chosen": 0.2242099791765213, "rewards/margins": 4.352242469787598, "rewards/rejected": -4.128032684326172, "step": 972 }, { "epoch": 0.25, "grad_norm": 37.7935791015625, "kl": 0.0, "learning_rate": 3.7267730960481546e-07, "logps/chosen": -163.07469177246094, "logps/rejected": -307.3078918457031, "loss": 0.3856, "rewards/chosen": -0.6944271922111511, "rewards/margins": 4.600676536560059, "rewards/rejected": -5.295103549957275, "step": 973 }, { "epoch": 0.25, "grad_norm": 27.959571838378906, "kl": 0.0, "learning_rate": 3.7254645380790366e-07, "logps/chosen": -164.4769287109375, "logps/rejected": -240.49453735351562, "loss": 0.3669, "rewards/chosen": -0.3397435247898102, "rewards/margins": 7.074169158935547, "rewards/rejected": -7.413912773132324, "step": 974 }, { "epoch": 0.26, "grad_norm": 28.11006736755371, "kl": 0.0, "learning_rate": 3.7241559801099186e-07, "logps/chosen": -197.59146118164062, "logps/rejected": -236.4452362060547, "loss": 0.3146, "rewards/chosen": 0.7232022881507874, "rewards/margins": 3.5962724685668945, "rewards/rejected": -2.873070240020752, "step": 975 }, { "epoch": 0.26, "grad_norm": 29.414844512939453, "kl": 0.0, "learning_rate": 3.7228474221408005e-07, "logps/chosen": -287.6759338378906, "logps/rejected": -163.3478546142578, "loss": 0.2951, "rewards/chosen": -0.5998216271400452, "rewards/margins": 2.333730936050415, "rewards/rejected": -2.9335525035858154, "step": 976 }, { "epoch": 0.26, "grad_norm": 42.738162994384766, "kl": 0.0, "learning_rate": 3.7215388641716825e-07, "logps/chosen": -211.0493927001953, "logps/rejected": -292.9945068359375, "loss": 0.3142, "rewards/chosen": -0.3580980896949768, "rewards/margins": 2.8586883544921875, "rewards/rejected": -3.2167863845825195, "step": 977 }, { "epoch": 0.26, "grad_norm": 26.3310546875, "kl": 0.0, "learning_rate": 3.720230306202565e-07, "logps/chosen": -208.38685607910156, "logps/rejected": -178.67808532714844, "loss": 0.2584, "rewards/chosen": -0.2977018356323242, "rewards/margins": 2.307711601257324, "rewards/rejected": -2.6054134368896484, "step": 978 }, { "epoch": 0.26, "grad_norm": 28.863971710205078, "kl": 0.0, "learning_rate": 3.718921748233447e-07, "logps/chosen": -194.7991180419922, "logps/rejected": -223.70895385742188, "loss": 0.2059, "rewards/chosen": 0.4047398567199707, "rewards/margins": 4.218716621398926, "rewards/rejected": -3.813977003097534, "step": 979 }, { "epoch": 0.26, "grad_norm": 45.329856872558594, "kl": 0.0, "learning_rate": 3.717613190264329e-07, "logps/chosen": -277.4205627441406, "logps/rejected": -271.2000732421875, "loss": 0.4038, "rewards/chosen": -0.817046046257019, "rewards/margins": 4.592546463012695, "rewards/rejected": -5.409592628479004, "step": 980 }, { "epoch": 0.26, "grad_norm": 37.37732696533203, "kl": 0.0, "learning_rate": 3.716304632295211e-07, "logps/chosen": -165.05406188964844, "logps/rejected": -333.9656677246094, "loss": 0.3201, "rewards/chosen": -0.2642420828342438, "rewards/margins": 3.700796604156494, "rewards/rejected": -3.965038776397705, "step": 981 }, { "epoch": 0.26, "grad_norm": 31.896753311157227, "kl": 0.0, "learning_rate": 3.714996074326093e-07, "logps/chosen": -220.74844360351562, "logps/rejected": -204.1945037841797, "loss": 0.4103, "rewards/chosen": -1.318084716796875, "rewards/margins": 1.4971325397491455, "rewards/rejected": -2.8152172565460205, "step": 982 }, { "epoch": 0.26, "grad_norm": 39.432945251464844, "kl": 0.0, "learning_rate": 3.713687516356974e-07, "logps/chosen": -197.0574951171875, "logps/rejected": -227.69265747070312, "loss": 0.3001, "rewards/chosen": -0.5320191979408264, "rewards/margins": 2.9302799701690674, "rewards/rejected": -3.462299108505249, "step": 983 }, { "epoch": 0.26, "grad_norm": 35.62767028808594, "kl": 0.0, "learning_rate": 3.712378958387856e-07, "logps/chosen": -236.9004364013672, "logps/rejected": -172.47108459472656, "loss": 0.2269, "rewards/chosen": 1.4592981338500977, "rewards/margins": 4.804227828979492, "rewards/rejected": -3.3449299335479736, "step": 984 }, { "epoch": 0.26, "grad_norm": 34.60026550292969, "kl": 0.0, "learning_rate": 3.711070400418738e-07, "logps/chosen": -221.7421112060547, "logps/rejected": -215.35638427734375, "loss": 0.3953, "rewards/chosen": -1.0239485502243042, "rewards/margins": 1.4995425939559937, "rewards/rejected": -2.523491144180298, "step": 985 }, { "epoch": 0.26, "grad_norm": 36.66464614868164, "kl": 0.0, "learning_rate": 3.70976184244962e-07, "logps/chosen": -286.3569030761719, "logps/rejected": -235.4409942626953, "loss": 0.3558, "rewards/chosen": -0.6151680946350098, "rewards/margins": 2.8036887645721436, "rewards/rejected": -3.4188568592071533, "step": 986 }, { "epoch": 0.26, "grad_norm": 31.001144409179688, "kl": 0.0, "learning_rate": 3.708453284480502e-07, "logps/chosen": -172.70333862304688, "logps/rejected": -230.7540740966797, "loss": 0.2102, "rewards/chosen": -1.314502477645874, "rewards/margins": 3.040217161178589, "rewards/rejected": -4.354719638824463, "step": 987 }, { "epoch": 0.26, "grad_norm": 36.64275360107422, "kl": 0.0, "learning_rate": 3.707144726511384e-07, "logps/chosen": -308.76202392578125, "logps/rejected": -272.7708435058594, "loss": 0.3881, "rewards/chosen": 0.16566914319992065, "rewards/margins": 2.5625765323638916, "rewards/rejected": -2.396907329559326, "step": 988 }, { "epoch": 0.26, "grad_norm": 31.150535583496094, "kl": 0.0, "learning_rate": 3.705836168542266e-07, "logps/chosen": -214.9908447265625, "logps/rejected": -304.88226318359375, "loss": 0.224, "rewards/chosen": 2.17590594291687, "rewards/margins": 6.284664154052734, "rewards/rejected": -4.108758449554443, "step": 989 }, { "epoch": 0.26, "grad_norm": 37.415401458740234, "kl": 0.0, "learning_rate": 3.704527610573148e-07, "logps/chosen": -260.45904541015625, "logps/rejected": -256.9763488769531, "loss": 0.3893, "rewards/chosen": 1.2356760501861572, "rewards/margins": 3.375903367996216, "rewards/rejected": -2.1402273178100586, "step": 990 }, { "epoch": 0.26, "grad_norm": 38.0907096862793, "kl": 0.0, "learning_rate": 3.7032190526040305e-07, "logps/chosen": -186.80435180664062, "logps/rejected": -257.00506591796875, "loss": 0.4833, "rewards/chosen": -0.41919440031051636, "rewards/margins": 0.7835555672645569, "rewards/rejected": -1.2027499675750732, "step": 991 }, { "epoch": 0.26, "grad_norm": 30.297321319580078, "kl": 0.0, "learning_rate": 3.7019104946349124e-07, "logps/chosen": -231.01339721679688, "logps/rejected": -323.2631530761719, "loss": 0.171, "rewards/chosen": 0.8052429556846619, "rewards/margins": 5.508130073547363, "rewards/rejected": -4.702887058258057, "step": 992 }, { "epoch": 0.26, "grad_norm": 28.98295783996582, "kl": 0.0, "learning_rate": 3.7006019366657944e-07, "logps/chosen": -195.5869903564453, "logps/rejected": -368.0828857421875, "loss": 0.2751, "rewards/chosen": 0.75277179479599, "rewards/margins": 3.5699214935302734, "rewards/rejected": -2.8171496391296387, "step": 993 }, { "epoch": 0.26, "grad_norm": 41.256099700927734, "kl": 0.0, "learning_rate": 3.6992933786966763e-07, "logps/chosen": -282.90325927734375, "logps/rejected": -265.955078125, "loss": 0.2973, "rewards/chosen": -0.04004257172346115, "rewards/margins": 2.438786268234253, "rewards/rejected": -2.4788289070129395, "step": 994 }, { "epoch": 0.26, "grad_norm": 40.493995666503906, "kl": 0.0, "learning_rate": 3.6979848207275583e-07, "logps/chosen": -198.56317138671875, "logps/rejected": -287.1308288574219, "loss": 0.2858, "rewards/chosen": 0.22279243171215057, "rewards/margins": 3.379775285720825, "rewards/rejected": -3.156982898712158, "step": 995 }, { "epoch": 0.26, "grad_norm": 38.28390121459961, "kl": 0.0, "learning_rate": 3.6966762627584403e-07, "logps/chosen": -252.07237243652344, "logps/rejected": -300.477294921875, "loss": 0.4081, "rewards/chosen": 0.12824928760528564, "rewards/margins": 3.357633590698242, "rewards/rejected": -3.229384422302246, "step": 996 }, { "epoch": 0.26, "grad_norm": 30.633607864379883, "kl": 0.0, "learning_rate": 3.695367704789322e-07, "logps/chosen": -268.8228454589844, "logps/rejected": -261.2051696777344, "loss": 0.2608, "rewards/chosen": 0.4649485945701599, "rewards/margins": 3.7446982860565186, "rewards/rejected": -3.279749631881714, "step": 997 }, { "epoch": 0.26, "grad_norm": 36.42918395996094, "kl": 0.0, "learning_rate": 3.6940591468202037e-07, "logps/chosen": -346.8229675292969, "logps/rejected": -241.703857421875, "loss": 0.2317, "rewards/chosen": 0.23028622567653656, "rewards/margins": 3.4296822547912598, "rewards/rejected": -3.1993961334228516, "step": 998 }, { "epoch": 0.26, "grad_norm": 39.02206802368164, "kl": 0.0, "learning_rate": 3.6927505888510856e-07, "logps/chosen": -216.50253295898438, "logps/rejected": -127.69355773925781, "loss": 0.3738, "rewards/chosen": -0.8492110967636108, "rewards/margins": 1.4893790483474731, "rewards/rejected": -2.338590145111084, "step": 999 }, { "epoch": 0.26, "grad_norm": 30.959394454956055, "kl": 0.0, "learning_rate": 3.6914420308819676e-07, "logps/chosen": -235.33822631835938, "logps/rejected": -237.93800354003906, "loss": 0.293, "rewards/chosen": -0.13448606431484222, "rewards/margins": 3.330787181854248, "rewards/rejected": -3.465273141860962, "step": 1000 }, { "epoch": 0.26, "grad_norm": 40.610050201416016, "kl": 0.0, "learning_rate": 3.6901334729128495e-07, "logps/chosen": -264.3585510253906, "logps/rejected": -255.59652709960938, "loss": 0.3573, "rewards/chosen": 0.12295226007699966, "rewards/margins": 1.9584699869155884, "rewards/rejected": -1.8355177640914917, "step": 1001 }, { "epoch": 0.26, "grad_norm": 22.982267379760742, "kl": 0.0, "learning_rate": 3.6888249149437315e-07, "logps/chosen": -179.1318817138672, "logps/rejected": -223.6998291015625, "loss": 0.2512, "rewards/chosen": 0.2519915997982025, "rewards/margins": 5.339731693267822, "rewards/rejected": -5.087739944458008, "step": 1002 }, { "epoch": 0.26, "grad_norm": 31.051963806152344, "kl": 0.0, "learning_rate": 3.6875163569746135e-07, "logps/chosen": -215.51844787597656, "logps/rejected": -269.37322998046875, "loss": 0.3854, "rewards/chosen": -0.3046747148036957, "rewards/margins": 2.4635965824127197, "rewards/rejected": -2.7682712078094482, "step": 1003 }, { "epoch": 0.26, "grad_norm": 38.849945068359375, "kl": 0.0, "learning_rate": 3.686207799005496e-07, "logps/chosen": -179.42074584960938, "logps/rejected": -280.94488525390625, "loss": 0.3472, "rewards/chosen": 0.19322259724140167, "rewards/margins": 3.179107666015625, "rewards/rejected": -2.9858851432800293, "step": 1004 }, { "epoch": 0.26, "grad_norm": 34.98317337036133, "kl": 0.0, "learning_rate": 3.684899241036378e-07, "logps/chosen": -191.39337158203125, "logps/rejected": -247.249755859375, "loss": 0.3104, "rewards/chosen": -0.00705593079328537, "rewards/margins": 3.7272536754608154, "rewards/rejected": -3.734309673309326, "step": 1005 }, { "epoch": 0.26, "grad_norm": 29.55091667175293, "kl": 0.0, "learning_rate": 3.68359068306726e-07, "logps/chosen": -195.71263122558594, "logps/rejected": -191.72177124023438, "loss": 0.1906, "rewards/chosen": 2.661048412322998, "rewards/margins": 6.463957786560059, "rewards/rejected": -3.8029093742370605, "step": 1006 }, { "epoch": 0.26, "grad_norm": 28.976953506469727, "kl": 0.0, "learning_rate": 3.682282125098142e-07, "logps/chosen": -243.90280151367188, "logps/rejected": -176.75965881347656, "loss": 0.3992, "rewards/chosen": -0.5529724955558777, "rewards/margins": 1.5031301975250244, "rewards/rejected": -2.056102752685547, "step": 1007 }, { "epoch": 0.26, "grad_norm": 35.87976837158203, "kl": 0.0, "learning_rate": 3.680973567129024e-07, "logps/chosen": -242.5333709716797, "logps/rejected": -211.3227996826172, "loss": 0.3383, "rewards/chosen": 0.3571178615093231, "rewards/margins": 3.2810115814208984, "rewards/rejected": -2.923893690109253, "step": 1008 }, { "epoch": 0.26, "grad_norm": 39.02558898925781, "kl": 0.0, "learning_rate": 3.679665009159906e-07, "logps/chosen": -225.7318878173828, "logps/rejected": -179.76136779785156, "loss": 0.3238, "rewards/chosen": 0.4625554084777832, "rewards/margins": 2.3718295097351074, "rewards/rejected": -1.9092742204666138, "step": 1009 }, { "epoch": 0.26, "grad_norm": 31.04306983947754, "kl": 0.0, "learning_rate": 3.6783564511907877e-07, "logps/chosen": -158.1611785888672, "logps/rejected": -218.263916015625, "loss": 0.3687, "rewards/chosen": -0.2737899720668793, "rewards/margins": 1.0832759141921997, "rewards/rejected": -1.3570659160614014, "step": 1010 }, { "epoch": 0.26, "grad_norm": 38.5332145690918, "kl": 0.0, "learning_rate": 3.6770478932216697e-07, "logps/chosen": -216.95333862304688, "logps/rejected": -232.80751037597656, "loss": 0.3252, "rewards/chosen": -1.0354280471801758, "rewards/margins": 1.416306972503662, "rewards/rejected": -2.451735019683838, "step": 1011 }, { "epoch": 0.26, "grad_norm": 37.71039962768555, "kl": 0.0, "learning_rate": 3.6757393352525516e-07, "logps/chosen": -181.9486083984375, "logps/rejected": -266.159423828125, "loss": 0.3085, "rewards/chosen": -0.29809871315956116, "rewards/margins": 3.1903493404388428, "rewards/rejected": -3.488448143005371, "step": 1012 }, { "epoch": 0.27, "grad_norm": 33.82704162597656, "kl": 0.0, "learning_rate": 3.6744307772834336e-07, "logps/chosen": -163.29898071289062, "logps/rejected": -301.127685546875, "loss": 0.2887, "rewards/chosen": -0.5370348691940308, "rewards/margins": 2.407470226287842, "rewards/rejected": -2.944504976272583, "step": 1013 }, { "epoch": 0.27, "grad_norm": 30.01554298400879, "kl": 0.0, "learning_rate": 3.673122219314315e-07, "logps/chosen": -235.93983459472656, "logps/rejected": -293.60986328125, "loss": 0.2571, "rewards/chosen": 0.16475680470466614, "rewards/margins": 3.1383562088012695, "rewards/rejected": -2.973599433898926, "step": 1014 }, { "epoch": 0.27, "grad_norm": 43.945167541503906, "kl": 0.0, "learning_rate": 3.671813661345197e-07, "logps/chosen": -229.14370727539062, "logps/rejected": -220.0166473388672, "loss": 0.3219, "rewards/chosen": -0.44912272691726685, "rewards/margins": 3.00689435005188, "rewards/rejected": -3.456017017364502, "step": 1015 }, { "epoch": 0.27, "grad_norm": 37.656532287597656, "kl": 0.0, "learning_rate": 3.670505103376079e-07, "logps/chosen": -241.76356506347656, "logps/rejected": -229.84144592285156, "loss": 0.2412, "rewards/chosen": 0.739287793636322, "rewards/margins": 5.566799163818359, "rewards/rejected": -4.827511310577393, "step": 1016 }, { "epoch": 0.27, "grad_norm": 35.11753845214844, "kl": 0.0, "learning_rate": 3.6691965454069614e-07, "logps/chosen": -211.39744567871094, "logps/rejected": -246.71319580078125, "loss": 0.2869, "rewards/chosen": -0.1698482483625412, "rewards/margins": 2.793739080429077, "rewards/rejected": -2.9635872840881348, "step": 1017 }, { "epoch": 0.27, "grad_norm": 33.98426055908203, "kl": 0.0, "learning_rate": 3.6678879874378434e-07, "logps/chosen": -219.35289001464844, "logps/rejected": -232.19667053222656, "loss": 0.2576, "rewards/chosen": 0.028564453125, "rewards/margins": 2.506056070327759, "rewards/rejected": -2.477491617202759, "step": 1018 }, { "epoch": 0.27, "grad_norm": 30.304336547851562, "kl": 0.0, "learning_rate": 3.6665794294687254e-07, "logps/chosen": -205.853515625, "logps/rejected": -215.15550231933594, "loss": 0.277, "rewards/chosen": 0.4227977395057678, "rewards/margins": 3.2086880207061768, "rewards/rejected": -2.7858903408050537, "step": 1019 }, { "epoch": 0.27, "grad_norm": 32.362892150878906, "kl": 0.0, "learning_rate": 3.6652708714996073e-07, "logps/chosen": -169.9617919921875, "logps/rejected": -322.0443115234375, "loss": 0.4108, "rewards/chosen": -1.0704345703125, "rewards/margins": 1.948420524597168, "rewards/rejected": -3.018855094909668, "step": 1020 }, { "epoch": 0.27, "grad_norm": 31.939716339111328, "kl": 0.0, "learning_rate": 3.6639623135304893e-07, "logps/chosen": -210.80897521972656, "logps/rejected": -196.7965545654297, "loss": 0.3068, "rewards/chosen": 1.1073076725006104, "rewards/margins": 3.1537821292877197, "rewards/rejected": -2.0464744567871094, "step": 1021 }, { "epoch": 0.27, "grad_norm": 47.13640213012695, "kl": 0.0, "learning_rate": 3.662653755561371e-07, "logps/chosen": -177.97195434570312, "logps/rejected": -257.19281005859375, "loss": 0.3351, "rewards/chosen": 0.2421102672815323, "rewards/margins": 2.5237832069396973, "rewards/rejected": -2.281672954559326, "step": 1022 }, { "epoch": 0.27, "grad_norm": 35.96914291381836, "kl": 0.0, "learning_rate": 3.661345197592253e-07, "logps/chosen": -173.86227416992188, "logps/rejected": -157.81129455566406, "loss": 0.4288, "rewards/chosen": 0.11688518524169922, "rewards/margins": 1.7955470085144043, "rewards/rejected": -1.678661823272705, "step": 1023 }, { "epoch": 0.27, "grad_norm": 31.106721878051758, "kl": 0.0, "learning_rate": 3.660036639623135e-07, "logps/chosen": -214.5399627685547, "logps/rejected": -246.36363220214844, "loss": 0.255, "rewards/chosen": -0.01915731653571129, "rewards/margins": 3.3207294940948486, "rewards/rejected": -3.3398869037628174, "step": 1024 }, { "epoch": 0.27, "grad_norm": 43.176334381103516, "kl": 0.0, "learning_rate": 3.658728081654017e-07, "logps/chosen": -217.10745239257812, "logps/rejected": -252.17526245117188, "loss": 0.3132, "rewards/chosen": -0.032477542757987976, "rewards/margins": 2.099614143371582, "rewards/rejected": -2.132091760635376, "step": 1025 }, { "epoch": 0.27, "grad_norm": 38.87571716308594, "kl": 0.0, "learning_rate": 3.657419523684899e-07, "logps/chosen": -156.24591064453125, "logps/rejected": -326.661865234375, "loss": 0.2864, "rewards/chosen": 0.2051771879196167, "rewards/margins": 3.872175693511963, "rewards/rejected": -3.6669983863830566, "step": 1026 }, { "epoch": 0.27, "grad_norm": 27.340553283691406, "kl": 0.0, "learning_rate": 3.656110965715781e-07, "logps/chosen": -151.21754455566406, "logps/rejected": -253.4945526123047, "loss": 0.3319, "rewards/chosen": -1.0714999437332153, "rewards/margins": 3.459376811981201, "rewards/rejected": -4.530876636505127, "step": 1027 }, { "epoch": 0.27, "grad_norm": 25.070993423461914, "kl": 0.0, "learning_rate": 3.654802407746663e-07, "logps/chosen": -177.9115753173828, "logps/rejected": -257.03033447265625, "loss": 0.3164, "rewards/chosen": -1.1591517925262451, "rewards/margins": 3.417459726333618, "rewards/rejected": -4.576611518859863, "step": 1028 }, { "epoch": 0.27, "grad_norm": 29.812543869018555, "kl": 0.0, "learning_rate": 3.6534938497775444e-07, "logps/chosen": -199.55758666992188, "logps/rejected": -237.88499450683594, "loss": 0.3949, "rewards/chosen": -0.17537379264831543, "rewards/margins": 2.9494917392730713, "rewards/rejected": -3.1248655319213867, "step": 1029 }, { "epoch": 0.27, "grad_norm": 36.289398193359375, "kl": 0.0, "learning_rate": 3.652185291808427e-07, "logps/chosen": -241.36741638183594, "logps/rejected": -276.4466247558594, "loss": 0.2921, "rewards/chosen": 0.03348015993833542, "rewards/margins": 2.9756243228912354, "rewards/rejected": -2.9421441555023193, "step": 1030 }, { "epoch": 0.27, "grad_norm": 36.39860916137695, "kl": 0.0, "learning_rate": 3.650876733839309e-07, "logps/chosen": -308.1175231933594, "logps/rejected": -331.46246337890625, "loss": 0.2929, "rewards/chosen": -1.4403387308120728, "rewards/margins": 0.3089176416397095, "rewards/rejected": -1.7492563724517822, "step": 1031 }, { "epoch": 0.27, "grad_norm": 35.538063049316406, "kl": 0.0, "learning_rate": 3.649568175870191e-07, "logps/chosen": -186.85958862304688, "logps/rejected": -172.8899383544922, "loss": 0.2932, "rewards/chosen": -0.7927916646003723, "rewards/margins": 2.2099575996398926, "rewards/rejected": -3.00274920463562, "step": 1032 }, { "epoch": 0.27, "grad_norm": 40.36717987060547, "kl": 0.0, "learning_rate": 3.648259617901073e-07, "logps/chosen": -242.24998474121094, "logps/rejected": -351.9871520996094, "loss": 0.3347, "rewards/chosen": -0.6761089563369751, "rewards/margins": 2.68052339553833, "rewards/rejected": -3.3566324710845947, "step": 1033 }, { "epoch": 0.27, "grad_norm": 29.186281204223633, "kl": 0.0, "learning_rate": 3.646951059931955e-07, "logps/chosen": -240.44607543945312, "logps/rejected": -237.96688842773438, "loss": 0.3484, "rewards/chosen": -0.43001431226730347, "rewards/margins": 2.2880451679229736, "rewards/rejected": -2.718059539794922, "step": 1034 }, { "epoch": 0.27, "grad_norm": 38.60892868041992, "kl": 0.0, "learning_rate": 3.645642501962837e-07, "logps/chosen": -198.13050842285156, "logps/rejected": -238.33778381347656, "loss": 0.2745, "rewards/chosen": -0.5678102374076843, "rewards/margins": 1.5358705520629883, "rewards/rejected": -2.1036808490753174, "step": 1035 }, { "epoch": 0.27, "grad_norm": 40.15579605102539, "kl": 0.0, "learning_rate": 3.6443339439937187e-07, "logps/chosen": -207.40586853027344, "logps/rejected": -269.68646240234375, "loss": 0.2976, "rewards/chosen": 1.5068060159683228, "rewards/margins": 5.378608226776123, "rewards/rejected": -3.87180233001709, "step": 1036 }, { "epoch": 0.27, "grad_norm": 40.54957962036133, "kl": 0.0, "learning_rate": 3.6430253860246007e-07, "logps/chosen": -225.75035095214844, "logps/rejected": -186.6640167236328, "loss": 0.249, "rewards/chosen": 0.4179876148700714, "rewards/margins": 2.1934192180633545, "rewards/rejected": -1.7754316329956055, "step": 1037 }, { "epoch": 0.27, "grad_norm": 28.730098724365234, "kl": 0.0, "learning_rate": 3.6417168280554826e-07, "logps/chosen": -187.89804077148438, "logps/rejected": -233.9765167236328, "loss": 0.3673, "rewards/chosen": -2.141961097717285, "rewards/margins": 0.7847371101379395, "rewards/rejected": -2.9266982078552246, "step": 1038 }, { "epoch": 0.27, "grad_norm": 47.1996955871582, "kl": 0.0, "learning_rate": 3.6404082700863646e-07, "logps/chosen": -220.11485290527344, "logps/rejected": -254.345703125, "loss": 0.2947, "rewards/chosen": -0.6743428707122803, "rewards/margins": 1.5272424221038818, "rewards/rejected": -2.201585292816162, "step": 1039 }, { "epoch": 0.27, "grad_norm": 28.40711212158203, "kl": 0.0, "learning_rate": 3.6390997121172465e-07, "logps/chosen": -237.76663208007812, "logps/rejected": -225.626220703125, "loss": 0.3574, "rewards/chosen": 3.728612184524536, "rewards/margins": 6.437889099121094, "rewards/rejected": -2.7092769145965576, "step": 1040 }, { "epoch": 0.27, "grad_norm": 36.701297760009766, "kl": 0.0, "learning_rate": 3.6377911541481285e-07, "logps/chosen": -221.3239288330078, "logps/rejected": -281.19451904296875, "loss": 0.3226, "rewards/chosen": 0.43275317549705505, "rewards/margins": 2.6378254890441895, "rewards/rejected": -2.2050724029541016, "step": 1041 }, { "epoch": 0.27, "grad_norm": 36.57932662963867, "kl": 0.0, "learning_rate": 3.636482596179011e-07, "logps/chosen": -248.81182861328125, "logps/rejected": -195.62704467773438, "loss": 0.3701, "rewards/chosen": -0.7044147253036499, "rewards/margins": 1.9177671670913696, "rewards/rejected": -2.6221818923950195, "step": 1042 }, { "epoch": 0.27, "grad_norm": 32.23270797729492, "kl": 0.0, "learning_rate": 3.635174038209893e-07, "logps/chosen": -211.1300048828125, "logps/rejected": -221.2258758544922, "loss": 0.1998, "rewards/chosen": -0.5551096200942993, "rewards/margins": 1.9950312376022339, "rewards/rejected": -2.550140857696533, "step": 1043 }, { "epoch": 0.27, "grad_norm": 39.78229522705078, "kl": 0.0, "learning_rate": 3.633865480240775e-07, "logps/chosen": -249.97067260742188, "logps/rejected": -204.08071899414062, "loss": 0.3696, "rewards/chosen": 0.06202399730682373, "rewards/margins": 1.7861658334732056, "rewards/rejected": -1.7241418361663818, "step": 1044 }, { "epoch": 0.27, "grad_norm": 38.07326889038086, "kl": 0.0, "learning_rate": 3.6325569222716564e-07, "logps/chosen": -237.25929260253906, "logps/rejected": -240.17529296875, "loss": 0.3822, "rewards/chosen": -0.42507678270339966, "rewards/margins": 1.954552412033081, "rewards/rejected": -2.379629135131836, "step": 1045 }, { "epoch": 0.27, "grad_norm": 41.57281494140625, "kl": 0.0, "learning_rate": 3.6312483643025383e-07, "logps/chosen": -183.5970916748047, "logps/rejected": -223.18557739257812, "loss": 0.4239, "rewards/chosen": 0.8696861267089844, "rewards/margins": 2.7113208770751953, "rewards/rejected": -1.8416346311569214, "step": 1046 }, { "epoch": 0.27, "grad_norm": 34.37956619262695, "kl": 0.0, "learning_rate": 3.6299398063334203e-07, "logps/chosen": -139.97079467773438, "logps/rejected": -280.18212890625, "loss": 0.2595, "rewards/chosen": 0.27579593658447266, "rewards/margins": 3.2917022705078125, "rewards/rejected": -3.01590633392334, "step": 1047 }, { "epoch": 0.27, "grad_norm": 38.35115432739258, "kl": 0.0, "learning_rate": 3.628631248364302e-07, "logps/chosen": -272.6831970214844, "logps/rejected": -240.39169311523438, "loss": 0.335, "rewards/chosen": -6.130115032196045, "rewards/margins": -2.784881114959717, "rewards/rejected": -3.345233917236328, "step": 1048 }, { "epoch": 0.27, "grad_norm": 32.5230598449707, "kl": 0.0, "learning_rate": 3.627322690395184e-07, "logps/chosen": -166.0072021484375, "logps/rejected": -243.00039672851562, "loss": 0.2476, "rewards/chosen": 0.14196598529815674, "rewards/margins": 3.079589366912842, "rewards/rejected": -2.9376235008239746, "step": 1049 }, { "epoch": 0.27, "grad_norm": 29.200593948364258, "kl": 0.0, "learning_rate": 3.626014132426066e-07, "logps/chosen": -175.75213623046875, "logps/rejected": -278.5937194824219, "loss": 0.3469, "rewards/chosen": 0.4250580668449402, "rewards/margins": 3.2779133319854736, "rewards/rejected": -2.8528552055358887, "step": 1050 }, { "epoch": 0.28, "grad_norm": 26.384889602661133, "kl": 0.0, "learning_rate": 3.624705574456948e-07, "logps/chosen": -120.09896087646484, "logps/rejected": -238.22848510742188, "loss": 0.3106, "rewards/chosen": -0.45138978958129883, "rewards/margins": 2.8458361625671387, "rewards/rejected": -3.2972259521484375, "step": 1051 }, { "epoch": 0.28, "grad_norm": 39.24557113647461, "kl": 0.0, "learning_rate": 3.62339701648783e-07, "logps/chosen": -196.98329162597656, "logps/rejected": -143.01910400390625, "loss": 0.4611, "rewards/chosen": -1.3231563568115234, "rewards/margins": 0.6955959796905518, "rewards/rejected": -2.018752336502075, "step": 1052 }, { "epoch": 0.28, "grad_norm": 35.793094635009766, "kl": 0.0, "learning_rate": 3.622088458518712e-07, "logps/chosen": -250.4149627685547, "logps/rejected": -282.08978271484375, "loss": 0.2657, "rewards/chosen": 1.1257227659225464, "rewards/margins": 4.489777088165283, "rewards/rejected": -3.3640542030334473, "step": 1053 }, { "epoch": 0.28, "grad_norm": 50.41539001464844, "kl": 0.0, "learning_rate": 3.620779900549594e-07, "logps/chosen": -232.12420654296875, "logps/rejected": -266.841552734375, "loss": 0.3643, "rewards/chosen": -0.20323766767978668, "rewards/margins": 1.5877408981323242, "rewards/rejected": -1.7909785509109497, "step": 1054 }, { "epoch": 0.28, "grad_norm": 45.92751693725586, "kl": 0.0, "learning_rate": 3.6194713425804765e-07, "logps/chosen": -179.73974609375, "logps/rejected": -249.5182647705078, "loss": 0.2827, "rewards/chosen": 0.3262701630592346, "rewards/margins": 3.874574899673462, "rewards/rejected": -3.548304796218872, "step": 1055 }, { "epoch": 0.28, "grad_norm": 36.23213577270508, "kl": 0.0, "learning_rate": 3.6181627846113585e-07, "logps/chosen": -171.02401733398438, "logps/rejected": -240.00747680664062, "loss": 0.2385, "rewards/chosen": 1.3682966232299805, "rewards/margins": 2.987136125564575, "rewards/rejected": -1.6188395023345947, "step": 1056 }, { "epoch": 0.28, "grad_norm": 29.29627799987793, "kl": 0.0, "learning_rate": 3.6168542266422404e-07, "logps/chosen": -210.8943328857422, "logps/rejected": -187.2423095703125, "loss": 0.2572, "rewards/chosen": 0.5274381637573242, "rewards/margins": 3.216721296310425, "rewards/rejected": -2.6892831325531006, "step": 1057 }, { "epoch": 0.28, "grad_norm": 31.17224884033203, "kl": 0.0, "learning_rate": 3.6155456686731224e-07, "logps/chosen": -180.1999969482422, "logps/rejected": -219.22610473632812, "loss": 0.4126, "rewards/chosen": -0.6666109561920166, "rewards/margins": 1.8168065547943115, "rewards/rejected": -2.483417510986328, "step": 1058 }, { "epoch": 0.28, "grad_norm": 33.67176055908203, "kl": 0.0, "learning_rate": 3.6142371107040043e-07, "logps/chosen": -220.28773498535156, "logps/rejected": -259.7289123535156, "loss": 0.2974, "rewards/chosen": -0.13516581058502197, "rewards/margins": 3.0074315071105957, "rewards/rejected": -3.142597198486328, "step": 1059 }, { "epoch": 0.28, "grad_norm": 33.259300231933594, "kl": 0.0, "learning_rate": 3.612928552734886e-07, "logps/chosen": -189.72705078125, "logps/rejected": -164.92654418945312, "loss": 0.4098, "rewards/chosen": -0.050531283020973206, "rewards/margins": 2.0134236812591553, "rewards/rejected": -2.063955068588257, "step": 1060 }, { "epoch": 0.28, "grad_norm": 38.474369049072266, "kl": 0.0, "learning_rate": 3.6116199947657677e-07, "logps/chosen": -211.18026733398438, "logps/rejected": -158.99363708496094, "loss": 0.3069, "rewards/chosen": -0.6650242209434509, "rewards/margins": 1.7212104797363281, "rewards/rejected": -2.386234760284424, "step": 1061 }, { "epoch": 0.28, "grad_norm": 29.082534790039062, "kl": 0.0, "learning_rate": 3.6103114367966497e-07, "logps/chosen": -282.92205810546875, "logps/rejected": -266.47674560546875, "loss": 0.2743, "rewards/chosen": 1.693918228149414, "rewards/margins": 5.3685173988342285, "rewards/rejected": -3.6745991706848145, "step": 1062 }, { "epoch": 0.28, "grad_norm": 35.966251373291016, "kl": 0.0, "learning_rate": 3.6090028788275316e-07, "logps/chosen": -163.5644989013672, "logps/rejected": -240.92135620117188, "loss": 0.2478, "rewards/chosen": 0.8558608889579773, "rewards/margins": 3.8144004344940186, "rewards/rejected": -2.9585394859313965, "step": 1063 }, { "epoch": 0.28, "grad_norm": 29.439090728759766, "kl": 0.0, "learning_rate": 3.6076943208584136e-07, "logps/chosen": -197.8784637451172, "logps/rejected": -206.02719116210938, "loss": 0.2793, "rewards/chosen": -0.06350186467170715, "rewards/margins": 3.4905803203582764, "rewards/rejected": -3.554082155227661, "step": 1064 }, { "epoch": 0.28, "grad_norm": 34.36996841430664, "kl": 0.0, "learning_rate": 3.6063857628892956e-07, "logps/chosen": -149.76780700683594, "logps/rejected": -293.9136047363281, "loss": 0.2786, "rewards/chosen": 0.11478301137685776, "rewards/margins": 2.3451292514801025, "rewards/rejected": -2.230346202850342, "step": 1065 }, { "epoch": 0.28, "grad_norm": 36.8817253112793, "kl": 0.0, "learning_rate": 3.6050772049201775e-07, "logps/chosen": -324.67822265625, "logps/rejected": -313.7707214355469, "loss": 0.2427, "rewards/chosen": 0.6795901656150818, "rewards/margins": 5.313671112060547, "rewards/rejected": -4.63408088684082, "step": 1066 }, { "epoch": 0.28, "grad_norm": 38.049503326416016, "kl": 0.0, "learning_rate": 3.6037686469510595e-07, "logps/chosen": -273.94134521484375, "logps/rejected": -324.9948425292969, "loss": 0.2256, "rewards/chosen": -0.8676828742027283, "rewards/margins": 2.8628861904144287, "rewards/rejected": -3.7305691242218018, "step": 1067 }, { "epoch": 0.28, "grad_norm": 33.18891906738281, "kl": 0.0, "learning_rate": 3.602460088981942e-07, "logps/chosen": -252.3577117919922, "logps/rejected": -173.41571044921875, "loss": 0.3427, "rewards/chosen": 0.029461294412612915, "rewards/margins": 1.8500933647155762, "rewards/rejected": -1.8206321001052856, "step": 1068 }, { "epoch": 0.28, "grad_norm": 35.695560455322266, "kl": 0.0, "learning_rate": 3.601151531012824e-07, "logps/chosen": -273.1550598144531, "logps/rejected": -285.8321533203125, "loss": 0.1906, "rewards/chosen": 1.8435993194580078, "rewards/margins": 5.752671241760254, "rewards/rejected": -3.909071922302246, "step": 1069 }, { "epoch": 0.28, "grad_norm": 34.36787033081055, "kl": 0.0, "learning_rate": 3.599842973043706e-07, "logps/chosen": -214.49176025390625, "logps/rejected": -263.76544189453125, "loss": 0.261, "rewards/chosen": 1.1163358688354492, "rewards/margins": 5.000162124633789, "rewards/rejected": -3.8838260173797607, "step": 1070 }, { "epoch": 0.28, "grad_norm": 31.274845123291016, "kl": 0.0, "learning_rate": 3.598534415074588e-07, "logps/chosen": -269.2871398925781, "logps/rejected": -231.68600463867188, "loss": 0.3206, "rewards/chosen": -1.9677873849868774, "rewards/margins": 0.8992663621902466, "rewards/rejected": -2.867053747177124, "step": 1071 }, { "epoch": 0.28, "grad_norm": 32.18807601928711, "kl": 0.0, "learning_rate": 3.59722585710547e-07, "logps/chosen": -276.0224304199219, "logps/rejected": -250.87071228027344, "loss": 0.4247, "rewards/chosen": 0.2261885702610016, "rewards/margins": 2.891800880432129, "rewards/rejected": -2.66561222076416, "step": 1072 }, { "epoch": 0.28, "grad_norm": 36.16868591308594, "kl": 0.0, "learning_rate": 3.595917299136352e-07, "logps/chosen": -208.01426696777344, "logps/rejected": -285.5256652832031, "loss": 0.2483, "rewards/chosen": 0.5357463955879211, "rewards/margins": 1.908036708831787, "rewards/rejected": -1.3722903728485107, "step": 1073 }, { "epoch": 0.28, "grad_norm": 39.653663635253906, "kl": 0.0, "learning_rate": 3.594608741167234e-07, "logps/chosen": -278.96038818359375, "logps/rejected": -210.6820526123047, "loss": 0.325, "rewards/chosen": 0.039040904492139816, "rewards/margins": 2.643235445022583, "rewards/rejected": -2.6041946411132812, "step": 1074 }, { "epoch": 0.28, "grad_norm": 37.162513732910156, "kl": 0.0, "learning_rate": 3.5933001831981157e-07, "logps/chosen": -233.07406616210938, "logps/rejected": -233.7490997314453, "loss": 0.3276, "rewards/chosen": 0.33511853218078613, "rewards/margins": 2.7062385082244873, "rewards/rejected": -2.371119976043701, "step": 1075 }, { "epoch": 0.28, "grad_norm": 34.30678176879883, "kl": 0.0, "learning_rate": 3.591991625228997e-07, "logps/chosen": -296.16326904296875, "logps/rejected": -230.3595733642578, "loss": 0.2341, "rewards/chosen": 2.3373847007751465, "rewards/margins": 4.555594444274902, "rewards/rejected": -2.2182095050811768, "step": 1076 }, { "epoch": 0.28, "grad_norm": 20.76529884338379, "kl": 0.0, "learning_rate": 3.590683067259879e-07, "logps/chosen": -186.85400390625, "logps/rejected": -223.57244873046875, "loss": 0.1979, "rewards/chosen": 1.0140910148620605, "rewards/margins": 5.6391987800598145, "rewards/rejected": -4.625107765197754, "step": 1077 }, { "epoch": 0.28, "grad_norm": 44.112098693847656, "kl": 0.0, "learning_rate": 3.589374509290761e-07, "logps/chosen": -180.42352294921875, "logps/rejected": -276.9669189453125, "loss": 0.3337, "rewards/chosen": -0.0607970654964447, "rewards/margins": 3.0823521614074707, "rewards/rejected": -3.1431491374969482, "step": 1078 }, { "epoch": 0.28, "grad_norm": 37.582191467285156, "kl": 0.0, "learning_rate": 3.588065951321643e-07, "logps/chosen": -243.1421356201172, "logps/rejected": -354.2049865722656, "loss": 0.2466, "rewards/chosen": -0.5239735245704651, "rewards/margins": 4.440597057342529, "rewards/rejected": -4.96457052230835, "step": 1079 }, { "epoch": 0.28, "grad_norm": 33.97587966918945, "kl": 0.0, "learning_rate": 3.5867573933525255e-07, "logps/chosen": -177.9431915283203, "logps/rejected": -192.5262451171875, "loss": 0.3506, "rewards/chosen": 0.012957848608493805, "rewards/margins": 2.7563772201538086, "rewards/rejected": -2.7434194087982178, "step": 1080 }, { "epoch": 0.28, "grad_norm": 36.664241790771484, "kl": 0.0, "learning_rate": 3.5854488353834075e-07, "logps/chosen": -247.65528869628906, "logps/rejected": -163.8009490966797, "loss": 0.3547, "rewards/chosen": -0.40981823205947876, "rewards/margins": 2.227193593978882, "rewards/rejected": -2.637011766433716, "step": 1081 }, { "epoch": 0.28, "grad_norm": 30.23735237121582, "kl": 0.0, "learning_rate": 3.5841402774142894e-07, "logps/chosen": -216.5425262451172, "logps/rejected": -273.2435607910156, "loss": 0.2448, "rewards/chosen": -0.4988417327404022, "rewards/margins": 3.563955783843994, "rewards/rejected": -4.062797546386719, "step": 1082 }, { "epoch": 0.28, "grad_norm": 34.601112365722656, "kl": 0.0, "learning_rate": 3.5828317194451714e-07, "logps/chosen": -210.7240447998047, "logps/rejected": -196.9040985107422, "loss": 0.2698, "rewards/chosen": 0.395294725894928, "rewards/margins": 3.474808931350708, "rewards/rejected": -3.079514265060425, "step": 1083 }, { "epoch": 0.28, "grad_norm": 28.270124435424805, "kl": 0.0, "learning_rate": 3.5815231614760534e-07, "logps/chosen": -165.27096557617188, "logps/rejected": -188.78155517578125, "loss": 0.2591, "rewards/chosen": 0.3385252356529236, "rewards/margins": 3.490814685821533, "rewards/rejected": -3.152289390563965, "step": 1084 }, { "epoch": 0.28, "grad_norm": 34.028770446777344, "kl": 0.0, "learning_rate": 3.5802146035069353e-07, "logps/chosen": -246.2218780517578, "logps/rejected": -238.57894897460938, "loss": 0.2223, "rewards/chosen": 1.6595675945281982, "rewards/margins": 6.902527809143066, "rewards/rejected": -5.242960453033447, "step": 1085 }, { "epoch": 0.28, "grad_norm": 36.6795539855957, "kl": 0.0, "learning_rate": 3.5789060455378173e-07, "logps/chosen": -303.646484375, "logps/rejected": -332.2771911621094, "loss": 0.2087, "rewards/chosen": 1.6191469430923462, "rewards/margins": 5.439394474029541, "rewards/rejected": -3.8202476501464844, "step": 1086 }, { "epoch": 0.28, "grad_norm": 29.40221405029297, "kl": 0.0, "learning_rate": 3.577597487568699e-07, "logps/chosen": -186.3616485595703, "logps/rejected": -279.948974609375, "loss": 0.2862, "rewards/chosen": -0.8338308334350586, "rewards/margins": 3.1905031204223633, "rewards/rejected": -4.024333953857422, "step": 1087 }, { "epoch": 0.28, "grad_norm": 42.282047271728516, "kl": 0.0, "learning_rate": 3.576288929599581e-07, "logps/chosen": -204.36720275878906, "logps/rejected": -209.05628967285156, "loss": 0.3698, "rewards/chosen": -0.9839210510253906, "rewards/margins": 1.6166069507598877, "rewards/rejected": -2.6005280017852783, "step": 1088 }, { "epoch": 0.29, "grad_norm": 37.4203987121582, "kl": 0.0, "learning_rate": 3.574980371630463e-07, "logps/chosen": -258.50775146484375, "logps/rejected": -273.71368408203125, "loss": 0.2078, "rewards/chosen": 0.680251955986023, "rewards/margins": 4.447729110717773, "rewards/rejected": -3.767477035522461, "step": 1089 }, { "epoch": 0.29, "grad_norm": 28.57037353515625, "kl": 0.0, "learning_rate": 3.573671813661345e-07, "logps/chosen": -210.90475463867188, "logps/rejected": -245.07102966308594, "loss": 0.1402, "rewards/chosen": 0.5093256235122681, "rewards/margins": 4.227346420288086, "rewards/rejected": -3.7180206775665283, "step": 1090 }, { "epoch": 0.29, "grad_norm": 35.79833984375, "kl": 0.0, "learning_rate": 3.5723632556922266e-07, "logps/chosen": -241.64450073242188, "logps/rejected": -228.38893127441406, "loss": 0.2786, "rewards/chosen": 1.0499564409255981, "rewards/margins": 3.9940905570983887, "rewards/rejected": -2.944133996963501, "step": 1091 }, { "epoch": 0.29, "grad_norm": 33.26674270629883, "kl": 0.0, "learning_rate": 3.5710546977231085e-07, "logps/chosen": -398.8047180175781, "logps/rejected": -177.8960723876953, "loss": 0.3536, "rewards/chosen": -1.450081706047058, "rewards/margins": 2.225198745727539, "rewards/rejected": -3.6752805709838867, "step": 1092 }, { "epoch": 0.29, "grad_norm": 34.63792037963867, "kl": 0.0, "learning_rate": 3.569746139753991e-07, "logps/chosen": -271.9081726074219, "logps/rejected": -234.6984100341797, "loss": 0.3375, "rewards/chosen": 1.359961748123169, "rewards/margins": 4.406167030334473, "rewards/rejected": -3.0462050437927246, "step": 1093 }, { "epoch": 0.29, "grad_norm": 29.882545471191406, "kl": 0.0, "learning_rate": 3.568437581784873e-07, "logps/chosen": -212.69424438476562, "logps/rejected": -268.8453063964844, "loss": 0.2343, "rewards/chosen": 1.1743170022964478, "rewards/margins": 4.926886081695557, "rewards/rejected": -3.7525689601898193, "step": 1094 }, { "epoch": 0.29, "grad_norm": 29.734010696411133, "kl": 0.0, "learning_rate": 3.567129023815755e-07, "logps/chosen": -181.9860382080078, "logps/rejected": -233.7906951904297, "loss": 0.3177, "rewards/chosen": -0.8437288403511047, "rewards/margins": 2.7986247539520264, "rewards/rejected": -3.6423535346984863, "step": 1095 }, { "epoch": 0.29, "grad_norm": 36.10123062133789, "kl": 0.0, "learning_rate": 3.565820465846637e-07, "logps/chosen": -188.33181762695312, "logps/rejected": -130.89389038085938, "loss": 0.4587, "rewards/chosen": -0.7544440031051636, "rewards/margins": 3.384410858154297, "rewards/rejected": -4.13885498046875, "step": 1096 }, { "epoch": 0.29, "grad_norm": 32.93227767944336, "kl": 0.0, "learning_rate": 3.564511907877519e-07, "logps/chosen": -238.99197387695312, "logps/rejected": -234.0099334716797, "loss": 0.3542, "rewards/chosen": -0.7831324934959412, "rewards/margins": 3.8155860900878906, "rewards/rejected": -4.598718643188477, "step": 1097 }, { "epoch": 0.29, "grad_norm": 31.56029510498047, "kl": 0.0, "learning_rate": 3.563203349908401e-07, "logps/chosen": -155.75697326660156, "logps/rejected": -248.09512329101562, "loss": 0.3233, "rewards/chosen": 0.1227286234498024, "rewards/margins": 2.778966188430786, "rewards/rejected": -2.6562376022338867, "step": 1098 }, { "epoch": 0.29, "grad_norm": 34.97945785522461, "kl": 0.0, "learning_rate": 3.561894791939283e-07, "logps/chosen": -165.0885772705078, "logps/rejected": -286.6366271972656, "loss": 0.3808, "rewards/chosen": 0.3326500356197357, "rewards/margins": 3.2164618968963623, "rewards/rejected": -2.8838119506835938, "step": 1099 }, { "epoch": 0.29, "grad_norm": 36.84022521972656, "kl": 0.0, "learning_rate": 3.5605862339701647e-07, "logps/chosen": -274.4532775878906, "logps/rejected": -254.0680389404297, "loss": 0.417, "rewards/chosen": 0.12156467139720917, "rewards/margins": 2.132566213607788, "rewards/rejected": -2.0110015869140625, "step": 1100 }, { "epoch": 0.29, "grad_norm": 40.564884185791016, "kl": 0.0, "learning_rate": 3.5592776760010467e-07, "logps/chosen": -211.90956115722656, "logps/rejected": -237.0247039794922, "loss": 0.3465, "rewards/chosen": -0.3848746716976166, "rewards/margins": 1.7771941423416138, "rewards/rejected": -2.1620688438415527, "step": 1101 }, { "epoch": 0.29, "grad_norm": 22.69942855834961, "kl": 0.0, "learning_rate": 3.5579691180319287e-07, "logps/chosen": -239.27420043945312, "logps/rejected": -246.932861328125, "loss": 0.2281, "rewards/chosen": 0.6759759187698364, "rewards/margins": 5.967441558837891, "rewards/rejected": -5.291465759277344, "step": 1102 }, { "epoch": 0.29, "grad_norm": 35.47650909423828, "kl": 0.0, "learning_rate": 3.5566605600628106e-07, "logps/chosen": -215.21624755859375, "logps/rejected": -268.92169189453125, "loss": 0.2809, "rewards/chosen": 0.8522527813911438, "rewards/margins": 4.585559844970703, "rewards/rejected": -3.733306884765625, "step": 1103 }, { "epoch": 0.29, "grad_norm": 39.69252395629883, "kl": 0.0, "learning_rate": 3.5553520020936926e-07, "logps/chosen": -225.89573669433594, "logps/rejected": -233.9873046875, "loss": 0.4371, "rewards/chosen": -0.27535781264305115, "rewards/margins": 2.050819158554077, "rewards/rejected": -2.326176881790161, "step": 1104 }, { "epoch": 0.29, "grad_norm": 34.08661651611328, "kl": 0.0, "learning_rate": 3.5540434441245745e-07, "logps/chosen": -189.98110961914062, "logps/rejected": -253.66513061523438, "loss": 0.3135, "rewards/chosen": 0.5056968927383423, "rewards/margins": 3.583799362182617, "rewards/rejected": -3.0781025886535645, "step": 1105 }, { "epoch": 0.29, "grad_norm": 26.737289428710938, "kl": 0.0, "learning_rate": 3.552734886155457e-07, "logps/chosen": -213.71267700195312, "logps/rejected": -161.48214721679688, "loss": 0.371, "rewards/chosen": -0.6218294501304626, "rewards/margins": 1.985694408416748, "rewards/rejected": -2.6075239181518555, "step": 1106 }, { "epoch": 0.29, "grad_norm": 38.503726959228516, "kl": 0.0, "learning_rate": 3.5514263281863385e-07, "logps/chosen": -183.30770874023438, "logps/rejected": -271.33172607421875, "loss": 0.3733, "rewards/chosen": -0.9077540636062622, "rewards/margins": 3.8997726440429688, "rewards/rejected": -4.807526588439941, "step": 1107 }, { "epoch": 0.29, "grad_norm": 32.68248748779297, "kl": 0.0, "learning_rate": 3.5501177702172204e-07, "logps/chosen": -263.5723876953125, "logps/rejected": -320.6473388671875, "loss": 0.472, "rewards/chosen": -1.8644930124282837, "rewards/margins": 2.3240761756896973, "rewards/rejected": -4.188569068908691, "step": 1108 }, { "epoch": 0.29, "grad_norm": 36.400020599365234, "kl": 0.0, "learning_rate": 3.5488092122481024e-07, "logps/chosen": -245.26560974121094, "logps/rejected": -276.50933837890625, "loss": 0.3022, "rewards/chosen": 0.9690700769424438, "rewards/margins": 3.6182336807250977, "rewards/rejected": -2.6491634845733643, "step": 1109 }, { "epoch": 0.29, "grad_norm": 39.89811706542969, "kl": 0.0, "learning_rate": 3.5475006542789843e-07, "logps/chosen": -296.92352294921875, "logps/rejected": -367.4890441894531, "loss": 0.3288, "rewards/chosen": -1.7777142524719238, "rewards/margins": 3.5788440704345703, "rewards/rejected": -5.356558322906494, "step": 1110 }, { "epoch": 0.29, "grad_norm": 34.54069900512695, "kl": 0.0, "learning_rate": 3.5461920963098663e-07, "logps/chosen": -204.82083129882812, "logps/rejected": -325.15386962890625, "loss": 0.2518, "rewards/chosen": 0.557120680809021, "rewards/margins": 3.710430145263672, "rewards/rejected": -3.1533093452453613, "step": 1111 }, { "epoch": 0.29, "grad_norm": 30.902477264404297, "kl": 0.0, "learning_rate": 3.544883538340748e-07, "logps/chosen": -275.5830993652344, "logps/rejected": -183.99923706054688, "loss": 0.1808, "rewards/chosen": 1.1739755868911743, "rewards/margins": 4.945966720581055, "rewards/rejected": -3.77199125289917, "step": 1112 }, { "epoch": 0.29, "grad_norm": 30.833444595336914, "kl": 0.0, "learning_rate": 3.54357498037163e-07, "logps/chosen": -201.6013946533203, "logps/rejected": -168.2086944580078, "loss": 0.1972, "rewards/chosen": 0.5766014456748962, "rewards/margins": 3.2745020389556885, "rewards/rejected": -2.6979005336761475, "step": 1113 }, { "epoch": 0.29, "grad_norm": 26.51701545715332, "kl": 0.0, "learning_rate": 3.542266422402512e-07, "logps/chosen": -208.7091522216797, "logps/rejected": -217.86351013183594, "loss": 0.1889, "rewards/chosen": 1.0350052118301392, "rewards/margins": 4.19869327545166, "rewards/rejected": -3.1636881828308105, "step": 1114 }, { "epoch": 0.29, "grad_norm": 30.602649688720703, "kl": 0.0, "learning_rate": 3.540957864433394e-07, "logps/chosen": -174.76805114746094, "logps/rejected": -262.15087890625, "loss": 0.1866, "rewards/chosen": 0.5281372666358948, "rewards/margins": 5.422541618347168, "rewards/rejected": -4.894404411315918, "step": 1115 }, { "epoch": 0.29, "grad_norm": 23.70747184753418, "kl": 0.0, "learning_rate": 3.539649306464276e-07, "logps/chosen": -151.63156127929688, "logps/rejected": -231.32972717285156, "loss": 0.2839, "rewards/chosen": -0.6982941627502441, "rewards/margins": 2.6295316219329834, "rewards/rejected": -3.3278257846832275, "step": 1116 }, { "epoch": 0.29, "grad_norm": 37.54524230957031, "kl": 0.0, "learning_rate": 3.538340748495158e-07, "logps/chosen": -252.19773864746094, "logps/rejected": -178.48255920410156, "loss": 0.3343, "rewards/chosen": -1.1702947616577148, "rewards/margins": 1.391571044921875, "rewards/rejected": -2.56186580657959, "step": 1117 }, { "epoch": 0.29, "grad_norm": 31.71670913696289, "kl": 0.0, "learning_rate": 3.53703219052604e-07, "logps/chosen": -174.445068359375, "logps/rejected": -246.0499725341797, "loss": 0.4527, "rewards/chosen": -0.7562252283096313, "rewards/margins": 2.1291980743408203, "rewards/rejected": -2.885423183441162, "step": 1118 }, { "epoch": 0.29, "grad_norm": 33.092411041259766, "kl": 0.0, "learning_rate": 3.5357236325569225e-07, "logps/chosen": -212.7728729248047, "logps/rejected": -231.80722045898438, "loss": 0.3066, "rewards/chosen": -0.33431848883628845, "rewards/margins": 2.203496217727661, "rewards/rejected": -2.5378146171569824, "step": 1119 }, { "epoch": 0.29, "grad_norm": 44.41012954711914, "kl": 0.0, "learning_rate": 3.5344150745878045e-07, "logps/chosen": -224.70591735839844, "logps/rejected": -218.91012573242188, "loss": 0.3321, "rewards/chosen": 0.26089727878570557, "rewards/margins": 2.4730587005615234, "rewards/rejected": -2.2121613025665283, "step": 1120 }, { "epoch": 0.29, "grad_norm": 36.9603385925293, "kl": 0.0, "learning_rate": 3.5331065166186864e-07, "logps/chosen": -256.7554016113281, "logps/rejected": -261.4122314453125, "loss": 0.3427, "rewards/chosen": -0.10907495021820068, "rewards/margins": 3.4610395431518555, "rewards/rejected": -3.5701146125793457, "step": 1121 }, { "epoch": 0.29, "grad_norm": 31.963926315307617, "kl": 0.0, "learning_rate": 3.531797958649568e-07, "logps/chosen": -200.814453125, "logps/rejected": -228.10572814941406, "loss": 0.3222, "rewards/chosen": -0.9516986608505249, "rewards/margins": 2.583322525024414, "rewards/rejected": -3.5350210666656494, "step": 1122 }, { "epoch": 0.29, "grad_norm": 41.13035583496094, "kl": 0.0, "learning_rate": 3.53048940068045e-07, "logps/chosen": -182.65289306640625, "logps/rejected": -259.0008850097656, "loss": 0.2592, "rewards/chosen": 0.5731912851333618, "rewards/margins": 5.2247538566589355, "rewards/rejected": -4.651562690734863, "step": 1123 }, { "epoch": 0.29, "grad_norm": 34.95411682128906, "kl": 0.0, "learning_rate": 3.529180842711332e-07, "logps/chosen": -268.8537292480469, "logps/rejected": -250.743896484375, "loss": 0.2416, "rewards/chosen": -0.5174262523651123, "rewards/margins": 3.3615188598632812, "rewards/rejected": -3.8789451122283936, "step": 1124 }, { "epoch": 0.29, "grad_norm": 30.07512855529785, "kl": 0.0, "learning_rate": 3.527872284742214e-07, "logps/chosen": -233.08389282226562, "logps/rejected": -165.42440795898438, "loss": 0.3159, "rewards/chosen": 0.16913124918937683, "rewards/margins": 3.1799628734588623, "rewards/rejected": -3.010831594467163, "step": 1125 }, { "epoch": 0.29, "grad_norm": 32.58613586425781, "kl": 0.0, "learning_rate": 3.5265637267730957e-07, "logps/chosen": -214.123779296875, "logps/rejected": -241.11907958984375, "loss": 0.2387, "rewards/chosen": -0.24165599048137665, "rewards/margins": 2.814639091491699, "rewards/rejected": -3.056295156478882, "step": 1126 }, { "epoch": 0.29, "grad_norm": 32.11212158203125, "kl": 0.0, "learning_rate": 3.5252551688039777e-07, "logps/chosen": -187.3433380126953, "logps/rejected": -289.6556091308594, "loss": 0.277, "rewards/chosen": 0.13912898302078247, "rewards/margins": 4.586435794830322, "rewards/rejected": -4.4473066329956055, "step": 1127 }, { "epoch": 0.3, "grad_norm": 28.93172836303711, "kl": 0.0, "learning_rate": 3.5239466108348596e-07, "logps/chosen": -157.70382690429688, "logps/rejected": -244.54635620117188, "loss": 0.3391, "rewards/chosen": 0.6886506080627441, "rewards/margins": 4.031140327453613, "rewards/rejected": -3.3424899578094482, "step": 1128 }, { "epoch": 0.3, "grad_norm": 32.03681945800781, "kl": 0.0, "learning_rate": 3.5226380528657416e-07, "logps/chosen": -256.11553955078125, "logps/rejected": -284.6397399902344, "loss": 0.2272, "rewards/chosen": 0.2986472249031067, "rewards/margins": 6.660142421722412, "rewards/rejected": -6.361495018005371, "step": 1129 }, { "epoch": 0.3, "grad_norm": 39.74080276489258, "kl": 0.0, "learning_rate": 3.5213294948966236e-07, "logps/chosen": -210.30718994140625, "logps/rejected": -232.85952758789062, "loss": 0.3897, "rewards/chosen": 0.4224991798400879, "rewards/margins": 2.9178881645202637, "rewards/rejected": -2.495388984680176, "step": 1130 }, { "epoch": 0.3, "grad_norm": 27.7830753326416, "kl": 0.0, "learning_rate": 3.520020936927506e-07, "logps/chosen": -158.5603790283203, "logps/rejected": -174.37652587890625, "loss": 0.4527, "rewards/chosen": -0.919854998588562, "rewards/margins": 1.2761861085891724, "rewards/rejected": -2.1960411071777344, "step": 1131 }, { "epoch": 0.3, "grad_norm": 50.690189361572266, "kl": 0.0, "learning_rate": 3.518712378958388e-07, "logps/chosen": -222.47323608398438, "logps/rejected": -234.22178649902344, "loss": 0.3715, "rewards/chosen": 0.2417455017566681, "rewards/margins": 4.570836544036865, "rewards/rejected": -4.3290910720825195, "step": 1132 }, { "epoch": 0.3, "grad_norm": 41.496700286865234, "kl": 0.0, "learning_rate": 3.51740382098927e-07, "logps/chosen": -145.2799835205078, "logps/rejected": -285.2710266113281, "loss": 0.2416, "rewards/chosen": -0.3389700949192047, "rewards/margins": 2.8799259662628174, "rewards/rejected": -3.2188961505889893, "step": 1133 }, { "epoch": 0.3, "grad_norm": 26.7552547454834, "kl": 0.0, "learning_rate": 3.516095263020152e-07, "logps/chosen": -165.4346466064453, "logps/rejected": -254.95603942871094, "loss": 0.3079, "rewards/chosen": 0.22509948909282684, "rewards/margins": 5.674718379974365, "rewards/rejected": -5.449618816375732, "step": 1134 }, { "epoch": 0.3, "grad_norm": 36.21974182128906, "kl": 0.0, "learning_rate": 3.514786705051034e-07, "logps/chosen": -269.6980895996094, "logps/rejected": -245.57833862304688, "loss": 0.3177, "rewards/chosen": 1.210508942604065, "rewards/margins": 3.6992435455322266, "rewards/rejected": -2.488734483718872, "step": 1135 }, { "epoch": 0.3, "grad_norm": 40.173545837402344, "kl": 0.0, "learning_rate": 3.513478147081916e-07, "logps/chosen": -208.01304626464844, "logps/rejected": -236.66079711914062, "loss": 0.2758, "rewards/chosen": 0.11015648394823074, "rewards/margins": 3.861097812652588, "rewards/rejected": -3.750941276550293, "step": 1136 }, { "epoch": 0.3, "grad_norm": 45.25703811645508, "kl": 0.0, "learning_rate": 3.512169589112798e-07, "logps/chosen": -177.85910034179688, "logps/rejected": -201.8383026123047, "loss": 0.3978, "rewards/chosen": -0.15296238660812378, "rewards/margins": 2.0827102661132812, "rewards/rejected": -2.23567271232605, "step": 1137 }, { "epoch": 0.3, "grad_norm": 27.257539749145508, "kl": 0.0, "learning_rate": 3.510861031143679e-07, "logps/chosen": -211.14024353027344, "logps/rejected": -286.07354736328125, "loss": 0.2634, "rewards/chosen": -0.6804866790771484, "rewards/margins": 2.8411343097686768, "rewards/rejected": -3.521620988845825, "step": 1138 }, { "epoch": 0.3, "grad_norm": 34.01937484741211, "kl": 0.0, "learning_rate": 3.509552473174561e-07, "logps/chosen": -215.672607421875, "logps/rejected": -360.40460205078125, "loss": 0.341, "rewards/chosen": 0.5583714246749878, "rewards/margins": 4.312696933746338, "rewards/rejected": -3.7543256282806396, "step": 1139 }, { "epoch": 0.3, "grad_norm": 30.42136573791504, "kl": 0.0, "learning_rate": 3.508243915205443e-07, "logps/chosen": -247.34075927734375, "logps/rejected": -221.0343475341797, "loss": 0.285, "rewards/chosen": 0.6125203371047974, "rewards/margins": 4.340419292449951, "rewards/rejected": -3.7278990745544434, "step": 1140 }, { "epoch": 0.3, "grad_norm": 42.30283737182617, "kl": 0.0, "learning_rate": 3.506935357236325e-07, "logps/chosen": -246.60385131835938, "logps/rejected": -209.10311889648438, "loss": 0.4, "rewards/chosen": -1.0343382358551025, "rewards/margins": 1.872960090637207, "rewards/rejected": -2.9072983264923096, "step": 1141 }, { "epoch": 0.3, "grad_norm": 32.05286407470703, "kl": 0.0, "learning_rate": 3.505626799267207e-07, "logps/chosen": -206.08543395996094, "logps/rejected": -248.9346923828125, "loss": 0.2648, "rewards/chosen": -1.6441336870193481, "rewards/margins": 0.37358176708221436, "rewards/rejected": -2.0177154541015625, "step": 1142 }, { "epoch": 0.3, "grad_norm": 34.85634994506836, "kl": 0.0, "learning_rate": 3.504318241298089e-07, "logps/chosen": -234.87477111816406, "logps/rejected": -354.82647705078125, "loss": 0.3797, "rewards/chosen": -0.04308699071407318, "rewards/margins": 4.236164569854736, "rewards/rejected": -4.279251575469971, "step": 1143 }, { "epoch": 0.3, "grad_norm": 38.970787048339844, "kl": 0.0, "learning_rate": 3.5030096833289715e-07, "logps/chosen": -294.22186279296875, "logps/rejected": -266.58087158203125, "loss": 0.3057, "rewards/chosen": 0.0297364741563797, "rewards/margins": 2.739408016204834, "rewards/rejected": -2.7096714973449707, "step": 1144 }, { "epoch": 0.3, "grad_norm": 27.94097328186035, "kl": 0.0, "learning_rate": 3.5017011253598535e-07, "logps/chosen": -166.2245330810547, "logps/rejected": -200.76283264160156, "loss": 0.3123, "rewards/chosen": -0.9906286001205444, "rewards/margins": 2.226215362548828, "rewards/rejected": -3.216843843460083, "step": 1145 }, { "epoch": 0.3, "grad_norm": 39.00582504272461, "kl": 0.0, "learning_rate": 3.5003925673907355e-07, "logps/chosen": -161.9521942138672, "logps/rejected": -197.63607788085938, "loss": 0.3922, "rewards/chosen": -0.6772257089614868, "rewards/margins": 1.6258729696273804, "rewards/rejected": -2.303098678588867, "step": 1146 }, { "epoch": 0.3, "grad_norm": 33.07608413696289, "kl": 0.0, "learning_rate": 3.4990840094216174e-07, "logps/chosen": -215.2793731689453, "logps/rejected": -267.1920471191406, "loss": 0.2962, "rewards/chosen": -0.22169288992881775, "rewards/margins": 3.129300832748413, "rewards/rejected": -3.3509936332702637, "step": 1147 }, { "epoch": 0.3, "grad_norm": 26.845361709594727, "kl": 0.0, "learning_rate": 3.4977754514524994e-07, "logps/chosen": -178.5347900390625, "logps/rejected": -216.12083435058594, "loss": 0.3127, "rewards/chosen": 0.09511661529541016, "rewards/margins": 4.102299213409424, "rewards/rejected": -4.007182598114014, "step": 1148 }, { "epoch": 0.3, "grad_norm": 27.727128982543945, "kl": 0.0, "learning_rate": 3.4964668934833813e-07, "logps/chosen": -416.78363037109375, "logps/rejected": -312.7774353027344, "loss": 0.307, "rewards/chosen": -3.3301751613616943, "rewards/margins": 0.24054336547851562, "rewards/rejected": -3.57071852684021, "step": 1149 }, { "epoch": 0.3, "grad_norm": 39.83740997314453, "kl": 0.0, "learning_rate": 3.4951583355142633e-07, "logps/chosen": -150.38316345214844, "logps/rejected": -260.5281066894531, "loss": 0.4156, "rewards/chosen": -0.8923667073249817, "rewards/margins": 1.060227394104004, "rewards/rejected": -1.9525940418243408, "step": 1150 }, { "epoch": 0.3, "grad_norm": 29.897340774536133, "kl": 0.0, "learning_rate": 3.4938497775451453e-07, "logps/chosen": -259.205810546875, "logps/rejected": -210.67149353027344, "loss": 0.2528, "rewards/chosen": 0.46551254391670227, "rewards/margins": 2.836808443069458, "rewards/rejected": -2.371295928955078, "step": 1151 }, { "epoch": 0.3, "grad_norm": 31.324119567871094, "kl": 0.0, "learning_rate": 3.492541219576027e-07, "logps/chosen": -177.623779296875, "logps/rejected": -181.99087524414062, "loss": 0.2261, "rewards/chosen": 0.7093040347099304, "rewards/margins": 3.908512830734253, "rewards/rejected": -3.1992087364196777, "step": 1152 }, { "epoch": 0.3, "grad_norm": 62.75912094116211, "kl": 0.0, "learning_rate": 3.4912326616069087e-07, "logps/chosen": -243.6322479248047, "logps/rejected": -309.91485595703125, "loss": 0.4145, "rewards/chosen": -1.3495761156082153, "rewards/margins": 1.1117101907730103, "rewards/rejected": -2.4612863063812256, "step": 1153 }, { "epoch": 0.3, "grad_norm": 24.441761016845703, "kl": 0.0, "learning_rate": 3.4899241036377906e-07, "logps/chosen": -137.18801879882812, "logps/rejected": -218.70822143554688, "loss": 0.4508, "rewards/chosen": -1.321584701538086, "rewards/margins": 2.427882671356201, "rewards/rejected": -3.749467372894287, "step": 1154 }, { "epoch": 0.3, "grad_norm": 26.579702377319336, "kl": 0.0, "learning_rate": 3.4886155456686726e-07, "logps/chosen": -259.0023193359375, "logps/rejected": -244.285400390625, "loss": 0.3294, "rewards/chosen": -0.7168277502059937, "rewards/margins": 3.6419262886047363, "rewards/rejected": -4.3587541580200195, "step": 1155 }, { "epoch": 0.3, "grad_norm": 34.10115051269531, "kl": 0.0, "learning_rate": 3.4873069876995545e-07, "logps/chosen": -196.53575134277344, "logps/rejected": -263.05865478515625, "loss": 0.3352, "rewards/chosen": 0.4309160113334656, "rewards/margins": 4.168835639953613, "rewards/rejected": -3.737919569015503, "step": 1156 }, { "epoch": 0.3, "grad_norm": 31.891420364379883, "kl": 0.0, "learning_rate": 3.485998429730437e-07, "logps/chosen": -220.2379608154297, "logps/rejected": -201.052734375, "loss": 0.3821, "rewards/chosen": -2.2106711864471436, "rewards/margins": 1.8266870975494385, "rewards/rejected": -4.037358283996582, "step": 1157 }, { "epoch": 0.3, "grad_norm": 27.060434341430664, "kl": 0.0, "learning_rate": 3.484689871761319e-07, "logps/chosen": -279.9279479980469, "logps/rejected": -191.00369262695312, "loss": 0.2911, "rewards/chosen": -1.332848072052002, "rewards/margins": 2.7451157569885254, "rewards/rejected": -4.077963829040527, "step": 1158 }, { "epoch": 0.3, "grad_norm": 34.95372009277344, "kl": 0.0, "learning_rate": 3.483381313792201e-07, "logps/chosen": -285.4616394042969, "logps/rejected": -143.6122283935547, "loss": 0.2803, "rewards/chosen": -0.22251377999782562, "rewards/margins": 3.0675089359283447, "rewards/rejected": -3.290022611618042, "step": 1159 }, { "epoch": 0.3, "grad_norm": 34.428009033203125, "kl": 0.0, "learning_rate": 3.482072755823083e-07, "logps/chosen": -238.25436401367188, "logps/rejected": -227.17388916015625, "loss": 0.2845, "rewards/chosen": 0.45113271474838257, "rewards/margins": 3.127865791320801, "rewards/rejected": -2.6767330169677734, "step": 1160 }, { "epoch": 0.3, "grad_norm": 29.634763717651367, "kl": 0.0, "learning_rate": 3.480764197853965e-07, "logps/chosen": -255.47523498535156, "logps/rejected": -360.51165771484375, "loss": 0.2351, "rewards/chosen": 0.6718313097953796, "rewards/margins": 3.9802281856536865, "rewards/rejected": -3.308396816253662, "step": 1161 }, { "epoch": 0.3, "grad_norm": 32.4753303527832, "kl": 0.0, "learning_rate": 3.479455639884847e-07, "logps/chosen": -266.46209716796875, "logps/rejected": -259.5201110839844, "loss": 0.2627, "rewards/chosen": -1.0025486946105957, "rewards/margins": 2.339106321334839, "rewards/rejected": -3.3416550159454346, "step": 1162 }, { "epoch": 0.3, "grad_norm": 35.363380432128906, "kl": 0.0, "learning_rate": 3.478147081915729e-07, "logps/chosen": -265.4547424316406, "logps/rejected": -208.99771118164062, "loss": 0.3466, "rewards/chosen": 0.7662509083747864, "rewards/margins": 2.833404541015625, "rewards/rejected": -2.0671536922454834, "step": 1163 }, { "epoch": 0.3, "grad_norm": 37.66944122314453, "kl": 0.0, "learning_rate": 3.476838523946611e-07, "logps/chosen": -204.93299865722656, "logps/rejected": -207.90904235839844, "loss": 0.3169, "rewards/chosen": -1.2671194076538086, "rewards/margins": 0.4675861597061157, "rewards/rejected": -1.7347055673599243, "step": 1164 }, { "epoch": 0.3, "grad_norm": 35.521873474121094, "kl": 0.0, "learning_rate": 3.4755299659774927e-07, "logps/chosen": -196.3144073486328, "logps/rejected": -231.2823028564453, "loss": 0.3089, "rewards/chosen": 1.3642683029174805, "rewards/margins": 4.673673629760742, "rewards/rejected": -3.309405565261841, "step": 1165 }, { "epoch": 0.31, "grad_norm": 37.934539794921875, "kl": 0.0, "learning_rate": 3.4742214080083747e-07, "logps/chosen": -213.7936553955078, "logps/rejected": -251.3383026123047, "loss": 0.3177, "rewards/chosen": 0.11157000809907913, "rewards/margins": 3.19706392288208, "rewards/rejected": -3.085493803024292, "step": 1166 }, { "epoch": 0.31, "grad_norm": 34.21260070800781, "kl": 0.0, "learning_rate": 3.4729128500392566e-07, "logps/chosen": -254.15847778320312, "logps/rejected": -162.43576049804688, "loss": 0.434, "rewards/chosen": -0.8147860765457153, "rewards/margins": 2.457930564880371, "rewards/rejected": -3.272716760635376, "step": 1167 }, { "epoch": 0.31, "grad_norm": 39.650821685791016, "kl": 0.0, "learning_rate": 3.471604292070138e-07, "logps/chosen": -192.7276153564453, "logps/rejected": -248.39785766601562, "loss": 0.3297, "rewards/chosen": -0.7319798469543457, "rewards/margins": 2.5251142978668213, "rewards/rejected": -3.257094144821167, "step": 1168 }, { "epoch": 0.31, "grad_norm": 33.93229675292969, "kl": 0.0, "learning_rate": 3.47029573410102e-07, "logps/chosen": -265.5296936035156, "logps/rejected": -164.14395141601562, "loss": 0.3404, "rewards/chosen": -0.1786927878856659, "rewards/margins": 1.8975303173065186, "rewards/rejected": -2.076223134994507, "step": 1169 }, { "epoch": 0.31, "grad_norm": 33.14167785644531, "kl": 0.0, "learning_rate": 3.4689871761319025e-07, "logps/chosen": -175.51358032226562, "logps/rejected": -238.18153381347656, "loss": 0.3263, "rewards/chosen": -0.6082727909088135, "rewards/margins": 2.283324956893921, "rewards/rejected": -2.8915977478027344, "step": 1170 }, { "epoch": 0.31, "grad_norm": 32.023223876953125, "kl": 0.0, "learning_rate": 3.4676786181627845e-07, "logps/chosen": -231.79791259765625, "logps/rejected": -228.96759033203125, "loss": 0.4064, "rewards/chosen": -0.6112353801727295, "rewards/margins": 2.084949016571045, "rewards/rejected": -2.6961843967437744, "step": 1171 }, { "epoch": 0.31, "grad_norm": 30.645387649536133, "kl": 0.0, "learning_rate": 3.4663700601936664e-07, "logps/chosen": -229.69735717773438, "logps/rejected": -265.0238037109375, "loss": 0.3055, "rewards/chosen": 0.29142194986343384, "rewards/margins": 3.7708466053009033, "rewards/rejected": -3.4794247150421143, "step": 1172 }, { "epoch": 0.31, "grad_norm": 32.59995651245117, "kl": 0.0, "learning_rate": 3.4650615022245484e-07, "logps/chosen": -262.04833984375, "logps/rejected": -277.7827453613281, "loss": 0.2863, "rewards/chosen": -1.32673978805542, "rewards/margins": 2.91971492767334, "rewards/rejected": -4.24645471572876, "step": 1173 }, { "epoch": 0.31, "grad_norm": 35.03938674926758, "kl": 0.0, "learning_rate": 3.4637529442554304e-07, "logps/chosen": -189.8060760498047, "logps/rejected": -258.8448181152344, "loss": 0.3792, "rewards/chosen": -0.6095502376556396, "rewards/margins": 1.7947406768798828, "rewards/rejected": -2.4042909145355225, "step": 1174 }, { "epoch": 0.31, "grad_norm": 31.531982421875, "kl": 0.0, "learning_rate": 3.4624443862863123e-07, "logps/chosen": -160.89028930664062, "logps/rejected": -330.67138671875, "loss": 0.2391, "rewards/chosen": 0.9728485941886902, "rewards/margins": 4.4766974449157715, "rewards/rejected": -3.5038487911224365, "step": 1175 }, { "epoch": 0.31, "grad_norm": 24.545883178710938, "kl": 0.0, "learning_rate": 3.4611358283171943e-07, "logps/chosen": -182.7744598388672, "logps/rejected": -288.28857421875, "loss": 0.3344, "rewards/chosen": -0.42739564180374146, "rewards/margins": 4.145176410675049, "rewards/rejected": -4.572572231292725, "step": 1176 }, { "epoch": 0.31, "grad_norm": 31.24026870727539, "kl": 0.0, "learning_rate": 3.459827270348076e-07, "logps/chosen": -211.71585083007812, "logps/rejected": -207.3837890625, "loss": 0.3375, "rewards/chosen": 0.5522348880767822, "rewards/margins": 3.1370089054107666, "rewards/rejected": -2.5847740173339844, "step": 1177 }, { "epoch": 0.31, "grad_norm": 34.2593879699707, "kl": 0.0, "learning_rate": 3.458518712378958e-07, "logps/chosen": -200.26980590820312, "logps/rejected": -212.49972534179688, "loss": 0.3796, "rewards/chosen": -0.28433290123939514, "rewards/margins": 2.662614107131958, "rewards/rejected": -2.9469470977783203, "step": 1178 }, { "epoch": 0.31, "grad_norm": 34.07558059692383, "kl": 0.0, "learning_rate": 3.45721015440984e-07, "logps/chosen": -190.99557495117188, "logps/rejected": -223.3245086669922, "loss": 0.2997, "rewards/chosen": 1.9847080707550049, "rewards/margins": 4.220609664916992, "rewards/rejected": -2.235901355743408, "step": 1179 }, { "epoch": 0.31, "grad_norm": 34.867740631103516, "kl": 0.0, "learning_rate": 3.455901596440722e-07, "logps/chosen": -261.5715026855469, "logps/rejected": -251.77923583984375, "loss": 0.2971, "rewards/chosen": 0.7804626822471619, "rewards/margins": 4.566412925720215, "rewards/rejected": -3.7859504222869873, "step": 1180 }, { "epoch": 0.31, "grad_norm": 34.3149299621582, "kl": 0.0, "learning_rate": 3.454593038471604e-07, "logps/chosen": -183.1739501953125, "logps/rejected": -226.0718231201172, "loss": 0.3276, "rewards/chosen": -0.10726749897003174, "rewards/margins": 2.843820095062256, "rewards/rejected": -2.951087713241577, "step": 1181 }, { "epoch": 0.31, "grad_norm": 26.37593650817871, "kl": 0.0, "learning_rate": 3.4532844805024866e-07, "logps/chosen": -159.28369140625, "logps/rejected": -247.39651489257812, "loss": 0.3558, "rewards/chosen": -0.06649637222290039, "rewards/margins": 3.216395854949951, "rewards/rejected": -3.2828922271728516, "step": 1182 }, { "epoch": 0.31, "grad_norm": 34.705955505371094, "kl": 0.0, "learning_rate": 3.4519759225333685e-07, "logps/chosen": -212.6090545654297, "logps/rejected": -210.63018798828125, "loss": 0.3391, "rewards/chosen": -0.04410066828131676, "rewards/margins": 3.6254005432128906, "rewards/rejected": -3.669501304626465, "step": 1183 }, { "epoch": 0.31, "grad_norm": 40.53861618041992, "kl": 0.0, "learning_rate": 3.45066736456425e-07, "logps/chosen": -221.90635681152344, "logps/rejected": -186.09765625, "loss": 0.4034, "rewards/chosen": -0.9847675561904907, "rewards/margins": 1.2683357000350952, "rewards/rejected": -2.253103256225586, "step": 1184 }, { "epoch": 0.31, "grad_norm": 35.10524368286133, "kl": 0.0, "learning_rate": 3.449358806595132e-07, "logps/chosen": -174.99594116210938, "logps/rejected": -395.3352355957031, "loss": 0.4667, "rewards/chosen": -0.811806321144104, "rewards/margins": 4.3953328132629395, "rewards/rejected": -5.207139015197754, "step": 1185 }, { "epoch": 0.31, "grad_norm": 29.58946990966797, "kl": 0.0, "learning_rate": 3.448050248626014e-07, "logps/chosen": -188.7567138671875, "logps/rejected": -224.7203826904297, "loss": 0.3357, "rewards/chosen": -0.04228636622428894, "rewards/margins": 2.5439279079437256, "rewards/rejected": -2.586214303970337, "step": 1186 }, { "epoch": 0.31, "grad_norm": 28.963594436645508, "kl": 0.0, "learning_rate": 3.446741690656896e-07, "logps/chosen": -222.30162048339844, "logps/rejected": -302.3878173828125, "loss": 0.2565, "rewards/chosen": -0.7175504565238953, "rewards/margins": 2.6879355907440186, "rewards/rejected": -3.4054861068725586, "step": 1187 }, { "epoch": 0.31, "grad_norm": 35.3771858215332, "kl": 0.0, "learning_rate": 3.445433132687778e-07, "logps/chosen": -156.33737182617188, "logps/rejected": -252.5842742919922, "loss": 0.251, "rewards/chosen": 1.3936952352523804, "rewards/margins": 4.000040531158447, "rewards/rejected": -2.6063451766967773, "step": 1188 }, { "epoch": 0.31, "grad_norm": 29.472457885742188, "kl": 0.0, "learning_rate": 3.44412457471866e-07, "logps/chosen": -253.04592895507812, "logps/rejected": -221.87600708007812, "loss": 0.2159, "rewards/chosen": 1.3840115070343018, "rewards/margins": 5.230579376220703, "rewards/rejected": -3.8465676307678223, "step": 1189 }, { "epoch": 0.31, "grad_norm": 35.125526428222656, "kl": 0.0, "learning_rate": 3.442816016749542e-07, "logps/chosen": -195.70407104492188, "logps/rejected": -204.52281188964844, "loss": 0.3822, "rewards/chosen": 0.1059611365199089, "rewards/margins": 2.519016742706299, "rewards/rejected": -2.413055658340454, "step": 1190 }, { "epoch": 0.31, "grad_norm": 34.76149368286133, "kl": 0.0, "learning_rate": 3.4415074587804237e-07, "logps/chosen": -146.63796997070312, "logps/rejected": -223.8983917236328, "loss": 0.2817, "rewards/chosen": 0.023099077865481377, "rewards/margins": 2.1824231147766113, "rewards/rejected": -2.1593239307403564, "step": 1191 }, { "epoch": 0.31, "grad_norm": 31.83498191833496, "kl": 0.0, "learning_rate": 3.4401989008113057e-07, "logps/chosen": -163.8594970703125, "logps/rejected": -251.00015258789062, "loss": 0.2421, "rewards/chosen": -0.8251597285270691, "rewards/margins": 2.2953813076019287, "rewards/rejected": -3.1205410957336426, "step": 1192 }, { "epoch": 0.31, "grad_norm": 39.38047790527344, "kl": 0.0, "learning_rate": 3.4388903428421876e-07, "logps/chosen": -245.26820373535156, "logps/rejected": -191.28118896484375, "loss": 0.2675, "rewards/chosen": 0.16523434221744537, "rewards/margins": 1.8972578048706055, "rewards/rejected": -1.7320234775543213, "step": 1193 }, { "epoch": 0.31, "grad_norm": 34.019287109375, "kl": 0.0, "learning_rate": 3.4375817848730696e-07, "logps/chosen": -169.6418914794922, "logps/rejected": -283.7315368652344, "loss": 0.2766, "rewards/chosen": 0.9147642850875854, "rewards/margins": 3.9462924003601074, "rewards/rejected": -3.0315279960632324, "step": 1194 }, { "epoch": 0.31, "grad_norm": 35.77202606201172, "kl": 0.0, "learning_rate": 3.436273226903952e-07, "logps/chosen": -218.32711791992188, "logps/rejected": -284.1678161621094, "loss": 0.4119, "rewards/chosen": -0.4475761651992798, "rewards/margins": 3.220229148864746, "rewards/rejected": -3.6678051948547363, "step": 1195 }, { "epoch": 0.31, "grad_norm": 38.87939453125, "kl": 0.0, "learning_rate": 3.434964668934834e-07, "logps/chosen": -267.1624755859375, "logps/rejected": -216.53843688964844, "loss": 0.382, "rewards/chosen": -0.9228044748306274, "rewards/margins": 1.4545472860336304, "rewards/rejected": -2.377351760864258, "step": 1196 }, { "epoch": 0.31, "grad_norm": 34.275413513183594, "kl": 0.0, "learning_rate": 3.433656110965716e-07, "logps/chosen": -179.17579650878906, "logps/rejected": -263.31060791015625, "loss": 0.3637, "rewards/chosen": -1.5491220951080322, "rewards/margins": 2.713186502456665, "rewards/rejected": -4.262308597564697, "step": 1197 }, { "epoch": 0.31, "grad_norm": 34.126121520996094, "kl": 0.0, "learning_rate": 3.432347552996598e-07, "logps/chosen": -212.6999969482422, "logps/rejected": -227.0008544921875, "loss": 0.4075, "rewards/chosen": -0.4199311137199402, "rewards/margins": 3.25209903717041, "rewards/rejected": -3.672030210494995, "step": 1198 }, { "epoch": 0.31, "grad_norm": 34.182960510253906, "kl": 0.0, "learning_rate": 3.4310389950274794e-07, "logps/chosen": -250.26296997070312, "logps/rejected": -278.6178894042969, "loss": 0.2245, "rewards/chosen": 1.2832039594650269, "rewards/margins": 4.30702543258667, "rewards/rejected": -3.0238215923309326, "step": 1199 }, { "epoch": 0.31, "grad_norm": 28.641311645507812, "kl": 0.0, "learning_rate": 3.4297304370583614e-07, "logps/chosen": -316.5002136230469, "logps/rejected": -273.9151611328125, "loss": 0.2202, "rewards/chosen": -1.1112972497940063, "rewards/margins": 1.6765261888504028, "rewards/rejected": -2.787823438644409, "step": 1200 }, { "epoch": 0.31, "grad_norm": 32.865848541259766, "kl": 0.0, "learning_rate": 3.4284218790892433e-07, "logps/chosen": -256.8058166503906, "logps/rejected": -209.70956420898438, "loss": 0.2936, "rewards/chosen": -0.7760433554649353, "rewards/margins": 3.5908236503601074, "rewards/rejected": -4.3668670654296875, "step": 1201 }, { "epoch": 0.31, "grad_norm": 36.68121337890625, "kl": 0.0, "learning_rate": 3.4271133211201253e-07, "logps/chosen": -296.3019104003906, "logps/rejected": -201.8861083984375, "loss": 0.3134, "rewards/chosen": -1.5894598960876465, "rewards/margins": 1.1896445751190186, "rewards/rejected": -2.779104471206665, "step": 1202 }, { "epoch": 0.31, "grad_norm": 31.72292137145996, "kl": 0.0, "learning_rate": 3.425804763151007e-07, "logps/chosen": -211.2059783935547, "logps/rejected": -217.85487365722656, "loss": 0.1636, "rewards/chosen": 1.3678613901138306, "rewards/margins": 4.285525321960449, "rewards/rejected": -2.917663812637329, "step": 1203 }, { "epoch": 0.32, "grad_norm": 30.31131935119629, "kl": 0.0, "learning_rate": 3.424496205181889e-07, "logps/chosen": -167.01673889160156, "logps/rejected": -249.53805541992188, "loss": 0.2794, "rewards/chosen": 0.2843502163887024, "rewards/margins": 4.522599697113037, "rewards/rejected": -4.2382493019104, "step": 1204 }, { "epoch": 0.32, "grad_norm": 35.710227966308594, "kl": 0.0, "learning_rate": 3.423187647212771e-07, "logps/chosen": -276.68084716796875, "logps/rejected": -268.0350036621094, "loss": 0.3991, "rewards/chosen": -1.6733345985412598, "rewards/margins": 2.664903163909912, "rewards/rejected": -4.338237762451172, "step": 1205 }, { "epoch": 0.32, "grad_norm": 30.06000518798828, "kl": 0.0, "learning_rate": 3.421879089243653e-07, "logps/chosen": -129.31488037109375, "logps/rejected": -242.54222106933594, "loss": 0.3198, "rewards/chosen": 0.6263553500175476, "rewards/margins": 4.219228267669678, "rewards/rejected": -3.5928728580474854, "step": 1206 }, { "epoch": 0.32, "grad_norm": 30.132490158081055, "kl": 0.0, "learning_rate": 3.420570531274535e-07, "logps/chosen": -215.542236328125, "logps/rejected": -291.1411437988281, "loss": 0.334, "rewards/chosen": -0.12181363999843597, "rewards/margins": 4.176156044006348, "rewards/rejected": -4.297969818115234, "step": 1207 }, { "epoch": 0.32, "grad_norm": 27.812297821044922, "kl": 0.0, "learning_rate": 3.4192619733054176e-07, "logps/chosen": -249.6283416748047, "logps/rejected": -277.0008239746094, "loss": 0.2755, "rewards/chosen": -0.5821962356567383, "rewards/margins": 4.218933582305908, "rewards/rejected": -4.8011298179626465, "step": 1208 }, { "epoch": 0.32, "grad_norm": 29.9333553314209, "kl": 0.0, "learning_rate": 3.4179534153362995e-07, "logps/chosen": -163.0187530517578, "logps/rejected": -219.123046875, "loss": 0.3825, "rewards/chosen": -0.026593446731567383, "rewards/margins": 3.478567361831665, "rewards/rejected": -3.5051608085632324, "step": 1209 }, { "epoch": 0.32, "grad_norm": 38.39295196533203, "kl": 0.0, "learning_rate": 3.4166448573671815e-07, "logps/chosen": -208.79698181152344, "logps/rejected": -254.31642150878906, "loss": 0.1957, "rewards/chosen": 0.22365638613700867, "rewards/margins": 2.581850290298462, "rewards/rejected": -2.358193874359131, "step": 1210 }, { "epoch": 0.32, "grad_norm": 38.24787139892578, "kl": 0.0, "learning_rate": 3.4153362993980635e-07, "logps/chosen": -168.13674926757812, "logps/rejected": -360.1756896972656, "loss": 0.3725, "rewards/chosen": -0.9074063301086426, "rewards/margins": 3.2058396339416504, "rewards/rejected": -4.113245964050293, "step": 1211 }, { "epoch": 0.32, "grad_norm": 31.460186004638672, "kl": 0.0, "learning_rate": 3.4140277414289454e-07, "logps/chosen": -224.10098266601562, "logps/rejected": -221.63059997558594, "loss": 0.3513, "rewards/chosen": 0.035461753606796265, "rewards/margins": 3.605962038040161, "rewards/rejected": -3.570500373840332, "step": 1212 }, { "epoch": 0.32, "grad_norm": 35.52521896362305, "kl": 0.0, "learning_rate": 3.4127191834598274e-07, "logps/chosen": -181.10682678222656, "logps/rejected": -281.4771728515625, "loss": 0.2102, "rewards/chosen": 1.0580134391784668, "rewards/margins": 5.643892765045166, "rewards/rejected": -4.585879325866699, "step": 1213 }, { "epoch": 0.32, "grad_norm": 39.60414505004883, "kl": 0.0, "learning_rate": 3.4114106254907093e-07, "logps/chosen": -282.92169189453125, "logps/rejected": -266.72705078125, "loss": 0.41, "rewards/chosen": -0.517210066318512, "rewards/margins": 2.233038902282715, "rewards/rejected": -2.750248908996582, "step": 1214 }, { "epoch": 0.32, "grad_norm": 29.621784210205078, "kl": 0.0, "learning_rate": 3.410102067521591e-07, "logps/chosen": -267.070556640625, "logps/rejected": -187.371826171875, "loss": 0.3786, "rewards/chosen": 0.6396648287773132, "rewards/margins": 3.1138055324554443, "rewards/rejected": -2.4741406440734863, "step": 1215 }, { "epoch": 0.32, "grad_norm": 30.271541595458984, "kl": 0.0, "learning_rate": 3.4087935095524727e-07, "logps/chosen": -183.39422607421875, "logps/rejected": -191.06455993652344, "loss": 0.2562, "rewards/chosen": -0.22444909811019897, "rewards/margins": 2.9028217792510986, "rewards/rejected": -3.1272709369659424, "step": 1216 }, { "epoch": 0.32, "grad_norm": 44.55641174316406, "kl": 0.0, "learning_rate": 3.4074849515833547e-07, "logps/chosen": -303.41845703125, "logps/rejected": -240.85781860351562, "loss": 0.2687, "rewards/chosen": 0.5954338908195496, "rewards/margins": 4.038144111633301, "rewards/rejected": -3.4427103996276855, "step": 1217 }, { "epoch": 0.32, "grad_norm": 32.984859466552734, "kl": 0.0, "learning_rate": 3.4061763936142367e-07, "logps/chosen": -242.97837829589844, "logps/rejected": -271.81951904296875, "loss": 0.3608, "rewards/chosen": -0.5196236371994019, "rewards/margins": 4.061900615692139, "rewards/rejected": -4.58152437210083, "step": 1218 }, { "epoch": 0.32, "grad_norm": 33.90766525268555, "kl": 0.0, "learning_rate": 3.4048678356451186e-07, "logps/chosen": -212.87364196777344, "logps/rejected": -280.95367431640625, "loss": 0.2355, "rewards/chosen": -0.5813936591148376, "rewards/margins": 4.398910045623779, "rewards/rejected": -4.980303764343262, "step": 1219 }, { "epoch": 0.32, "grad_norm": 35.093475341796875, "kl": 0.0, "learning_rate": 3.4035592776760006e-07, "logps/chosen": -157.0143280029297, "logps/rejected": -259.2233581542969, "loss": 0.3154, "rewards/chosen": -0.6709714531898499, "rewards/margins": 4.702061653137207, "rewards/rejected": -5.373033046722412, "step": 1220 }, { "epoch": 0.32, "grad_norm": 32.3795051574707, "kl": 0.0, "learning_rate": 3.402250719706883e-07, "logps/chosen": -224.70108032226562, "logps/rejected": -208.52987670898438, "loss": 0.2729, "rewards/chosen": 0.46259552240371704, "rewards/margins": 4.860898494720459, "rewards/rejected": -4.398303031921387, "step": 1221 }, { "epoch": 0.32, "grad_norm": 37.3427848815918, "kl": 0.0, "learning_rate": 3.400942161737765e-07, "logps/chosen": -329.36810302734375, "logps/rejected": -238.87973022460938, "loss": 0.323, "rewards/chosen": -1.7284150123596191, "rewards/margins": 1.4195363521575928, "rewards/rejected": -3.147951364517212, "step": 1222 }, { "epoch": 0.32, "grad_norm": 32.58262634277344, "kl": 0.0, "learning_rate": 3.399633603768647e-07, "logps/chosen": -324.0147399902344, "logps/rejected": -252.42791748046875, "loss": 0.3242, "rewards/chosen": 0.827644944190979, "rewards/margins": 4.047224521636963, "rewards/rejected": -3.2195796966552734, "step": 1223 }, { "epoch": 0.32, "grad_norm": 41.58140563964844, "kl": 0.0, "learning_rate": 3.398325045799529e-07, "logps/chosen": -160.6015625, "logps/rejected": -184.90505981445312, "loss": 0.4001, "rewards/chosen": -0.4772702157497406, "rewards/margins": 2.276743173599243, "rewards/rejected": -2.7540132999420166, "step": 1224 }, { "epoch": 0.32, "grad_norm": 34.357051849365234, "kl": 0.0, "learning_rate": 3.397016487830411e-07, "logps/chosen": -260.9039001464844, "logps/rejected": -277.7361145019531, "loss": 0.3566, "rewards/chosen": -0.6111627221107483, "rewards/margins": 3.346715211868286, "rewards/rejected": -3.9578778743743896, "step": 1225 }, { "epoch": 0.32, "grad_norm": 29.176429748535156, "kl": 0.0, "learning_rate": 3.395707929861293e-07, "logps/chosen": -180.9840850830078, "logps/rejected": -256.6327209472656, "loss": 0.2435, "rewards/chosen": 1.7163417339324951, "rewards/margins": 4.8167266845703125, "rewards/rejected": -3.1003851890563965, "step": 1226 }, { "epoch": 0.32, "grad_norm": 32.2241096496582, "kl": 0.0, "learning_rate": 3.394399371892175e-07, "logps/chosen": -216.3617706298828, "logps/rejected": -303.110595703125, "loss": 0.2862, "rewards/chosen": 0.3831513524055481, "rewards/margins": 5.223682880401611, "rewards/rejected": -4.840531349182129, "step": 1227 }, { "epoch": 0.32, "grad_norm": 31.990962982177734, "kl": 0.0, "learning_rate": 3.393090813923057e-07, "logps/chosen": -193.42356872558594, "logps/rejected": -281.7864685058594, "loss": 0.2524, "rewards/chosen": 0.4198690950870514, "rewards/margins": 3.9765639305114746, "rewards/rejected": -3.556694746017456, "step": 1228 }, { "epoch": 0.32, "grad_norm": 35.88570785522461, "kl": 0.0, "learning_rate": 3.391782255953939e-07, "logps/chosen": -199.9151611328125, "logps/rejected": -377.8648986816406, "loss": 0.2915, "rewards/chosen": -0.035070642828941345, "rewards/margins": 4.737933158874512, "rewards/rejected": -4.773003578186035, "step": 1229 }, { "epoch": 0.32, "grad_norm": 34.06972122192383, "kl": 0.0, "learning_rate": 3.39047369798482e-07, "logps/chosen": -178.60128784179688, "logps/rejected": -202.55992126464844, "loss": 0.3516, "rewards/chosen": -0.07218974083662033, "rewards/margins": 2.193382978439331, "rewards/rejected": -2.2655727863311768, "step": 1230 }, { "epoch": 0.32, "grad_norm": 25.618955612182617, "kl": 0.0, "learning_rate": 3.389165140015702e-07, "logps/chosen": -183.26873779296875, "logps/rejected": -279.2188415527344, "loss": 0.2036, "rewards/chosen": 0.7737140655517578, "rewards/margins": 4.777088642120361, "rewards/rejected": -4.0033745765686035, "step": 1231 }, { "epoch": 0.32, "grad_norm": 35.26194763183594, "kl": 0.0, "learning_rate": 3.387856582046584e-07, "logps/chosen": -228.4512481689453, "logps/rejected": -207.5843048095703, "loss": 0.2103, "rewards/chosen": -0.6176943778991699, "rewards/margins": 2.538363218307495, "rewards/rejected": -3.156057596206665, "step": 1232 }, { "epoch": 0.32, "grad_norm": 31.332338333129883, "kl": 0.0, "learning_rate": 3.386548024077466e-07, "logps/chosen": -224.06138610839844, "logps/rejected": -265.4245910644531, "loss": 0.2006, "rewards/chosen": 1.4042233228683472, "rewards/margins": 6.1257853507995605, "rewards/rejected": -4.721561908721924, "step": 1233 }, { "epoch": 0.32, "grad_norm": 31.065032958984375, "kl": 0.0, "learning_rate": 3.3852394661083486e-07, "logps/chosen": -166.82833862304688, "logps/rejected": -242.60174560546875, "loss": 0.2613, "rewards/chosen": 1.596419095993042, "rewards/margins": 5.903311729431152, "rewards/rejected": -4.306892395019531, "step": 1234 }, { "epoch": 0.32, "grad_norm": 46.043609619140625, "kl": 0.0, "learning_rate": 3.3839309081392305e-07, "logps/chosen": -263.305419921875, "logps/rejected": -230.71295166015625, "loss": 0.257, "rewards/chosen": 0.45542898774147034, "rewards/margins": 3.6104214191436768, "rewards/rejected": -3.1549923419952393, "step": 1235 }, { "epoch": 0.32, "grad_norm": 23.832841873168945, "kl": 0.0, "learning_rate": 3.3826223501701125e-07, "logps/chosen": -253.24847412109375, "logps/rejected": -249.77139282226562, "loss": 0.2069, "rewards/chosen": 1.0765002965927124, "rewards/margins": 5.492335796356201, "rewards/rejected": -4.415835380554199, "step": 1236 }, { "epoch": 0.32, "grad_norm": 30.16594123840332, "kl": 0.0, "learning_rate": 3.3813137922009944e-07, "logps/chosen": -155.04786682128906, "logps/rejected": -249.30087280273438, "loss": 0.2036, "rewards/chosen": 0.18218417465686798, "rewards/margins": 4.265267372131348, "rewards/rejected": -4.083083152770996, "step": 1237 }, { "epoch": 0.32, "grad_norm": 28.164379119873047, "kl": 0.0, "learning_rate": 3.3800052342318764e-07, "logps/chosen": -170.76309204101562, "logps/rejected": -257.8999328613281, "loss": 0.2369, "rewards/chosen": -0.30757319927215576, "rewards/margins": 3.608126163482666, "rewards/rejected": -3.9156994819641113, "step": 1238 }, { "epoch": 0.32, "grad_norm": 26.276927947998047, "kl": 0.0, "learning_rate": 3.3786966762627584e-07, "logps/chosen": -264.63726806640625, "logps/rejected": -238.72987365722656, "loss": 0.2766, "rewards/chosen": -0.26351839303970337, "rewards/margins": 4.711465358734131, "rewards/rejected": -4.9749836921691895, "step": 1239 }, { "epoch": 0.32, "grad_norm": 37.53226089477539, "kl": 0.0, "learning_rate": 3.3773881182936403e-07, "logps/chosen": -281.6136779785156, "logps/rejected": -226.8959503173828, "loss": 0.2406, "rewards/chosen": 0.10490907728672028, "rewards/margins": 3.665008783340454, "rewards/rejected": -3.5600996017456055, "step": 1240 }, { "epoch": 0.32, "grad_norm": 31.892934799194336, "kl": 0.0, "learning_rate": 3.3760795603245223e-07, "logps/chosen": -195.93685913085938, "logps/rejected": -157.26161193847656, "loss": 0.2046, "rewards/chosen": 1.778195858001709, "rewards/margins": 4.448366165161133, "rewards/rejected": -2.670170545578003, "step": 1241 }, { "epoch": 0.33, "grad_norm": 30.623804092407227, "kl": 0.0, "learning_rate": 3.374771002355404e-07, "logps/chosen": -290.7236328125, "logps/rejected": -267.4052734375, "loss": 0.1627, "rewards/chosen": 1.2010747194290161, "rewards/margins": 3.951550006866455, "rewards/rejected": -2.7504751682281494, "step": 1242 }, { "epoch": 0.33, "grad_norm": 32.45186996459961, "kl": 0.0, "learning_rate": 3.373462444386286e-07, "logps/chosen": -163.9646759033203, "logps/rejected": -299.09814453125, "loss": 0.3236, "rewards/chosen": 0.41362541913986206, "rewards/margins": 4.08070182800293, "rewards/rejected": -3.667076587677002, "step": 1243 }, { "epoch": 0.33, "grad_norm": 30.40380859375, "kl": 0.0, "learning_rate": 3.372153886417168e-07, "logps/chosen": -238.871337890625, "logps/rejected": -240.1171417236328, "loss": 0.3127, "rewards/chosen": -0.03489166498184204, "rewards/margins": 2.8372724056243896, "rewards/rejected": -2.872164011001587, "step": 1244 }, { "epoch": 0.33, "grad_norm": 35.650115966796875, "kl": 0.0, "learning_rate": 3.37084532844805e-07, "logps/chosen": -289.92877197265625, "logps/rejected": -228.46633911132812, "loss": 0.3788, "rewards/chosen": -0.9537517428398132, "rewards/margins": 1.8257262706756592, "rewards/rejected": -2.779478073120117, "step": 1245 }, { "epoch": 0.33, "grad_norm": 33.27054977416992, "kl": 0.0, "learning_rate": 3.3695367704789316e-07, "logps/chosen": -249.22886657714844, "logps/rejected": -367.12518310546875, "loss": 0.1397, "rewards/chosen": -1.49248206615448, "rewards/margins": 1.9466301202774048, "rewards/rejected": -3.4391121864318848, "step": 1246 }, { "epoch": 0.33, "grad_norm": 37.12660598754883, "kl": 0.0, "learning_rate": 3.368228212509814e-07, "logps/chosen": -214.15615844726562, "logps/rejected": -298.9302673339844, "loss": 0.3329, "rewards/chosen": 0.08646465837955475, "rewards/margins": 2.5704357624053955, "rewards/rejected": -2.483971118927002, "step": 1247 }, { "epoch": 0.33, "grad_norm": 33.774932861328125, "kl": 0.0, "learning_rate": 3.366919654540696e-07, "logps/chosen": -171.0416259765625, "logps/rejected": -205.38577270507812, "loss": 0.3242, "rewards/chosen": -0.309177964925766, "rewards/margins": 3.087322235107422, "rewards/rejected": -3.3965001106262207, "step": 1248 }, { "epoch": 0.33, "grad_norm": 28.201087951660156, "kl": 0.0, "learning_rate": 3.365611096571578e-07, "logps/chosen": -197.46839904785156, "logps/rejected": -156.85015869140625, "loss": 0.3639, "rewards/chosen": -0.526721715927124, "rewards/margins": 2.991612672805786, "rewards/rejected": -3.51833438873291, "step": 1249 }, { "epoch": 0.33, "grad_norm": 40.35578918457031, "kl": 0.0, "learning_rate": 3.36430253860246e-07, "logps/chosen": -216.8326416015625, "logps/rejected": -204.6576385498047, "loss": 0.2314, "rewards/chosen": 0.758324921131134, "rewards/margins": 3.4638853073120117, "rewards/rejected": -2.7055604457855225, "step": 1250 }, { "epoch": 0.33, "grad_norm": 49.18851089477539, "kl": 0.0, "learning_rate": 3.362993980633342e-07, "logps/chosen": -201.96607971191406, "logps/rejected": -231.14260864257812, "loss": 0.4535, "rewards/chosen": -0.826595664024353, "rewards/margins": 2.144320487976074, "rewards/rejected": -2.9709160327911377, "step": 1251 }, { "epoch": 0.33, "grad_norm": 28.66099739074707, "kl": 0.0, "learning_rate": 3.361685422664224e-07, "logps/chosen": -226.81207275390625, "logps/rejected": -262.93695068359375, "loss": 0.2788, "rewards/chosen": -0.13777820765972137, "rewards/margins": 3.4477951526641846, "rewards/rejected": -3.585573434829712, "step": 1252 }, { "epoch": 0.33, "grad_norm": 31.51655387878418, "kl": 0.0, "learning_rate": 3.360376864695106e-07, "logps/chosen": -211.86929321289062, "logps/rejected": -180.88951110839844, "loss": 0.3393, "rewards/chosen": 1.378541111946106, "rewards/margins": 3.041508674621582, "rewards/rejected": -1.662967562675476, "step": 1253 }, { "epoch": 0.33, "grad_norm": 39.14763641357422, "kl": 0.0, "learning_rate": 3.359068306725988e-07, "logps/chosen": -187.16439819335938, "logps/rejected": -303.744873046875, "loss": 0.273, "rewards/chosen": 0.921317994594574, "rewards/margins": 3.8566207885742188, "rewards/rejected": -2.935302734375, "step": 1254 }, { "epoch": 0.33, "grad_norm": 35.68745040893555, "kl": 0.0, "learning_rate": 3.35775974875687e-07, "logps/chosen": -155.5998992919922, "logps/rejected": -257.8075256347656, "loss": 0.2658, "rewards/chosen": 0.012111015617847443, "rewards/margins": 5.436872482299805, "rewards/rejected": -5.4247612953186035, "step": 1255 }, { "epoch": 0.33, "grad_norm": 29.839157104492188, "kl": 0.0, "learning_rate": 3.3564511907877517e-07, "logps/chosen": -293.30322265625, "logps/rejected": -283.6425476074219, "loss": 0.254, "rewards/chosen": 1.0832270383834839, "rewards/margins": 3.56386137008667, "rewards/rejected": -2.4806342124938965, "step": 1256 }, { "epoch": 0.33, "grad_norm": 33.9878044128418, "kl": 0.0, "learning_rate": 3.3551426328186337e-07, "logps/chosen": -195.77926635742188, "logps/rejected": -237.57620239257812, "loss": 0.2727, "rewards/chosen": 0.3099658787250519, "rewards/margins": 3.34023118019104, "rewards/rejected": -3.0302653312683105, "step": 1257 }, { "epoch": 0.33, "grad_norm": 40.25716018676758, "kl": 0.0, "learning_rate": 3.3538340748495156e-07, "logps/chosen": -231.48263549804688, "logps/rejected": -191.4035186767578, "loss": 0.354, "rewards/chosen": 0.30240654945373535, "rewards/margins": 3.148895502090454, "rewards/rejected": -2.8464889526367188, "step": 1258 }, { "epoch": 0.33, "grad_norm": 47.265533447265625, "kl": 0.0, "learning_rate": 3.352525516880398e-07, "logps/chosen": -204.80039978027344, "logps/rejected": -298.1878967285156, "loss": 0.3726, "rewards/chosen": -0.2800760865211487, "rewards/margins": 2.6696648597717285, "rewards/rejected": -2.9497408866882324, "step": 1259 }, { "epoch": 0.33, "grad_norm": 21.470439910888672, "kl": 0.0, "learning_rate": 3.35121695891128e-07, "logps/chosen": -235.9132843017578, "logps/rejected": -189.3260955810547, "loss": 0.1427, "rewards/chosen": 2.9290521144866943, "rewards/margins": 5.725638389587402, "rewards/rejected": -2.796586513519287, "step": 1260 }, { "epoch": 0.33, "grad_norm": 30.650962829589844, "kl": 0.0, "learning_rate": 3.3499084009421615e-07, "logps/chosen": -199.38571166992188, "logps/rejected": -224.87171936035156, "loss": 0.2969, "rewards/chosen": 0.1919896900653839, "rewards/margins": 2.7235448360443115, "rewards/rejected": -2.53155517578125, "step": 1261 }, { "epoch": 0.33, "grad_norm": 33.12488555908203, "kl": 0.0, "learning_rate": 3.3485998429730435e-07, "logps/chosen": -232.99105834960938, "logps/rejected": -231.17054748535156, "loss": 0.2222, "rewards/chosen": 1.200358271598816, "rewards/margins": 5.405186176300049, "rewards/rejected": -4.204827785491943, "step": 1262 }, { "epoch": 0.33, "grad_norm": 27.685657501220703, "kl": 0.0, "learning_rate": 3.3472912850039254e-07, "logps/chosen": -186.10411071777344, "logps/rejected": -265.6203918457031, "loss": 0.2324, "rewards/chosen": -0.5338296890258789, "rewards/margins": 4.319558620452881, "rewards/rejected": -4.85338830947876, "step": 1263 }, { "epoch": 0.33, "grad_norm": 33.48194122314453, "kl": 0.0, "learning_rate": 3.3459827270348074e-07, "logps/chosen": -229.52662658691406, "logps/rejected": -180.10003662109375, "loss": 0.3356, "rewards/chosen": -0.27033787965774536, "rewards/margins": 2.364189386367798, "rewards/rejected": -2.6345272064208984, "step": 1264 }, { "epoch": 0.33, "grad_norm": 33.939762115478516, "kl": 0.0, "learning_rate": 3.3446741690656893e-07, "logps/chosen": -208.50833129882812, "logps/rejected": -187.72377014160156, "loss": 0.3683, "rewards/chosen": -0.9061264395713806, "rewards/margins": 1.2528698444366455, "rewards/rejected": -2.158996343612671, "step": 1265 }, { "epoch": 0.33, "grad_norm": 33.020294189453125, "kl": 0.0, "learning_rate": 3.3433656110965713e-07, "logps/chosen": -325.7091064453125, "logps/rejected": -205.07147216796875, "loss": 0.3448, "rewards/chosen": -0.6361463069915771, "rewards/margins": 2.527127504348755, "rewards/rejected": -3.163273811340332, "step": 1266 }, { "epoch": 0.33, "grad_norm": 34.751121520996094, "kl": 0.0, "learning_rate": 3.3420570531274533e-07, "logps/chosen": -207.54547119140625, "logps/rejected": -274.9513854980469, "loss": 0.3026, "rewards/chosen": 1.1950047016143799, "rewards/margins": 5.298422813415527, "rewards/rejected": -4.103418350219727, "step": 1267 }, { "epoch": 0.33, "grad_norm": 29.884418487548828, "kl": 0.0, "learning_rate": 3.340748495158335e-07, "logps/chosen": -208.39561462402344, "logps/rejected": -266.50848388671875, "loss": 0.2547, "rewards/chosen": -0.3481028974056244, "rewards/margins": 3.2418465614318848, "rewards/rejected": -3.589949369430542, "step": 1268 }, { "epoch": 0.33, "grad_norm": 31.327274322509766, "kl": 0.0, "learning_rate": 3.339439937189217e-07, "logps/chosen": -122.44271087646484, "logps/rejected": -207.86700439453125, "loss": 0.2258, "rewards/chosen": -0.5743510127067566, "rewards/margins": 2.366105318069458, "rewards/rejected": -2.9404563903808594, "step": 1269 }, { "epoch": 0.33, "grad_norm": 32.57371520996094, "kl": 0.0, "learning_rate": 3.338131379220099e-07, "logps/chosen": -242.4744873046875, "logps/rejected": -311.08563232421875, "loss": 0.4399, "rewards/chosen": 0.11520695686340332, "rewards/margins": 2.5848286151885986, "rewards/rejected": -2.4696216583251953, "step": 1270 }, { "epoch": 0.33, "grad_norm": 31.31317901611328, "kl": 0.0, "learning_rate": 3.336822821250981e-07, "logps/chosen": -240.73184204101562, "logps/rejected": -262.8088073730469, "loss": 0.26, "rewards/chosen": 0.389142781496048, "rewards/margins": 2.956101417541504, "rewards/rejected": -2.5669586658477783, "step": 1271 }, { "epoch": 0.33, "grad_norm": 31.12925910949707, "kl": 0.0, "learning_rate": 3.3355142632818636e-07, "logps/chosen": -196.77252197265625, "logps/rejected": -157.89349365234375, "loss": 0.2793, "rewards/chosen": 1.9008492231369019, "rewards/margins": 3.7623605728149414, "rewards/rejected": -1.8615113496780396, "step": 1272 }, { "epoch": 0.33, "grad_norm": 30.95001983642578, "kl": 0.0, "learning_rate": 3.3342057053127456e-07, "logps/chosen": -212.56983947753906, "logps/rejected": -295.71539306640625, "loss": 0.2432, "rewards/chosen": 0.1810222566127777, "rewards/margins": 5.87899112701416, "rewards/rejected": -5.69796895980835, "step": 1273 }, { "epoch": 0.33, "grad_norm": 28.166805267333984, "kl": 0.0, "learning_rate": 3.3328971473436275e-07, "logps/chosen": -232.2152862548828, "logps/rejected": -165.13128662109375, "loss": 0.2392, "rewards/chosen": 2.376075506210327, "rewards/margins": 5.288122177124023, "rewards/rejected": -2.912046432495117, "step": 1274 }, { "epoch": 0.33, "grad_norm": 25.603988647460938, "kl": 0.0, "learning_rate": 3.3315885893745095e-07, "logps/chosen": -156.23667907714844, "logps/rejected": -237.1687469482422, "loss": 0.2811, "rewards/chosen": -1.1099730730056763, "rewards/margins": 3.728555679321289, "rewards/rejected": -4.838528633117676, "step": 1275 }, { "epoch": 0.33, "grad_norm": 33.69292068481445, "kl": 0.0, "learning_rate": 3.3302800314053914e-07, "logps/chosen": -239.6951904296875, "logps/rejected": -345.7257995605469, "loss": 0.2289, "rewards/chosen": 0.387544721364975, "rewards/margins": 4.686365604400635, "rewards/rejected": -4.298820972442627, "step": 1276 }, { "epoch": 0.33, "grad_norm": 41.52256774902344, "kl": 0.0, "learning_rate": 3.328971473436273e-07, "logps/chosen": -274.7806701660156, "logps/rejected": -259.1031494140625, "loss": 0.2678, "rewards/chosen": 3.6112773418426514, "rewards/margins": 6.836021423339844, "rewards/rejected": -3.2247438430786133, "step": 1277 }, { "epoch": 0.33, "grad_norm": 36.408992767333984, "kl": 0.0, "learning_rate": 3.327662915467155e-07, "logps/chosen": -235.43978881835938, "logps/rejected": -193.00564575195312, "loss": 0.3697, "rewards/chosen": -0.9631791710853577, "rewards/margins": 1.1764774322509766, "rewards/rejected": -2.1396565437316895, "step": 1278 }, { "epoch": 0.33, "grad_norm": 30.44072151184082, "kl": 0.0, "learning_rate": 3.326354357498037e-07, "logps/chosen": -218.36167907714844, "logps/rejected": -300.410400390625, "loss": 0.3474, "rewards/chosen": -0.4119375944137573, "rewards/margins": 2.955416679382324, "rewards/rejected": -3.367354393005371, "step": 1279 }, { "epoch": 0.33, "grad_norm": 32.16078567504883, "kl": 0.0, "learning_rate": 3.325045799528919e-07, "logps/chosen": -241.3597412109375, "logps/rejected": -221.12881469726562, "loss": 0.2101, "rewards/chosen": -0.3120342791080475, "rewards/margins": 3.0008411407470703, "rewards/rejected": -3.312875509262085, "step": 1280 }, { "epoch": 0.34, "grad_norm": 27.598134994506836, "kl": 0.0, "learning_rate": 3.3237372415598007e-07, "logps/chosen": -299.2360534667969, "logps/rejected": -197.92079162597656, "loss": 0.2723, "rewards/chosen": -2.645521879196167, "rewards/margins": 2.080192804336548, "rewards/rejected": -4.725714683532715, "step": 1281 }, { "epoch": 0.34, "grad_norm": 30.310110092163086, "kl": 0.0, "learning_rate": 3.3224286835906827e-07, "logps/chosen": -248.118408203125, "logps/rejected": -244.019775390625, "loss": 0.2386, "rewards/chosen": 0.5699302554130554, "rewards/margins": 4.555118560791016, "rewards/rejected": -3.9851884841918945, "step": 1282 }, { "epoch": 0.34, "grad_norm": 32.69074630737305, "kl": 0.0, "learning_rate": 3.3211201256215646e-07, "logps/chosen": -220.3896484375, "logps/rejected": -378.6432800292969, "loss": 0.3685, "rewards/chosen": -0.45064517855644226, "rewards/margins": 4.8300395011901855, "rewards/rejected": -5.280684471130371, "step": 1283 }, { "epoch": 0.34, "grad_norm": 30.25950813293457, "kl": 0.0, "learning_rate": 3.3198115676524466e-07, "logps/chosen": -211.71487426757812, "logps/rejected": -291.50787353515625, "loss": 0.3666, "rewards/chosen": -0.06342494487762451, "rewards/margins": 3.4773359298706055, "rewards/rejected": -3.5407607555389404, "step": 1284 }, { "epoch": 0.34, "grad_norm": 31.38227653503418, "kl": 0.0, "learning_rate": 3.318503009683329e-07, "logps/chosen": -215.906005859375, "logps/rejected": -224.23704528808594, "loss": 0.3256, "rewards/chosen": 0.7493216395378113, "rewards/margins": 4.575001239776611, "rewards/rejected": -3.8256797790527344, "step": 1285 }, { "epoch": 0.34, "grad_norm": 34.842342376708984, "kl": 0.0, "learning_rate": 3.317194451714211e-07, "logps/chosen": -220.79994201660156, "logps/rejected": -257.6636962890625, "loss": 0.3224, "rewards/chosen": 0.5526933670043945, "rewards/margins": 4.311444282531738, "rewards/rejected": -3.7587506771087646, "step": 1286 }, { "epoch": 0.34, "grad_norm": 30.686933517456055, "kl": 0.0, "learning_rate": 3.315885893745093e-07, "logps/chosen": -167.66883850097656, "logps/rejected": -153.66209411621094, "loss": 0.3503, "rewards/chosen": -0.23966237902641296, "rewards/margins": 1.8854963779449463, "rewards/rejected": -2.1251587867736816, "step": 1287 }, { "epoch": 0.34, "grad_norm": 31.066295623779297, "kl": 0.0, "learning_rate": 3.314577335775975e-07, "logps/chosen": -240.11083984375, "logps/rejected": -255.7864532470703, "loss": 0.3307, "rewards/chosen": -0.976582944393158, "rewards/margins": 1.937021017074585, "rewards/rejected": -2.9136040210723877, "step": 1288 }, { "epoch": 0.34, "grad_norm": 35.197872161865234, "kl": 0.0, "learning_rate": 3.313268777806857e-07, "logps/chosen": -232.86688232421875, "logps/rejected": -194.73477172851562, "loss": 0.2723, "rewards/chosen": 2.010469913482666, "rewards/margins": 4.8714823722839355, "rewards/rejected": -2.8610124588012695, "step": 1289 }, { "epoch": 0.34, "grad_norm": 31.749187469482422, "kl": 0.0, "learning_rate": 3.311960219837739e-07, "logps/chosen": -243.93431091308594, "logps/rejected": -220.49685668945312, "loss": 0.3194, "rewards/chosen": -1.9240546226501465, "rewards/margins": 1.2547504901885986, "rewards/rejected": -3.178805112838745, "step": 1290 }, { "epoch": 0.34, "grad_norm": 34.85056686401367, "kl": 0.0, "learning_rate": 3.310651661868621e-07, "logps/chosen": -213.20204162597656, "logps/rejected": -278.9600830078125, "loss": 0.3964, "rewards/chosen": -0.25566422939300537, "rewards/margins": 3.075680732727051, "rewards/rejected": -3.3313448429107666, "step": 1291 }, { "epoch": 0.34, "grad_norm": 24.75638198852539, "kl": 0.0, "learning_rate": 3.3093431038995023e-07, "logps/chosen": -206.9791717529297, "logps/rejected": -248.0471649169922, "loss": 0.4092, "rewards/chosen": -0.372029185295105, "rewards/margins": 3.3591151237487793, "rewards/rejected": -3.7311441898345947, "step": 1292 }, { "epoch": 0.34, "grad_norm": 33.22563171386719, "kl": 0.0, "learning_rate": 3.308034545930384e-07, "logps/chosen": -156.0008544921875, "logps/rejected": -316.6332092285156, "loss": 0.3212, "rewards/chosen": -0.07225000858306885, "rewards/margins": 4.35306453704834, "rewards/rejected": -4.425314426422119, "step": 1293 }, { "epoch": 0.34, "grad_norm": 34.623626708984375, "kl": 0.0, "learning_rate": 3.306725987961266e-07, "logps/chosen": -279.859130859375, "logps/rejected": -233.71697998046875, "loss": 0.3227, "rewards/chosen": 1.6375625133514404, "rewards/margins": 4.174957752227783, "rewards/rejected": -2.5373952388763428, "step": 1294 }, { "epoch": 0.34, "grad_norm": 45.627933502197266, "kl": 0.0, "learning_rate": 3.305417429992148e-07, "logps/chosen": -160.09315490722656, "logps/rejected": -212.97886657714844, "loss": 0.3397, "rewards/chosen": -0.6607236862182617, "rewards/margins": 3.3477578163146973, "rewards/rejected": -4.008481502532959, "step": 1295 }, { "epoch": 0.34, "grad_norm": 33.33060073852539, "kl": 0.0, "learning_rate": 3.30410887202303e-07, "logps/chosen": -139.9228973388672, "logps/rejected": -234.82342529296875, "loss": 0.2886, "rewards/chosen": -0.6339284777641296, "rewards/margins": 2.746208429336548, "rewards/rejected": -3.3801369667053223, "step": 1296 }, { "epoch": 0.34, "grad_norm": 38.89460754394531, "kl": 0.0, "learning_rate": 3.302800314053912e-07, "logps/chosen": -195.2257843017578, "logps/rejected": -192.89590454101562, "loss": 0.4372, "rewards/chosen": -0.28713446855545044, "rewards/margins": 2.1057236194610596, "rewards/rejected": -2.3928580284118652, "step": 1297 }, { "epoch": 0.34, "grad_norm": 36.500423431396484, "kl": 0.0, "learning_rate": 3.3014917560847946e-07, "logps/chosen": -210.28921508789062, "logps/rejected": -235.51065063476562, "loss": 0.3719, "rewards/chosen": -1.0210708379745483, "rewards/margins": 1.5469309091567993, "rewards/rejected": -2.5680017471313477, "step": 1298 }, { "epoch": 0.34, "grad_norm": 22.017080307006836, "kl": 0.0, "learning_rate": 3.3001831981156765e-07, "logps/chosen": -259.6287841796875, "logps/rejected": -318.98065185546875, "loss": 0.3813, "rewards/chosen": -0.38963747024536133, "rewards/margins": 3.8554883003234863, "rewards/rejected": -4.245125770568848, "step": 1299 }, { "epoch": 0.34, "grad_norm": 39.884857177734375, "kl": 0.0, "learning_rate": 3.2988746401465585e-07, "logps/chosen": -259.2381896972656, "logps/rejected": -260.0971374511719, "loss": 0.2882, "rewards/chosen": -0.054092198610305786, "rewards/margins": 2.6869301795959473, "rewards/rejected": -2.7410223484039307, "step": 1300 }, { "epoch": 0.34, "grad_norm": 33.03709411621094, "kl": 0.0, "learning_rate": 3.2975660821774405e-07, "logps/chosen": -166.88330078125, "logps/rejected": -223.3334503173828, "loss": 0.2703, "rewards/chosen": 0.3510906994342804, "rewards/margins": 4.768466949462891, "rewards/rejected": -4.4173760414123535, "step": 1301 }, { "epoch": 0.34, "grad_norm": 30.759967803955078, "kl": 0.0, "learning_rate": 3.2962575242083224e-07, "logps/chosen": -252.9071044921875, "logps/rejected": -229.6643829345703, "loss": 0.2931, "rewards/chosen": 1.2659831047058105, "rewards/margins": 3.1950631141662598, "rewards/rejected": -1.9290800094604492, "step": 1302 }, { "epoch": 0.34, "grad_norm": 33.443843841552734, "kl": 0.0, "learning_rate": 3.2949489662392044e-07, "logps/chosen": -242.87637329101562, "logps/rejected": -233.8732147216797, "loss": 0.1907, "rewards/chosen": 0.24500472843647003, "rewards/margins": 3.0496490001678467, "rewards/rejected": -2.8046443462371826, "step": 1303 }, { "epoch": 0.34, "grad_norm": 35.365848541259766, "kl": 0.0, "learning_rate": 3.2936404082700864e-07, "logps/chosen": -231.86984252929688, "logps/rejected": -308.7398681640625, "loss": 0.2432, "rewards/chosen": 0.4850401282310486, "rewards/margins": 4.330360412597656, "rewards/rejected": -3.845320224761963, "step": 1304 }, { "epoch": 0.34, "grad_norm": 37.317588806152344, "kl": 0.0, "learning_rate": 3.2923318503009683e-07, "logps/chosen": -150.95700073242188, "logps/rejected": -274.0843505859375, "loss": 0.3491, "rewards/chosen": 0.7635312676429749, "rewards/margins": 3.192793846130371, "rewards/rejected": -2.429262638092041, "step": 1305 }, { "epoch": 0.34, "grad_norm": 48.94740295410156, "kl": 0.0, "learning_rate": 3.2910232923318503e-07, "logps/chosen": -346.37908935546875, "logps/rejected": -209.9012908935547, "loss": 0.2711, "rewards/chosen": 2.334627628326416, "rewards/margins": 6.195981025695801, "rewards/rejected": -3.8613533973693848, "step": 1306 }, { "epoch": 0.34, "grad_norm": 30.677465438842773, "kl": 0.0, "learning_rate": 3.289714734362732e-07, "logps/chosen": -222.22210693359375, "logps/rejected": -294.2380676269531, "loss": 0.2369, "rewards/chosen": 0.19016534090042114, "rewards/margins": 3.609548807144165, "rewards/rejected": -3.4193835258483887, "step": 1307 }, { "epoch": 0.34, "grad_norm": 35.83417892456055, "kl": 0.0, "learning_rate": 3.2884061763936137e-07, "logps/chosen": -196.81204223632812, "logps/rejected": -219.13180541992188, "loss": 0.3425, "rewards/chosen": 0.48529767990112305, "rewards/margins": 3.888932704925537, "rewards/rejected": -3.403635025024414, "step": 1308 }, { "epoch": 0.34, "grad_norm": 29.441770553588867, "kl": 0.0, "learning_rate": 3.2870976184244956e-07, "logps/chosen": -184.06394958496094, "logps/rejected": -191.65087890625, "loss": 0.313, "rewards/chosen": -1.2080618143081665, "rewards/margins": 1.6941922903060913, "rewards/rejected": -2.902254104614258, "step": 1309 }, { "epoch": 0.34, "grad_norm": 41.78122329711914, "kl": 0.0, "learning_rate": 3.2857890604553776e-07, "logps/chosen": -259.63751220703125, "logps/rejected": -190.84457397460938, "loss": 0.3762, "rewards/chosen": 0.5477985143661499, "rewards/margins": 3.737063407897949, "rewards/rejected": -3.1892647743225098, "step": 1310 }, { "epoch": 0.34, "grad_norm": 33.553565979003906, "kl": 0.0, "learning_rate": 3.28448050248626e-07, "logps/chosen": -198.94801330566406, "logps/rejected": -414.20306396484375, "loss": 0.347, "rewards/chosen": -0.033007245510816574, "rewards/margins": 3.7820427417755127, "rewards/rejected": -3.815049886703491, "step": 1311 }, { "epoch": 0.34, "grad_norm": 43.8389778137207, "kl": 0.0, "learning_rate": 3.283171944517142e-07, "logps/chosen": -165.3663330078125, "logps/rejected": -285.16845703125, "loss": 0.4081, "rewards/chosen": -1.201613187789917, "rewards/margins": 1.7473170757293701, "rewards/rejected": -2.948930263519287, "step": 1312 }, { "epoch": 0.34, "grad_norm": 30.555959701538086, "kl": 0.0, "learning_rate": 3.281863386548024e-07, "logps/chosen": -217.09132385253906, "logps/rejected": -234.91665649414062, "loss": 0.3565, "rewards/chosen": 1.054399013519287, "rewards/margins": 4.382190704345703, "rewards/rejected": -3.327791452407837, "step": 1313 }, { "epoch": 0.34, "grad_norm": 33.22416687011719, "kl": 0.0, "learning_rate": 3.280554828578906e-07, "logps/chosen": -217.23284912109375, "logps/rejected": -216.6754150390625, "loss": 0.3091, "rewards/chosen": 0.3072512447834015, "rewards/margins": 1.8591370582580566, "rewards/rejected": -1.5518858432769775, "step": 1314 }, { "epoch": 0.34, "grad_norm": 29.256071090698242, "kl": 0.0, "learning_rate": 3.279246270609788e-07, "logps/chosen": -211.4664306640625, "logps/rejected": -235.34336853027344, "loss": 0.2241, "rewards/chosen": 1.3942919969558716, "rewards/margins": 4.8556413650512695, "rewards/rejected": -3.4613492488861084, "step": 1315 }, { "epoch": 0.34, "grad_norm": 45.10913848876953, "kl": 0.0, "learning_rate": 3.27793771264067e-07, "logps/chosen": -262.1194152832031, "logps/rejected": -236.89015197753906, "loss": 0.4242, "rewards/chosen": -0.6220903396606445, "rewards/margins": 1.0707972049713135, "rewards/rejected": -1.692887544631958, "step": 1316 }, { "epoch": 0.34, "grad_norm": 43.61226272583008, "kl": 0.0, "learning_rate": 3.276629154671552e-07, "logps/chosen": -205.19381713867188, "logps/rejected": -300.3736267089844, "loss": 0.3916, "rewards/chosen": -0.4625236392021179, "rewards/margins": 2.3358194828033447, "rewards/rejected": -2.7983431816101074, "step": 1317 }, { "epoch": 0.34, "grad_norm": 35.57749557495117, "kl": 0.0, "learning_rate": 3.275320596702434e-07, "logps/chosen": -215.95855712890625, "logps/rejected": -211.11325073242188, "loss": 0.2919, "rewards/chosen": -0.27239271998405457, "rewards/margins": 3.561589002609253, "rewards/rejected": -3.83398175239563, "step": 1318 }, { "epoch": 0.35, "grad_norm": 30.0214900970459, "kl": 0.0, "learning_rate": 3.274012038733316e-07, "logps/chosen": -162.21311950683594, "logps/rejected": -231.0467987060547, "loss": 0.2538, "rewards/chosen": -0.42144662141799927, "rewards/margins": 3.6177022457122803, "rewards/rejected": -4.039148807525635, "step": 1319 }, { "epoch": 0.35, "grad_norm": 35.25520706176758, "kl": 0.0, "learning_rate": 3.2727034807641977e-07, "logps/chosen": -214.12205505371094, "logps/rejected": -236.38314819335938, "loss": 0.4485, "rewards/chosen": -1.0461833477020264, "rewards/margins": 4.4344940185546875, "rewards/rejected": -5.480677604675293, "step": 1320 }, { "epoch": 0.35, "grad_norm": 27.39512825012207, "kl": 0.0, "learning_rate": 3.2713949227950797e-07, "logps/chosen": -230.88999938964844, "logps/rejected": -208.17617797851562, "loss": 0.2967, "rewards/chosen": -0.46113088726997375, "rewards/margins": 3.70143723487854, "rewards/rejected": -4.162568092346191, "step": 1321 }, { "epoch": 0.35, "grad_norm": 49.641719818115234, "kl": 0.0, "learning_rate": 3.2700863648259616e-07, "logps/chosen": -278.8284912109375, "logps/rejected": -205.60911560058594, "loss": 0.3747, "rewards/chosen": -0.01196346990764141, "rewards/margins": 2.739752769470215, "rewards/rejected": -2.751716136932373, "step": 1322 }, { "epoch": 0.35, "grad_norm": 33.947898864746094, "kl": 0.0, "learning_rate": 3.2687778068568436e-07, "logps/chosen": -194.21571350097656, "logps/rejected": -252.41253662109375, "loss": 0.2927, "rewards/chosen": -0.2291528880596161, "rewards/margins": 3.9392712116241455, "rewards/rejected": -4.168424129486084, "step": 1323 }, { "epoch": 0.35, "grad_norm": 38.89460372924805, "kl": 0.0, "learning_rate": 3.2674692488877256e-07, "logps/chosen": -178.58995056152344, "logps/rejected": -286.2423400878906, "loss": 0.4326, "rewards/chosen": -0.6443737745285034, "rewards/margins": 2.9597129821777344, "rewards/rejected": -3.6040866374969482, "step": 1324 }, { "epoch": 0.35, "grad_norm": 28.817676544189453, "kl": 0.0, "learning_rate": 3.2661606909186075e-07, "logps/chosen": -181.80223083496094, "logps/rejected": -182.43310546875, "loss": 0.3204, "rewards/chosen": -0.6919834613800049, "rewards/margins": 2.9899115562438965, "rewards/rejected": -3.6818950176239014, "step": 1325 }, { "epoch": 0.35, "grad_norm": 37.1805534362793, "kl": 0.0, "learning_rate": 3.2648521329494895e-07, "logps/chosen": -130.12796020507812, "logps/rejected": -197.9749755859375, "loss": 0.4522, "rewards/chosen": -0.9502408504486084, "rewards/margins": 0.5358531475067139, "rewards/rejected": -1.4860939979553223, "step": 1326 }, { "epoch": 0.35, "grad_norm": 42.603267669677734, "kl": 0.0, "learning_rate": 3.2635435749803715e-07, "logps/chosen": -154.728271484375, "logps/rejected": -188.37063598632812, "loss": 0.3996, "rewards/chosen": -0.20417237281799316, "rewards/margins": 1.0332392454147339, "rewards/rejected": -1.237411618232727, "step": 1327 }, { "epoch": 0.35, "grad_norm": 23.203495025634766, "kl": 0.0, "learning_rate": 3.2622350170112534e-07, "logps/chosen": -237.64964294433594, "logps/rejected": -216.24838256835938, "loss": 0.3329, "rewards/chosen": -0.4951002895832062, "rewards/margins": 2.8161814212799072, "rewards/rejected": -3.311281681060791, "step": 1328 }, { "epoch": 0.35, "grad_norm": 32.98259353637695, "kl": 0.0, "learning_rate": 3.2609264590421354e-07, "logps/chosen": -287.97393798828125, "logps/rejected": -265.22216796875, "loss": 0.2701, "rewards/chosen": 1.7656995058059692, "rewards/margins": 5.856935977935791, "rewards/rejected": -4.091236591339111, "step": 1329 }, { "epoch": 0.35, "grad_norm": 31.110828399658203, "kl": 0.0, "learning_rate": 3.2596179010730173e-07, "logps/chosen": -168.94876098632812, "logps/rejected": -231.80029296875, "loss": 0.2973, "rewards/chosen": 0.5853991508483887, "rewards/margins": 4.364181041717529, "rewards/rejected": -3.7787818908691406, "step": 1330 }, { "epoch": 0.35, "grad_norm": 28.867490768432617, "kl": 0.0, "learning_rate": 3.2583093431038993e-07, "logps/chosen": -153.498779296875, "logps/rejected": -174.12374877929688, "loss": 0.3801, "rewards/chosen": -0.5442581176757812, "rewards/margins": 1.9553544521331787, "rewards/rejected": -2.49961256980896, "step": 1331 }, { "epoch": 0.35, "grad_norm": 34.34536361694336, "kl": 0.0, "learning_rate": 3.257000785134781e-07, "logps/chosen": -166.44349670410156, "logps/rejected": -246.9544677734375, "loss": 0.2097, "rewards/chosen": 0.37404146790504456, "rewards/margins": 4.729082107543945, "rewards/rejected": -4.355040550231934, "step": 1332 }, { "epoch": 0.35, "grad_norm": 34.6766471862793, "kl": 0.0, "learning_rate": 3.255692227165663e-07, "logps/chosen": -306.0629577636719, "logps/rejected": -280.69903564453125, "loss": 0.2987, "rewards/chosen": 0.3299980163574219, "rewards/margins": 6.534651279449463, "rewards/rejected": -6.204653263092041, "step": 1333 }, { "epoch": 0.35, "grad_norm": 39.86823654174805, "kl": 0.0, "learning_rate": 3.254383669196545e-07, "logps/chosen": -244.7062530517578, "logps/rejected": -260.9649658203125, "loss": 0.3421, "rewards/chosen": 0.7138856649398804, "rewards/margins": 2.7415547370910645, "rewards/rejected": -2.0276689529418945, "step": 1334 }, { "epoch": 0.35, "grad_norm": 36.28447341918945, "kl": 0.0, "learning_rate": 3.253075111227427e-07, "logps/chosen": -216.41928100585938, "logps/rejected": -272.4765930175781, "loss": 0.282, "rewards/chosen": 1.007466435432434, "rewards/margins": 4.307610988616943, "rewards/rejected": -3.300144672393799, "step": 1335 }, { "epoch": 0.35, "grad_norm": 37.88795471191406, "kl": 0.0, "learning_rate": 3.2517665532583096e-07, "logps/chosen": -272.2811584472656, "logps/rejected": -198.16000366210938, "loss": 0.3557, "rewards/chosen": 0.30693429708480835, "rewards/margins": 2.8492815494537354, "rewards/rejected": -2.5423471927642822, "step": 1336 }, { "epoch": 0.35, "grad_norm": 37.08567810058594, "kl": 0.0, "learning_rate": 3.2504579952891916e-07, "logps/chosen": -127.075439453125, "logps/rejected": -191.78448486328125, "loss": 0.316, "rewards/chosen": -1.1991398334503174, "rewards/margins": 2.050868272781372, "rewards/rejected": -3.2500081062316895, "step": 1337 }, { "epoch": 0.35, "grad_norm": 28.48150062561035, "kl": 0.0, "learning_rate": 3.2491494373200736e-07, "logps/chosen": -139.986572265625, "logps/rejected": -309.28363037109375, "loss": 0.2604, "rewards/chosen": 0.7744281888008118, "rewards/margins": 3.9714620113372803, "rewards/rejected": -3.1970338821411133, "step": 1338 }, { "epoch": 0.35, "grad_norm": 32.136024475097656, "kl": 0.0, "learning_rate": 3.247840879350955e-07, "logps/chosen": -233.92196655273438, "logps/rejected": -303.3210754394531, "loss": 0.3133, "rewards/chosen": -1.318963885307312, "rewards/margins": 2.054591655731201, "rewards/rejected": -3.3735556602478027, "step": 1339 }, { "epoch": 0.35, "grad_norm": 39.1546630859375, "kl": 0.0, "learning_rate": 3.246532321381837e-07, "logps/chosen": -158.99038696289062, "logps/rejected": -258.7633972167969, "loss": 0.41, "rewards/chosen": -0.913709819316864, "rewards/margins": 3.426400899887085, "rewards/rejected": -4.340110778808594, "step": 1340 }, { "epoch": 0.35, "grad_norm": 31.439769744873047, "kl": 0.0, "learning_rate": 3.245223763412719e-07, "logps/chosen": -248.65176391601562, "logps/rejected": -187.52197265625, "loss": 0.1849, "rewards/chosen": -0.4633801281452179, "rewards/margins": 3.0590274333953857, "rewards/rejected": -3.5224075317382812, "step": 1341 }, { "epoch": 0.35, "grad_norm": 24.785945892333984, "kl": 0.0, "learning_rate": 3.243915205443601e-07, "logps/chosen": -163.9761199951172, "logps/rejected": -241.343994140625, "loss": 0.2388, "rewards/chosen": -0.21575519442558289, "rewards/margins": 3.2993061542510986, "rewards/rejected": -3.515061378479004, "step": 1342 }, { "epoch": 0.35, "grad_norm": 38.66981887817383, "kl": 0.0, "learning_rate": 3.242606647474483e-07, "logps/chosen": -245.6348114013672, "logps/rejected": -206.09735107421875, "loss": 0.3331, "rewards/chosen": 0.7461506128311157, "rewards/margins": 2.7707104682922363, "rewards/rejected": -2.02455997467041, "step": 1343 }, { "epoch": 0.35, "grad_norm": 31.069034576416016, "kl": 0.0, "learning_rate": 3.241298089505365e-07, "logps/chosen": -217.89781188964844, "logps/rejected": -283.12060546875, "loss": 0.3239, "rewards/chosen": 0.8098434209823608, "rewards/margins": 4.698688983917236, "rewards/rejected": -3.888845682144165, "step": 1344 }, { "epoch": 0.35, "grad_norm": 33.27344512939453, "kl": 0.0, "learning_rate": 3.239989531536247e-07, "logps/chosen": -275.2389831542969, "logps/rejected": -250.8555145263672, "loss": 0.3039, "rewards/chosen": -0.4530223309993744, "rewards/margins": 3.8131110668182373, "rewards/rejected": -4.2661333084106445, "step": 1345 }, { "epoch": 0.35, "grad_norm": 38.127281188964844, "kl": 0.0, "learning_rate": 3.2386809735671287e-07, "logps/chosen": -169.89163208007812, "logps/rejected": -284.6907653808594, "loss": 0.4194, "rewards/chosen": -0.27509641647338867, "rewards/margins": 3.6556601524353027, "rewards/rejected": -3.9307565689086914, "step": 1346 }, { "epoch": 0.35, "grad_norm": 29.15488052368164, "kl": 0.0, "learning_rate": 3.2373724155980107e-07, "logps/chosen": -197.29884338378906, "logps/rejected": -242.72018432617188, "loss": 0.3905, "rewards/chosen": 0.06425243616104126, "rewards/margins": 3.8348217010498047, "rewards/rejected": -3.770569324493408, "step": 1347 }, { "epoch": 0.35, "grad_norm": 30.490800857543945, "kl": 0.0, "learning_rate": 3.2360638576288926e-07, "logps/chosen": -191.22560119628906, "logps/rejected": -229.977783203125, "loss": 0.3334, "rewards/chosen": -0.10668891668319702, "rewards/margins": 2.346184253692627, "rewards/rejected": -2.4528732299804688, "step": 1348 }, { "epoch": 0.35, "grad_norm": 34.293060302734375, "kl": 0.0, "learning_rate": 3.234755299659775e-07, "logps/chosen": -214.0876922607422, "logps/rejected": -265.4975891113281, "loss": 0.1624, "rewards/chosen": 1.9567590951919556, "rewards/margins": 5.267825603485107, "rewards/rejected": -3.3110663890838623, "step": 1349 }, { "epoch": 0.35, "grad_norm": 39.73982238769531, "kl": 0.0, "learning_rate": 3.233446741690657e-07, "logps/chosen": -167.7155303955078, "logps/rejected": -342.3310546875, "loss": 0.1932, "rewards/chosen": 1.1075279712677002, "rewards/margins": 5.490621566772461, "rewards/rejected": -4.383093357086182, "step": 1350 }, { "epoch": 0.35, "grad_norm": 38.64570236206055, "kl": 0.0, "learning_rate": 3.232138183721539e-07, "logps/chosen": -183.162109375, "logps/rejected": -280.5931091308594, "loss": 0.2884, "rewards/chosen": 1.5205305814743042, "rewards/margins": 4.8635640144348145, "rewards/rejected": -3.3430333137512207, "step": 1351 }, { "epoch": 0.35, "grad_norm": 34.20791244506836, "kl": 0.0, "learning_rate": 3.230829625752421e-07, "logps/chosen": -206.82135009765625, "logps/rejected": -225.54063415527344, "loss": 0.3409, "rewards/chosen": 0.016800865530967712, "rewards/margins": 2.840919017791748, "rewards/rejected": -2.824118137359619, "step": 1352 }, { "epoch": 0.35, "grad_norm": 35.95807647705078, "kl": 0.0, "learning_rate": 3.229521067783303e-07, "logps/chosen": -185.8177490234375, "logps/rejected": -196.95664978027344, "loss": 0.2837, "rewards/chosen": 0.334745854139328, "rewards/margins": 1.8283900022506714, "rewards/rejected": -1.493644118309021, "step": 1353 }, { "epoch": 0.35, "grad_norm": 34.594947814941406, "kl": 0.0, "learning_rate": 3.2282125098141844e-07, "logps/chosen": -233.8248291015625, "logps/rejected": -209.21939086914062, "loss": 0.3388, "rewards/chosen": 0.3423979878425598, "rewards/margins": 2.1111137866973877, "rewards/rejected": -1.768715739250183, "step": 1354 }, { "epoch": 0.35, "grad_norm": 35.39228057861328, "kl": 0.0, "learning_rate": 3.2269039518450664e-07, "logps/chosen": -188.0096435546875, "logps/rejected": -226.69932556152344, "loss": 0.287, "rewards/chosen": 0.7518016695976257, "rewards/margins": 4.171019077301025, "rewards/rejected": -3.419217348098755, "step": 1355 }, { "epoch": 0.35, "grad_norm": 37.23146057128906, "kl": 0.0, "learning_rate": 3.2255953938759483e-07, "logps/chosen": -220.07754516601562, "logps/rejected": -296.0440979003906, "loss": 0.2496, "rewards/chosen": 0.1624390184879303, "rewards/margins": 4.445444107055664, "rewards/rejected": -4.283005237579346, "step": 1356 }, { "epoch": 0.36, "grad_norm": 37.137718200683594, "kl": 0.0, "learning_rate": 3.2242868359068303e-07, "logps/chosen": -179.314453125, "logps/rejected": -221.1763916015625, "loss": 0.2737, "rewards/chosen": 0.19471484422683716, "rewards/margins": 2.744776964187622, "rewards/rejected": -2.5500621795654297, "step": 1357 }, { "epoch": 0.36, "grad_norm": 38.724143981933594, "kl": 0.0, "learning_rate": 3.222978277937712e-07, "logps/chosen": -208.41488647460938, "logps/rejected": -224.366455078125, "loss": 0.237, "rewards/chosen": 1.1091750860214233, "rewards/margins": 5.58030366897583, "rewards/rejected": -4.471128463745117, "step": 1358 }, { "epoch": 0.36, "grad_norm": 38.86799621582031, "kl": 0.0, "learning_rate": 3.221669719968594e-07, "logps/chosen": -221.882568359375, "logps/rejected": -244.26026916503906, "loss": 0.3496, "rewards/chosen": 0.35998934507369995, "rewards/margins": 3.726022958755493, "rewards/rejected": -3.3660335540771484, "step": 1359 }, { "epoch": 0.36, "grad_norm": 28.907777786254883, "kl": 0.0, "learning_rate": 3.220361161999476e-07, "logps/chosen": -229.4911651611328, "logps/rejected": -188.29910278320312, "loss": 0.2566, "rewards/chosen": 0.28083348274230957, "rewards/margins": 2.9774253368377686, "rewards/rejected": -2.696591854095459, "step": 1360 }, { "epoch": 0.36, "grad_norm": 36.41794967651367, "kl": 0.0, "learning_rate": 3.219052604030358e-07, "logps/chosen": -156.39662170410156, "logps/rejected": -194.1817626953125, "loss": 0.3464, "rewards/chosen": 0.58488529920578, "rewards/margins": 2.58624529838562, "rewards/rejected": -2.0013599395751953, "step": 1361 }, { "epoch": 0.36, "grad_norm": 30.247217178344727, "kl": 0.0, "learning_rate": 3.2177440460612406e-07, "logps/chosen": -207.5975799560547, "logps/rejected": -271.22698974609375, "loss": 0.2429, "rewards/chosen": 2.0984890460968018, "rewards/margins": 6.212061882019043, "rewards/rejected": -4.11357307434082, "step": 1362 }, { "epoch": 0.36, "grad_norm": 36.8974723815918, "kl": 0.0, "learning_rate": 3.2164354880921226e-07, "logps/chosen": -251.12509155273438, "logps/rejected": -222.53085327148438, "loss": 0.3591, "rewards/chosen": -0.7981698513031006, "rewards/margins": 2.2022039890289307, "rewards/rejected": -3.0003738403320312, "step": 1363 }, { "epoch": 0.36, "grad_norm": 31.882705688476562, "kl": 0.0, "learning_rate": 3.2151269301230045e-07, "logps/chosen": -241.10394287109375, "logps/rejected": -229.85205078125, "loss": 0.256, "rewards/chosen": -1.05807626247406, "rewards/margins": 2.32078218460083, "rewards/rejected": -3.3788583278656006, "step": 1364 }, { "epoch": 0.36, "grad_norm": 32.5128288269043, "kl": 0.0, "learning_rate": 3.2138183721538865e-07, "logps/chosen": -293.7722473144531, "logps/rejected": -266.4582214355469, "loss": 0.3502, "rewards/chosen": -0.9223523139953613, "rewards/margins": 2.8367271423339844, "rewards/rejected": -3.7590794563293457, "step": 1365 }, { "epoch": 0.36, "grad_norm": 31.339826583862305, "kl": 0.0, "learning_rate": 3.2125098141847685e-07, "logps/chosen": -186.4215545654297, "logps/rejected": -194.24964904785156, "loss": 0.309, "rewards/chosen": -0.3455151915550232, "rewards/margins": 2.1275269985198975, "rewards/rejected": -2.4730422496795654, "step": 1366 }, { "epoch": 0.36, "grad_norm": 27.793371200561523, "kl": 0.0, "learning_rate": 3.2112012562156504e-07, "logps/chosen": -207.349609375, "logps/rejected": -221.176025390625, "loss": 0.3284, "rewards/chosen": -1.0674750804901123, "rewards/margins": 1.9950244426727295, "rewards/rejected": -3.062499523162842, "step": 1367 }, { "epoch": 0.36, "grad_norm": 25.472999572753906, "kl": 0.0, "learning_rate": 3.2098926982465324e-07, "logps/chosen": -225.4566192626953, "logps/rejected": -209.94873046875, "loss": 0.2498, "rewards/chosen": -0.36175042390823364, "rewards/margins": 2.9907066822052, "rewards/rejected": -3.352457046508789, "step": 1368 }, { "epoch": 0.36, "grad_norm": 31.11713981628418, "kl": 0.0, "learning_rate": 3.2085841402774143e-07, "logps/chosen": -164.0789031982422, "logps/rejected": -194.20265197753906, "loss": 0.3384, "rewards/chosen": 0.4543860852718353, "rewards/margins": 2.6434974670410156, "rewards/rejected": -2.1891114711761475, "step": 1369 }, { "epoch": 0.36, "grad_norm": 34.00928497314453, "kl": 0.0, "learning_rate": 3.207275582308296e-07, "logps/chosen": -212.17886352539062, "logps/rejected": -181.32400512695312, "loss": 0.2691, "rewards/chosen": 0.9818140864372253, "rewards/margins": 3.594285011291504, "rewards/rejected": -2.612470865249634, "step": 1370 }, { "epoch": 0.36, "grad_norm": 29.166141510009766, "kl": 0.0, "learning_rate": 3.2059670243391777e-07, "logps/chosen": -249.57412719726562, "logps/rejected": -267.99700927734375, "loss": 0.3086, "rewards/chosen": 0.5101392269134521, "rewards/margins": 4.556434631347656, "rewards/rejected": -4.046295642852783, "step": 1371 }, { "epoch": 0.36, "grad_norm": 31.815664291381836, "kl": 0.0, "learning_rate": 3.2046584663700597e-07, "logps/chosen": -231.37120056152344, "logps/rejected": -383.26568603515625, "loss": 0.2616, "rewards/chosen": 0.07025027275085449, "rewards/margins": 8.557687759399414, "rewards/rejected": -8.48743724822998, "step": 1372 }, { "epoch": 0.36, "grad_norm": 36.147239685058594, "kl": 0.0, "learning_rate": 3.2033499084009417e-07, "logps/chosen": -177.21237182617188, "logps/rejected": -266.6687316894531, "loss": 0.3493, "rewards/chosen": -0.384672611951828, "rewards/margins": 2.363210439682007, "rewards/rejected": -2.7478830814361572, "step": 1373 }, { "epoch": 0.36, "grad_norm": 37.070411682128906, "kl": 0.0, "learning_rate": 3.202041350431824e-07, "logps/chosen": -232.02838134765625, "logps/rejected": -181.37887573242188, "loss": 0.3831, "rewards/chosen": -0.11297231912612915, "rewards/margins": 2.5317389965057373, "rewards/rejected": -2.6447112560272217, "step": 1374 }, { "epoch": 0.36, "grad_norm": 42.010162353515625, "kl": 0.0, "learning_rate": 3.200732792462706e-07, "logps/chosen": -185.11367797851562, "logps/rejected": -175.94879150390625, "loss": 0.3412, "rewards/chosen": 0.004318729043006897, "rewards/margins": 2.7210121154785156, "rewards/rejected": -2.71669340133667, "step": 1375 }, { "epoch": 0.36, "grad_norm": 32.87958908081055, "kl": 0.0, "learning_rate": 3.199424234493588e-07, "logps/chosen": -230.2548065185547, "logps/rejected": -321.4191589355469, "loss": 0.3384, "rewards/chosen": -0.9217988848686218, "rewards/margins": 1.7525115013122559, "rewards/rejected": -2.6743104457855225, "step": 1376 }, { "epoch": 0.36, "grad_norm": 36.438838958740234, "kl": 0.0, "learning_rate": 3.19811567652447e-07, "logps/chosen": -219.46987915039062, "logps/rejected": -230.5147247314453, "loss": 0.2815, "rewards/chosen": 0.13436929881572723, "rewards/margins": 2.3530349731445312, "rewards/rejected": -2.218665599822998, "step": 1377 }, { "epoch": 0.36, "grad_norm": 31.880538940429688, "kl": 0.0, "learning_rate": 3.196807118555352e-07, "logps/chosen": -138.39146423339844, "logps/rejected": -214.18983459472656, "loss": 0.2752, "rewards/chosen": 1.1650010347366333, "rewards/margins": 3.4112720489501953, "rewards/rejected": -2.2462708950042725, "step": 1378 }, { "epoch": 0.36, "grad_norm": 24.90720558166504, "kl": 0.0, "learning_rate": 3.195498560586234e-07, "logps/chosen": -242.847412109375, "logps/rejected": -178.04931640625, "loss": 0.3436, "rewards/chosen": -0.43071645498275757, "rewards/margins": 2.444901943206787, "rewards/rejected": -2.8756184577941895, "step": 1379 }, { "epoch": 0.36, "grad_norm": 42.002777099609375, "kl": 0.0, "learning_rate": 3.194190002617116e-07, "logps/chosen": -255.03749084472656, "logps/rejected": -203.60467529296875, "loss": 0.3988, "rewards/chosen": -0.029839009046554565, "rewards/margins": 1.9282221794128418, "rewards/rejected": -1.9580612182617188, "step": 1380 }, { "epoch": 0.36, "grad_norm": 34.07427215576172, "kl": 0.0, "learning_rate": 3.192881444647998e-07, "logps/chosen": -143.0186767578125, "logps/rejected": -223.99244689941406, "loss": 0.3164, "rewards/chosen": 0.14076310396194458, "rewards/margins": 3.031914472579956, "rewards/rejected": -2.8911514282226562, "step": 1381 }, { "epoch": 0.36, "grad_norm": 29.833765029907227, "kl": 0.0, "learning_rate": 3.19157288667888e-07, "logps/chosen": -210.23605346679688, "logps/rejected": -235.79354858398438, "loss": 0.3206, "rewards/chosen": -0.04645270109176636, "rewards/margins": 4.771703720092773, "rewards/rejected": -4.8181562423706055, "step": 1382 }, { "epoch": 0.36, "grad_norm": 35.3935546875, "kl": 0.0, "learning_rate": 3.190264328709762e-07, "logps/chosen": -313.9126892089844, "logps/rejected": -148.69590759277344, "loss": 0.3035, "rewards/chosen": -0.025645911693572998, "rewards/margins": 2.0939083099365234, "rewards/rejected": -2.119554281234741, "step": 1383 }, { "epoch": 0.36, "grad_norm": 33.280887603759766, "kl": 0.0, "learning_rate": 3.188955770740644e-07, "logps/chosen": -209.35018920898438, "logps/rejected": -344.81787109375, "loss": 0.2728, "rewards/chosen": 0.5633559226989746, "rewards/margins": 11.289161682128906, "rewards/rejected": -10.72580623626709, "step": 1384 }, { "epoch": 0.36, "grad_norm": 29.389549255371094, "kl": 0.0, "learning_rate": 3.187647212771525e-07, "logps/chosen": -143.76699829101562, "logps/rejected": -267.39080810546875, "loss": 0.3333, "rewards/chosen": -0.05079573392868042, "rewards/margins": 3.496812582015991, "rewards/rejected": -3.5476083755493164, "step": 1385 }, { "epoch": 0.36, "grad_norm": 27.688262939453125, "kl": 0.0, "learning_rate": 3.186338654802407e-07, "logps/chosen": -173.18344116210938, "logps/rejected": -175.218017578125, "loss": 0.2628, "rewards/chosen": 0.21559491753578186, "rewards/margins": 1.7542529106140137, "rewards/rejected": -1.5386580228805542, "step": 1386 }, { "epoch": 0.36, "grad_norm": 26.76343536376953, "kl": 0.0, "learning_rate": 3.1850300968332896e-07, "logps/chosen": -156.0584259033203, "logps/rejected": -314.71502685546875, "loss": 0.1997, "rewards/chosen": 1.3652375936508179, "rewards/margins": 7.026503562927246, "rewards/rejected": -5.661265850067139, "step": 1387 }, { "epoch": 0.36, "grad_norm": 33.4283332824707, "kl": 0.0, "learning_rate": 3.1837215388641716e-07, "logps/chosen": -196.38980102539062, "logps/rejected": -226.2455291748047, "loss": 0.153, "rewards/chosen": 3.0454821586608887, "rewards/margins": 6.030291557312012, "rewards/rejected": -2.984809160232544, "step": 1388 }, { "epoch": 0.36, "grad_norm": 38.99360656738281, "kl": 0.0, "learning_rate": 3.1824129808950536e-07, "logps/chosen": -282.0292663574219, "logps/rejected": -421.4825439453125, "loss": 0.2815, "rewards/chosen": 0.8324300646781921, "rewards/margins": 3.952428102493286, "rewards/rejected": -3.119997978210449, "step": 1389 }, { "epoch": 0.36, "grad_norm": 43.50868606567383, "kl": 0.0, "learning_rate": 3.1811044229259355e-07, "logps/chosen": -124.22837829589844, "logps/rejected": -212.58775329589844, "loss": 0.3436, "rewards/chosen": 0.3585464656352997, "rewards/margins": 2.7998228073120117, "rewards/rejected": -2.4412763118743896, "step": 1390 }, { "epoch": 0.36, "grad_norm": 44.76431655883789, "kl": 0.0, "learning_rate": 3.1797958649568175e-07, "logps/chosen": -224.62722778320312, "logps/rejected": -197.28936767578125, "loss": 0.4756, "rewards/chosen": -1.3198111057281494, "rewards/margins": -0.12586522102355957, "rewards/rejected": -1.1939458847045898, "step": 1391 }, { "epoch": 0.36, "grad_norm": 33.97430419921875, "kl": 0.0, "learning_rate": 3.1784873069876994e-07, "logps/chosen": -203.94140625, "logps/rejected": -204.45584106445312, "loss": 0.3566, "rewards/chosen": 0.5176467895507812, "rewards/margins": 1.5917928218841553, "rewards/rejected": -1.074146032333374, "step": 1392 }, { "epoch": 0.36, "grad_norm": 34.34186553955078, "kl": 0.0, "learning_rate": 3.1771787490185814e-07, "logps/chosen": -232.93101501464844, "logps/rejected": -186.81085205078125, "loss": 0.366, "rewards/chosen": 0.39034122228622437, "rewards/margins": 1.8955557346343994, "rewards/rejected": -1.5052144527435303, "step": 1393 }, { "epoch": 0.36, "grad_norm": 34.1844367980957, "kl": 0.0, "learning_rate": 3.1758701910494634e-07, "logps/chosen": -183.45620727539062, "logps/rejected": -236.90887451171875, "loss": 0.2629, "rewards/chosen": 0.44305187463760376, "rewards/margins": 4.251132488250732, "rewards/rejected": -3.8080806732177734, "step": 1394 }, { "epoch": 0.37, "grad_norm": 43.349403381347656, "kl": 0.0, "learning_rate": 3.1745616330803453e-07, "logps/chosen": -195.5172882080078, "logps/rejected": -221.8050537109375, "loss": 0.2069, "rewards/chosen": 1.6076291799545288, "rewards/margins": 4.366086006164551, "rewards/rejected": -2.7584567070007324, "step": 1395 }, { "epoch": 0.37, "grad_norm": 32.73597717285156, "kl": 0.0, "learning_rate": 3.1732530751112273e-07, "logps/chosen": -155.1674041748047, "logps/rejected": -318.8786315917969, "loss": 0.3455, "rewards/chosen": -0.8497037887573242, "rewards/margins": 2.198971748352051, "rewards/rejected": -3.048675537109375, "step": 1396 }, { "epoch": 0.37, "grad_norm": 33.25680160522461, "kl": 0.0, "learning_rate": 3.171944517142109e-07, "logps/chosen": -204.45106506347656, "logps/rejected": -172.80410766601562, "loss": 0.2137, "rewards/chosen": 1.4785555601119995, "rewards/margins": 3.709897041320801, "rewards/rejected": -2.231341600418091, "step": 1397 }, { "epoch": 0.37, "grad_norm": 28.279253005981445, "kl": 0.0, "learning_rate": 3.170635959172991e-07, "logps/chosen": -173.07913208007812, "logps/rejected": -171.8015594482422, "loss": 0.3284, "rewards/chosen": -0.3849499225616455, "rewards/margins": 1.7102558612823486, "rewards/rejected": -2.095205783843994, "step": 1398 }, { "epoch": 0.37, "grad_norm": 29.989870071411133, "kl": 0.0, "learning_rate": 3.169327401203873e-07, "logps/chosen": -340.0752868652344, "logps/rejected": -181.56031799316406, "loss": 0.2352, "rewards/chosen": 3.104139804840088, "rewards/margins": 6.055088996887207, "rewards/rejected": -2.95094895362854, "step": 1399 }, { "epoch": 0.37, "grad_norm": 36.66115188598633, "kl": 0.0, "learning_rate": 3.1680188432347557e-07, "logps/chosen": -242.0625, "logps/rejected": -340.12542724609375, "loss": 0.3103, "rewards/chosen": 0.20509715378284454, "rewards/margins": 3.771449089050293, "rewards/rejected": -3.566351890563965, "step": 1400 }, { "epoch": 0.37, "grad_norm": 30.042667388916016, "kl": 0.0, "learning_rate": 3.166710285265637e-07, "logps/chosen": -234.99282836914062, "logps/rejected": -247.1533203125, "loss": 0.2896, "rewards/chosen": -0.4193776547908783, "rewards/margins": 4.4713287353515625, "rewards/rejected": -4.890706539154053, "step": 1401 }, { "epoch": 0.37, "grad_norm": 40.00863265991211, "kl": 0.0, "learning_rate": 3.165401727296519e-07, "logps/chosen": -206.06680297851562, "logps/rejected": -228.10067749023438, "loss": 0.4289, "rewards/chosen": -0.21271708607673645, "rewards/margins": 2.1718521118164062, "rewards/rejected": -2.3845691680908203, "step": 1402 }, { "epoch": 0.37, "grad_norm": 31.949710845947266, "kl": 0.0, "learning_rate": 3.164093169327401e-07, "logps/chosen": -302.5041198730469, "logps/rejected": -295.21661376953125, "loss": 0.2195, "rewards/chosen": 0.8447118401527405, "rewards/margins": 4.163388252258301, "rewards/rejected": -3.318676471710205, "step": 1403 }, { "epoch": 0.37, "grad_norm": 30.53171157836914, "kl": 0.0, "learning_rate": 3.162784611358283e-07, "logps/chosen": -221.29522705078125, "logps/rejected": -240.13375854492188, "loss": 0.3361, "rewards/chosen": 0.3290228247642517, "rewards/margins": 3.6443843841552734, "rewards/rejected": -3.315361499786377, "step": 1404 }, { "epoch": 0.37, "grad_norm": 29.357757568359375, "kl": 0.0, "learning_rate": 3.161476053389165e-07, "logps/chosen": -222.23890686035156, "logps/rejected": -256.6720275878906, "loss": 0.3401, "rewards/chosen": -0.36933091282844543, "rewards/margins": 2.098792314529419, "rewards/rejected": -2.468123197555542, "step": 1405 }, { "epoch": 0.37, "grad_norm": 38.137054443359375, "kl": 0.0, "learning_rate": 3.160167495420047e-07, "logps/chosen": -224.1837615966797, "logps/rejected": -277.242919921875, "loss": 0.218, "rewards/chosen": 0.6834139227867126, "rewards/margins": 3.4380838871002197, "rewards/rejected": -2.7546699047088623, "step": 1406 }, { "epoch": 0.37, "grad_norm": 32.083553314208984, "kl": 0.0, "learning_rate": 3.158858937450929e-07, "logps/chosen": -161.6896209716797, "logps/rejected": -279.6732177734375, "loss": 0.2805, "rewards/chosen": 0.5990890860557556, "rewards/margins": 5.596165180206299, "rewards/rejected": -4.997076034545898, "step": 1407 }, { "epoch": 0.37, "grad_norm": 30.056991577148438, "kl": 0.0, "learning_rate": 3.157550379481811e-07, "logps/chosen": -184.24851989746094, "logps/rejected": -239.89739990234375, "loss": 0.2644, "rewards/chosen": 0.6428428888320923, "rewards/margins": 3.2653703689575195, "rewards/rejected": -2.622527599334717, "step": 1408 }, { "epoch": 0.37, "grad_norm": 38.32441711425781, "kl": 0.0, "learning_rate": 3.156241821512693e-07, "logps/chosen": -213.1717071533203, "logps/rejected": -268.25689697265625, "loss": 0.1622, "rewards/chosen": 2.8008837699890137, "rewards/margins": 6.9828104972839355, "rewards/rejected": -4.181926727294922, "step": 1409 }, { "epoch": 0.37, "grad_norm": 32.889060974121094, "kl": 0.0, "learning_rate": 3.154933263543575e-07, "logps/chosen": -222.70004272460938, "logps/rejected": -199.03073120117188, "loss": 0.4285, "rewards/chosen": -0.5681661367416382, "rewards/margins": 1.663326382637024, "rewards/rejected": -2.231492519378662, "step": 1410 }, { "epoch": 0.37, "grad_norm": 32.03827667236328, "kl": 0.0, "learning_rate": 3.1536247055744567e-07, "logps/chosen": -199.5806427001953, "logps/rejected": -234.4297637939453, "loss": 0.2155, "rewards/chosen": 1.4614603519439697, "rewards/margins": 4.551479339599609, "rewards/rejected": -3.0900187492370605, "step": 1411 }, { "epoch": 0.37, "grad_norm": 28.646682739257812, "kl": 0.0, "learning_rate": 3.152316147605339e-07, "logps/chosen": -185.1912384033203, "logps/rejected": -371.14825439453125, "loss": 0.3057, "rewards/chosen": -0.053407806903123856, "rewards/margins": 4.185282230377197, "rewards/rejected": -4.23868989944458, "step": 1412 }, { "epoch": 0.37, "grad_norm": 36.860076904296875, "kl": 0.0, "learning_rate": 3.151007589636221e-07, "logps/chosen": -191.5133514404297, "logps/rejected": -253.78466796875, "loss": 0.2214, "rewards/chosen": 2.807755708694458, "rewards/margins": 5.15962028503418, "rewards/rejected": -2.3518643379211426, "step": 1413 }, { "epoch": 0.37, "grad_norm": 29.74126625061035, "kl": 0.0, "learning_rate": 3.149699031667103e-07, "logps/chosen": -219.7583770751953, "logps/rejected": -290.28253173828125, "loss": 0.2206, "rewards/chosen": 0.3516443073749542, "rewards/margins": 4.822754859924316, "rewards/rejected": -4.4711103439331055, "step": 1414 }, { "epoch": 0.37, "grad_norm": 33.049774169921875, "kl": 0.0, "learning_rate": 3.148390473697985e-07, "logps/chosen": -243.57969665527344, "logps/rejected": -400.4478454589844, "loss": 0.2593, "rewards/chosen": 1.0341131687164307, "rewards/margins": 7.01644229888916, "rewards/rejected": -5.982329368591309, "step": 1415 }, { "epoch": 0.37, "grad_norm": 27.80577278137207, "kl": 0.0, "learning_rate": 3.1470819157288665e-07, "logps/chosen": -198.178955078125, "logps/rejected": -237.63461303710938, "loss": 0.2456, "rewards/chosen": 0.9747270941734314, "rewards/margins": 4.922013282775879, "rewards/rejected": -3.9472861289978027, "step": 1416 }, { "epoch": 0.37, "grad_norm": 37.619537353515625, "kl": 0.0, "learning_rate": 3.1457733577597485e-07, "logps/chosen": -212.923583984375, "logps/rejected": -216.25405883789062, "loss": 0.3014, "rewards/chosen": -0.093365877866745, "rewards/margins": 2.6899845600128174, "rewards/rejected": -2.7833504676818848, "step": 1417 }, { "epoch": 0.37, "grad_norm": 32.109161376953125, "kl": 0.0, "learning_rate": 3.1444647997906304e-07, "logps/chosen": -199.86090087890625, "logps/rejected": -230.55740356445312, "loss": 0.2963, "rewards/chosen": 0.27489233016967773, "rewards/margins": 2.1686692237854004, "rewards/rejected": -1.8937770128250122, "step": 1418 }, { "epoch": 0.37, "grad_norm": 35.83468246459961, "kl": 0.0, "learning_rate": 3.1431562418215124e-07, "logps/chosen": -225.78807067871094, "logps/rejected": -251.05801391601562, "loss": 0.3557, "rewards/chosen": 0.7054110765457153, "rewards/margins": 3.20662260055542, "rewards/rejected": -2.501211643218994, "step": 1419 }, { "epoch": 0.37, "grad_norm": 40.44602966308594, "kl": 0.0, "learning_rate": 3.1418476838523943e-07, "logps/chosen": -268.45751953125, "logps/rejected": -209.78469848632812, "loss": 0.3089, "rewards/chosen": 1.3857576847076416, "rewards/margins": 3.0642151832580566, "rewards/rejected": -1.678457498550415, "step": 1420 }, { "epoch": 0.37, "grad_norm": 33.158668518066406, "kl": 0.0, "learning_rate": 3.1405391258832763e-07, "logps/chosen": -183.05435180664062, "logps/rejected": -246.34567260742188, "loss": 0.3375, "rewards/chosen": 0.30931970477104187, "rewards/margins": 3.5352070331573486, "rewards/rejected": -3.2258872985839844, "step": 1421 }, { "epoch": 0.37, "grad_norm": 37.4815559387207, "kl": 0.0, "learning_rate": 3.1392305679141583e-07, "logps/chosen": -197.08316040039062, "logps/rejected": -252.58250427246094, "loss": 0.2512, "rewards/chosen": 1.102900743484497, "rewards/margins": 3.982532501220703, "rewards/rejected": -2.879631757736206, "step": 1422 }, { "epoch": 0.37, "grad_norm": 33.83235168457031, "kl": 0.0, "learning_rate": 3.13792200994504e-07, "logps/chosen": -212.65773010253906, "logps/rejected": -306.92974853515625, "loss": 0.2468, "rewards/chosen": 0.312684565782547, "rewards/margins": 3.8462066650390625, "rewards/rejected": -3.533522129058838, "step": 1423 }, { "epoch": 0.37, "grad_norm": 32.37043380737305, "kl": 0.0, "learning_rate": 3.136613451975922e-07, "logps/chosen": -202.6951904296875, "logps/rejected": -128.4596405029297, "loss": 0.2921, "rewards/chosen": -0.11523666977882385, "rewards/margins": 1.6435047388076782, "rewards/rejected": -1.7587413787841797, "step": 1424 }, { "epoch": 0.37, "grad_norm": 28.053985595703125, "kl": 0.0, "learning_rate": 3.1353048940068047e-07, "logps/chosen": -216.8121795654297, "logps/rejected": -221.2266387939453, "loss": 0.1942, "rewards/chosen": 0.6265994310379028, "rewards/margins": 4.50843620300293, "rewards/rejected": -3.8818366527557373, "step": 1425 }, { "epoch": 0.37, "grad_norm": 39.29544448852539, "kl": 0.0, "learning_rate": 3.1339963360376866e-07, "logps/chosen": -183.68960571289062, "logps/rejected": -299.44647216796875, "loss": 0.2205, "rewards/chosen": 0.5857275128364563, "rewards/margins": 3.8476221561431885, "rewards/rejected": -3.261894702911377, "step": 1426 }, { "epoch": 0.37, "grad_norm": 41.37693786621094, "kl": 0.0, "learning_rate": 3.1326877780685686e-07, "logps/chosen": -191.17079162597656, "logps/rejected": -236.34121704101562, "loss": 0.2157, "rewards/chosen": 0.22563637793064117, "rewards/margins": 3.2330408096313477, "rewards/rejected": -3.007404327392578, "step": 1427 }, { "epoch": 0.37, "grad_norm": 31.09884262084961, "kl": 0.0, "learning_rate": 3.1313792200994506e-07, "logps/chosen": -195.26025390625, "logps/rejected": -295.53948974609375, "loss": 0.3539, "rewards/chosen": -0.5408454537391663, "rewards/margins": 2.64743971824646, "rewards/rejected": -3.1882851123809814, "step": 1428 }, { "epoch": 0.37, "grad_norm": 31.855932235717773, "kl": 0.0, "learning_rate": 3.1300706621303325e-07, "logps/chosen": -185.37232971191406, "logps/rejected": -397.4236145019531, "loss": 0.2468, "rewards/chosen": 0.33413439989089966, "rewards/margins": 4.738489151000977, "rewards/rejected": -4.404354572296143, "step": 1429 }, { "epoch": 0.37, "grad_norm": 26.01105499267578, "kl": 0.0, "learning_rate": 3.1287621041612145e-07, "logps/chosen": -191.81692504882812, "logps/rejected": -241.3495635986328, "loss": 0.1754, "rewards/chosen": 0.1391974687576294, "rewards/margins": 3.8511571884155273, "rewards/rejected": -3.7119598388671875, "step": 1430 }, { "epoch": 0.37, "grad_norm": 33.91474151611328, "kl": 0.0, "learning_rate": 3.127453546192096e-07, "logps/chosen": -314.10211181640625, "logps/rejected": -266.388671875, "loss": 0.2915, "rewards/chosen": 0.3027981221675873, "rewards/margins": 4.647043228149414, "rewards/rejected": -4.344244956970215, "step": 1431 }, { "epoch": 0.37, "grad_norm": 29.76590347290039, "kl": 0.0, "learning_rate": 3.126144988222978e-07, "logps/chosen": -160.12327575683594, "logps/rejected": -245.9610137939453, "loss": 0.3936, "rewards/chosen": -0.5193794965744019, "rewards/margins": 2.19705867767334, "rewards/rejected": -2.716438055038452, "step": 1432 }, { "epoch": 0.38, "grad_norm": 32.13069534301758, "kl": 0.0, "learning_rate": 3.12483643025386e-07, "logps/chosen": -240.64173889160156, "logps/rejected": -193.10438537597656, "loss": 0.2948, "rewards/chosen": 0.27804023027420044, "rewards/margins": 3.268437385559082, "rewards/rejected": -2.9903972148895264, "step": 1433 }, { "epoch": 0.38, "grad_norm": 27.502201080322266, "kl": 0.0, "learning_rate": 3.123527872284742e-07, "logps/chosen": -235.20938110351562, "logps/rejected": -323.3491516113281, "loss": 0.2264, "rewards/chosen": 1.5059895515441895, "rewards/margins": 5.093905448913574, "rewards/rejected": -3.587916135787964, "step": 1434 }, { "epoch": 0.38, "grad_norm": 35.56685256958008, "kl": 0.0, "learning_rate": 3.122219314315624e-07, "logps/chosen": -182.975830078125, "logps/rejected": -133.8419189453125, "loss": 0.2527, "rewards/chosen": 0.5755879282951355, "rewards/margins": 3.9616384506225586, "rewards/rejected": -3.3860504627227783, "step": 1435 }, { "epoch": 0.38, "grad_norm": 29.447450637817383, "kl": 0.0, "learning_rate": 3.1209107563465057e-07, "logps/chosen": -191.61111450195312, "logps/rejected": -312.0847473144531, "loss": 0.2663, "rewards/chosen": -0.32649320363998413, "rewards/margins": 4.456490993499756, "rewards/rejected": -4.782984256744385, "step": 1436 }, { "epoch": 0.38, "grad_norm": 39.114532470703125, "kl": 0.0, "learning_rate": 3.1196021983773877e-07, "logps/chosen": -248.420654296875, "logps/rejected": -363.3230285644531, "loss": 0.2007, "rewards/chosen": 1.6235179901123047, "rewards/margins": 5.3430280685424805, "rewards/rejected": -3.719510078430176, "step": 1437 }, { "epoch": 0.38, "grad_norm": 26.85320281982422, "kl": 0.0, "learning_rate": 3.11829364040827e-07, "logps/chosen": -168.63673400878906, "logps/rejected": -158.07162475585938, "loss": 0.3226, "rewards/chosen": 0.7311351299285889, "rewards/margins": 3.2565670013427734, "rewards/rejected": -2.5254318714141846, "step": 1438 }, { "epoch": 0.38, "grad_norm": 35.99043273925781, "kl": 0.0, "learning_rate": 3.116985082439152e-07, "logps/chosen": -275.37347412109375, "logps/rejected": -253.6946563720703, "loss": 0.2417, "rewards/chosen": 0.5541010499000549, "rewards/margins": 4.043929576873779, "rewards/rejected": -3.48982834815979, "step": 1439 }, { "epoch": 0.38, "grad_norm": 39.426124572753906, "kl": 0.0, "learning_rate": 3.115676524470034e-07, "logps/chosen": -251.13905334472656, "logps/rejected": -192.73565673828125, "loss": 0.3679, "rewards/chosen": 0.6304320096969604, "rewards/margins": 2.555609941482544, "rewards/rejected": -1.9251779317855835, "step": 1440 }, { "epoch": 0.38, "grad_norm": 27.877065658569336, "kl": 0.0, "learning_rate": 3.114367966500916e-07, "logps/chosen": -208.22756958007812, "logps/rejected": -168.75772094726562, "loss": 0.319, "rewards/chosen": -0.7844001650810242, "rewards/margins": 2.3113536834716797, "rewards/rejected": -3.0957539081573486, "step": 1441 }, { "epoch": 0.38, "grad_norm": 42.60310745239258, "kl": 0.0, "learning_rate": 3.113059408531798e-07, "logps/chosen": -238.57073974609375, "logps/rejected": -259.5094909667969, "loss": 0.4606, "rewards/chosen": -0.1071540042757988, "rewards/margins": 1.33914053440094, "rewards/rejected": -1.4462945461273193, "step": 1442 }, { "epoch": 0.38, "grad_norm": 31.70445442199707, "kl": 0.0, "learning_rate": 3.11175085056268e-07, "logps/chosen": -188.24851989746094, "logps/rejected": -266.7518310546875, "loss": 0.3175, "rewards/chosen": 0.5412964820861816, "rewards/margins": 3.8777060508728027, "rewards/rejected": -3.336409568786621, "step": 1443 }, { "epoch": 0.38, "grad_norm": 42.61405944824219, "kl": 0.0, "learning_rate": 3.110442292593562e-07, "logps/chosen": -159.64474487304688, "logps/rejected": -288.9511413574219, "loss": 0.2807, "rewards/chosen": 0.07189539819955826, "rewards/margins": 4.065637588500977, "rewards/rejected": -3.993741989135742, "step": 1444 }, { "epoch": 0.38, "grad_norm": 33.7481689453125, "kl": 0.0, "learning_rate": 3.109133734624444e-07, "logps/chosen": -176.5513916015625, "logps/rejected": -195.16612243652344, "loss": 0.3098, "rewards/chosen": -0.15705662965774536, "rewards/margins": 2.0167901515960693, "rewards/rejected": -2.17384672164917, "step": 1445 }, { "epoch": 0.38, "grad_norm": 31.605884552001953, "kl": 0.0, "learning_rate": 3.107825176655326e-07, "logps/chosen": -246.95982360839844, "logps/rejected": -293.61956787109375, "loss": 0.2951, "rewards/chosen": 0.4098077118396759, "rewards/margins": 4.766162395477295, "rewards/rejected": -4.356354713439941, "step": 1446 }, { "epoch": 0.38, "grad_norm": 37.16173553466797, "kl": 0.0, "learning_rate": 3.1065166186862073e-07, "logps/chosen": -241.196044921875, "logps/rejected": -211.0733642578125, "loss": 0.2888, "rewards/chosen": -0.24421033263206482, "rewards/margins": 4.125661373138428, "rewards/rejected": -4.369871616363525, "step": 1447 }, { "epoch": 0.38, "grad_norm": 39.998992919921875, "kl": 0.0, "learning_rate": 3.105208060717089e-07, "logps/chosen": -293.6846618652344, "logps/rejected": -206.80398559570312, "loss": 0.2356, "rewards/chosen": -0.6223787665367126, "rewards/margins": 2.1629295349121094, "rewards/rejected": -2.785308361053467, "step": 1448 }, { "epoch": 0.38, "grad_norm": 52.20500183105469, "kl": 0.0, "learning_rate": 3.103899502747971e-07, "logps/chosen": -192.3514404296875, "logps/rejected": -288.9823913574219, "loss": 0.2946, "rewards/chosen": -0.11612102389335632, "rewards/margins": 3.5733249187469482, "rewards/rejected": -3.689445972442627, "step": 1449 }, { "epoch": 0.38, "grad_norm": 31.664487838745117, "kl": 0.0, "learning_rate": 3.102590944778853e-07, "logps/chosen": -199.2932891845703, "logps/rejected": -276.72418212890625, "loss": 0.2768, "rewards/chosen": 0.11254112422466278, "rewards/margins": 3.0545997619628906, "rewards/rejected": -2.942058563232422, "step": 1450 }, { "epoch": 0.38, "grad_norm": 38.79372787475586, "kl": 0.0, "learning_rate": 3.1012823868097357e-07, "logps/chosen": -289.4332275390625, "logps/rejected": -180.63380432128906, "loss": 0.4345, "rewards/chosen": -0.293634831905365, "rewards/margins": 0.8389768004417419, "rewards/rejected": -1.132611632347107, "step": 1451 }, { "epoch": 0.38, "grad_norm": 34.717159271240234, "kl": 0.0, "learning_rate": 3.0999738288406176e-07, "logps/chosen": -122.23812103271484, "logps/rejected": -292.23309326171875, "loss": 0.2905, "rewards/chosen": -0.4397892653942108, "rewards/margins": 2.1818888187408447, "rewards/rejected": -2.621678113937378, "step": 1452 }, { "epoch": 0.38, "grad_norm": 26.660770416259766, "kl": 0.0, "learning_rate": 3.0986652708714996e-07, "logps/chosen": -157.556640625, "logps/rejected": -295.0622863769531, "loss": 0.2446, "rewards/chosen": 1.0527524948120117, "rewards/margins": 3.977877616882324, "rewards/rejected": -2.9251251220703125, "step": 1453 }, { "epoch": 0.38, "grad_norm": 29.619903564453125, "kl": 0.0, "learning_rate": 3.0973567129023815e-07, "logps/chosen": -192.88714599609375, "logps/rejected": -202.2433624267578, "loss": 0.2015, "rewards/chosen": 0.7536472678184509, "rewards/margins": 3.491544246673584, "rewards/rejected": -2.7378969192504883, "step": 1454 }, { "epoch": 0.38, "grad_norm": 34.550262451171875, "kl": 0.0, "learning_rate": 3.0960481549332635e-07, "logps/chosen": -225.03179931640625, "logps/rejected": -210.81072998046875, "loss": 0.3427, "rewards/chosen": 2.0873336791992188, "rewards/margins": 4.194190979003906, "rewards/rejected": -2.1068575382232666, "step": 1455 }, { "epoch": 0.38, "grad_norm": 39.06039047241211, "kl": 0.0, "learning_rate": 3.0947395969641455e-07, "logps/chosen": -235.35470581054688, "logps/rejected": -283.54583740234375, "loss": 0.2915, "rewards/chosen": 0.08544189482927322, "rewards/margins": 3.5063464641571045, "rewards/rejected": -3.4209046363830566, "step": 1456 }, { "epoch": 0.38, "grad_norm": 37.075130462646484, "kl": 0.0, "learning_rate": 3.0934310389950274e-07, "logps/chosen": -140.18020629882812, "logps/rejected": -242.32400512695312, "loss": 0.3857, "rewards/chosen": -0.3754658102989197, "rewards/margins": 1.4958202838897705, "rewards/rejected": -1.8712860345840454, "step": 1457 }, { "epoch": 0.38, "grad_norm": 30.9268741607666, "kl": 0.0, "learning_rate": 3.0921224810259094e-07, "logps/chosen": -211.39682006835938, "logps/rejected": -248.64735412597656, "loss": 0.3061, "rewards/chosen": 0.48306411504745483, "rewards/margins": 3.4601712226867676, "rewards/rejected": -2.977107048034668, "step": 1458 }, { "epoch": 0.38, "grad_norm": 27.856122970581055, "kl": 0.0, "learning_rate": 3.0908139230567914e-07, "logps/chosen": -225.61920166015625, "logps/rejected": -236.6941680908203, "loss": 0.3685, "rewards/chosen": -0.30282920598983765, "rewards/margins": 2.0515077114105225, "rewards/rejected": -2.354336977005005, "step": 1459 }, { "epoch": 0.38, "grad_norm": 29.09127426147461, "kl": 0.0, "learning_rate": 3.0895053650876733e-07, "logps/chosen": -157.025390625, "logps/rejected": -247.02685546875, "loss": 0.3274, "rewards/chosen": 0.052327901124954224, "rewards/margins": 2.787691354751587, "rewards/rejected": -2.735363483428955, "step": 1460 }, { "epoch": 0.38, "grad_norm": 31.07024383544922, "kl": 0.0, "learning_rate": 3.0881968071185553e-07, "logps/chosen": -137.29736328125, "logps/rejected": -241.9585723876953, "loss": 0.1837, "rewards/chosen": 0.11936046183109283, "rewards/margins": 3.824509859085083, "rewards/rejected": -3.7051494121551514, "step": 1461 }, { "epoch": 0.38, "grad_norm": 36.02007293701172, "kl": 0.0, "learning_rate": 3.0868882491494367e-07, "logps/chosen": -166.37362670898438, "logps/rejected": -258.8514099121094, "loss": 0.1196, "rewards/chosen": 1.0134023427963257, "rewards/margins": 4.809149742126465, "rewards/rejected": -3.7957472801208496, "step": 1462 }, { "epoch": 0.38, "grad_norm": 30.613178253173828, "kl": 0.0, "learning_rate": 3.0855796911803187e-07, "logps/chosen": -274.941650390625, "logps/rejected": -276.4577941894531, "loss": 0.3452, "rewards/chosen": 0.25566530227661133, "rewards/margins": 5.204866409301758, "rewards/rejected": -4.9492011070251465, "step": 1463 }, { "epoch": 0.38, "grad_norm": 28.9886417388916, "kl": 0.0, "learning_rate": 3.084271133211201e-07, "logps/chosen": -115.51466369628906, "logps/rejected": -223.60037231445312, "loss": 0.2394, "rewards/chosen": 1.5236237049102783, "rewards/margins": 5.302260875701904, "rewards/rejected": -3.778637170791626, "step": 1464 }, { "epoch": 0.38, "grad_norm": 27.671178817749023, "kl": 0.0, "learning_rate": 3.082962575242083e-07, "logps/chosen": -185.50547790527344, "logps/rejected": -286.89459228515625, "loss": 0.3222, "rewards/chosen": 0.7556943297386169, "rewards/margins": 3.5602149963378906, "rewards/rejected": -2.804520606994629, "step": 1465 }, { "epoch": 0.38, "grad_norm": 29.94826889038086, "kl": 0.0, "learning_rate": 3.081654017272965e-07, "logps/chosen": -247.38006591796875, "logps/rejected": -202.25840759277344, "loss": 0.1946, "rewards/chosen": 1.0844428539276123, "rewards/margins": 5.710987091064453, "rewards/rejected": -4.626543998718262, "step": 1466 }, { "epoch": 0.38, "grad_norm": 25.045530319213867, "kl": 0.0, "learning_rate": 3.080345459303847e-07, "logps/chosen": -220.7410125732422, "logps/rejected": -191.14923095703125, "loss": 0.3061, "rewards/chosen": -1.2042438983917236, "rewards/margins": 0.7004872560501099, "rewards/rejected": -1.9047311544418335, "step": 1467 }, { "epoch": 0.38, "grad_norm": 35.02234649658203, "kl": 0.0, "learning_rate": 3.079036901334729e-07, "logps/chosen": -198.464599609375, "logps/rejected": -260.91424560546875, "loss": 0.3501, "rewards/chosen": 0.9540805220603943, "rewards/margins": 3.882875919342041, "rewards/rejected": -2.928795337677002, "step": 1468 }, { "epoch": 0.38, "grad_norm": 33.84938430786133, "kl": 0.0, "learning_rate": 3.077728343365611e-07, "logps/chosen": -152.50367736816406, "logps/rejected": -298.82403564453125, "loss": 0.3167, "rewards/chosen": -0.22875815629959106, "rewards/margins": 4.95879602432251, "rewards/rejected": -5.187554359436035, "step": 1469 }, { "epoch": 0.38, "grad_norm": 35.08100128173828, "kl": 0.0, "learning_rate": 3.076419785396493e-07, "logps/chosen": -295.23663330078125, "logps/rejected": -158.22804260253906, "loss": 0.3239, "rewards/chosen": 1.505220651626587, "rewards/margins": 4.030706405639648, "rewards/rejected": -2.5254859924316406, "step": 1470 }, { "epoch": 0.38, "grad_norm": 33.56315231323242, "kl": 0.0, "learning_rate": 3.075111227427375e-07, "logps/chosen": -125.35154724121094, "logps/rejected": -183.9165802001953, "loss": 0.2298, "rewards/chosen": 1.0760307312011719, "rewards/margins": 4.734567642211914, "rewards/rejected": -3.6585371494293213, "step": 1471 }, { "epoch": 0.39, "grad_norm": 46.655113220214844, "kl": 0.0, "learning_rate": 3.073802669458257e-07, "logps/chosen": -321.3465576171875, "logps/rejected": -221.3088836669922, "loss": 0.4142, "rewards/chosen": 0.4878466725349426, "rewards/margins": 2.085667371749878, "rewards/rejected": -1.5978206396102905, "step": 1472 }, { "epoch": 0.39, "grad_norm": 35.99465560913086, "kl": 0.0, "learning_rate": 3.072494111489139e-07, "logps/chosen": -208.22398376464844, "logps/rejected": -208.73878479003906, "loss": 0.2003, "rewards/chosen": 1.324711561203003, "rewards/margins": 4.5703043937683105, "rewards/rejected": -3.2455928325653076, "step": 1473 }, { "epoch": 0.39, "grad_norm": 32.40290451049805, "kl": 0.0, "learning_rate": 3.071185553520021e-07, "logps/chosen": -204.934814453125, "logps/rejected": -203.62197875976562, "loss": 0.3736, "rewards/chosen": -0.7168243527412415, "rewards/margins": 2.0529723167419434, "rewards/rejected": -2.76979660987854, "step": 1474 }, { "epoch": 0.39, "grad_norm": 28.98914909362793, "kl": 0.0, "learning_rate": 3.0698769955509027e-07, "logps/chosen": -138.81582641601562, "logps/rejected": -237.28030395507812, "loss": 0.279, "rewards/chosen": 0.6532577276229858, "rewards/margins": 3.616654396057129, "rewards/rejected": -2.9633965492248535, "step": 1475 }, { "epoch": 0.39, "grad_norm": 34.76724624633789, "kl": 0.0, "learning_rate": 3.068568437581785e-07, "logps/chosen": -257.87762451171875, "logps/rejected": -243.04725646972656, "loss": 0.219, "rewards/chosen": 0.8108853697776794, "rewards/margins": 3.9558494091033936, "rewards/rejected": -3.1449639797210693, "step": 1476 }, { "epoch": 0.39, "grad_norm": 22.456140518188477, "kl": 0.0, "learning_rate": 3.067259879612667e-07, "logps/chosen": -159.96438598632812, "logps/rejected": -202.48800659179688, "loss": 0.2281, "rewards/chosen": 0.7046743035316467, "rewards/margins": 4.733370780944824, "rewards/rejected": -4.028696537017822, "step": 1477 }, { "epoch": 0.39, "grad_norm": 28.346275329589844, "kl": 0.0, "learning_rate": 3.0659513216435486e-07, "logps/chosen": -163.13380432128906, "logps/rejected": -276.5238952636719, "loss": 0.2247, "rewards/chosen": 1.4744484424591064, "rewards/margins": 5.781862258911133, "rewards/rejected": -4.3074140548706055, "step": 1478 }, { "epoch": 0.39, "grad_norm": 26.092206954956055, "kl": 0.0, "learning_rate": 3.0646427636744306e-07, "logps/chosen": -268.532470703125, "logps/rejected": -272.1234436035156, "loss": 0.2369, "rewards/chosen": 1.1939743757247925, "rewards/margins": 5.351585388183594, "rewards/rejected": -4.157610893249512, "step": 1479 }, { "epoch": 0.39, "grad_norm": 36.25248718261719, "kl": 0.0, "learning_rate": 3.0633342057053125e-07, "logps/chosen": -230.22296142578125, "logps/rejected": -373.62445068359375, "loss": 0.2736, "rewards/chosen": -1.2396442890167236, "rewards/margins": 4.840928077697754, "rewards/rejected": -6.080572605133057, "step": 1480 }, { "epoch": 0.39, "grad_norm": 30.960378646850586, "kl": 0.0, "learning_rate": 3.0620256477361945e-07, "logps/chosen": -168.9617462158203, "logps/rejected": -217.56732177734375, "loss": 0.3062, "rewards/chosen": 0.2559138834476471, "rewards/margins": 3.4393367767333984, "rewards/rejected": -3.183422803878784, "step": 1481 }, { "epoch": 0.39, "grad_norm": 27.86515998840332, "kl": 0.0, "learning_rate": 3.0607170897670765e-07, "logps/chosen": -219.33834838867188, "logps/rejected": -238.18690490722656, "loss": 0.1837, "rewards/chosen": -0.1820228099822998, "rewards/margins": 2.337970733642578, "rewards/rejected": -2.519993543624878, "step": 1482 }, { "epoch": 0.39, "grad_norm": 30.410737991333008, "kl": 0.0, "learning_rate": 3.0594085317979584e-07, "logps/chosen": -152.96058654785156, "logps/rejected": -216.08642578125, "loss": 0.2986, "rewards/chosen": -0.333772748708725, "rewards/margins": 3.3381154537200928, "rewards/rejected": -3.6718881130218506, "step": 1483 }, { "epoch": 0.39, "grad_norm": 45.52098846435547, "kl": 0.0, "learning_rate": 3.0580999738288404e-07, "logps/chosen": -235.53439331054688, "logps/rejected": -242.63815307617188, "loss": 0.2718, "rewards/chosen": 1.6429567337036133, "rewards/margins": 4.876601219177246, "rewards/rejected": -3.233644485473633, "step": 1484 }, { "epoch": 0.39, "grad_norm": 33.77592849731445, "kl": 0.0, "learning_rate": 3.0567914158597223e-07, "logps/chosen": -222.42430114746094, "logps/rejected": -269.167724609375, "loss": 0.3382, "rewards/chosen": 0.28296148777008057, "rewards/margins": 3.6499133110046387, "rewards/rejected": -3.3669517040252686, "step": 1485 }, { "epoch": 0.39, "grad_norm": 34.32917785644531, "kl": 0.0, "learning_rate": 3.0554828578906043e-07, "logps/chosen": -245.42189025878906, "logps/rejected": -234.22384643554688, "loss": 0.2435, "rewards/chosen": 1.0255693197250366, "rewards/margins": 2.8453550338745117, "rewards/rejected": -1.8197858333587646, "step": 1486 }, { "epoch": 0.39, "grad_norm": 40.217166900634766, "kl": 0.0, "learning_rate": 3.054174299921486e-07, "logps/chosen": -209.5061798095703, "logps/rejected": -204.7925262451172, "loss": 0.347, "rewards/chosen": 0.07800575345754623, "rewards/margins": 3.161608934402466, "rewards/rejected": -3.0836031436920166, "step": 1487 }, { "epoch": 0.39, "grad_norm": 34.6190185546875, "kl": 0.0, "learning_rate": 3.052865741952368e-07, "logps/chosen": -317.90216064453125, "logps/rejected": -208.20858764648438, "loss": 0.3854, "rewards/chosen": -0.6948710680007935, "rewards/margins": 1.9840534925460815, "rewards/rejected": -2.678924560546875, "step": 1488 }, { "epoch": 0.39, "grad_norm": 33.66693115234375, "kl": 0.0, "learning_rate": 3.0515571839832507e-07, "logps/chosen": -131.39944458007812, "logps/rejected": -219.7893524169922, "loss": 0.272, "rewards/chosen": 1.1535712480545044, "rewards/margins": 3.9656424522399902, "rewards/rejected": -2.8120710849761963, "step": 1489 }, { "epoch": 0.39, "grad_norm": 27.822031021118164, "kl": 0.0, "learning_rate": 3.0502486260141327e-07, "logps/chosen": -205.55569458007812, "logps/rejected": -261.0909729003906, "loss": 0.4186, "rewards/chosen": -0.6630886793136597, "rewards/margins": 1.5766831636428833, "rewards/rejected": -2.239771842956543, "step": 1490 }, { "epoch": 0.39, "grad_norm": 43.77429962158203, "kl": 0.0, "learning_rate": 3.0489400680450146e-07, "logps/chosen": -233.81439208984375, "logps/rejected": -243.33828735351562, "loss": 0.3327, "rewards/chosen": -0.9625567197799683, "rewards/margins": 4.0890398025512695, "rewards/rejected": -5.051596641540527, "step": 1491 }, { "epoch": 0.39, "grad_norm": 35.22560501098633, "kl": 0.0, "learning_rate": 3.0476315100758966e-07, "logps/chosen": -270.0503234863281, "logps/rejected": -254.5584259033203, "loss": 0.2098, "rewards/chosen": -0.6624004244804382, "rewards/margins": 5.66503381729126, "rewards/rejected": -6.327434062957764, "step": 1492 }, { "epoch": 0.39, "grad_norm": 38.9492073059082, "kl": 0.0, "learning_rate": 3.046322952106778e-07, "logps/chosen": -220.76625061035156, "logps/rejected": -245.36216735839844, "loss": 0.3059, "rewards/chosen": 0.005003486294299364, "rewards/margins": 3.092819929122925, "rewards/rejected": -3.0878164768218994, "step": 1493 }, { "epoch": 0.39, "grad_norm": 48.36807632446289, "kl": 0.0, "learning_rate": 3.04501439413766e-07, "logps/chosen": -230.12994384765625, "logps/rejected": -240.77676391601562, "loss": 0.4136, "rewards/chosen": 1.2152693271636963, "rewards/margins": 1.9342682361602783, "rewards/rejected": -0.718998908996582, "step": 1494 }, { "epoch": 0.39, "grad_norm": 38.448631286621094, "kl": 0.0, "learning_rate": 3.043705836168542e-07, "logps/chosen": -296.76776123046875, "logps/rejected": -231.88865661621094, "loss": 0.2302, "rewards/chosen": 1.8158690929412842, "rewards/margins": 4.516867637634277, "rewards/rejected": -2.700998544692993, "step": 1495 }, { "epoch": 0.39, "grad_norm": 34.014060974121094, "kl": 0.0, "learning_rate": 3.042397278199424e-07, "logps/chosen": -133.33018493652344, "logps/rejected": -316.0521240234375, "loss": 0.2851, "rewards/chosen": 1.3611581325531006, "rewards/margins": 4.80728006362915, "rewards/rejected": -3.44612193107605, "step": 1496 }, { "epoch": 0.39, "grad_norm": 24.460899353027344, "kl": 0.0, "learning_rate": 3.041088720230306e-07, "logps/chosen": -225.04161071777344, "logps/rejected": -238.4369354248047, "loss": 0.1244, "rewards/chosen": 2.800896406173706, "rewards/margins": 6.36253547668457, "rewards/rejected": -3.5616390705108643, "step": 1497 }, { "epoch": 0.39, "grad_norm": 28.581289291381836, "kl": 0.0, "learning_rate": 3.039780162261188e-07, "logps/chosen": -245.81613159179688, "logps/rejected": -208.173095703125, "loss": 0.3861, "rewards/chosen": -0.5592122673988342, "rewards/margins": 2.6268491744995117, "rewards/rejected": -3.186061382293701, "step": 1498 }, { "epoch": 0.39, "grad_norm": 29.391639709472656, "kl": 0.0, "learning_rate": 3.03847160429207e-07, "logps/chosen": -277.779541015625, "logps/rejected": -284.0148620605469, "loss": 0.3022, "rewards/chosen": -0.8414028882980347, "rewards/margins": 1.9396697282791138, "rewards/rejected": -2.7810726165771484, "step": 1499 }, { "epoch": 0.39, "grad_norm": 40.777618408203125, "kl": 0.0, "learning_rate": 3.037163046322952e-07, "logps/chosen": -203.26284790039062, "logps/rejected": -250.16270446777344, "loss": 0.3193, "rewards/chosen": 0.002718701958656311, "rewards/margins": 2.7231438159942627, "rewards/rejected": -2.7204251289367676, "step": 1500 }, { "epoch": 0.39, "grad_norm": 27.155214309692383, "kl": 0.0, "learning_rate": 3.0358544883538337e-07, "logps/chosen": -213.1066131591797, "logps/rejected": -231.6353759765625, "loss": 0.1685, "rewards/chosen": 0.4949774444103241, "rewards/margins": 4.511791229248047, "rewards/rejected": -4.0168137550354, "step": 1501 }, { "epoch": 0.39, "grad_norm": 29.21578598022461, "kl": 0.0, "learning_rate": 3.034545930384716e-07, "logps/chosen": -184.6173553466797, "logps/rejected": -220.74658203125, "loss": 0.2779, "rewards/chosen": -0.2819909155368805, "rewards/margins": 3.6761438846588135, "rewards/rejected": -3.958134889602661, "step": 1502 }, { "epoch": 0.39, "grad_norm": 38.4527702331543, "kl": 0.0, "learning_rate": 3.033237372415598e-07, "logps/chosen": -261.9404296875, "logps/rejected": -197.33087158203125, "loss": 0.357, "rewards/chosen": 0.11832320690155029, "rewards/margins": 2.584242820739746, "rewards/rejected": -2.4659194946289062, "step": 1503 }, { "epoch": 0.39, "grad_norm": 28.692373275756836, "kl": 0.0, "learning_rate": 3.03192881444648e-07, "logps/chosen": -186.4076690673828, "logps/rejected": -177.31590270996094, "loss": 0.1979, "rewards/chosen": -0.825257420539856, "rewards/margins": 2.0939512252807617, "rewards/rejected": -2.919208526611328, "step": 1504 }, { "epoch": 0.39, "grad_norm": 39.02003479003906, "kl": 0.0, "learning_rate": 3.030620256477362e-07, "logps/chosen": -253.4713134765625, "logps/rejected": -160.2345733642578, "loss": 0.407, "rewards/chosen": -0.8305866122245789, "rewards/margins": 0.5052904486656189, "rewards/rejected": -1.3358770608901978, "step": 1505 }, { "epoch": 0.39, "grad_norm": 41.82636642456055, "kl": 0.0, "learning_rate": 3.029311698508244e-07, "logps/chosen": -75.3058853149414, "logps/rejected": -257.3697814941406, "loss": 0.2283, "rewards/chosen": 1.5601531267166138, "rewards/margins": 3.760997772216797, "rewards/rejected": -2.2008447647094727, "step": 1506 }, { "epoch": 0.39, "grad_norm": 37.105777740478516, "kl": 0.0, "learning_rate": 3.028003140539126e-07, "logps/chosen": -292.6695556640625, "logps/rejected": -244.73265075683594, "loss": 0.3224, "rewards/chosen": -1.939176321029663, "rewards/margins": 3.3143680095672607, "rewards/rejected": -5.253544330596924, "step": 1507 }, { "epoch": 0.39, "grad_norm": 58.40880584716797, "kl": 0.0, "learning_rate": 3.026694582570008e-07, "logps/chosen": -144.26760864257812, "logps/rejected": -260.7916259765625, "loss": 0.2336, "rewards/chosen": 0.3313431739807129, "rewards/margins": 4.2500715255737305, "rewards/rejected": -3.9187281131744385, "step": 1508 }, { "epoch": 0.39, "grad_norm": 36.10960006713867, "kl": 0.0, "learning_rate": 3.0253860246008894e-07, "logps/chosen": -216.76353454589844, "logps/rejected": -235.7379150390625, "loss": 0.2212, "rewards/chosen": 0.9106661677360535, "rewards/margins": 4.330042839050293, "rewards/rejected": -3.419376850128174, "step": 1509 }, { "epoch": 0.4, "grad_norm": 35.00156784057617, "kl": 0.0, "learning_rate": 3.0240774666317714e-07, "logps/chosen": -247.1843719482422, "logps/rejected": -234.66162109375, "loss": 0.3298, "rewards/chosen": 0.6620055437088013, "rewards/margins": 2.648651599884033, "rewards/rejected": -1.986646056175232, "step": 1510 }, { "epoch": 0.4, "grad_norm": 33.013389587402344, "kl": 0.0, "learning_rate": 3.0227689086626533e-07, "logps/chosen": -267.90496826171875, "logps/rejected": -251.13446044921875, "loss": 0.2749, "rewards/chosen": 1.9397257566452026, "rewards/margins": 5.9958577156066895, "rewards/rejected": -4.056131839752197, "step": 1511 }, { "epoch": 0.4, "grad_norm": 24.649215698242188, "kl": 0.0, "learning_rate": 3.0214603506935353e-07, "logps/chosen": -128.24273681640625, "logps/rejected": -199.8554229736328, "loss": 0.247, "rewards/chosen": 0.5000770688056946, "rewards/margins": 4.609408378601074, "rewards/rejected": -4.109331130981445, "step": 1512 }, { "epoch": 0.4, "grad_norm": 49.26161193847656, "kl": 0.0, "learning_rate": 3.020151792724417e-07, "logps/chosen": -228.639892578125, "logps/rejected": -158.06222534179688, "loss": 0.4441, "rewards/chosen": -0.19630053639411926, "rewards/margins": 1.6684128046035767, "rewards/rejected": -1.8647133111953735, "step": 1513 }, { "epoch": 0.4, "grad_norm": 34.833702087402344, "kl": 0.0, "learning_rate": 3.018843234755299e-07, "logps/chosen": -253.34954833984375, "logps/rejected": -217.84603881835938, "loss": 0.4131, "rewards/chosen": -1.3816742897033691, "rewards/margins": 1.0617821216583252, "rewards/rejected": -2.4434564113616943, "step": 1514 }, { "epoch": 0.4, "grad_norm": 28.9056396484375, "kl": 0.0, "learning_rate": 3.0175346767861817e-07, "logps/chosen": -172.053955078125, "logps/rejected": -256.8983154296875, "loss": 0.2978, "rewards/chosen": -1.2212709188461304, "rewards/margins": 1.9896234273910522, "rewards/rejected": -3.2108943462371826, "step": 1515 }, { "epoch": 0.4, "grad_norm": 28.236330032348633, "kl": 0.0, "learning_rate": 3.0162261188170637e-07, "logps/chosen": -263.5512390136719, "logps/rejected": -252.2602996826172, "loss": 0.2911, "rewards/chosen": -0.8032323718070984, "rewards/margins": 2.6989030838012695, "rewards/rejected": -3.5021355152130127, "step": 1516 }, { "epoch": 0.4, "grad_norm": 27.29069709777832, "kl": 0.0, "learning_rate": 3.0149175608479456e-07, "logps/chosen": -195.29360961914062, "logps/rejected": -219.7592010498047, "loss": 0.2787, "rewards/chosen": -0.900688648223877, "rewards/margins": 3.03352952003479, "rewards/rejected": -3.934218168258667, "step": 1517 }, { "epoch": 0.4, "grad_norm": 37.780662536621094, "kl": 0.0, "learning_rate": 3.0136090028788276e-07, "logps/chosen": -224.90914916992188, "logps/rejected": -257.0303955078125, "loss": 0.3451, "rewards/chosen": -0.3784347176551819, "rewards/margins": 4.28721284866333, "rewards/rejected": -4.665647506713867, "step": 1518 }, { "epoch": 0.4, "grad_norm": 35.100135803222656, "kl": 0.0, "learning_rate": 3.0123004449097095e-07, "logps/chosen": -210.57568359375, "logps/rejected": -216.138916015625, "loss": 0.3389, "rewards/chosen": 0.2311960607767105, "rewards/margins": 3.171760320663452, "rewards/rejected": -2.9405641555786133, "step": 1519 }, { "epoch": 0.4, "grad_norm": 29.088823318481445, "kl": 0.0, "learning_rate": 3.0109918869405915e-07, "logps/chosen": -164.3828125, "logps/rejected": -214.0646514892578, "loss": 0.2862, "rewards/chosen": 0.353228896856308, "rewards/margins": 3.9613986015319824, "rewards/rejected": -3.6081697940826416, "step": 1520 }, { "epoch": 0.4, "grad_norm": 33.2971305847168, "kl": 0.0, "learning_rate": 3.0096833289714735e-07, "logps/chosen": -220.14793395996094, "logps/rejected": -321.53192138671875, "loss": 0.2287, "rewards/chosen": 0.3785732686519623, "rewards/margins": 5.689975738525391, "rewards/rejected": -5.311402320861816, "step": 1521 }, { "epoch": 0.4, "grad_norm": 42.90241622924805, "kl": 0.0, "learning_rate": 3.0083747710023554e-07, "logps/chosen": -195.2041778564453, "logps/rejected": -241.295654296875, "loss": 0.2787, "rewards/chosen": 1.4876168966293335, "rewards/margins": 4.329094409942627, "rewards/rejected": -2.841477632522583, "step": 1522 }, { "epoch": 0.4, "grad_norm": 33.62684631347656, "kl": 0.0, "learning_rate": 3.0070662130332374e-07, "logps/chosen": -272.9668273925781, "logps/rejected": -261.4366760253906, "loss": 0.3369, "rewards/chosen": -0.7848519086837769, "rewards/margins": 3.7503604888916016, "rewards/rejected": -4.535212516784668, "step": 1523 }, { "epoch": 0.4, "grad_norm": 34.81737518310547, "kl": 0.0, "learning_rate": 3.005757655064119e-07, "logps/chosen": -173.27880859375, "logps/rejected": -293.0011901855469, "loss": 0.3702, "rewards/chosen": -0.3220140337944031, "rewards/margins": 5.201068878173828, "rewards/rejected": -5.523082733154297, "step": 1524 }, { "epoch": 0.4, "grad_norm": 39.21683120727539, "kl": 0.0, "learning_rate": 3.004449097095001e-07, "logps/chosen": -164.2547607421875, "logps/rejected": -258.0382080078125, "loss": 0.3019, "rewards/chosen": 0.24068419635295868, "rewards/margins": 2.7888166904449463, "rewards/rejected": -2.5481324195861816, "step": 1525 }, { "epoch": 0.4, "grad_norm": 25.111703872680664, "kl": 0.0, "learning_rate": 3.0031405391258827e-07, "logps/chosen": -181.2828369140625, "logps/rejected": -229.8302001953125, "loss": 0.1429, "rewards/chosen": 3.282607316970825, "rewards/margins": 7.605630874633789, "rewards/rejected": -4.323023796081543, "step": 1526 }, { "epoch": 0.4, "grad_norm": 31.849605560302734, "kl": 0.0, "learning_rate": 3.0018319811567647e-07, "logps/chosen": -239.89051818847656, "logps/rejected": -295.36663818359375, "loss": 0.397, "rewards/chosen": -0.9494417905807495, "rewards/margins": 1.8495646715164185, "rewards/rejected": -2.799006462097168, "step": 1527 }, { "epoch": 0.4, "grad_norm": 35.73810958862305, "kl": 0.0, "learning_rate": 3.000523423187647e-07, "logps/chosen": -194.4100341796875, "logps/rejected": -226.74893188476562, "loss": 0.3235, "rewards/chosen": 0.38581228256225586, "rewards/margins": 3.156494140625, "rewards/rejected": -2.770681858062744, "step": 1528 }, { "epoch": 0.4, "grad_norm": 34.644859313964844, "kl": 0.0, "learning_rate": 2.999214865218529e-07, "logps/chosen": -202.15171813964844, "logps/rejected": -245.19308471679688, "loss": 0.2077, "rewards/chosen": 1.9559669494628906, "rewards/margins": 4.376315116882324, "rewards/rejected": -2.4203479290008545, "step": 1529 }, { "epoch": 0.4, "grad_norm": 34.94104766845703, "kl": 0.0, "learning_rate": 2.997906307249411e-07, "logps/chosen": -188.9501190185547, "logps/rejected": -271.6665344238281, "loss": 0.3114, "rewards/chosen": -0.6005575060844421, "rewards/margins": 3.01674485206604, "rewards/rejected": -3.617302417755127, "step": 1530 }, { "epoch": 0.4, "grad_norm": 38.38316345214844, "kl": 0.0, "learning_rate": 2.996597749280293e-07, "logps/chosen": -214.96990966796875, "logps/rejected": -289.9461975097656, "loss": 0.3475, "rewards/chosen": -0.06996187567710876, "rewards/margins": 2.5924227237701416, "rewards/rejected": -2.662384510040283, "step": 1531 }, { "epoch": 0.4, "grad_norm": 33.79578399658203, "kl": 0.0, "learning_rate": 2.995289191311175e-07, "logps/chosen": -281.37237548828125, "logps/rejected": -356.375, "loss": 0.3213, "rewards/chosen": 0.9624449014663696, "rewards/margins": 6.715054988861084, "rewards/rejected": -5.752610206604004, "step": 1532 }, { "epoch": 0.4, "grad_norm": 35.78695297241211, "kl": 0.0, "learning_rate": 2.993980633342057e-07, "logps/chosen": -208.66139221191406, "logps/rejected": -307.55810546875, "loss": 0.1853, "rewards/chosen": 2.171360492706299, "rewards/margins": 4.887650489807129, "rewards/rejected": -2.71628999710083, "step": 1533 }, { "epoch": 0.4, "grad_norm": 33.36848831176758, "kl": 0.0, "learning_rate": 2.992672075372939e-07, "logps/chosen": -276.29547119140625, "logps/rejected": -322.4742431640625, "loss": 0.2057, "rewards/chosen": 1.1996976137161255, "rewards/margins": 4.880166053771973, "rewards/rejected": -3.6804683208465576, "step": 1534 }, { "epoch": 0.4, "grad_norm": 35.62516784667969, "kl": 0.0, "learning_rate": 2.991363517403821e-07, "logps/chosen": -186.53976440429688, "logps/rejected": -294.9364013671875, "loss": 0.3682, "rewards/chosen": -0.6345914006233215, "rewards/margins": 3.435218572616577, "rewards/rejected": -4.069809913635254, "step": 1535 }, { "epoch": 0.4, "grad_norm": 35.93933868408203, "kl": 0.0, "learning_rate": 2.990054959434703e-07, "logps/chosen": -292.94549560546875, "logps/rejected": -264.8148498535156, "loss": 0.2498, "rewards/chosen": 1.626547932624817, "rewards/margins": 4.3725175857543945, "rewards/rejected": -2.745969772338867, "step": 1536 }, { "epoch": 0.4, "grad_norm": 66.67236328125, "kl": 0.0, "learning_rate": 2.988746401465585e-07, "logps/chosen": -252.86598205566406, "logps/rejected": -177.5896759033203, "loss": 0.3376, "rewards/chosen": -0.02513676881790161, "rewards/margins": 2.472538471221924, "rewards/rejected": -2.4976751804351807, "step": 1537 }, { "epoch": 0.4, "grad_norm": 32.29557418823242, "kl": 0.0, "learning_rate": 2.987437843496467e-07, "logps/chosen": -196.1752166748047, "logps/rejected": -275.33575439453125, "loss": 0.2844, "rewards/chosen": 1.8063740730285645, "rewards/margins": 4.412812232971191, "rewards/rejected": -2.606438398361206, "step": 1538 }, { "epoch": 0.4, "grad_norm": 29.864439010620117, "kl": 0.0, "learning_rate": 2.986129285527349e-07, "logps/chosen": -245.36810302734375, "logps/rejected": -219.60891723632812, "loss": 0.2431, "rewards/chosen": 1.5060614347457886, "rewards/margins": 5.396974086761475, "rewards/rejected": -3.8909127712249756, "step": 1539 }, { "epoch": 0.4, "grad_norm": 34.409236907958984, "kl": 0.0, "learning_rate": 2.98482072755823e-07, "logps/chosen": -183.43905639648438, "logps/rejected": -157.3790740966797, "loss": 0.2875, "rewards/chosen": 1.2466129064559937, "rewards/margins": 2.63576078414917, "rewards/rejected": -1.3891478776931763, "step": 1540 }, { "epoch": 0.4, "grad_norm": 46.68579864501953, "kl": 0.0, "learning_rate": 2.9835121695891127e-07, "logps/chosen": -272.5766906738281, "logps/rejected": -255.7216339111328, "loss": 0.2431, "rewards/chosen": 0.9345579147338867, "rewards/margins": 3.078080892562866, "rewards/rejected": -2.1435229778289795, "step": 1541 }, { "epoch": 0.4, "grad_norm": 24.752788543701172, "kl": 0.0, "learning_rate": 2.9822036116199946e-07, "logps/chosen": -180.98768615722656, "logps/rejected": -276.1407775878906, "loss": 0.2213, "rewards/chosen": 0.8837758302688599, "rewards/margins": 4.952155113220215, "rewards/rejected": -4.0683794021606445, "step": 1542 }, { "epoch": 0.4, "grad_norm": 57.56275939941406, "kl": 0.0, "learning_rate": 2.9808950536508766e-07, "logps/chosen": -176.87188720703125, "logps/rejected": -266.2235107421875, "loss": 0.3233, "rewards/chosen": -0.016633659601211548, "rewards/margins": 3.9170687198638916, "rewards/rejected": -3.9337024688720703, "step": 1543 }, { "epoch": 0.4, "grad_norm": 32.71670150756836, "kl": 0.0, "learning_rate": 2.9795864956817586e-07, "logps/chosen": -222.072265625, "logps/rejected": -275.0723876953125, "loss": 0.297, "rewards/chosen": 0.750746488571167, "rewards/margins": 7.154358863830566, "rewards/rejected": -6.40361213684082, "step": 1544 }, { "epoch": 0.4, "grad_norm": 33.7441520690918, "kl": 0.0, "learning_rate": 2.9782779377126405e-07, "logps/chosen": -183.69985961914062, "logps/rejected": -291.5790100097656, "loss": 0.2655, "rewards/chosen": -0.3251463770866394, "rewards/margins": 3.752892255783081, "rewards/rejected": -4.078038692474365, "step": 1545 }, { "epoch": 0.4, "grad_norm": 26.29779815673828, "kl": 0.0, "learning_rate": 2.9769693797435225e-07, "logps/chosen": -258.9979248046875, "logps/rejected": -377.3835144042969, "loss": 0.2908, "rewards/chosen": 0.3104372024536133, "rewards/margins": 4.86877965927124, "rewards/rejected": -4.558342456817627, "step": 1546 }, { "epoch": 0.4, "grad_norm": 27.509925842285156, "kl": 0.0, "learning_rate": 2.9756608217744044e-07, "logps/chosen": -262.73876953125, "logps/rejected": -230.32789611816406, "loss": 0.2273, "rewards/chosen": -1.16804039478302, "rewards/margins": 2.2382097244262695, "rewards/rejected": -3.406250238418579, "step": 1547 }, { "epoch": 0.41, "grad_norm": 40.13040542602539, "kl": 0.0, "learning_rate": 2.9743522638052864e-07, "logps/chosen": -296.4833984375, "logps/rejected": -290.46551513671875, "loss": 0.2746, "rewards/chosen": 0.9714206457138062, "rewards/margins": 4.574386119842529, "rewards/rejected": -3.6029653549194336, "step": 1548 }, { "epoch": 0.41, "grad_norm": 38.02606201171875, "kl": 0.0, "learning_rate": 2.9730437058361684e-07, "logps/chosen": -194.1744384765625, "logps/rejected": -297.8863830566406, "loss": 0.2987, "rewards/chosen": 0.05778183043003082, "rewards/margins": 4.536423206329346, "rewards/rejected": -4.478641510009766, "step": 1549 }, { "epoch": 0.41, "grad_norm": 43.55545425415039, "kl": 0.0, "learning_rate": 2.9717351478670503e-07, "logps/chosen": -299.7167053222656, "logps/rejected": -213.52081298828125, "loss": 0.3064, "rewards/chosen": 0.4842572510242462, "rewards/margins": 3.815340042114258, "rewards/rejected": -3.331082820892334, "step": 1550 }, { "epoch": 0.41, "grad_norm": 32.664939880371094, "kl": 0.0, "learning_rate": 2.9704265898979323e-07, "logps/chosen": -279.3871765136719, "logps/rejected": -205.69642639160156, "loss": 0.2493, "rewards/chosen": 0.3699782192707062, "rewards/margins": 3.5103328227996826, "rewards/rejected": -3.140354633331299, "step": 1551 }, { "epoch": 0.41, "grad_norm": 34.73197555541992, "kl": 0.0, "learning_rate": 2.969118031928814e-07, "logps/chosen": -270.1496276855469, "logps/rejected": -314.9805908203125, "loss": 0.3478, "rewards/chosen": 0.297321081161499, "rewards/margins": 4.266029357910156, "rewards/rejected": -3.9687085151672363, "step": 1552 }, { "epoch": 0.41, "grad_norm": 38.26441192626953, "kl": 0.0, "learning_rate": 2.967809473959697e-07, "logps/chosen": -146.8294219970703, "logps/rejected": -241.314697265625, "loss": 0.223, "rewards/chosen": 1.411135196685791, "rewards/margins": 5.363457679748535, "rewards/rejected": -3.9523227214813232, "step": 1553 }, { "epoch": 0.41, "grad_norm": 41.40995407104492, "kl": 0.0, "learning_rate": 2.9665009159905787e-07, "logps/chosen": -245.00399780273438, "logps/rejected": -275.0325012207031, "loss": 0.2849, "rewards/chosen": 0.1878739595413208, "rewards/margins": 3.4663233757019043, "rewards/rejected": -3.278449296951294, "step": 1554 }, { "epoch": 0.41, "grad_norm": 42.86524963378906, "kl": 0.0, "learning_rate": 2.96519235802146e-07, "logps/chosen": -219.91879272460938, "logps/rejected": -148.5173797607422, "loss": 0.359, "rewards/chosen": -0.5962321162223816, "rewards/margins": 1.0270888805389404, "rewards/rejected": -1.6233209371566772, "step": 1555 }, { "epoch": 0.41, "grad_norm": 39.4825553894043, "kl": 0.0, "learning_rate": 2.963883800052342e-07, "logps/chosen": -149.03903198242188, "logps/rejected": -281.20672607421875, "loss": 0.1473, "rewards/chosen": 2.119748830795288, "rewards/margins": 4.9517822265625, "rewards/rejected": -2.832033157348633, "step": 1556 }, { "epoch": 0.41, "grad_norm": 28.854963302612305, "kl": 0.0, "learning_rate": 2.962575242083224e-07, "logps/chosen": -158.43087768554688, "logps/rejected": -260.1464538574219, "loss": 0.2625, "rewards/chosen": -0.2116086781024933, "rewards/margins": 2.700449228286743, "rewards/rejected": -2.912057876586914, "step": 1557 }, { "epoch": 0.41, "grad_norm": 35.32528305053711, "kl": 0.0, "learning_rate": 2.961266684114106e-07, "logps/chosen": -278.7086486816406, "logps/rejected": -288.8902587890625, "loss": 0.2674, "rewards/chosen": 1.1448341608047485, "rewards/margins": 5.730721473693848, "rewards/rejected": -4.585887432098389, "step": 1558 }, { "epoch": 0.41, "grad_norm": 25.97184944152832, "kl": 0.0, "learning_rate": 2.959958126144988e-07, "logps/chosen": -300.5465087890625, "logps/rejected": -219.9793701171875, "loss": 0.3383, "rewards/chosen": 0.6771482229232788, "rewards/margins": 4.751331806182861, "rewards/rejected": -4.074183464050293, "step": 1559 }, { "epoch": 0.41, "grad_norm": 34.6771240234375, "kl": 0.0, "learning_rate": 2.95864956817587e-07, "logps/chosen": -193.79806518554688, "logps/rejected": -223.5035400390625, "loss": 0.3794, "rewards/chosen": -0.3675355315208435, "rewards/margins": 1.791252613067627, "rewards/rejected": -2.1587882041931152, "step": 1560 }, { "epoch": 0.41, "grad_norm": 41.60268783569336, "kl": 0.0, "learning_rate": 2.957341010206752e-07, "logps/chosen": -245.3753662109375, "logps/rejected": -261.7906799316406, "loss": 0.3893, "rewards/chosen": -0.6306812763214111, "rewards/margins": 1.5200133323669434, "rewards/rejected": -2.1506946086883545, "step": 1561 }, { "epoch": 0.41, "grad_norm": 36.107933044433594, "kl": 0.0, "learning_rate": 2.956032452237634e-07, "logps/chosen": -274.00250244140625, "logps/rejected": -252.59808349609375, "loss": 0.4207, "rewards/chosen": -0.3567908704280853, "rewards/margins": 3.2441518306732178, "rewards/rejected": -3.600942611694336, "step": 1562 }, { "epoch": 0.41, "grad_norm": 30.820205688476562, "kl": 0.0, "learning_rate": 2.954723894268516e-07, "logps/chosen": -361.6520080566406, "logps/rejected": -216.23590087890625, "loss": 0.3088, "rewards/chosen": -2.397629976272583, "rewards/margins": 1.9007060527801514, "rewards/rejected": -4.298336029052734, "step": 1563 }, { "epoch": 0.41, "grad_norm": 34.62519073486328, "kl": 0.0, "learning_rate": 2.953415336299398e-07, "logps/chosen": -258.6853332519531, "logps/rejected": -197.72561645507812, "loss": 0.3942, "rewards/chosen": -0.9516178965568542, "rewards/margins": 2.312492609024048, "rewards/rejected": -3.264110565185547, "step": 1564 }, { "epoch": 0.41, "grad_norm": 36.56438446044922, "kl": 0.0, "learning_rate": 2.95210677833028e-07, "logps/chosen": -248.93716430664062, "logps/rejected": -186.44081115722656, "loss": 0.2179, "rewards/chosen": 1.0057936906814575, "rewards/margins": 3.8947253227233887, "rewards/rejected": -2.8889315128326416, "step": 1565 }, { "epoch": 0.41, "grad_norm": 35.079612731933594, "kl": 0.0, "learning_rate": 2.950798220361162e-07, "logps/chosen": -300.0610046386719, "logps/rejected": -149.28273010253906, "loss": 0.2715, "rewards/chosen": -0.047494616359472275, "rewards/margins": 2.8899199962615967, "rewards/rejected": -2.9374146461486816, "step": 1566 }, { "epoch": 0.41, "grad_norm": 37.34353256225586, "kl": 0.0, "learning_rate": 2.949489662392044e-07, "logps/chosen": -259.10064697265625, "logps/rejected": -260.9805603027344, "loss": 0.2721, "rewards/chosen": 0.1995670050382614, "rewards/margins": 4.065165996551514, "rewards/rejected": -3.8655989170074463, "step": 1567 }, { "epoch": 0.41, "grad_norm": 34.23500442504883, "kl": 0.0, "learning_rate": 2.948181104422926e-07, "logps/chosen": -254.81307983398438, "logps/rejected": -244.6279754638672, "loss": 0.3378, "rewards/chosen": -0.6847414970397949, "rewards/margins": 1.1335030794143677, "rewards/rejected": -1.8182445764541626, "step": 1568 }, { "epoch": 0.41, "grad_norm": 33.09608459472656, "kl": 0.0, "learning_rate": 2.946872546453808e-07, "logps/chosen": -192.080810546875, "logps/rejected": -347.98486328125, "loss": 0.2522, "rewards/chosen": 0.757779598236084, "rewards/margins": 4.43095588684082, "rewards/rejected": -3.6731760501861572, "step": 1569 }, { "epoch": 0.41, "grad_norm": 35.08234786987305, "kl": 0.0, "learning_rate": 2.94556398848469e-07, "logps/chosen": -202.5784912109375, "logps/rejected": -241.56146240234375, "loss": 0.2955, "rewards/chosen": 0.7428146600723267, "rewards/margins": 3.106171131134033, "rewards/rejected": -2.363356590270996, "step": 1570 }, { "epoch": 0.41, "grad_norm": 39.55061721801758, "kl": 0.0, "learning_rate": 2.9442554305155715e-07, "logps/chosen": -203.7303924560547, "logps/rejected": -218.20925903320312, "loss": 0.3215, "rewards/chosen": -0.3977167308330536, "rewards/margins": 3.4592111110687256, "rewards/rejected": -3.8569278717041016, "step": 1571 }, { "epoch": 0.41, "grad_norm": 32.61384963989258, "kl": 0.0, "learning_rate": 2.9429468725464535e-07, "logps/chosen": -154.0523223876953, "logps/rejected": -324.3094787597656, "loss": 0.2933, "rewards/chosen": -0.45588475465774536, "rewards/margins": 2.8898463249206543, "rewards/rejected": -3.345731019973755, "step": 1572 }, { "epoch": 0.41, "grad_norm": 33.56401824951172, "kl": 0.0, "learning_rate": 2.9416383145773354e-07, "logps/chosen": -252.0158233642578, "logps/rejected": -242.02838134765625, "loss": 0.2804, "rewards/chosen": 0.04096754267811775, "rewards/margins": 4.675113201141357, "rewards/rejected": -4.634145736694336, "step": 1573 }, { "epoch": 0.41, "grad_norm": 29.832853317260742, "kl": 0.0, "learning_rate": 2.9403297566082174e-07, "logps/chosen": -174.89089965820312, "logps/rejected": -227.717529296875, "loss": 0.2085, "rewards/chosen": 0.28123578429222107, "rewards/margins": 4.435916900634766, "rewards/rejected": -4.154681205749512, "step": 1574 }, { "epoch": 0.41, "grad_norm": 51.532833099365234, "kl": 0.0, "learning_rate": 2.9390211986390993e-07, "logps/chosen": -189.41305541992188, "logps/rejected": -229.99093627929688, "loss": 0.254, "rewards/chosen": 0.8586848974227905, "rewards/margins": 4.504140853881836, "rewards/rejected": -3.645456075668335, "step": 1575 }, { "epoch": 0.41, "grad_norm": 34.82904052734375, "kl": 0.0, "learning_rate": 2.9377126406699813e-07, "logps/chosen": -235.12171936035156, "logps/rejected": -253.74807739257812, "loss": 0.2706, "rewards/chosen": 1.0586671829223633, "rewards/margins": 3.163533926010132, "rewards/rejected": -2.1048667430877686, "step": 1576 }, { "epoch": 0.41, "grad_norm": 39.55526351928711, "kl": 0.0, "learning_rate": 2.9364040827008633e-07, "logps/chosen": -207.1605224609375, "logps/rejected": -192.63937377929688, "loss": 0.3382, "rewards/chosen": 0.6457839012145996, "rewards/margins": 2.901211738586426, "rewards/rejected": -2.255427837371826, "step": 1577 }, { "epoch": 0.41, "grad_norm": 33.488189697265625, "kl": 0.0, "learning_rate": 2.935095524731745e-07, "logps/chosen": -185.49661254882812, "logps/rejected": -205.3595428466797, "loss": 0.2738, "rewards/chosen": 0.4008524715900421, "rewards/margins": 2.8835244178771973, "rewards/rejected": -2.4826719760894775, "step": 1578 }, { "epoch": 0.41, "grad_norm": 29.02250862121582, "kl": 0.0, "learning_rate": 2.9337869667626277e-07, "logps/chosen": -157.56988525390625, "logps/rejected": -229.55596923828125, "loss": 0.2364, "rewards/chosen": 0.9958530068397522, "rewards/margins": 5.4190287590026855, "rewards/rejected": -4.423175811767578, "step": 1579 }, { "epoch": 0.41, "grad_norm": 27.68988037109375, "kl": 0.0, "learning_rate": 2.9324784087935097e-07, "logps/chosen": -243.2689666748047, "logps/rejected": -219.78640747070312, "loss": 0.1673, "rewards/chosen": 1.2246966361999512, "rewards/margins": 4.660741806030273, "rewards/rejected": -3.436044931411743, "step": 1580 }, { "epoch": 0.41, "grad_norm": 34.465538024902344, "kl": 0.0, "learning_rate": 2.9311698508243916e-07, "logps/chosen": -306.2691345214844, "logps/rejected": -232.6453094482422, "loss": 0.2769, "rewards/chosen": 1.0992822647094727, "rewards/margins": 5.033297061920166, "rewards/rejected": -3.9340147972106934, "step": 1581 }, { "epoch": 0.41, "grad_norm": 37.50482940673828, "kl": 0.0, "learning_rate": 2.9298612928552736e-07, "logps/chosen": -142.67239379882812, "logps/rejected": -199.15847778320312, "loss": 0.3397, "rewards/chosen": -0.2452094852924347, "rewards/margins": 2.48954439163208, "rewards/rejected": -2.7347538471221924, "step": 1582 }, { "epoch": 0.41, "grad_norm": 32.987525939941406, "kl": 0.0, "learning_rate": 2.9285527348861556e-07, "logps/chosen": -178.70310974121094, "logps/rejected": -236.485107421875, "loss": 0.2765, "rewards/chosen": 0.8258715867996216, "rewards/margins": 4.397680759429932, "rewards/rejected": -3.5718092918395996, "step": 1583 }, { "epoch": 0.41, "grad_norm": 29.570829391479492, "kl": 0.0, "learning_rate": 2.9272441769170375e-07, "logps/chosen": -214.9452362060547, "logps/rejected": -255.57749938964844, "loss": 0.3128, "rewards/chosen": -0.2915063500404358, "rewards/margins": 2.860386848449707, "rewards/rejected": -3.151893138885498, "step": 1584 }, { "epoch": 0.41, "grad_norm": 31.92060089111328, "kl": 0.0, "learning_rate": 2.9259356189479195e-07, "logps/chosen": -207.015625, "logps/rejected": -247.09225463867188, "loss": 0.2024, "rewards/chosen": 1.528673529624939, "rewards/margins": 7.904787540435791, "rewards/rejected": -6.3761138916015625, "step": 1585 }, { "epoch": 0.42, "grad_norm": 31.320079803466797, "kl": 0.0, "learning_rate": 2.924627060978801e-07, "logps/chosen": -304.74462890625, "logps/rejected": -261.03466796875, "loss": 0.2907, "rewards/chosen": 1.158564567565918, "rewards/margins": 4.468474388122559, "rewards/rejected": -3.3099098205566406, "step": 1586 }, { "epoch": 0.42, "grad_norm": 32.51560592651367, "kl": 0.0, "learning_rate": 2.923318503009683e-07, "logps/chosen": -239.3182373046875, "logps/rejected": -222.99497985839844, "loss": 0.1696, "rewards/chosen": 0.8448255658149719, "rewards/margins": 4.6665825843811035, "rewards/rejected": -3.8217570781707764, "step": 1587 }, { "epoch": 0.42, "grad_norm": 30.755525588989258, "kl": 0.0, "learning_rate": 2.922009945040565e-07, "logps/chosen": -154.37701416015625, "logps/rejected": -225.18515014648438, "loss": 0.183, "rewards/chosen": -1.5603611469268799, "rewards/margins": 2.311732530593872, "rewards/rejected": -3.872093677520752, "step": 1588 }, { "epoch": 0.42, "grad_norm": 27.795978546142578, "kl": 0.0, "learning_rate": 2.920701387071447e-07, "logps/chosen": -164.93106079101562, "logps/rejected": -216.1005859375, "loss": 0.3562, "rewards/chosen": 0.017641305923461914, "rewards/margins": 3.949174165725708, "rewards/rejected": -3.931532859802246, "step": 1589 }, { "epoch": 0.42, "grad_norm": 32.155128479003906, "kl": 0.0, "learning_rate": 2.919392829102329e-07, "logps/chosen": -161.41615295410156, "logps/rejected": -250.1803436279297, "loss": 0.4051, "rewards/chosen": -0.46750572323799133, "rewards/margins": 2.207667350769043, "rewards/rejected": -2.675173044204712, "step": 1590 }, { "epoch": 0.42, "grad_norm": 32.22792053222656, "kl": 0.0, "learning_rate": 2.9180842711332107e-07, "logps/chosen": -145.3040008544922, "logps/rejected": -203.88059997558594, "loss": 0.2157, "rewards/chosen": 0.2076568603515625, "rewards/margins": 3.002925157546997, "rewards/rejected": -2.7952682971954346, "step": 1591 }, { "epoch": 0.42, "grad_norm": 32.03038024902344, "kl": 0.0, "learning_rate": 2.916775713164093e-07, "logps/chosen": -233.9254913330078, "logps/rejected": -181.0751953125, "loss": 0.254, "rewards/chosen": 0.3560665249824524, "rewards/margins": 3.7291817665100098, "rewards/rejected": -3.373115301132202, "step": 1592 }, { "epoch": 0.42, "grad_norm": 37.68912124633789, "kl": 0.0, "learning_rate": 2.915467155194975e-07, "logps/chosen": -261.59320068359375, "logps/rejected": -266.7197570800781, "loss": 0.3849, "rewards/chosen": 1.6894758939743042, "rewards/margins": 4.872840881347656, "rewards/rejected": -3.1833648681640625, "step": 1593 }, { "epoch": 0.42, "grad_norm": 33.84444046020508, "kl": 0.0, "learning_rate": 2.914158597225857e-07, "logps/chosen": -152.94725036621094, "logps/rejected": -311.8249816894531, "loss": 0.3181, "rewards/chosen": -0.022468430921435356, "rewards/margins": 2.707664728164673, "rewards/rejected": -2.730133056640625, "step": 1594 }, { "epoch": 0.42, "grad_norm": 37.6119384765625, "kl": 0.0, "learning_rate": 2.912850039256739e-07, "logps/chosen": -184.41940307617188, "logps/rejected": -260.7890625, "loss": 0.3186, "rewards/chosen": 0.874193012714386, "rewards/margins": 4.659364223480225, "rewards/rejected": -3.7851712703704834, "step": 1595 }, { "epoch": 0.42, "grad_norm": 37.459774017333984, "kl": 0.0, "learning_rate": 2.911541481287621e-07, "logps/chosen": -184.00550842285156, "logps/rejected": -214.40687561035156, "loss": 0.4171, "rewards/chosen": 0.19082167744636536, "rewards/margins": 2.454958438873291, "rewards/rejected": -2.264136791229248, "step": 1596 }, { "epoch": 0.42, "grad_norm": 38.99348449707031, "kl": 0.0, "learning_rate": 2.910232923318503e-07, "logps/chosen": -286.60687255859375, "logps/rejected": -240.58551025390625, "loss": 0.3764, "rewards/chosen": -0.34489428997039795, "rewards/margins": 3.8633813858032227, "rewards/rejected": -4.20827579498291, "step": 1597 }, { "epoch": 0.42, "grad_norm": 33.473636627197266, "kl": 0.0, "learning_rate": 2.908924365349385e-07, "logps/chosen": -260.33843994140625, "logps/rejected": -172.78604125976562, "loss": 0.3135, "rewards/chosen": 1.2632750272750854, "rewards/margins": 3.5282578468322754, "rewards/rejected": -2.2649829387664795, "step": 1598 }, { "epoch": 0.42, "grad_norm": 31.902114868164062, "kl": 0.0, "learning_rate": 2.907615807380267e-07, "logps/chosen": -172.07113647460938, "logps/rejected": -185.29881286621094, "loss": 0.3438, "rewards/chosen": -0.7174520492553711, "rewards/margins": 1.5515670776367188, "rewards/rejected": -2.26901912689209, "step": 1599 }, { "epoch": 0.42, "grad_norm": 37.15164566040039, "kl": 0.0, "learning_rate": 2.906307249411149e-07, "logps/chosen": -130.16259765625, "logps/rejected": -296.18731689453125, "loss": 0.3613, "rewards/chosen": 0.2204550802707672, "rewards/margins": 4.622489929199219, "rewards/rejected": -4.402034759521484, "step": 1600 }, { "epoch": 0.42, "grad_norm": 34.73413848876953, "kl": 0.0, "learning_rate": 2.904998691442031e-07, "logps/chosen": -184.0595703125, "logps/rejected": -321.77734375, "loss": 0.3076, "rewards/chosen": 1.8415918350219727, "rewards/margins": 6.024647235870361, "rewards/rejected": -4.183055400848389, "step": 1601 }, { "epoch": 0.42, "grad_norm": 33.50428009033203, "kl": 0.0, "learning_rate": 2.9036901334729123e-07, "logps/chosen": -250.47787475585938, "logps/rejected": -304.5994873046875, "loss": 0.3039, "rewards/chosen": -0.5124570727348328, "rewards/margins": 2.7285592555999756, "rewards/rejected": -3.241016387939453, "step": 1602 }, { "epoch": 0.42, "grad_norm": 34.6931037902832, "kl": 0.0, "learning_rate": 2.902381575503794e-07, "logps/chosen": -223.81039428710938, "logps/rejected": -283.75616455078125, "loss": 0.3198, "rewards/chosen": 1.386765718460083, "rewards/margins": 3.858922243118286, "rewards/rejected": -2.472156524658203, "step": 1603 }, { "epoch": 0.42, "grad_norm": 32.15188217163086, "kl": 0.0, "learning_rate": 2.901073017534677e-07, "logps/chosen": -141.97601318359375, "logps/rejected": -281.28228759765625, "loss": 0.2395, "rewards/chosen": 0.25665849447250366, "rewards/margins": 3.629164695739746, "rewards/rejected": -3.3725061416625977, "step": 1604 }, { "epoch": 0.42, "grad_norm": 33.40724182128906, "kl": 0.0, "learning_rate": 2.8997644595655587e-07, "logps/chosen": -256.0122375488281, "logps/rejected": -179.1112060546875, "loss": 0.2602, "rewards/chosen": 3.1538848876953125, "rewards/margins": 5.853321075439453, "rewards/rejected": -2.6994359493255615, "step": 1605 }, { "epoch": 0.42, "grad_norm": 29.446321487426758, "kl": 0.0, "learning_rate": 2.8984559015964407e-07, "logps/chosen": -253.1598663330078, "logps/rejected": -259.9168701171875, "loss": 0.2396, "rewards/chosen": 1.2982040643692017, "rewards/margins": 5.060348033905029, "rewards/rejected": -3.762143850326538, "step": 1606 }, { "epoch": 0.42, "grad_norm": 40.72247314453125, "kl": 0.0, "learning_rate": 2.8971473436273226e-07, "logps/chosen": -217.13844299316406, "logps/rejected": -189.36697387695312, "loss": 0.3974, "rewards/chosen": -0.5850549936294556, "rewards/margins": 1.4722565412521362, "rewards/rejected": -2.057311534881592, "step": 1607 }, { "epoch": 0.42, "grad_norm": 35.95814895629883, "kl": 0.0, "learning_rate": 2.8958387856582046e-07, "logps/chosen": -222.97215270996094, "logps/rejected": -227.67852783203125, "loss": 0.2516, "rewards/chosen": 1.4065536260604858, "rewards/margins": 3.951564311981201, "rewards/rejected": -2.545010566711426, "step": 1608 }, { "epoch": 0.42, "grad_norm": 34.521671295166016, "kl": 0.0, "learning_rate": 2.8945302276890866e-07, "logps/chosen": -234.58786010742188, "logps/rejected": -275.44482421875, "loss": 0.2856, "rewards/chosen": -0.958917498588562, "rewards/margins": 2.5073065757751465, "rewards/rejected": -3.466223955154419, "step": 1609 }, { "epoch": 0.42, "grad_norm": 30.042179107666016, "kl": 0.0, "learning_rate": 2.8932216697199685e-07, "logps/chosen": -130.65647888183594, "logps/rejected": -274.7435302734375, "loss": 0.1877, "rewards/chosen": 0.5573607683181763, "rewards/margins": 3.556046962738037, "rewards/rejected": -2.9986863136291504, "step": 1610 }, { "epoch": 0.42, "grad_norm": 34.71845245361328, "kl": 0.0, "learning_rate": 2.8919131117508505e-07, "logps/chosen": -289.10089111328125, "logps/rejected": -286.6903991699219, "loss": 0.2166, "rewards/chosen": -0.21719495952129364, "rewards/margins": 4.7296366691589355, "rewards/rejected": -4.946831703186035, "step": 1611 }, { "epoch": 0.42, "grad_norm": 33.65607452392578, "kl": 0.0, "learning_rate": 2.8906045537817324e-07, "logps/chosen": -244.82997131347656, "logps/rejected": -242.37782287597656, "loss": 0.3627, "rewards/chosen": -0.37126827239990234, "rewards/margins": 3.5149543285369873, "rewards/rejected": -3.8862226009368896, "step": 1612 }, { "epoch": 0.42, "grad_norm": 38.01091384887695, "kl": 0.0, "learning_rate": 2.8892959958126144e-07, "logps/chosen": -264.7205810546875, "logps/rejected": -233.71192932128906, "loss": 0.361, "rewards/chosen": -0.027559369802474976, "rewards/margins": 3.0955469608306885, "rewards/rejected": -3.1231062412261963, "step": 1613 }, { "epoch": 0.42, "grad_norm": 37.26441955566406, "kl": 0.0, "learning_rate": 2.8879874378434964e-07, "logps/chosen": -205.59405517578125, "logps/rejected": -183.545166015625, "loss": 0.271, "rewards/chosen": 0.7251332998275757, "rewards/margins": 4.0589494705200195, "rewards/rejected": -3.3338160514831543, "step": 1614 }, { "epoch": 0.42, "grad_norm": 39.93470764160156, "kl": 0.0, "learning_rate": 2.8866788798743783e-07, "logps/chosen": -238.8513641357422, "logps/rejected": -219.5674285888672, "loss": 0.2655, "rewards/chosen": 0.4453073740005493, "rewards/margins": 2.642704963684082, "rewards/rejected": -2.197397470474243, "step": 1615 }, { "epoch": 0.42, "grad_norm": 33.35372543334961, "kl": 0.0, "learning_rate": 2.8853703219052603e-07, "logps/chosen": -186.39279174804688, "logps/rejected": -284.56536865234375, "loss": 0.2743, "rewards/chosen": 0.2608222961425781, "rewards/margins": 3.18856143951416, "rewards/rejected": -2.927739143371582, "step": 1616 }, { "epoch": 0.42, "grad_norm": 30.4285945892334, "kl": 0.0, "learning_rate": 2.884061763936142e-07, "logps/chosen": -184.7376251220703, "logps/rejected": -223.98312377929688, "loss": 0.2033, "rewards/chosen": 1.5121128559112549, "rewards/margins": 5.503354549407959, "rewards/rejected": -3.991241693496704, "step": 1617 }, { "epoch": 0.42, "grad_norm": 41.996219635009766, "kl": 0.0, "learning_rate": 2.882753205967024e-07, "logps/chosen": -256.65380859375, "logps/rejected": -216.48294067382812, "loss": 0.3603, "rewards/chosen": 0.522615909576416, "rewards/margins": 2.6449623107910156, "rewards/rejected": -2.1223464012145996, "step": 1618 }, { "epoch": 0.42, "grad_norm": 37.05546569824219, "kl": 0.0, "learning_rate": 2.881444647997906e-07, "logps/chosen": -246.1167449951172, "logps/rejected": -245.74496459960938, "loss": 0.199, "rewards/chosen": 0.8783416748046875, "rewards/margins": 4.541228294372559, "rewards/rejected": -3.66288685798645, "step": 1619 }, { "epoch": 0.42, "grad_norm": 39.39576721191406, "kl": 0.0, "learning_rate": 2.880136090028788e-07, "logps/chosen": -233.797119140625, "logps/rejected": -301.4216003417969, "loss": 0.3432, "rewards/chosen": -0.2727748453617096, "rewards/margins": 2.818376064300537, "rewards/rejected": -3.091150999069214, "step": 1620 }, { "epoch": 0.42, "grad_norm": 44.32394027709961, "kl": 0.0, "learning_rate": 2.87882753205967e-07, "logps/chosen": -172.6717529296875, "logps/rejected": -227.44888305664062, "loss": 0.3056, "rewards/chosen": 2.0932228565216064, "rewards/margins": 4.66046667098999, "rewards/rejected": -2.567243814468384, "step": 1621 }, { "epoch": 0.42, "grad_norm": 30.677011489868164, "kl": 0.0, "learning_rate": 2.877518974090552e-07, "logps/chosen": -221.4599151611328, "logps/rejected": -274.02984619140625, "loss": 0.2468, "rewards/chosen": 1.1671116352081299, "rewards/margins": 4.75871467590332, "rewards/rejected": -3.5916030406951904, "step": 1622 }, { "epoch": 0.42, "grad_norm": 33.010372161865234, "kl": 0.0, "learning_rate": 2.876210416121434e-07, "logps/chosen": -184.25941467285156, "logps/rejected": -311.1826477050781, "loss": 0.3287, "rewards/chosen": 0.30492183566093445, "rewards/margins": 3.5320022106170654, "rewards/rejected": -3.2270803451538086, "step": 1623 }, { "epoch": 0.43, "grad_norm": 28.4550838470459, "kl": 0.0, "learning_rate": 2.874901858152316e-07, "logps/chosen": -156.66748046875, "logps/rejected": -173.7084197998047, "loss": 0.2516, "rewards/chosen": 0.15340088307857513, "rewards/margins": 3.6066744327545166, "rewards/rejected": -3.4532735347747803, "step": 1624 }, { "epoch": 0.43, "grad_norm": 23.95566749572754, "kl": 0.0, "learning_rate": 2.873593300183198e-07, "logps/chosen": -226.8877716064453, "logps/rejected": -283.3754577636719, "loss": 0.2624, "rewards/chosen": -0.21693488955497742, "rewards/margins": 3.2981135845184326, "rewards/rejected": -3.5150485038757324, "step": 1625 }, { "epoch": 0.43, "grad_norm": 30.599740982055664, "kl": 0.0, "learning_rate": 2.87228474221408e-07, "logps/chosen": -221.4315643310547, "logps/rejected": -248.6028594970703, "loss": 0.2275, "rewards/chosen": 0.7559409141540527, "rewards/margins": 4.82999324798584, "rewards/rejected": -4.074052333831787, "step": 1626 }, { "epoch": 0.43, "grad_norm": 39.68366241455078, "kl": 0.0, "learning_rate": 2.870976184244962e-07, "logps/chosen": -265.5011291503906, "logps/rejected": -273.00732421875, "loss": 0.2918, "rewards/chosen": -0.13745827972888947, "rewards/margins": 3.9944634437561035, "rewards/rejected": -4.131921768188477, "step": 1627 }, { "epoch": 0.43, "grad_norm": 33.131900787353516, "kl": 0.0, "learning_rate": 2.869667626275844e-07, "logps/chosen": -255.82313537597656, "logps/rejected": -212.59637451171875, "loss": 0.3587, "rewards/chosen": -0.8583320379257202, "rewards/margins": 1.7786017656326294, "rewards/rejected": -2.6369338035583496, "step": 1628 }, { "epoch": 0.43, "grad_norm": 33.5303840637207, "kl": 0.0, "learning_rate": 2.868359068306726e-07, "logps/chosen": -262.96636962890625, "logps/rejected": -253.9119873046875, "loss": 0.2916, "rewards/chosen": -0.17899669706821442, "rewards/margins": 3.351792097091675, "rewards/rejected": -3.5307888984680176, "step": 1629 }, { "epoch": 0.43, "grad_norm": 34.768192291259766, "kl": 0.0, "learning_rate": 2.867050510337608e-07, "logps/chosen": -226.05429077148438, "logps/rejected": -276.16412353515625, "loss": 0.2543, "rewards/chosen": 0.31507861614227295, "rewards/margins": 3.9771056175231934, "rewards/rejected": -3.662026882171631, "step": 1630 }, { "epoch": 0.43, "grad_norm": 33.05791473388672, "kl": 0.0, "learning_rate": 2.86574195236849e-07, "logps/chosen": -180.6664581298828, "logps/rejected": -284.9426574707031, "loss": 0.2811, "rewards/chosen": 0.8875211477279663, "rewards/margins": 5.335653305053711, "rewards/rejected": -4.448132038116455, "step": 1631 }, { "epoch": 0.43, "grad_norm": 23.5435733795166, "kl": 0.0, "learning_rate": 2.864433394399372e-07, "logps/chosen": -223.41973876953125, "logps/rejected": -323.1280517578125, "loss": 0.1187, "rewards/chosen": 2.1701741218566895, "rewards/margins": 6.089311599731445, "rewards/rejected": -3.919137716293335, "step": 1632 }, { "epoch": 0.43, "grad_norm": 32.40892028808594, "kl": 0.0, "learning_rate": 2.8631248364302536e-07, "logps/chosen": -264.6060485839844, "logps/rejected": -249.17063903808594, "loss": 0.3056, "rewards/chosen": -0.5933135747909546, "rewards/margins": 3.272615909576416, "rewards/rejected": -3.86592960357666, "step": 1633 }, { "epoch": 0.43, "grad_norm": 32.43721008300781, "kl": 0.0, "learning_rate": 2.8618162784611356e-07, "logps/chosen": -224.7136993408203, "logps/rejected": -233.19293212890625, "loss": 0.2528, "rewards/chosen": 0.22769694030284882, "rewards/margins": 3.5428829193115234, "rewards/rejected": -3.315186023712158, "step": 1634 }, { "epoch": 0.43, "grad_norm": 31.453256607055664, "kl": 0.0, "learning_rate": 2.8605077204920175e-07, "logps/chosen": -193.77178955078125, "logps/rejected": -260.65252685546875, "loss": 0.3186, "rewards/chosen": -0.4263240098953247, "rewards/margins": 4.058472156524658, "rewards/rejected": -4.484796047210693, "step": 1635 }, { "epoch": 0.43, "grad_norm": 36.352272033691406, "kl": 0.0, "learning_rate": 2.8591991625228995e-07, "logps/chosen": -212.51431274414062, "logps/rejected": -172.3106231689453, "loss": 0.2725, "rewards/chosen": 0.6394997835159302, "rewards/margins": 3.36637020111084, "rewards/rejected": -2.72687029838562, "step": 1636 }, { "epoch": 0.43, "grad_norm": 30.131851196289062, "kl": 0.0, "learning_rate": 2.8578906045537815e-07, "logps/chosen": -253.44989013671875, "logps/rejected": -225.4195098876953, "loss": 0.2125, "rewards/chosen": 2.016209125518799, "rewards/margins": 6.881429195404053, "rewards/rejected": -4.865220069885254, "step": 1637 }, { "epoch": 0.43, "grad_norm": 29.6743106842041, "kl": 0.0, "learning_rate": 2.8565820465846634e-07, "logps/chosen": -205.046630859375, "logps/rejected": -268.38018798828125, "loss": 0.3314, "rewards/chosen": 0.4701835513114929, "rewards/margins": 4.412789821624756, "rewards/rejected": -3.9426064491271973, "step": 1638 }, { "epoch": 0.43, "grad_norm": 26.07879638671875, "kl": 0.0, "learning_rate": 2.8552734886155454e-07, "logps/chosen": -194.8833465576172, "logps/rejected": -234.7974395751953, "loss": 0.2276, "rewards/chosen": 0.6492434144020081, "rewards/margins": 5.1089558601379395, "rewards/rejected": -4.459712505340576, "step": 1639 }, { "epoch": 0.43, "grad_norm": 33.90639114379883, "kl": 0.0, "learning_rate": 2.8539649306464273e-07, "logps/chosen": -244.555908203125, "logps/rejected": -222.04403686523438, "loss": 0.2405, "rewards/chosen": 1.0766985416412354, "rewards/margins": 5.581542015075684, "rewards/rejected": -4.504843711853027, "step": 1640 }, { "epoch": 0.43, "grad_norm": 42.474082946777344, "kl": 0.0, "learning_rate": 2.8526563726773093e-07, "logps/chosen": -248.51821899414062, "logps/rejected": -244.5760040283203, "loss": 0.3104, "rewards/chosen": -0.07312363386154175, "rewards/margins": 3.078744888305664, "rewards/rejected": -3.1518685817718506, "step": 1641 }, { "epoch": 0.43, "grad_norm": 30.39196014404297, "kl": 0.0, "learning_rate": 2.851347814708191e-07, "logps/chosen": -208.0784912109375, "logps/rejected": -297.77337646484375, "loss": 0.2939, "rewards/chosen": -1.3725045919418335, "rewards/margins": 2.603248119354248, "rewards/rejected": -3.975752592086792, "step": 1642 }, { "epoch": 0.43, "grad_norm": 47.98642349243164, "kl": 0.0, "learning_rate": 2.850039256739074e-07, "logps/chosen": -230.6311798095703, "logps/rejected": -256.6865539550781, "loss": 0.2826, "rewards/chosen": 2.421473979949951, "rewards/margins": 3.9391822814941406, "rewards/rejected": -1.5177083015441895, "step": 1643 }, { "epoch": 0.43, "grad_norm": 41.3521614074707, "kl": 0.0, "learning_rate": 2.8487306987699557e-07, "logps/chosen": -248.044677734375, "logps/rejected": -151.1188201904297, "loss": 0.3185, "rewards/chosen": 0.3247153162956238, "rewards/margins": 1.6584868431091309, "rewards/rejected": -1.3337714672088623, "step": 1644 }, { "epoch": 0.43, "grad_norm": 36.328460693359375, "kl": 0.0, "learning_rate": 2.8474221408008377e-07, "logps/chosen": -200.56027221679688, "logps/rejected": -236.47607421875, "loss": 0.3498, "rewards/chosen": 0.5588264465332031, "rewards/margins": 3.169299602508545, "rewards/rejected": -2.610473155975342, "step": 1645 }, { "epoch": 0.43, "grad_norm": 28.64386749267578, "kl": 0.0, "learning_rate": 2.8461135828317196e-07, "logps/chosen": -160.4168701171875, "logps/rejected": -263.1020202636719, "loss": 0.1606, "rewards/chosen": 2.1916143894195557, "rewards/margins": 6.299075126647949, "rewards/rejected": -4.1074604988098145, "step": 1646 }, { "epoch": 0.43, "grad_norm": 38.80425262451172, "kl": 0.0, "learning_rate": 2.8448050248626016e-07, "logps/chosen": -367.1888427734375, "logps/rejected": -211.77659606933594, "loss": 0.2512, "rewards/chosen": 1.4123479127883911, "rewards/margins": 6.332348346710205, "rewards/rejected": -4.9200005531311035, "step": 1647 }, { "epoch": 0.43, "grad_norm": 39.87413787841797, "kl": 0.0, "learning_rate": 2.843496466893483e-07, "logps/chosen": -147.86988830566406, "logps/rejected": -287.0220031738281, "loss": 0.294, "rewards/chosen": 0.49987924098968506, "rewards/margins": 4.298526763916016, "rewards/rejected": -3.798647403717041, "step": 1648 }, { "epoch": 0.43, "grad_norm": 32.95948791503906, "kl": 0.0, "learning_rate": 2.842187908924365e-07, "logps/chosen": -231.50198364257812, "logps/rejected": -171.44778442382812, "loss": 0.3698, "rewards/chosen": 0.3435051143169403, "rewards/margins": 2.4760706424713135, "rewards/rejected": -2.132565498352051, "step": 1649 }, { "epoch": 0.43, "grad_norm": 30.405078887939453, "kl": 0.0, "learning_rate": 2.840879350955247e-07, "logps/chosen": -105.76648712158203, "logps/rejected": -305.2538757324219, "loss": 0.2833, "rewards/chosen": 0.7598097920417786, "rewards/margins": 4.988456726074219, "rewards/rejected": -4.228646755218506, "step": 1650 }, { "epoch": 0.43, "grad_norm": 31.918275833129883, "kl": 0.0, "learning_rate": 2.839570792986129e-07, "logps/chosen": -237.28346252441406, "logps/rejected": -207.51666259765625, "loss": 0.3442, "rewards/chosen": 0.7215090990066528, "rewards/margins": 3.5226168632507324, "rewards/rejected": -2.80110764503479, "step": 1651 }, { "epoch": 0.43, "grad_norm": 33.554229736328125, "kl": 0.0, "learning_rate": 2.838262235017011e-07, "logps/chosen": -167.70065307617188, "logps/rejected": -213.15670776367188, "loss": 0.2661, "rewards/chosen": 1.394936442375183, "rewards/margins": 4.518024921417236, "rewards/rejected": -3.1230885982513428, "step": 1652 }, { "epoch": 0.43, "grad_norm": 32.72843551635742, "kl": 0.0, "learning_rate": 2.836953677047893e-07, "logps/chosen": -136.93429565429688, "logps/rejected": -387.04998779296875, "loss": 0.3315, "rewards/chosen": 0.10951113700866699, "rewards/margins": 6.089513778686523, "rewards/rejected": -5.980002403259277, "step": 1653 }, { "epoch": 0.43, "grad_norm": 31.057981491088867, "kl": 0.0, "learning_rate": 2.835645119078775e-07, "logps/chosen": -210.75360107421875, "logps/rejected": -283.0705871582031, "loss": 0.2595, "rewards/chosen": 1.8047876358032227, "rewards/margins": 4.513606071472168, "rewards/rejected": -2.708818197250366, "step": 1654 }, { "epoch": 0.43, "grad_norm": 32.37018966674805, "kl": 0.0, "learning_rate": 2.8343365611096573e-07, "logps/chosen": -169.20822143554688, "logps/rejected": -194.51992797851562, "loss": 0.2947, "rewards/chosen": 0.5286230444908142, "rewards/margins": 3.582944631576538, "rewards/rejected": -3.054321527481079, "step": 1655 }, { "epoch": 0.43, "grad_norm": 26.04457664489746, "kl": 0.0, "learning_rate": 2.833028003140539e-07, "logps/chosen": -249.79750061035156, "logps/rejected": -288.1142272949219, "loss": 0.2435, "rewards/chosen": 0.1828829050064087, "rewards/margins": 4.325873851776123, "rewards/rejected": -4.142991065979004, "step": 1656 }, { "epoch": 0.43, "grad_norm": 39.51694869995117, "kl": 0.0, "learning_rate": 2.831719445171421e-07, "logps/chosen": -181.7620849609375, "logps/rejected": -284.8299865722656, "loss": 0.224, "rewards/chosen": 1.3785738945007324, "rewards/margins": 6.513654708862305, "rewards/rejected": -5.135080814361572, "step": 1657 }, { "epoch": 0.43, "grad_norm": 24.030521392822266, "kl": 0.0, "learning_rate": 2.830410887202303e-07, "logps/chosen": -266.92938232421875, "logps/rejected": -246.81884765625, "loss": 0.2942, "rewards/chosen": 0.030593067407608032, "rewards/margins": 4.214050769805908, "rewards/rejected": -4.183457851409912, "step": 1658 }, { "epoch": 0.43, "grad_norm": 39.13831329345703, "kl": 0.0, "learning_rate": 2.829102329233185e-07, "logps/chosen": -185.76181030273438, "logps/rejected": -130.00856018066406, "loss": 0.2973, "rewards/chosen": 0.9576006531715393, "rewards/margins": 3.937793254852295, "rewards/rejected": -2.9801926612854004, "step": 1659 }, { "epoch": 0.43, "grad_norm": 33.2174186706543, "kl": 0.0, "learning_rate": 2.827793771264067e-07, "logps/chosen": -254.79701232910156, "logps/rejected": -287.5552062988281, "loss": 0.2675, "rewards/chosen": 1.2208576202392578, "rewards/margins": 5.56520938873291, "rewards/rejected": -4.344351768493652, "step": 1660 }, { "epoch": 0.43, "grad_norm": 35.70877456665039, "kl": 0.0, "learning_rate": 2.826485213294949e-07, "logps/chosen": -235.49278259277344, "logps/rejected": -189.52891540527344, "loss": 0.2972, "rewards/chosen": 0.6834301352500916, "rewards/margins": 3.502739906311035, "rewards/rejected": -2.819309711456299, "step": 1661 }, { "epoch": 0.43, "grad_norm": 38.852294921875, "kl": 0.0, "learning_rate": 2.825176655325831e-07, "logps/chosen": -183.91111755371094, "logps/rejected": -270.0567932128906, "loss": 0.4571, "rewards/chosen": -0.81363844871521, "rewards/margins": 0.5745820999145508, "rewards/rejected": -1.3882205486297607, "step": 1662 }, { "epoch": 0.44, "grad_norm": 34.467350006103516, "kl": 0.0, "learning_rate": 2.8238680973567124e-07, "logps/chosen": -158.2981719970703, "logps/rejected": -207.15960693359375, "loss": 0.4017, "rewards/chosen": 0.1074838638305664, "rewards/margins": 2.8124186992645264, "rewards/rejected": -2.70493483543396, "step": 1663 }, { "epoch": 0.44, "grad_norm": 30.37034034729004, "kl": 0.0, "learning_rate": 2.8225595393875944e-07, "logps/chosen": -167.8849639892578, "logps/rejected": -258.62823486328125, "loss": 0.2809, "rewards/chosen": 0.7363007664680481, "rewards/margins": 3.5093798637390137, "rewards/rejected": -2.7730791568756104, "step": 1664 }, { "epoch": 0.44, "grad_norm": 31.080869674682617, "kl": 0.0, "learning_rate": 2.8212509814184764e-07, "logps/chosen": -172.04812622070312, "logps/rejected": -267.6941223144531, "loss": 0.2494, "rewards/chosen": -0.2119612991809845, "rewards/margins": 3.346021890640259, "rewards/rejected": -3.557983160018921, "step": 1665 }, { "epoch": 0.44, "grad_norm": 33.17937088012695, "kl": 0.0, "learning_rate": 2.8199424234493583e-07, "logps/chosen": -184.60275268554688, "logps/rejected": -273.01202392578125, "loss": 0.2754, "rewards/chosen": 0.02937476895749569, "rewards/margins": 2.6486494541168213, "rewards/rejected": -2.619274616241455, "step": 1666 }, { "epoch": 0.44, "grad_norm": 37.046207427978516, "kl": 0.0, "learning_rate": 2.8186338654802403e-07, "logps/chosen": -208.9612579345703, "logps/rejected": -206.72579956054688, "loss": 0.1766, "rewards/chosen": 1.7866507768630981, "rewards/margins": 5.889621257781982, "rewards/rejected": -4.102970600128174, "step": 1667 }, { "epoch": 0.44, "grad_norm": 33.24836730957031, "kl": 0.0, "learning_rate": 2.817325307511123e-07, "logps/chosen": -246.86883544921875, "logps/rejected": -160.70315551757812, "loss": 0.3167, "rewards/chosen": 0.5075183510780334, "rewards/margins": 3.0673842430114746, "rewards/rejected": -2.559865951538086, "step": 1668 }, { "epoch": 0.44, "grad_norm": 32.354652404785156, "kl": 0.0, "learning_rate": 2.8160167495420047e-07, "logps/chosen": -188.59686279296875, "logps/rejected": -239.78671264648438, "loss": 0.306, "rewards/chosen": -0.3811313211917877, "rewards/margins": 3.6232974529266357, "rewards/rejected": -4.004428863525391, "step": 1669 }, { "epoch": 0.44, "grad_norm": 42.71672058105469, "kl": 0.0, "learning_rate": 2.8147081915728867e-07, "logps/chosen": -184.1849822998047, "logps/rejected": -235.50355529785156, "loss": 0.2995, "rewards/chosen": 0.20322510600090027, "rewards/margins": 2.809774875640869, "rewards/rejected": -2.6065497398376465, "step": 1670 }, { "epoch": 0.44, "grad_norm": 36.16896438598633, "kl": 0.0, "learning_rate": 2.8133996336037687e-07, "logps/chosen": -271.17999267578125, "logps/rejected": -224.8990478515625, "loss": 0.1511, "rewards/chosen": 1.3531864881515503, "rewards/margins": 4.555708885192871, "rewards/rejected": -3.2025222778320312, "step": 1671 }, { "epoch": 0.44, "grad_norm": 34.2087516784668, "kl": 0.0, "learning_rate": 2.8120910756346506e-07, "logps/chosen": -136.76390075683594, "logps/rejected": -273.3305358886719, "loss": 0.1379, "rewards/chosen": 1.9782171249389648, "rewards/margins": 5.933925628662109, "rewards/rejected": -3.9557087421417236, "step": 1672 }, { "epoch": 0.44, "grad_norm": 60.45607376098633, "kl": 0.0, "learning_rate": 2.8107825176655326e-07, "logps/chosen": -197.25955200195312, "logps/rejected": -257.97955322265625, "loss": 0.2683, "rewards/chosen": 1.5958689451217651, "rewards/margins": 6.442742824554443, "rewards/rejected": -4.846873760223389, "step": 1673 }, { "epoch": 0.44, "grad_norm": 65.07658386230469, "kl": 0.0, "learning_rate": 2.8094739596964145e-07, "logps/chosen": -146.33827209472656, "logps/rejected": -255.15676879882812, "loss": 0.3031, "rewards/chosen": -0.05155050754547119, "rewards/margins": 3.9145121574401855, "rewards/rejected": -3.966062545776367, "step": 1674 }, { "epoch": 0.44, "grad_norm": 32.69587707519531, "kl": 0.0, "learning_rate": 2.8081654017272965e-07, "logps/chosen": -273.9563903808594, "logps/rejected": -226.44253540039062, "loss": 0.2102, "rewards/chosen": 0.08316595107316971, "rewards/margins": 5.162535667419434, "rewards/rejected": -5.07936954498291, "step": 1675 }, { "epoch": 0.44, "grad_norm": 45.86758804321289, "kl": 0.0, "learning_rate": 2.8068568437581785e-07, "logps/chosen": -202.95440673828125, "logps/rejected": -185.07928466796875, "loss": 0.2519, "rewards/chosen": 1.0496774911880493, "rewards/margins": 3.4434638023376465, "rewards/rejected": -2.3937861919403076, "step": 1676 }, { "epoch": 0.44, "grad_norm": 47.8294563293457, "kl": 0.0, "learning_rate": 2.8055482857890604e-07, "logps/chosen": -176.25680541992188, "logps/rejected": -215.69883728027344, "loss": 0.2564, "rewards/chosen": -0.19729217886924744, "rewards/margins": 3.721970319747925, "rewards/rejected": -3.919262409210205, "step": 1677 }, { "epoch": 0.44, "grad_norm": 38.45645523071289, "kl": 0.0, "learning_rate": 2.8042397278199424e-07, "logps/chosen": -204.14828491210938, "logps/rejected": -286.4856872558594, "loss": 0.2808, "rewards/chosen": 0.12513354420661926, "rewards/margins": 1.9805330038070679, "rewards/rejected": -1.855399489402771, "step": 1678 }, { "epoch": 0.44, "grad_norm": 45.83350372314453, "kl": 0.0, "learning_rate": 2.802931169850824e-07, "logps/chosen": -217.64407348632812, "logps/rejected": -231.1699676513672, "loss": 0.3138, "rewards/chosen": 1.533355712890625, "rewards/margins": 3.561047077178955, "rewards/rejected": -2.02769136428833, "step": 1679 }, { "epoch": 0.44, "grad_norm": 55.16447067260742, "kl": 0.0, "learning_rate": 2.801622611881706e-07, "logps/chosen": -186.09637451171875, "logps/rejected": -243.02772521972656, "loss": 0.2658, "rewards/chosen": 1.1252343654632568, "rewards/margins": 4.571606636047363, "rewards/rejected": -3.4463725090026855, "step": 1680 }, { "epoch": 0.44, "grad_norm": 34.269187927246094, "kl": 0.0, "learning_rate": 2.8003140539125883e-07, "logps/chosen": -175.88824462890625, "logps/rejected": -248.73648071289062, "loss": 0.3089, "rewards/chosen": 0.7217296957969666, "rewards/margins": 4.206638336181641, "rewards/rejected": -3.4849085807800293, "step": 1681 }, { "epoch": 0.44, "grad_norm": 28.83681297302246, "kl": 0.0, "learning_rate": 2.79900549594347e-07, "logps/chosen": -217.58006286621094, "logps/rejected": -239.70579528808594, "loss": 0.1725, "rewards/chosen": 0.40505197644233704, "rewards/margins": 4.14391565322876, "rewards/rejected": -3.738863706588745, "step": 1682 }, { "epoch": 0.44, "grad_norm": 48.4196891784668, "kl": 0.0, "learning_rate": 2.797696937974352e-07, "logps/chosen": -257.90679931640625, "logps/rejected": -296.9246826171875, "loss": 0.2987, "rewards/chosen": 2.1349244117736816, "rewards/margins": 6.551563739776611, "rewards/rejected": -4.41663932800293, "step": 1683 }, { "epoch": 0.44, "grad_norm": 44.74823760986328, "kl": 0.0, "learning_rate": 2.796388380005234e-07, "logps/chosen": -266.4541931152344, "logps/rejected": -273.9454345703125, "loss": 0.1819, "rewards/chosen": 1.0883949995040894, "rewards/margins": 5.361190319061279, "rewards/rejected": -4.2727952003479, "step": 1684 }, { "epoch": 0.44, "grad_norm": 37.19141387939453, "kl": 0.0, "learning_rate": 2.795079822036116e-07, "logps/chosen": -178.24195861816406, "logps/rejected": -273.5054626464844, "loss": 0.3304, "rewards/chosen": 0.9247137308120728, "rewards/margins": 5.101280212402344, "rewards/rejected": -4.1765666007995605, "step": 1685 }, { "epoch": 0.44, "grad_norm": 35.41918182373047, "kl": 0.0, "learning_rate": 2.793771264066998e-07, "logps/chosen": -194.33016967773438, "logps/rejected": -309.4744567871094, "loss": 0.2389, "rewards/chosen": 0.6592968702316284, "rewards/margins": 6.351325988769531, "rewards/rejected": -5.692028999328613, "step": 1686 }, { "epoch": 0.44, "grad_norm": 39.674442291259766, "kl": 0.0, "learning_rate": 2.79246270609788e-07, "logps/chosen": -243.66392517089844, "logps/rejected": -168.36773681640625, "loss": 0.3489, "rewards/chosen": 0.40796035528182983, "rewards/margins": 2.883101224899292, "rewards/rejected": -2.4751408100128174, "step": 1687 }, { "epoch": 0.44, "grad_norm": 30.215177536010742, "kl": 0.0, "learning_rate": 2.791154148128762e-07, "logps/chosen": -166.00108337402344, "logps/rejected": -360.5684509277344, "loss": 0.3205, "rewards/chosen": 0.15947306156158447, "rewards/margins": 4.606205463409424, "rewards/rejected": -4.446732521057129, "step": 1688 }, { "epoch": 0.44, "grad_norm": 39.63730239868164, "kl": 0.0, "learning_rate": 2.789845590159644e-07, "logps/chosen": -229.3401336669922, "logps/rejected": -291.5890808105469, "loss": 0.2245, "rewards/chosen": 0.7700627446174622, "rewards/margins": 3.624380111694336, "rewards/rejected": -2.8543174266815186, "step": 1689 }, { "epoch": 0.44, "grad_norm": 35.81988525390625, "kl": 0.0, "learning_rate": 2.788537032190526e-07, "logps/chosen": -183.9010467529297, "logps/rejected": -276.3099365234375, "loss": 0.2818, "rewards/chosen": -0.11290199309587479, "rewards/margins": 4.365841865539551, "rewards/rejected": -4.478744029998779, "step": 1690 }, { "epoch": 0.44, "grad_norm": 40.292118072509766, "kl": 0.0, "learning_rate": 2.787228474221408e-07, "logps/chosen": -292.67730712890625, "logps/rejected": -255.4952392578125, "loss": 0.2497, "rewards/chosen": 0.150015190243721, "rewards/margins": 3.031562089920044, "rewards/rejected": -2.881546974182129, "step": 1691 }, { "epoch": 0.44, "grad_norm": 35.17213821411133, "kl": 0.0, "learning_rate": 2.78591991625229e-07, "logps/chosen": -194.00588989257812, "logps/rejected": -233.5550994873047, "loss": 0.2567, "rewards/chosen": 0.7114308476448059, "rewards/margins": 5.897915363311768, "rewards/rejected": -5.186484336853027, "step": 1692 }, { "epoch": 0.44, "grad_norm": 35.00627517700195, "kl": 0.0, "learning_rate": 2.7846113582831723e-07, "logps/chosen": -139.06361389160156, "logps/rejected": -284.79034423828125, "loss": 0.2337, "rewards/chosen": 1.307264804840088, "rewards/margins": 4.419478416442871, "rewards/rejected": -3.112213611602783, "step": 1693 }, { "epoch": 0.44, "grad_norm": 37.7496337890625, "kl": 0.0, "learning_rate": 2.783302800314054e-07, "logps/chosen": -239.04811096191406, "logps/rejected": -277.545166015625, "loss": 0.3579, "rewards/chosen": -0.09821644425392151, "rewards/margins": 3.1371920108795166, "rewards/rejected": -3.2354085445404053, "step": 1694 }, { "epoch": 0.44, "grad_norm": 42.01094436645508, "kl": 0.0, "learning_rate": 2.7819942423449357e-07, "logps/chosen": -301.322509765625, "logps/rejected": -200.0469512939453, "loss": 0.3572, "rewards/chosen": 1.5365756750106812, "rewards/margins": 4.429783344268799, "rewards/rejected": -2.893207550048828, "step": 1695 }, { "epoch": 0.44, "grad_norm": 33.279029846191406, "kl": 0.0, "learning_rate": 2.7806856843758177e-07, "logps/chosen": -258.8411865234375, "logps/rejected": -286.9688720703125, "loss": 0.1982, "rewards/chosen": 2.530242443084717, "rewards/margins": 5.06218147277832, "rewards/rejected": -2.5319392681121826, "step": 1696 }, { "epoch": 0.44, "grad_norm": 40.62521743774414, "kl": 0.0, "learning_rate": 2.7793771264066996e-07, "logps/chosen": -223.89183044433594, "logps/rejected": -249.8245391845703, "loss": 0.1846, "rewards/chosen": 1.793093204498291, "rewards/margins": 4.362424850463867, "rewards/rejected": -2.5693318843841553, "step": 1697 }, { "epoch": 0.44, "grad_norm": 27.853527069091797, "kl": 0.0, "learning_rate": 2.7780685684375816e-07, "logps/chosen": -175.04196166992188, "logps/rejected": -233.70567321777344, "loss": 0.2352, "rewards/chosen": 0.5203903317451477, "rewards/margins": 4.373768329620361, "rewards/rejected": -3.8533778190612793, "step": 1698 }, { "epoch": 0.44, "grad_norm": 36.20708084106445, "kl": 0.0, "learning_rate": 2.7767600104684636e-07, "logps/chosen": -243.81565856933594, "logps/rejected": -267.9309387207031, "loss": 0.3228, "rewards/chosen": 0.4174182415008545, "rewards/margins": 3.5275862216949463, "rewards/rejected": -3.110167980194092, "step": 1699 }, { "epoch": 0.44, "grad_norm": 31.369003295898438, "kl": 0.0, "learning_rate": 2.7754514524993455e-07, "logps/chosen": -150.89146423339844, "logps/rejected": -216.03077697753906, "loss": 0.19, "rewards/chosen": 0.7634617686271667, "rewards/margins": 3.220472574234009, "rewards/rejected": -2.4570107460021973, "step": 1700 }, { "epoch": 0.45, "grad_norm": 37.170772552490234, "kl": 0.0, "learning_rate": 2.7741428945302275e-07, "logps/chosen": -246.51806640625, "logps/rejected": -232.55331420898438, "loss": 0.2736, "rewards/chosen": 2.2666428089141846, "rewards/margins": 4.5983099937438965, "rewards/rejected": -2.331667184829712, "step": 1701 }, { "epoch": 0.45, "grad_norm": 26.145320892333984, "kl": 0.0, "learning_rate": 2.7728343365611094e-07, "logps/chosen": -138.7021484375, "logps/rejected": -237.99859619140625, "loss": 0.1992, "rewards/chosen": 1.8896682262420654, "rewards/margins": 4.866306304931641, "rewards/rejected": -2.976638078689575, "step": 1702 }, { "epoch": 0.45, "grad_norm": 31.039020538330078, "kl": 0.0, "learning_rate": 2.7715257785919914e-07, "logps/chosen": -202.62603759765625, "logps/rejected": -258.099609375, "loss": 0.1471, "rewards/chosen": 0.5484234690666199, "rewards/margins": 4.4137139320373535, "rewards/rejected": -3.865290403366089, "step": 1703 }, { "epoch": 0.45, "grad_norm": 45.64496994018555, "kl": 0.0, "learning_rate": 2.7702172206228734e-07, "logps/chosen": -198.77719116210938, "logps/rejected": -246.3597412109375, "loss": 0.2946, "rewards/chosen": 0.9636480808258057, "rewards/margins": 4.638223648071289, "rewards/rejected": -3.6745755672454834, "step": 1704 }, { "epoch": 0.45, "grad_norm": 34.864662170410156, "kl": 0.0, "learning_rate": 2.7689086626537553e-07, "logps/chosen": -227.4167938232422, "logps/rejected": -191.95358276367188, "loss": 0.3061, "rewards/chosen": 0.04578828811645508, "rewards/margins": 3.3845396041870117, "rewards/rejected": -3.3387513160705566, "step": 1705 }, { "epoch": 0.45, "grad_norm": 28.03066062927246, "kl": 0.0, "learning_rate": 2.767600104684638e-07, "logps/chosen": -272.80523681640625, "logps/rejected": -262.7962341308594, "loss": 0.3551, "rewards/chosen": -1.5040392875671387, "rewards/margins": 1.8724522590637207, "rewards/rejected": -3.3764915466308594, "step": 1706 }, { "epoch": 0.45, "grad_norm": 32.483184814453125, "kl": 0.0, "learning_rate": 2.76629154671552e-07, "logps/chosen": -178.73095703125, "logps/rejected": -269.7213134765625, "loss": 0.2432, "rewards/chosen": 0.8758571743965149, "rewards/margins": 3.153795003890991, "rewards/rejected": -2.277937889099121, "step": 1707 }, { "epoch": 0.45, "grad_norm": 32.03137969970703, "kl": 0.0, "learning_rate": 2.764982988746402e-07, "logps/chosen": -249.5450897216797, "logps/rejected": -209.1124725341797, "loss": 0.325, "rewards/chosen": -0.8352705836296082, "rewards/margins": 3.4512455463409424, "rewards/rejected": -4.286516189575195, "step": 1708 }, { "epoch": 0.45, "grad_norm": 35.8421516418457, "kl": 0.0, "learning_rate": 2.7636744307772837e-07, "logps/chosen": -259.5403747558594, "logps/rejected": -254.30734252929688, "loss": 0.3344, "rewards/chosen": -1.6663615703582764, "rewards/margins": 2.7338154315948486, "rewards/rejected": -4.400177001953125, "step": 1709 }, { "epoch": 0.45, "grad_norm": 30.6213321685791, "kl": 0.0, "learning_rate": 2.762365872808165e-07, "logps/chosen": -256.17169189453125, "logps/rejected": -184.3090057373047, "loss": 0.3511, "rewards/chosen": 0.3080146908760071, "rewards/margins": 2.139042615890503, "rewards/rejected": -1.8310279846191406, "step": 1710 }, { "epoch": 0.45, "grad_norm": 32.349365234375, "kl": 0.0, "learning_rate": 2.761057314839047e-07, "logps/chosen": -240.81570434570312, "logps/rejected": -275.6083068847656, "loss": 0.2586, "rewards/chosen": 2.547614574432373, "rewards/margins": 6.020605087280273, "rewards/rejected": -3.4729907512664795, "step": 1711 }, { "epoch": 0.45, "grad_norm": 27.55760383605957, "kl": 0.0, "learning_rate": 2.759748756869929e-07, "logps/chosen": -106.76872253417969, "logps/rejected": -210.98382568359375, "loss": 0.3021, "rewards/chosen": 1.2487657070159912, "rewards/margins": 4.193385124206543, "rewards/rejected": -2.944619655609131, "step": 1712 }, { "epoch": 0.45, "grad_norm": 36.03867721557617, "kl": 0.0, "learning_rate": 2.758440198900811e-07, "logps/chosen": -115.28714752197266, "logps/rejected": -229.01974487304688, "loss": 0.2283, "rewards/chosen": 1.1162008047103882, "rewards/margins": 3.21551513671875, "rewards/rejected": -2.0993142127990723, "step": 1713 }, { "epoch": 0.45, "grad_norm": 29.898643493652344, "kl": 0.0, "learning_rate": 2.757131640931693e-07, "logps/chosen": -173.66696166992188, "logps/rejected": -237.84140014648438, "loss": 0.2725, "rewards/chosen": 0.8104872107505798, "rewards/margins": 4.533384799957275, "rewards/rejected": -3.72289776802063, "step": 1714 }, { "epoch": 0.45, "grad_norm": 32.10867691040039, "kl": 0.0, "learning_rate": 2.755823082962575e-07, "logps/chosen": -227.56259155273438, "logps/rejected": -196.11566162109375, "loss": 0.2026, "rewards/chosen": 0.22661995887756348, "rewards/margins": 2.7698512077331543, "rewards/rejected": -2.543231248855591, "step": 1715 }, { "epoch": 0.45, "grad_norm": 53.901634216308594, "kl": 0.0, "learning_rate": 2.754514524993457e-07, "logps/chosen": -241.9610137939453, "logps/rejected": -256.36456298828125, "loss": 0.3786, "rewards/chosen": -1.0330169200897217, "rewards/margins": 2.7314131259918213, "rewards/rejected": -3.764430046081543, "step": 1716 }, { "epoch": 0.45, "grad_norm": 35.57015609741211, "kl": 0.0, "learning_rate": 2.753205967024339e-07, "logps/chosen": -222.4459686279297, "logps/rejected": -249.87548828125, "loss": 0.2803, "rewards/chosen": -0.750407874584198, "rewards/margins": 2.9313929080963135, "rewards/rejected": -3.6818008422851562, "step": 1717 }, { "epoch": 0.45, "grad_norm": 32.74998092651367, "kl": 0.0, "learning_rate": 2.751897409055221e-07, "logps/chosen": -225.82725524902344, "logps/rejected": -207.35427856445312, "loss": 0.2025, "rewards/chosen": 0.190630704164505, "rewards/margins": 4.504660129547119, "rewards/rejected": -4.314029216766357, "step": 1718 }, { "epoch": 0.45, "grad_norm": 30.85138702392578, "kl": 0.0, "learning_rate": 2.7505888510861033e-07, "logps/chosen": -276.42974853515625, "logps/rejected": -217.53627014160156, "loss": 0.2485, "rewards/chosen": 3.411306381225586, "rewards/margins": 6.3637566566467285, "rewards/rejected": -2.9524502754211426, "step": 1719 }, { "epoch": 0.45, "grad_norm": 24.205673217773438, "kl": 0.0, "learning_rate": 2.7492802931169853e-07, "logps/chosen": -173.72286987304688, "logps/rejected": -234.66258239746094, "loss": 0.4371, "rewards/chosen": -0.9166773557662964, "rewards/margins": 2.8205437660217285, "rewards/rejected": -3.7372212409973145, "step": 1720 }, { "epoch": 0.45, "grad_norm": 29.204938888549805, "kl": 0.0, "learning_rate": 2.747971735147867e-07, "logps/chosen": -190.07568359375, "logps/rejected": -196.97793579101562, "loss": 0.2964, "rewards/chosen": 0.2621425688266754, "rewards/margins": 2.3583216667175293, "rewards/rejected": -2.0961790084838867, "step": 1721 }, { "epoch": 0.45, "grad_norm": 53.4695930480957, "kl": 0.0, "learning_rate": 2.746663177178749e-07, "logps/chosen": -144.24964904785156, "logps/rejected": -246.5261688232422, "loss": 0.3908, "rewards/chosen": -1.2694023847579956, "rewards/margins": 1.4854682683944702, "rewards/rejected": -2.754870653152466, "step": 1722 }, { "epoch": 0.45, "grad_norm": 41.367244720458984, "kl": 0.0, "learning_rate": 2.745354619209631e-07, "logps/chosen": -209.56777954101562, "logps/rejected": -255.2227020263672, "loss": 0.3476, "rewards/chosen": 1.0162454843521118, "rewards/margins": 3.2657556533813477, "rewards/rejected": -2.2495102882385254, "step": 1723 }, { "epoch": 0.45, "grad_norm": 34.461116790771484, "kl": 0.0, "learning_rate": 2.744046061240513e-07, "logps/chosen": -206.70022583007812, "logps/rejected": -241.9964141845703, "loss": 0.2432, "rewards/chosen": 0.9106854200363159, "rewards/margins": 5.031620502471924, "rewards/rejected": -4.120934963226318, "step": 1724 }, { "epoch": 0.45, "grad_norm": 30.99492835998535, "kl": 0.0, "learning_rate": 2.7427375032713945e-07, "logps/chosen": -164.06423950195312, "logps/rejected": -181.23422241210938, "loss": 0.2385, "rewards/chosen": 0.9010778069496155, "rewards/margins": 3.430784225463867, "rewards/rejected": -2.5297064781188965, "step": 1725 }, { "epoch": 0.45, "grad_norm": 33.725914001464844, "kl": 0.0, "learning_rate": 2.7414289453022765e-07, "logps/chosen": -194.82861328125, "logps/rejected": -357.1357116699219, "loss": 0.3359, "rewards/chosen": -0.22986668348312378, "rewards/margins": 2.912917137145996, "rewards/rejected": -3.1427838802337646, "step": 1726 }, { "epoch": 0.45, "grad_norm": 29.54648208618164, "kl": 0.0, "learning_rate": 2.7401203873331585e-07, "logps/chosen": -227.89065551757812, "logps/rejected": -245.01356506347656, "loss": 0.2869, "rewards/chosen": 2.4143357276916504, "rewards/margins": 6.6730875968933105, "rewards/rejected": -4.25875186920166, "step": 1727 }, { "epoch": 0.45, "grad_norm": 33.45779037475586, "kl": 0.0, "learning_rate": 2.7388118293640404e-07, "logps/chosen": -226.79429626464844, "logps/rejected": -247.69236755371094, "loss": 0.2158, "rewards/chosen": 0.29212290048599243, "rewards/margins": 3.065626859664917, "rewards/rejected": -2.7735040187835693, "step": 1728 }, { "epoch": 0.45, "grad_norm": 30.237781524658203, "kl": 0.0, "learning_rate": 2.7375032713949224e-07, "logps/chosen": -179.5382080078125, "logps/rejected": -395.1669006347656, "loss": 0.2689, "rewards/chosen": -0.9008402228355408, "rewards/margins": 4.793771266937256, "rewards/rejected": -5.694611549377441, "step": 1729 }, { "epoch": 0.45, "grad_norm": 46.390830993652344, "kl": 0.0, "learning_rate": 2.7361947134258044e-07, "logps/chosen": -272.663330078125, "logps/rejected": -317.2637634277344, "loss": 0.2308, "rewards/chosen": 1.130327582359314, "rewards/margins": 5.195208549499512, "rewards/rejected": -4.064880847930908, "step": 1730 }, { "epoch": 0.45, "grad_norm": 29.513668060302734, "kl": 0.0, "learning_rate": 2.7348861554566863e-07, "logps/chosen": -218.89425659179688, "logps/rejected": -167.77731323242188, "loss": 0.335, "rewards/chosen": 0.20457467436790466, "rewards/margins": 2.1242728233337402, "rewards/rejected": -1.9196981191635132, "step": 1731 }, { "epoch": 0.45, "grad_norm": 36.8236083984375, "kl": 0.0, "learning_rate": 2.733577597487569e-07, "logps/chosen": -163.4239501953125, "logps/rejected": -268.9611511230469, "loss": 0.2571, "rewards/chosen": 1.3878527879714966, "rewards/margins": 5.625244617462158, "rewards/rejected": -4.237391948699951, "step": 1732 }, { "epoch": 0.45, "grad_norm": 36.46755599975586, "kl": 0.0, "learning_rate": 2.732269039518451e-07, "logps/chosen": -219.85479736328125, "logps/rejected": -258.7679443359375, "loss": 0.286, "rewards/chosen": 0.35300034284591675, "rewards/margins": 3.6059610843658447, "rewards/rejected": -3.252960681915283, "step": 1733 }, { "epoch": 0.45, "grad_norm": 37.581642150878906, "kl": 0.0, "learning_rate": 2.7309604815493327e-07, "logps/chosen": -187.14065551757812, "logps/rejected": -280.6612548828125, "loss": 0.2148, "rewards/chosen": 0.6708101630210876, "rewards/margins": 4.311281204223633, "rewards/rejected": -3.6404712200164795, "step": 1734 }, { "epoch": 0.45, "grad_norm": 43.155517578125, "kl": 0.0, "learning_rate": 2.7296519235802147e-07, "logps/chosen": -223.60923767089844, "logps/rejected": -187.82534790039062, "loss": 0.2841, "rewards/chosen": 1.4644616842269897, "rewards/margins": 4.722436428070068, "rewards/rejected": -3.257974863052368, "step": 1735 }, { "epoch": 0.45, "grad_norm": 35.82574462890625, "kl": 0.0, "learning_rate": 2.7283433656110966e-07, "logps/chosen": -220.65611267089844, "logps/rejected": -231.52175903320312, "loss": 0.156, "rewards/chosen": 1.0742979049682617, "rewards/margins": 4.472984790802002, "rewards/rejected": -3.3986868858337402, "step": 1736 }, { "epoch": 0.45, "grad_norm": 36.58572769165039, "kl": 0.0, "learning_rate": 2.7270348076419786e-07, "logps/chosen": -241.86561584472656, "logps/rejected": -254.5074005126953, "loss": 0.2847, "rewards/chosen": -0.3873230814933777, "rewards/margins": 4.528369426727295, "rewards/rejected": -4.915692329406738, "step": 1737 }, { "epoch": 0.45, "grad_norm": 38.22736740112305, "kl": 0.0, "learning_rate": 2.7257262496728606e-07, "logps/chosen": -266.9388427734375, "logps/rejected": -192.8319091796875, "loss": 0.3288, "rewards/chosen": 0.3637565076351166, "rewards/margins": 3.0919291973114014, "rewards/rejected": -2.728172779083252, "step": 1738 }, { "epoch": 0.46, "grad_norm": 27.466691970825195, "kl": 0.0, "learning_rate": 2.7244176917037425e-07, "logps/chosen": -129.4464569091797, "logps/rejected": -252.09034729003906, "loss": 0.2556, "rewards/chosen": 1.1280877590179443, "rewards/margins": 5.6138916015625, "rewards/rejected": -4.485803604125977, "step": 1739 }, { "epoch": 0.46, "grad_norm": 31.164216995239258, "kl": 0.0, "learning_rate": 2.7231091337346245e-07, "logps/chosen": -205.83444213867188, "logps/rejected": -282.27777099609375, "loss": 0.3561, "rewards/chosen": 0.007905125617980957, "rewards/margins": 3.2738075256347656, "rewards/rejected": -3.265902519226074, "step": 1740 }, { "epoch": 0.46, "grad_norm": 39.175819396972656, "kl": 0.0, "learning_rate": 2.721800575765506e-07, "logps/chosen": -189.10853576660156, "logps/rejected": -349.58447265625, "loss": 0.1715, "rewards/chosen": 0.36496955156326294, "rewards/margins": 5.270139694213867, "rewards/rejected": -4.90516996383667, "step": 1741 }, { "epoch": 0.46, "grad_norm": 29.220705032348633, "kl": 0.0, "learning_rate": 2.720492017796388e-07, "logps/chosen": -239.35317993164062, "logps/rejected": -262.12664794921875, "loss": 0.3557, "rewards/chosen": -0.8873398303985596, "rewards/margins": 3.00297474861145, "rewards/rejected": -3.8903145790100098, "step": 1742 }, { "epoch": 0.46, "grad_norm": 30.227474212646484, "kl": 0.0, "learning_rate": 2.71918345982727e-07, "logps/chosen": -168.98941040039062, "logps/rejected": -206.9694366455078, "loss": 0.1996, "rewards/chosen": 0.669614851474762, "rewards/margins": 5.056918144226074, "rewards/rejected": -4.387303352355957, "step": 1743 }, { "epoch": 0.46, "grad_norm": 33.103397369384766, "kl": 0.0, "learning_rate": 2.717874901858152e-07, "logps/chosen": -265.4107971191406, "logps/rejected": -283.2393798828125, "loss": 0.3026, "rewards/chosen": -0.4820014238357544, "rewards/margins": 4.627739429473877, "rewards/rejected": -5.109740734100342, "step": 1744 }, { "epoch": 0.46, "grad_norm": 37.08218765258789, "kl": 0.0, "learning_rate": 2.7165663438890343e-07, "logps/chosen": -183.61221313476562, "logps/rejected": -359.53314208984375, "loss": 0.3758, "rewards/chosen": 0.01924392580986023, "rewards/margins": 2.7295875549316406, "rewards/rejected": -2.710343599319458, "step": 1745 }, { "epoch": 0.46, "grad_norm": 44.7434196472168, "kl": 0.0, "learning_rate": 2.715257785919916e-07, "logps/chosen": -260.5285339355469, "logps/rejected": -237.41989135742188, "loss": 0.3212, "rewards/chosen": 0.23573371767997742, "rewards/margins": 3.459974765777588, "rewards/rejected": -3.224241018295288, "step": 1746 }, { "epoch": 0.46, "grad_norm": 44.934471130371094, "kl": 0.0, "learning_rate": 2.713949227950798e-07, "logps/chosen": -170.7688751220703, "logps/rejected": -228.92611694335938, "loss": 0.3384, "rewards/chosen": 0.4019045829772949, "rewards/margins": 3.453260898590088, "rewards/rejected": -3.051356315612793, "step": 1747 }, { "epoch": 0.46, "grad_norm": 43.90964126586914, "kl": 0.0, "learning_rate": 2.71264066998168e-07, "logps/chosen": -199.80911254882812, "logps/rejected": -300.0240783691406, "loss": 0.3233, "rewards/chosen": -0.9098714590072632, "rewards/margins": 2.1194257736206055, "rewards/rejected": -3.029297351837158, "step": 1748 }, { "epoch": 0.46, "grad_norm": 35.426116943359375, "kl": 0.0, "learning_rate": 2.711332112012562e-07, "logps/chosen": -218.34246826171875, "logps/rejected": -288.5013732910156, "loss": 0.2749, "rewards/chosen": 0.5279122591018677, "rewards/margins": 4.1315693855285645, "rewards/rejected": -3.6036572456359863, "step": 1749 }, { "epoch": 0.46, "grad_norm": 40.60729217529297, "kl": 0.0, "learning_rate": 2.710023554043444e-07, "logps/chosen": -169.89242553710938, "logps/rejected": -271.1588134765625, "loss": 0.3116, "rewards/chosen": 0.8719571828842163, "rewards/margins": 4.5694427490234375, "rewards/rejected": -3.6974854469299316, "step": 1750 }, { "epoch": 0.46, "grad_norm": 35.51378631591797, "kl": 0.0, "learning_rate": 2.708714996074326e-07, "logps/chosen": -209.006103515625, "logps/rejected": -205.5970916748047, "loss": 0.2389, "rewards/chosen": -0.20795372128486633, "rewards/margins": 2.6677086353302, "rewards/rejected": -2.875662326812744, "step": 1751 }, { "epoch": 0.46, "grad_norm": 27.783546447753906, "kl": 0.0, "learning_rate": 2.707406438105208e-07, "logps/chosen": -197.99685668945312, "logps/rejected": -267.1977844238281, "loss": 0.1513, "rewards/chosen": 1.9851152896881104, "rewards/margins": 5.490628719329834, "rewards/rejected": -3.5055134296417236, "step": 1752 }, { "epoch": 0.46, "grad_norm": 34.98855209350586, "kl": 0.0, "learning_rate": 2.70609788013609e-07, "logps/chosen": -233.90687561035156, "logps/rejected": -234.8693389892578, "loss": 0.1836, "rewards/chosen": 0.2806207835674286, "rewards/margins": 4.119460582733154, "rewards/rejected": -3.8388397693634033, "step": 1753 }, { "epoch": 0.46, "grad_norm": 33.62569046020508, "kl": 0.0, "learning_rate": 2.704789322166972e-07, "logps/chosen": -250.41143798828125, "logps/rejected": -276.34075927734375, "loss": 0.2799, "rewards/chosen": 1.218125581741333, "rewards/margins": 6.132073402404785, "rewards/rejected": -4.913948059082031, "step": 1754 }, { "epoch": 0.46, "grad_norm": 28.395240783691406, "kl": 0.0, "learning_rate": 2.703480764197854e-07, "logps/chosen": -160.5257110595703, "logps/rejected": -197.5235137939453, "loss": 0.2393, "rewards/chosen": 0.7340290546417236, "rewards/margins": 5.709027290344238, "rewards/rejected": -4.974998474121094, "step": 1755 }, { "epoch": 0.46, "grad_norm": 31.5032901763916, "kl": 0.0, "learning_rate": 2.7021722062287353e-07, "logps/chosen": -313.3423767089844, "logps/rejected": -189.52862548828125, "loss": 0.4041, "rewards/chosen": 0.41564011573791504, "rewards/margins": 3.1868736743927, "rewards/rejected": -2.771233558654785, "step": 1756 }, { "epoch": 0.46, "grad_norm": 39.107112884521484, "kl": 0.0, "learning_rate": 2.7008636482596173e-07, "logps/chosen": -245.70704650878906, "logps/rejected": -172.10501098632812, "loss": 0.3122, "rewards/chosen": 0.5999122858047485, "rewards/margins": 3.459867000579834, "rewards/rejected": -2.859954833984375, "step": 1757 }, { "epoch": 0.46, "grad_norm": 32.38665771484375, "kl": 0.0, "learning_rate": 2.6995550902905e-07, "logps/chosen": -158.6022186279297, "logps/rejected": -247.68646240234375, "loss": 0.2896, "rewards/chosen": -1.248726725578308, "rewards/margins": 3.0644445419311523, "rewards/rejected": -4.31317138671875, "step": 1758 }, { "epoch": 0.46, "grad_norm": 39.92535400390625, "kl": 0.0, "learning_rate": 2.698246532321382e-07, "logps/chosen": -182.69566345214844, "logps/rejected": -335.43505859375, "loss": 0.1626, "rewards/chosen": 1.3375664949417114, "rewards/margins": 6.369048595428467, "rewards/rejected": -5.031482219696045, "step": 1759 }, { "epoch": 0.46, "grad_norm": 28.15011215209961, "kl": 0.0, "learning_rate": 2.6969379743522637e-07, "logps/chosen": -182.77301025390625, "logps/rejected": -227.71070861816406, "loss": 0.2726, "rewards/chosen": 0.5093837380409241, "rewards/margins": 4.273017883300781, "rewards/rejected": -3.763633966445923, "step": 1760 }, { "epoch": 0.46, "grad_norm": 45.147037506103516, "kl": 0.0, "learning_rate": 2.6956294163831457e-07, "logps/chosen": -178.78170776367188, "logps/rejected": -242.56234741210938, "loss": 0.3427, "rewards/chosen": 0.8544749021530151, "rewards/margins": 4.4317498207092285, "rewards/rejected": -3.577275037765503, "step": 1761 }, { "epoch": 0.46, "grad_norm": 42.30143356323242, "kl": 0.0, "learning_rate": 2.6943208584140276e-07, "logps/chosen": -325.6743469238281, "logps/rejected": -199.9776611328125, "loss": 0.2854, "rewards/chosen": 0.4093678891658783, "rewards/margins": 2.5241336822509766, "rewards/rejected": -2.1147658824920654, "step": 1762 }, { "epoch": 0.46, "grad_norm": 31.528766632080078, "kl": 0.0, "learning_rate": 2.6930123004449096e-07, "logps/chosen": -166.7415771484375, "logps/rejected": -233.80612182617188, "loss": 0.2507, "rewards/chosen": 0.9934232831001282, "rewards/margins": 5.178330898284912, "rewards/rejected": -4.18490743637085, "step": 1763 }, { "epoch": 0.46, "grad_norm": 28.84537696838379, "kl": 0.0, "learning_rate": 2.6917037424757916e-07, "logps/chosen": -225.96017456054688, "logps/rejected": -223.4440155029297, "loss": 0.1548, "rewards/chosen": 2.732337236404419, "rewards/margins": 7.202681541442871, "rewards/rejected": -4.470344543457031, "step": 1764 }, { "epoch": 0.46, "grad_norm": 42.9340705871582, "kl": 0.0, "learning_rate": 2.6903951845066735e-07, "logps/chosen": -250.65216064453125, "logps/rejected": -240.86422729492188, "loss": 0.3431, "rewards/chosen": 0.7281718254089355, "rewards/margins": 2.66864013671875, "rewards/rejected": -1.9404683113098145, "step": 1765 }, { "epoch": 0.46, "grad_norm": 23.091651916503906, "kl": 0.0, "learning_rate": 2.6890866265375555e-07, "logps/chosen": -188.62490844726562, "logps/rejected": -295.15179443359375, "loss": 0.2365, "rewards/chosen": 1.522935152053833, "rewards/margins": 4.846446990966797, "rewards/rejected": -3.323512077331543, "step": 1766 }, { "epoch": 0.46, "grad_norm": 34.22391128540039, "kl": 0.0, "learning_rate": 2.6877780685684374e-07, "logps/chosen": -237.4374237060547, "logps/rejected": -255.03697204589844, "loss": 0.259, "rewards/chosen": 1.1581065654754639, "rewards/margins": 5.033398628234863, "rewards/rejected": -3.8752918243408203, "step": 1767 }, { "epoch": 0.46, "grad_norm": 32.2965202331543, "kl": 0.0, "learning_rate": 2.6864695105993194e-07, "logps/chosen": -227.5531005859375, "logps/rejected": -224.8191680908203, "loss": 0.3165, "rewards/chosen": 0.9469077587127686, "rewards/margins": 4.652983665466309, "rewards/rejected": -3.706075668334961, "step": 1768 }, { "epoch": 0.46, "grad_norm": 32.94017791748047, "kl": 0.0, "learning_rate": 2.6851609526302014e-07, "logps/chosen": -219.24993896484375, "logps/rejected": -199.1254119873047, "loss": 0.2666, "rewards/chosen": 0.9429276585578918, "rewards/margins": 5.1648478507995605, "rewards/rejected": -4.221920013427734, "step": 1769 }, { "epoch": 0.46, "grad_norm": 22.41168975830078, "kl": 0.0, "learning_rate": 2.683852394661084e-07, "logps/chosen": -125.49591064453125, "logps/rejected": -276.9358215332031, "loss": 0.2313, "rewards/chosen": 0.19119969010353088, "rewards/margins": 4.500891208648682, "rewards/rejected": -4.309691429138184, "step": 1770 }, { "epoch": 0.46, "grad_norm": 42.297550201416016, "kl": 0.0, "learning_rate": 2.682543836691966e-07, "logps/chosen": -145.0494384765625, "logps/rejected": -273.5556640625, "loss": 0.2749, "rewards/chosen": 0.5761336088180542, "rewards/margins": 4.615227699279785, "rewards/rejected": -4.039093971252441, "step": 1771 }, { "epoch": 0.46, "grad_norm": 34.109439849853516, "kl": 0.0, "learning_rate": 2.681235278722847e-07, "logps/chosen": -151.40411376953125, "logps/rejected": -197.5521240234375, "loss": 0.2645, "rewards/chosen": -0.12928418815135956, "rewards/margins": 2.1960153579711914, "rewards/rejected": -2.3252995014190674, "step": 1772 }, { "epoch": 0.46, "grad_norm": 32.29287338256836, "kl": 0.0, "learning_rate": 2.679926720753729e-07, "logps/chosen": -236.3354949951172, "logps/rejected": -219.23182678222656, "loss": 0.2023, "rewards/chosen": 1.2813626527786255, "rewards/margins": 3.9799628257751465, "rewards/rejected": -2.6986002922058105, "step": 1773 }, { "epoch": 0.46, "grad_norm": 31.360984802246094, "kl": 0.0, "learning_rate": 2.678618162784611e-07, "logps/chosen": -206.96392822265625, "logps/rejected": -266.0826416015625, "loss": 0.2598, "rewards/chosen": 0.18017417192459106, "rewards/margins": 3.8584301471710205, "rewards/rejected": -3.678256034851074, "step": 1774 }, { "epoch": 0.46, "grad_norm": 38.270694732666016, "kl": 0.0, "learning_rate": 2.677309604815493e-07, "logps/chosen": -201.98492431640625, "logps/rejected": -284.6060485839844, "loss": 0.3326, "rewards/chosen": 0.169302299618721, "rewards/margins": 2.283196210861206, "rewards/rejected": -2.113893985748291, "step": 1775 }, { "epoch": 0.46, "grad_norm": 27.302358627319336, "kl": 0.0, "learning_rate": 2.676001046846375e-07, "logps/chosen": -152.28955078125, "logps/rejected": -331.47662353515625, "loss": 0.1946, "rewards/chosen": 0.48952898383140564, "rewards/margins": 4.17086124420166, "rewards/rejected": -3.6813323497772217, "step": 1776 }, { "epoch": 0.47, "grad_norm": 28.06827163696289, "kl": 0.0, "learning_rate": 2.674692488877257e-07, "logps/chosen": -253.61843872070312, "logps/rejected": -256.72442626953125, "loss": 0.3883, "rewards/chosen": -0.4867596924304962, "rewards/margins": 3.532467842102051, "rewards/rejected": -4.019227504730225, "step": 1777 }, { "epoch": 0.47, "grad_norm": 28.32488250732422, "kl": 0.0, "learning_rate": 2.673383930908139e-07, "logps/chosen": -273.54400634765625, "logps/rejected": -182.26773071289062, "loss": 0.3289, "rewards/chosen": 0.13828414678573608, "rewards/margins": 2.294508934020996, "rewards/rejected": -2.1562247276306152, "step": 1778 }, { "epoch": 0.47, "grad_norm": 29.519807815551758, "kl": 0.0, "learning_rate": 2.672075372939021e-07, "logps/chosen": -183.9783477783203, "logps/rejected": -324.3849792480469, "loss": 0.3371, "rewards/chosen": -0.7770126461982727, "rewards/margins": 5.614282131195068, "rewards/rejected": -6.391294956207275, "step": 1779 }, { "epoch": 0.47, "grad_norm": 35.09743881225586, "kl": 0.0, "learning_rate": 2.670766814969903e-07, "logps/chosen": -126.46038055419922, "logps/rejected": -287.030517578125, "loss": 0.3135, "rewards/chosen": 0.18409352004528046, "rewards/margins": 3.7861502170562744, "rewards/rejected": -3.6020567417144775, "step": 1780 }, { "epoch": 0.47, "grad_norm": 36.5549201965332, "kl": 0.0, "learning_rate": 2.669458257000785e-07, "logps/chosen": -218.15708923339844, "logps/rejected": -216.2779998779297, "loss": 0.2754, "rewards/chosen": 0.7770873308181763, "rewards/margins": 3.693025588989258, "rewards/rejected": -2.915938138961792, "step": 1781 }, { "epoch": 0.47, "grad_norm": 34.714717864990234, "kl": 0.0, "learning_rate": 2.668149699031667e-07, "logps/chosen": -266.4847412109375, "logps/rejected": -261.268798828125, "loss": 0.3335, "rewards/chosen": 0.46225249767303467, "rewards/margins": 2.131314992904663, "rewards/rejected": -1.6690624952316284, "step": 1782 }, { "epoch": 0.47, "grad_norm": 29.801822662353516, "kl": 0.0, "learning_rate": 2.6668411410625493e-07, "logps/chosen": -214.0922393798828, "logps/rejected": -276.224609375, "loss": 0.3039, "rewards/chosen": 0.2218581587076187, "rewards/margins": 3.801252603530884, "rewards/rejected": -3.5793943405151367, "step": 1783 }, { "epoch": 0.47, "grad_norm": 27.524829864501953, "kl": 0.0, "learning_rate": 2.6655325830934313e-07, "logps/chosen": -186.962890625, "logps/rejected": -252.66331481933594, "loss": 0.2315, "rewards/chosen": -0.20698793232440948, "rewards/margins": 4.5711774826049805, "rewards/rejected": -4.778165340423584, "step": 1784 }, { "epoch": 0.47, "grad_norm": 28.168418884277344, "kl": 0.0, "learning_rate": 2.664224025124313e-07, "logps/chosen": -116.75592803955078, "logps/rejected": -254.7372283935547, "loss": 0.2833, "rewards/chosen": -0.1261283904314041, "rewards/margins": 1.8637630939483643, "rewards/rejected": -1.989891529083252, "step": 1785 }, { "epoch": 0.47, "grad_norm": 30.8496036529541, "kl": 0.0, "learning_rate": 2.662915467155195e-07, "logps/chosen": -305.5096130371094, "logps/rejected": -225.94699096679688, "loss": 0.1714, "rewards/chosen": 3.5320422649383545, "rewards/margins": 7.76275634765625, "rewards/rejected": -4.230713844299316, "step": 1786 }, { "epoch": 0.47, "grad_norm": 30.151742935180664, "kl": 0.0, "learning_rate": 2.6616069091860767e-07, "logps/chosen": -148.5421905517578, "logps/rejected": -241.40003967285156, "loss": 0.2474, "rewards/chosen": 1.5538842678070068, "rewards/margins": 4.725393295288086, "rewards/rejected": -3.171509027481079, "step": 1787 }, { "epoch": 0.47, "grad_norm": 29.541250228881836, "kl": 0.0, "learning_rate": 2.6602983512169586e-07, "logps/chosen": -237.12210083007812, "logps/rejected": -302.9940490722656, "loss": 0.2254, "rewards/chosen": 0.5165008902549744, "rewards/margins": 7.595618724822998, "rewards/rejected": -7.079117774963379, "step": 1788 }, { "epoch": 0.47, "grad_norm": 26.707609176635742, "kl": 0.0, "learning_rate": 2.6589897932478406e-07, "logps/chosen": -168.9550018310547, "logps/rejected": -295.4699401855469, "loss": 0.2857, "rewards/chosen": 1.3008852005004883, "rewards/margins": 5.830302715301514, "rewards/rejected": -4.529417514801025, "step": 1789 }, { "epoch": 0.47, "grad_norm": 36.683982849121094, "kl": 0.0, "learning_rate": 2.6576812352787225e-07, "logps/chosen": -150.46591186523438, "logps/rejected": -191.10302734375, "loss": 0.361, "rewards/chosen": -0.044897012412548065, "rewards/margins": 2.294562339782715, "rewards/rejected": -2.3394594192504883, "step": 1790 }, { "epoch": 0.47, "grad_norm": 34.28142166137695, "kl": 0.0, "learning_rate": 2.6563726773096045e-07, "logps/chosen": -201.0179901123047, "logps/rejected": -262.7911071777344, "loss": 0.2706, "rewards/chosen": -0.44374069571495056, "rewards/margins": 6.126780986785889, "rewards/rejected": -6.570521831512451, "step": 1791 }, { "epoch": 0.47, "grad_norm": 28.13138771057129, "kl": 0.0, "learning_rate": 2.6550641193404865e-07, "logps/chosen": -272.98065185546875, "logps/rejected": -279.70684814453125, "loss": 0.2232, "rewards/chosen": 1.1704599857330322, "rewards/margins": 5.974841117858887, "rewards/rejected": -4.804380893707275, "step": 1792 }, { "epoch": 0.47, "grad_norm": 36.26626968383789, "kl": 0.0, "learning_rate": 2.6537555613713684e-07, "logps/chosen": -191.0433349609375, "logps/rejected": -271.6612854003906, "loss": 0.2392, "rewards/chosen": 1.1138032674789429, "rewards/margins": 4.200105667114258, "rewards/rejected": -3.0863025188446045, "step": 1793 }, { "epoch": 0.47, "grad_norm": 41.627620697021484, "kl": 0.0, "learning_rate": 2.6524470034022504e-07, "logps/chosen": -176.78253173828125, "logps/rejected": -245.73622131347656, "loss": 0.3471, "rewards/chosen": -1.435298204421997, "rewards/margins": 1.6505019664764404, "rewards/rejected": -3.0858001708984375, "step": 1794 }, { "epoch": 0.47, "grad_norm": 29.305988311767578, "kl": 0.0, "learning_rate": 2.6511384454331323e-07, "logps/chosen": -209.2613983154297, "logps/rejected": -171.62149047851562, "loss": 0.2582, "rewards/chosen": 0.047622814774513245, "rewards/margins": 2.757302761077881, "rewards/rejected": -2.7096798419952393, "step": 1795 }, { "epoch": 0.47, "grad_norm": 33.22182083129883, "kl": 0.0, "learning_rate": 2.649829887464015e-07, "logps/chosen": -156.6744384765625, "logps/rejected": -246.99066162109375, "loss": 0.3714, "rewards/chosen": -1.121241569519043, "rewards/margins": 4.636855125427246, "rewards/rejected": -5.758096694946289, "step": 1796 }, { "epoch": 0.47, "grad_norm": 33.60297775268555, "kl": 0.0, "learning_rate": 2.648521329494897e-07, "logps/chosen": -240.12210083007812, "logps/rejected": -282.920654296875, "loss": 0.2713, "rewards/chosen": 1.0724228620529175, "rewards/margins": 4.727004528045654, "rewards/rejected": -3.6545817852020264, "step": 1797 }, { "epoch": 0.47, "grad_norm": 37.307891845703125, "kl": 0.0, "learning_rate": 2.647212771525779e-07, "logps/chosen": -168.95399475097656, "logps/rejected": -182.30612182617188, "loss": 0.1923, "rewards/chosen": 1.4341641664505005, "rewards/margins": 4.384321689605713, "rewards/rejected": -2.950157642364502, "step": 1798 }, { "epoch": 0.47, "grad_norm": 39.907894134521484, "kl": 0.0, "learning_rate": 2.6459042135566607e-07, "logps/chosen": -244.38983154296875, "logps/rejected": -295.0066833496094, "loss": 0.2557, "rewards/chosen": 1.7292518615722656, "rewards/margins": 4.0041303634643555, "rewards/rejected": -2.27487850189209, "step": 1799 }, { "epoch": 0.47, "grad_norm": 29.53258514404297, "kl": 0.0, "learning_rate": 2.6445956555875427e-07, "logps/chosen": -147.8897705078125, "logps/rejected": -284.5010070800781, "loss": 0.3865, "rewards/chosen": -0.3604433536529541, "rewards/margins": 1.9473583698272705, "rewards/rejected": -2.3078017234802246, "step": 1800 }, { "epoch": 0.47, "grad_norm": 39.97542190551758, "kl": 0.0, "learning_rate": 2.6432870976184246e-07, "logps/chosen": -262.21710205078125, "logps/rejected": -261.5345458984375, "loss": 0.269, "rewards/chosen": 0.1343774050474167, "rewards/margins": 2.3693342208862305, "rewards/rejected": -2.234956741333008, "step": 1801 }, { "epoch": 0.47, "grad_norm": 42.28630828857422, "kl": 0.0, "learning_rate": 2.6419785396493066e-07, "logps/chosen": -169.6084442138672, "logps/rejected": -211.6229248046875, "loss": 0.4199, "rewards/chosen": -0.3721810281276703, "rewards/margins": 2.8687891960144043, "rewards/rejected": -3.2409701347351074, "step": 1802 }, { "epoch": 0.47, "grad_norm": 28.243871688842773, "kl": 0.0, "learning_rate": 2.640669981680188e-07, "logps/chosen": -220.0828857421875, "logps/rejected": -210.42112731933594, "loss": 0.3123, "rewards/chosen": -0.20898278057575226, "rewards/margins": 2.981920003890991, "rewards/rejected": -3.1909027099609375, "step": 1803 }, { "epoch": 0.47, "grad_norm": 29.4338436126709, "kl": 0.0, "learning_rate": 2.63936142371107e-07, "logps/chosen": -184.46368408203125, "logps/rejected": -190.14401245117188, "loss": 0.337, "rewards/chosen": 0.3699902296066284, "rewards/margins": 2.436811923980713, "rewards/rejected": -2.066821575164795, "step": 1804 }, { "epoch": 0.47, "grad_norm": 31.90092658996582, "kl": 0.0, "learning_rate": 2.638052865741952e-07, "logps/chosen": -219.99734497070312, "logps/rejected": -307.57318115234375, "loss": 0.2219, "rewards/chosen": 1.241980791091919, "rewards/margins": 6.160677909851074, "rewards/rejected": -4.918696880340576, "step": 1805 }, { "epoch": 0.47, "grad_norm": 34.10798645019531, "kl": 0.0, "learning_rate": 2.636744307772834e-07, "logps/chosen": -262.48138427734375, "logps/rejected": -194.05780029296875, "loss": 0.2215, "rewards/chosen": -1.38141667842865, "rewards/margins": 1.2182193994522095, "rewards/rejected": -2.5996360778808594, "step": 1806 }, { "epoch": 0.47, "grad_norm": 37.151607513427734, "kl": 0.0, "learning_rate": 2.635435749803716e-07, "logps/chosen": -222.68251037597656, "logps/rejected": -282.22100830078125, "loss": 0.197, "rewards/chosen": 0.9315857887268066, "rewards/margins": 4.866264820098877, "rewards/rejected": -3.9346790313720703, "step": 1807 }, { "epoch": 0.47, "grad_norm": 45.96467208862305, "kl": 0.0, "learning_rate": 2.634127191834598e-07, "logps/chosen": -260.59710693359375, "logps/rejected": -211.7723388671875, "loss": 0.3401, "rewards/chosen": 0.4457830488681793, "rewards/margins": 2.8025193214416504, "rewards/rejected": -2.356736183166504, "step": 1808 }, { "epoch": 0.47, "grad_norm": 36.53901672363281, "kl": 0.0, "learning_rate": 2.6328186338654803e-07, "logps/chosen": -229.66275024414062, "logps/rejected": -205.681396484375, "loss": 0.4063, "rewards/chosen": -0.6894323229789734, "rewards/margins": 1.5290522575378418, "rewards/rejected": -2.21848464012146, "step": 1809 }, { "epoch": 0.47, "grad_norm": 38.05723190307617, "kl": 0.0, "learning_rate": 2.6315100758963623e-07, "logps/chosen": -233.05426025390625, "logps/rejected": -192.11688232421875, "loss": 0.3905, "rewards/chosen": -0.518449068069458, "rewards/margins": 1.3643622398376465, "rewards/rejected": -1.8828113079071045, "step": 1810 }, { "epoch": 0.47, "grad_norm": 39.44814682006836, "kl": 0.0, "learning_rate": 2.630201517927244e-07, "logps/chosen": -165.95632934570312, "logps/rejected": -271.4263916015625, "loss": 0.2897, "rewards/chosen": 0.6428464651107788, "rewards/margins": 4.1496405601501465, "rewards/rejected": -3.5067942142486572, "step": 1811 }, { "epoch": 0.47, "grad_norm": 30.81804656982422, "kl": 0.0, "learning_rate": 2.628892959958126e-07, "logps/chosen": -235.5777587890625, "logps/rejected": -223.3262939453125, "loss": 0.2791, "rewards/chosen": 0.08995027095079422, "rewards/margins": 3.556304931640625, "rewards/rejected": -3.4663546085357666, "step": 1812 }, { "epoch": 0.47, "grad_norm": 33.08573913574219, "kl": 0.0, "learning_rate": 2.627584401989008e-07, "logps/chosen": -225.62367248535156, "logps/rejected": -210.410400390625, "loss": 0.2657, "rewards/chosen": -0.42150986194610596, "rewards/margins": 2.2237415313720703, "rewards/rejected": -2.645251512527466, "step": 1813 }, { "epoch": 0.47, "grad_norm": 32.53742218017578, "kl": 0.0, "learning_rate": 2.62627584401989e-07, "logps/chosen": -182.93365478515625, "logps/rejected": -269.56402587890625, "loss": 0.3082, "rewards/chosen": 0.7558895349502563, "rewards/margins": 4.601802349090576, "rewards/rejected": -3.8459129333496094, "step": 1814 }, { "epoch": 0.48, "grad_norm": 35.428428649902344, "kl": 0.0, "learning_rate": 2.624967286050772e-07, "logps/chosen": -179.17361450195312, "logps/rejected": -357.9075012207031, "loss": 0.3118, "rewards/chosen": -0.028548792004585266, "rewards/margins": 4.181148052215576, "rewards/rejected": -4.2096967697143555, "step": 1815 }, { "epoch": 0.48, "grad_norm": 32.03295135498047, "kl": 0.0, "learning_rate": 2.623658728081654e-07, "logps/chosen": -155.4049530029297, "logps/rejected": -197.88656616210938, "loss": 0.2179, "rewards/chosen": 1.8276541233062744, "rewards/margins": 4.273235321044922, "rewards/rejected": -2.4455809593200684, "step": 1816 }, { "epoch": 0.48, "grad_norm": 28.16794204711914, "kl": 0.0, "learning_rate": 2.622350170112536e-07, "logps/chosen": -196.6329803466797, "logps/rejected": -276.8796691894531, "loss": 0.3296, "rewards/chosen": -0.34935376048088074, "rewards/margins": 2.847820281982422, "rewards/rejected": -3.197174072265625, "step": 1817 }, { "epoch": 0.48, "grad_norm": 24.694307327270508, "kl": 0.0, "learning_rate": 2.6210416121434174e-07, "logps/chosen": -167.74771118164062, "logps/rejected": -248.10716247558594, "loss": 0.2452, "rewards/chosen": 0.8411151170730591, "rewards/margins": 4.5710296630859375, "rewards/rejected": -3.729914665222168, "step": 1818 }, { "epoch": 0.48, "grad_norm": 32.40667724609375, "kl": 0.0, "learning_rate": 2.6197330541742994e-07, "logps/chosen": -167.05648803710938, "logps/rejected": -260.3861083984375, "loss": 0.3702, "rewards/chosen": -0.3283267021179199, "rewards/margins": 1.5936791896820068, "rewards/rejected": -1.9220058917999268, "step": 1819 }, { "epoch": 0.48, "grad_norm": 29.981260299682617, "kl": 0.0, "learning_rate": 2.6184244962051814e-07, "logps/chosen": -205.59954833984375, "logps/rejected": -240.790771484375, "loss": 0.3203, "rewards/chosen": 0.49631327390670776, "rewards/margins": 3.0999534130096436, "rewards/rejected": -2.603640079498291, "step": 1820 }, { "epoch": 0.48, "grad_norm": 39.830020904541016, "kl": 0.0, "learning_rate": 2.6171159382360633e-07, "logps/chosen": -309.5193176269531, "logps/rejected": -225.8451690673828, "loss": 0.4059, "rewards/chosen": -0.7414736747741699, "rewards/margins": 1.5210020542144775, "rewards/rejected": -2.2624757289886475, "step": 1821 }, { "epoch": 0.48, "grad_norm": 28.099843978881836, "kl": 0.0, "learning_rate": 2.615807380266946e-07, "logps/chosen": -139.35086059570312, "logps/rejected": -258.74658203125, "loss": 0.3531, "rewards/chosen": 0.06932894885540009, "rewards/margins": 3.5908191204071045, "rewards/rejected": -3.5214900970458984, "step": 1822 }, { "epoch": 0.48, "grad_norm": 33.78807067871094, "kl": 0.0, "learning_rate": 2.614498822297828e-07, "logps/chosen": -235.4106903076172, "logps/rejected": -252.2579803466797, "loss": 0.2059, "rewards/chosen": 1.0817266702651978, "rewards/margins": 3.7266507148742676, "rewards/rejected": -2.6449241638183594, "step": 1823 }, { "epoch": 0.48, "grad_norm": 26.09669303894043, "kl": 0.0, "learning_rate": 2.61319026432871e-07, "logps/chosen": -201.91397094726562, "logps/rejected": -203.9112091064453, "loss": 0.2103, "rewards/chosen": 0.38464394211769104, "rewards/margins": 4.490221977233887, "rewards/rejected": -4.1055779457092285, "step": 1824 }, { "epoch": 0.48, "grad_norm": 32.747703552246094, "kl": 0.0, "learning_rate": 2.6118817063595917e-07, "logps/chosen": -130.88514709472656, "logps/rejected": -275.74755859375, "loss": 0.2718, "rewards/chosen": 0.1479092687368393, "rewards/margins": 5.113210201263428, "rewards/rejected": -4.965301036834717, "step": 1825 }, { "epoch": 0.48, "grad_norm": 33.221134185791016, "kl": 0.0, "learning_rate": 2.6105731483904737e-07, "logps/chosen": -268.6279296875, "logps/rejected": -209.37034606933594, "loss": 0.2728, "rewards/chosen": -0.01809675432741642, "rewards/margins": 3.1155669689178467, "rewards/rejected": -3.1336636543273926, "step": 1826 }, { "epoch": 0.48, "grad_norm": 33.30061340332031, "kl": 0.0, "learning_rate": 2.6092645904213556e-07, "logps/chosen": -270.9553527832031, "logps/rejected": -303.281005859375, "loss": 0.2219, "rewards/chosen": 1.86402428150177, "rewards/margins": 5.579380989074707, "rewards/rejected": -3.7153568267822266, "step": 1827 }, { "epoch": 0.48, "grad_norm": 35.51240921020508, "kl": 0.0, "learning_rate": 2.6079560324522376e-07, "logps/chosen": -212.87295532226562, "logps/rejected": -247.5990753173828, "loss": 0.2996, "rewards/chosen": 0.5374855399131775, "rewards/margins": 4.391886234283447, "rewards/rejected": -3.854400634765625, "step": 1828 }, { "epoch": 0.48, "grad_norm": 43.96573257446289, "kl": 0.0, "learning_rate": 2.6066474744831195e-07, "logps/chosen": -307.2767333984375, "logps/rejected": -211.69451904296875, "loss": 0.338, "rewards/chosen": -0.34222811460494995, "rewards/margins": 2.2243869304656982, "rewards/rejected": -2.566615104675293, "step": 1829 }, { "epoch": 0.48, "grad_norm": 34.17534255981445, "kl": 0.0, "learning_rate": 2.6053389165140015e-07, "logps/chosen": -189.63507080078125, "logps/rejected": -239.6513214111328, "loss": 0.2713, "rewards/chosen": -0.28839311003685, "rewards/margins": 3.142786741256714, "rewards/rejected": -3.4311797618865967, "step": 1830 }, { "epoch": 0.48, "grad_norm": 34.881080627441406, "kl": 0.0, "learning_rate": 2.6040303585448835e-07, "logps/chosen": -201.80218505859375, "logps/rejected": -237.1852569580078, "loss": 0.2662, "rewards/chosen": 0.23056668043136597, "rewards/margins": 3.316105604171753, "rewards/rejected": -3.085538864135742, "step": 1831 }, { "epoch": 0.48, "grad_norm": 35.58689880371094, "kl": 0.0, "learning_rate": 2.6027218005757654e-07, "logps/chosen": -205.9733123779297, "logps/rejected": -214.0948944091797, "loss": 0.3541, "rewards/chosen": 0.2090885192155838, "rewards/margins": 3.7775912284851074, "rewards/rejected": -3.56850266456604, "step": 1832 }, { "epoch": 0.48, "grad_norm": 46.46804428100586, "kl": 0.0, "learning_rate": 2.6014132426066474e-07, "logps/chosen": -243.96986389160156, "logps/rejected": -321.12237548828125, "loss": 0.2543, "rewards/chosen": 0.33036065101623535, "rewards/margins": 4.61346435546875, "rewards/rejected": -4.283103942871094, "step": 1833 }, { "epoch": 0.48, "grad_norm": 37.85405349731445, "kl": 0.0, "learning_rate": 2.600104684637529e-07, "logps/chosen": -262.3682556152344, "logps/rejected": -252.35533142089844, "loss": 0.2585, "rewards/chosen": 0.8365837335586548, "rewards/margins": 3.0271658897399902, "rewards/rejected": -2.190582036972046, "step": 1834 }, { "epoch": 0.48, "grad_norm": 39.19338607788086, "kl": 0.0, "learning_rate": 2.5987961266684113e-07, "logps/chosen": -260.9456481933594, "logps/rejected": -229.8648681640625, "loss": 0.3005, "rewards/chosen": 1.6565935611724854, "rewards/margins": 4.466734409332275, "rewards/rejected": -2.81014084815979, "step": 1835 }, { "epoch": 0.48, "grad_norm": 29.94069480895996, "kl": 0.0, "learning_rate": 2.5974875686992933e-07, "logps/chosen": -269.1368103027344, "logps/rejected": -189.29193115234375, "loss": 0.2599, "rewards/chosen": -0.17925623059272766, "rewards/margins": 3.1845247745513916, "rewards/rejected": -3.363780975341797, "step": 1836 }, { "epoch": 0.48, "grad_norm": 29.240211486816406, "kl": 0.0, "learning_rate": 2.596179010730175e-07, "logps/chosen": -244.43478393554688, "logps/rejected": -251.92523193359375, "loss": 0.3398, "rewards/chosen": -0.38684120774269104, "rewards/margins": 5.424040794372559, "rewards/rejected": -5.810882091522217, "step": 1837 }, { "epoch": 0.48, "grad_norm": 37.174434661865234, "kl": 0.0, "learning_rate": 2.594870452761057e-07, "logps/chosen": -188.85650634765625, "logps/rejected": -329.2757873535156, "loss": 0.4007, "rewards/chosen": -0.41793757677078247, "rewards/margins": 3.9053256511688232, "rewards/rejected": -4.323263168334961, "step": 1838 }, { "epoch": 0.48, "grad_norm": 41.535221099853516, "kl": 0.0, "learning_rate": 2.593561894791939e-07, "logps/chosen": -244.72862243652344, "logps/rejected": -299.87762451171875, "loss": 0.223, "rewards/chosen": 1.2217128276824951, "rewards/margins": 5.825761795043945, "rewards/rejected": -4.604048728942871, "step": 1839 }, { "epoch": 0.48, "grad_norm": 28.986780166625977, "kl": 0.0, "learning_rate": 2.592253336822821e-07, "logps/chosen": -204.30938720703125, "logps/rejected": -151.59121704101562, "loss": 0.3252, "rewards/chosen": 1.5701173543930054, "rewards/margins": 3.4767918586730957, "rewards/rejected": -1.9066746234893799, "step": 1840 }, { "epoch": 0.48, "grad_norm": 36.32473373413086, "kl": 0.0, "learning_rate": 2.590944778853703e-07, "logps/chosen": -207.05661010742188, "logps/rejected": -250.57901000976562, "loss": 0.4133, "rewards/chosen": -0.5402141809463501, "rewards/margins": 1.7857950925827026, "rewards/rejected": -2.3260092735290527, "step": 1841 }, { "epoch": 0.48, "grad_norm": 38.50576400756836, "kl": 0.0, "learning_rate": 2.589636220884585e-07, "logps/chosen": -277.8896484375, "logps/rejected": -229.23648071289062, "loss": 0.2772, "rewards/chosen": 0.07086262106895447, "rewards/margins": 4.372156620025635, "rewards/rejected": -4.301293849945068, "step": 1842 }, { "epoch": 0.48, "grad_norm": 31.36016845703125, "kl": 0.0, "learning_rate": 2.588327662915467e-07, "logps/chosen": -140.7576446533203, "logps/rejected": -238.2123260498047, "loss": 0.2594, "rewards/chosen": 1.0603423118591309, "rewards/margins": 2.653862476348877, "rewards/rejected": -1.593520164489746, "step": 1843 }, { "epoch": 0.48, "grad_norm": 38.02727127075195, "kl": 0.0, "learning_rate": 2.587019104946349e-07, "logps/chosen": -184.4716796875, "logps/rejected": -265.66009521484375, "loss": 0.3522, "rewards/chosen": -1.331416130065918, "rewards/margins": 2.9817352294921875, "rewards/rejected": -4.3131513595581055, "step": 1844 }, { "epoch": 0.48, "grad_norm": 33.963863372802734, "kl": 0.0, "learning_rate": 2.585710546977231e-07, "logps/chosen": -233.77108764648438, "logps/rejected": -203.09811401367188, "loss": 0.2908, "rewards/chosen": 0.03152484446763992, "rewards/margins": 4.830499649047852, "rewards/rejected": -4.798974990844727, "step": 1845 }, { "epoch": 0.48, "grad_norm": 32.82160186767578, "kl": 0.0, "learning_rate": 2.584401989008113e-07, "logps/chosen": -275.6189270019531, "logps/rejected": -189.2215118408203, "loss": 0.1936, "rewards/chosen": 2.4405970573425293, "rewards/margins": 6.280233860015869, "rewards/rejected": -3.83963680267334, "step": 1846 }, { "epoch": 0.48, "grad_norm": 28.793840408325195, "kl": 0.0, "learning_rate": 2.5830934310389954e-07, "logps/chosen": -202.758056640625, "logps/rejected": -209.53219604492188, "loss": 0.2467, "rewards/chosen": 0.4155222177505493, "rewards/margins": 4.262606620788574, "rewards/rejected": -3.8470845222473145, "step": 1847 }, { "epoch": 0.48, "grad_norm": 31.604778289794922, "kl": 0.0, "learning_rate": 2.5817848730698773e-07, "logps/chosen": -119.1492919921875, "logps/rejected": -248.53244018554688, "loss": 0.2821, "rewards/chosen": 1.3229519128799438, "rewards/margins": 3.7250118255615234, "rewards/rejected": -2.402060031890869, "step": 1848 }, { "epoch": 0.48, "grad_norm": 36.85615539550781, "kl": 0.0, "learning_rate": 2.580476315100759e-07, "logps/chosen": -279.9113464355469, "logps/rejected": -217.8267059326172, "loss": 0.2948, "rewards/chosen": 1.4072452783584595, "rewards/margins": 3.7849130630493164, "rewards/rejected": -2.3776676654815674, "step": 1849 }, { "epoch": 0.48, "grad_norm": 26.794940948486328, "kl": 0.0, "learning_rate": 2.5791677571316407e-07, "logps/chosen": -222.50216674804688, "logps/rejected": -312.8692321777344, "loss": 0.3357, "rewards/chosen": -0.4899716377258301, "rewards/margins": 5.189242362976074, "rewards/rejected": -5.679214000701904, "step": 1850 }, { "epoch": 0.48, "grad_norm": 31.417387008666992, "kl": 0.0, "learning_rate": 2.5778591991625227e-07, "logps/chosen": -93.98246002197266, "logps/rejected": -269.1598205566406, "loss": 0.3274, "rewards/chosen": -0.26879116892814636, "rewards/margins": 2.833065986633301, "rewards/rejected": -3.1018571853637695, "step": 1851 }, { "epoch": 0.48, "grad_norm": 38.49124526977539, "kl": 0.0, "learning_rate": 2.5765506411934046e-07, "logps/chosen": -267.2807312011719, "logps/rejected": -243.086669921875, "loss": 0.2353, "rewards/chosen": 2.298083782196045, "rewards/margins": 5.859939098358154, "rewards/rejected": -3.5618553161621094, "step": 1852 }, { "epoch": 0.48, "grad_norm": 39.02790832519531, "kl": 0.0, "learning_rate": 2.5752420832242866e-07, "logps/chosen": -215.70635986328125, "logps/rejected": -261.94781494140625, "loss": 0.3084, "rewards/chosen": 0.24146094918251038, "rewards/margins": 2.3436214923858643, "rewards/rejected": -2.1021604537963867, "step": 1853 }, { "epoch": 0.49, "grad_norm": 35.632415771484375, "kl": 0.0, "learning_rate": 2.5739335252551686e-07, "logps/chosen": -245.4604034423828, "logps/rejected": -216.57431030273438, "loss": 0.252, "rewards/chosen": 0.37408319115638733, "rewards/margins": 3.8141672611236572, "rewards/rejected": -3.4400839805603027, "step": 1854 }, { "epoch": 0.49, "grad_norm": 38.72943115234375, "kl": 0.0, "learning_rate": 2.5726249672860505e-07, "logps/chosen": -253.3831329345703, "logps/rejected": -220.69468688964844, "loss": 0.1547, "rewards/chosen": 0.9969064593315125, "rewards/margins": 5.076277732849121, "rewards/rejected": -4.079371452331543, "step": 1855 }, { "epoch": 0.49, "grad_norm": 33.00349807739258, "kl": 0.0, "learning_rate": 2.5713164093169325e-07, "logps/chosen": -178.433349609375, "logps/rejected": -243.36041259765625, "loss": 0.348, "rewards/chosen": -0.0945533812046051, "rewards/margins": 2.8740203380584717, "rewards/rejected": -2.968573808670044, "step": 1856 }, { "epoch": 0.49, "grad_norm": 23.94053077697754, "kl": 0.0, "learning_rate": 2.5700078513478144e-07, "logps/chosen": -238.0264434814453, "logps/rejected": -362.50677490234375, "loss": 0.2621, "rewards/chosen": 0.5840424299240112, "rewards/margins": 4.776507377624512, "rewards/rejected": -4.192464828491211, "step": 1857 }, { "epoch": 0.49, "grad_norm": 29.512399673461914, "kl": 0.0, "learning_rate": 2.5686992933786964e-07, "logps/chosen": -187.4696044921875, "logps/rejected": -281.3097839355469, "loss": 0.185, "rewards/chosen": 1.7165203094482422, "rewards/margins": 5.96409273147583, "rewards/rejected": -4.247572422027588, "step": 1858 }, { "epoch": 0.49, "grad_norm": 34.900535583496094, "kl": 0.0, "learning_rate": 2.5673907354095784e-07, "logps/chosen": -243.0150146484375, "logps/rejected": -188.5065155029297, "loss": 0.2916, "rewards/chosen": -0.3352580666542053, "rewards/margins": 2.3534703254699707, "rewards/rejected": -2.6887283325195312, "step": 1859 }, { "epoch": 0.49, "grad_norm": 37.05907440185547, "kl": 0.0, "learning_rate": 2.566082177440461e-07, "logps/chosen": -173.6273956298828, "logps/rejected": -222.750244140625, "loss": 0.32, "rewards/chosen": 1.3401601314544678, "rewards/margins": 3.5735867023468018, "rewards/rejected": -2.233426570892334, "step": 1860 }, { "epoch": 0.49, "grad_norm": 38.520572662353516, "kl": 0.0, "learning_rate": 2.564773619471343e-07, "logps/chosen": -198.89744567871094, "logps/rejected": -206.3017578125, "loss": 0.2883, "rewards/chosen": 0.6857025623321533, "rewards/margins": 2.900019884109497, "rewards/rejected": -2.2143173217773438, "step": 1861 }, { "epoch": 0.49, "grad_norm": 24.21077537536621, "kl": 0.0, "learning_rate": 2.563465061502225e-07, "logps/chosen": -205.73321533203125, "logps/rejected": -201.75450134277344, "loss": 0.2302, "rewards/chosen": 1.6169886589050293, "rewards/margins": 5.4056196212768555, "rewards/rejected": -3.788630962371826, "step": 1862 }, { "epoch": 0.49, "grad_norm": 27.92758560180664, "kl": 0.0, "learning_rate": 2.562156503533107e-07, "logps/chosen": -169.76693725585938, "logps/rejected": -300.3410949707031, "loss": 0.2816, "rewards/chosen": 1.1886805295944214, "rewards/margins": 6.535387992858887, "rewards/rejected": -5.346707344055176, "step": 1863 }, { "epoch": 0.49, "grad_norm": 40.24781036376953, "kl": 0.0, "learning_rate": 2.5608479455639887e-07, "logps/chosen": -308.61517333984375, "logps/rejected": -213.69386291503906, "loss": 0.3359, "rewards/chosen": -1.7393323183059692, "rewards/margins": 0.8687006235122681, "rewards/rejected": -2.6080329418182373, "step": 1864 }, { "epoch": 0.49, "grad_norm": 37.39738082885742, "kl": 0.0, "learning_rate": 2.55953938759487e-07, "logps/chosen": -237.5323486328125, "logps/rejected": -267.2157287597656, "loss": 0.314, "rewards/chosen": 1.9053438901901245, "rewards/margins": 5.274196147918701, "rewards/rejected": -3.368852138519287, "step": 1865 }, { "epoch": 0.49, "grad_norm": 27.616085052490234, "kl": 0.0, "learning_rate": 2.558230829625752e-07, "logps/chosen": -195.3375244140625, "logps/rejected": -274.4885559082031, "loss": 0.2278, "rewards/chosen": 1.3364429473876953, "rewards/margins": 3.6697354316711426, "rewards/rejected": -2.3332924842834473, "step": 1866 }, { "epoch": 0.49, "grad_norm": 31.84263801574707, "kl": 0.0, "learning_rate": 2.556922271656634e-07, "logps/chosen": -255.9803924560547, "logps/rejected": -363.9133605957031, "loss": 0.2285, "rewards/chosen": 0.1768454760313034, "rewards/margins": 5.845008373260498, "rewards/rejected": -5.668162822723389, "step": 1867 }, { "epoch": 0.49, "grad_norm": 24.04751968383789, "kl": 0.0, "learning_rate": 2.555613713687516e-07, "logps/chosen": -240.5629425048828, "logps/rejected": -328.81195068359375, "loss": 0.1552, "rewards/chosen": 0.5476657748222351, "rewards/margins": 5.483957767486572, "rewards/rejected": -4.9362921714782715, "step": 1868 }, { "epoch": 0.49, "grad_norm": 48.056461334228516, "kl": 0.0, "learning_rate": 2.554305155718398e-07, "logps/chosen": -198.63595581054688, "logps/rejected": -255.7877655029297, "loss": 0.2434, "rewards/chosen": 1.6382763385772705, "rewards/margins": 4.527665138244629, "rewards/rejected": -2.8893887996673584, "step": 1869 }, { "epoch": 0.49, "grad_norm": 32.82905578613281, "kl": 0.0, "learning_rate": 2.55299659774928e-07, "logps/chosen": -203.86563110351562, "logps/rejected": -264.9864501953125, "loss": 0.2917, "rewards/chosen": -0.517257809638977, "rewards/margins": 2.667818069458008, "rewards/rejected": -3.1850759983062744, "step": 1870 }, { "epoch": 0.49, "grad_norm": 25.104469299316406, "kl": 0.0, "learning_rate": 2.551688039780162e-07, "logps/chosen": -129.41441345214844, "logps/rejected": -279.703857421875, "loss": 0.3841, "rewards/chosen": -0.38443830609321594, "rewards/margins": 3.2106776237487793, "rewards/rejected": -3.595115900039673, "step": 1871 }, { "epoch": 0.49, "grad_norm": 29.48250961303711, "kl": 0.0, "learning_rate": 2.550379481811044e-07, "logps/chosen": -251.05799865722656, "logps/rejected": -246.1929168701172, "loss": 0.1636, "rewards/chosen": 2.388549566268921, "rewards/margins": 5.352682590484619, "rewards/rejected": -2.9641330242156982, "step": 1872 }, { "epoch": 0.49, "grad_norm": 40.20368957519531, "kl": 0.0, "learning_rate": 2.5490709238419264e-07, "logps/chosen": -231.3787384033203, "logps/rejected": -276.19232177734375, "loss": 0.2379, "rewards/chosen": 1.4038299322128296, "rewards/margins": 5.262545585632324, "rewards/rejected": -3.858715534210205, "step": 1873 }, { "epoch": 0.49, "grad_norm": 24.63336181640625, "kl": 0.0, "learning_rate": 2.5477623658728083e-07, "logps/chosen": -157.3682403564453, "logps/rejected": -257.8160400390625, "loss": 0.1499, "rewards/chosen": 2.0958898067474365, "rewards/margins": 5.802646160125732, "rewards/rejected": -3.706756353378296, "step": 1874 }, { "epoch": 0.49, "grad_norm": 27.928619384765625, "kl": 0.0, "learning_rate": 2.5464538079036903e-07, "logps/chosen": -163.24595642089844, "logps/rejected": -290.6197204589844, "loss": 0.1987, "rewards/chosen": 0.922084391117096, "rewards/margins": 4.095943450927734, "rewards/rejected": -3.173859119415283, "step": 1875 }, { "epoch": 0.49, "grad_norm": 36.277679443359375, "kl": 0.0, "learning_rate": 2.545145249934572e-07, "logps/chosen": -198.1995849609375, "logps/rejected": -264.11358642578125, "loss": 0.2921, "rewards/chosen": -0.16467052698135376, "rewards/margins": 4.354956150054932, "rewards/rejected": -4.519626617431641, "step": 1876 }, { "epoch": 0.49, "grad_norm": 38.40446853637695, "kl": 0.0, "learning_rate": 2.543836691965454e-07, "logps/chosen": -187.0970458984375, "logps/rejected": -296.1387634277344, "loss": 0.257, "rewards/chosen": 0.20785601437091827, "rewards/margins": 4.458132743835449, "rewards/rejected": -4.250276565551758, "step": 1877 }, { "epoch": 0.49, "grad_norm": 39.18281936645508, "kl": 0.0, "learning_rate": 2.542528133996336e-07, "logps/chosen": -186.9453125, "logps/rejected": -246.5155487060547, "loss": 0.3307, "rewards/chosen": -0.34305471181869507, "rewards/margins": 1.5182170867919922, "rewards/rejected": -1.8612717390060425, "step": 1878 }, { "epoch": 0.49, "grad_norm": 27.042150497436523, "kl": 0.0, "learning_rate": 2.541219576027218e-07, "logps/chosen": -189.15548706054688, "logps/rejected": -219.12112426757812, "loss": 0.1848, "rewards/chosen": 1.8135871887207031, "rewards/margins": 5.438680171966553, "rewards/rejected": -3.6250929832458496, "step": 1879 }, { "epoch": 0.49, "grad_norm": 42.695098876953125, "kl": 0.0, "learning_rate": 2.5399110180580995e-07, "logps/chosen": -186.45420837402344, "logps/rejected": -312.56304931640625, "loss": 0.2228, "rewards/chosen": 0.7028865814208984, "rewards/margins": 4.877556324005127, "rewards/rejected": -4.1746697425842285, "step": 1880 }, { "epoch": 0.49, "grad_norm": 38.940799713134766, "kl": 0.0, "learning_rate": 2.5386024600889815e-07, "logps/chosen": -347.6214599609375, "logps/rejected": -166.71929931640625, "loss": 0.2926, "rewards/chosen": -2.7828896045684814, "rewards/margins": -0.96140456199646, "rewards/rejected": -1.8214850425720215, "step": 1881 }, { "epoch": 0.49, "grad_norm": 30.760910034179688, "kl": 0.0, "learning_rate": 2.5372939021198635e-07, "logps/chosen": -205.28262329101562, "logps/rejected": -237.68161010742188, "loss": 0.3346, "rewards/chosen": -0.400518000125885, "rewards/margins": 4.106273174285889, "rewards/rejected": -4.506791114807129, "step": 1882 }, { "epoch": 0.49, "grad_norm": 39.819332122802734, "kl": 0.0, "learning_rate": 2.5359853441507454e-07, "logps/chosen": -278.8540344238281, "logps/rejected": -275.9216003417969, "loss": 0.2436, "rewards/chosen": 1.4045870304107666, "rewards/margins": 2.893503189086914, "rewards/rejected": -1.488916039466858, "step": 1883 }, { "epoch": 0.49, "grad_norm": 31.362224578857422, "kl": 0.0, "learning_rate": 2.5346767861816274e-07, "logps/chosen": -208.73666381835938, "logps/rejected": -221.90631103515625, "loss": 0.2985, "rewards/chosen": 1.8967316150665283, "rewards/margins": 6.240492820739746, "rewards/rejected": -4.343760967254639, "step": 1884 }, { "epoch": 0.49, "grad_norm": 31.99814224243164, "kl": 0.0, "learning_rate": 2.5333682282125094e-07, "logps/chosen": -212.91397094726562, "logps/rejected": -280.9819030761719, "loss": 0.3056, "rewards/chosen": -0.8252590894699097, "rewards/margins": 3.0601096153259277, "rewards/rejected": -3.885368824005127, "step": 1885 }, { "epoch": 0.49, "grad_norm": 36.41081237792969, "kl": 0.0, "learning_rate": 2.532059670243392e-07, "logps/chosen": -218.00038146972656, "logps/rejected": -217.99234008789062, "loss": 0.2384, "rewards/chosen": 0.8008521199226379, "rewards/margins": 4.729746341705322, "rewards/rejected": -3.92889404296875, "step": 1886 }, { "epoch": 0.49, "grad_norm": 37.86720657348633, "kl": 0.0, "learning_rate": 2.530751112274274e-07, "logps/chosen": -269.68780517578125, "logps/rejected": -251.1253204345703, "loss": 0.2796, "rewards/chosen": 0.7860704660415649, "rewards/margins": 3.951198101043701, "rewards/rejected": -3.165127754211426, "step": 1887 }, { "epoch": 0.49, "grad_norm": 31.15866470336914, "kl": 0.0, "learning_rate": 2.529442554305156e-07, "logps/chosen": -224.89462280273438, "logps/rejected": -279.7936706542969, "loss": 0.2781, "rewards/chosen": 0.3493259847164154, "rewards/margins": 5.08610200881958, "rewards/rejected": -4.736775875091553, "step": 1888 }, { "epoch": 0.49, "grad_norm": 34.115726470947266, "kl": 0.0, "learning_rate": 2.5281339963360377e-07, "logps/chosen": -234.70863342285156, "logps/rejected": -174.17152404785156, "loss": 0.2653, "rewards/chosen": 1.474958062171936, "rewards/margins": 3.463298797607422, "rewards/rejected": -1.9883406162261963, "step": 1889 }, { "epoch": 0.49, "grad_norm": 36.890350341796875, "kl": 0.0, "learning_rate": 2.5268254383669197e-07, "logps/chosen": -204.939208984375, "logps/rejected": -262.5244140625, "loss": 0.2881, "rewards/chosen": 0.6666855812072754, "rewards/margins": 4.215951919555664, "rewards/rejected": -3.5492665767669678, "step": 1890 }, { "epoch": 0.49, "grad_norm": 33.597843170166016, "kl": 0.0, "learning_rate": 2.5255168803978016e-07, "logps/chosen": -148.05075073242188, "logps/rejected": -116.547607421875, "loss": 0.2184, "rewards/chosen": 0.7623181939125061, "rewards/margins": 2.7595696449279785, "rewards/rejected": -1.9972515106201172, "step": 1891 }, { "epoch": 0.5, "grad_norm": 35.052284240722656, "kl": 0.0, "learning_rate": 2.5242083224286836e-07, "logps/chosen": -166.26486206054688, "logps/rejected": -197.2524871826172, "loss": 0.296, "rewards/chosen": 0.15562376379966736, "rewards/margins": 2.3366987705230713, "rewards/rejected": -2.181075096130371, "step": 1892 }, { "epoch": 0.5, "grad_norm": 32.913978576660156, "kl": 0.0, "learning_rate": 2.5228997644595656e-07, "logps/chosen": -143.8211669921875, "logps/rejected": -236.24700927734375, "loss": 0.3891, "rewards/chosen": 0.37257999181747437, "rewards/margins": 2.2927846908569336, "rewards/rejected": -1.9202046394348145, "step": 1893 }, { "epoch": 0.5, "grad_norm": 30.473838806152344, "kl": 0.0, "learning_rate": 2.5215912064904475e-07, "logps/chosen": -192.12429809570312, "logps/rejected": -227.31834411621094, "loss": 0.2552, "rewards/chosen": 2.3903250694274902, "rewards/margins": 6.382128715515137, "rewards/rejected": -3.9918034076690674, "step": 1894 }, { "epoch": 0.5, "grad_norm": 39.04948043823242, "kl": 0.0, "learning_rate": 2.5202826485213295e-07, "logps/chosen": -233.09219360351562, "logps/rejected": -185.82948303222656, "loss": 0.3997, "rewards/chosen": -0.4558795392513275, "rewards/margins": 2.3500428199768066, "rewards/rejected": -2.805922269821167, "step": 1895 }, { "epoch": 0.5, "grad_norm": 34.34318161010742, "kl": 0.0, "learning_rate": 2.518974090552211e-07, "logps/chosen": -171.54519653320312, "logps/rejected": -185.059814453125, "loss": 0.2155, "rewards/chosen": 2.059077739715576, "rewards/margins": 5.530086994171143, "rewards/rejected": -3.4710092544555664, "step": 1896 }, { "epoch": 0.5, "grad_norm": 38.1912956237793, "kl": 0.0, "learning_rate": 2.517665532583093e-07, "logps/chosen": -270.6682434082031, "logps/rejected": -306.77264404296875, "loss": 0.3807, "rewards/chosen": 1.2388911247253418, "rewards/margins": 3.8831653594970703, "rewards/rejected": -2.6442742347717285, "step": 1897 }, { "epoch": 0.5, "grad_norm": 30.48811912536621, "kl": 0.0, "learning_rate": 2.5163569746139754e-07, "logps/chosen": -209.73785400390625, "logps/rejected": -205.18353271484375, "loss": 0.3538, "rewards/chosen": 1.776361107826233, "rewards/margins": 3.798926830291748, "rewards/rejected": -2.0225658416748047, "step": 1898 }, { "epoch": 0.5, "grad_norm": 46.357425689697266, "kl": 0.0, "learning_rate": 2.5150484166448573e-07, "logps/chosen": -147.869140625, "logps/rejected": -235.92543029785156, "loss": 0.1959, "rewards/chosen": 1.779581069946289, "rewards/margins": 4.554662704467773, "rewards/rejected": -2.7750816345214844, "step": 1899 }, { "epoch": 0.5, "grad_norm": 40.692535400390625, "kl": 0.0, "learning_rate": 2.5137398586757393e-07, "logps/chosen": -180.42958068847656, "logps/rejected": -214.12777709960938, "loss": 0.38, "rewards/chosen": -0.06748411059379578, "rewards/margins": 1.6282284259796143, "rewards/rejected": -1.6957125663757324, "step": 1900 }, { "epoch": 0.5, "grad_norm": 37.360252380371094, "kl": 0.0, "learning_rate": 2.512431300706621e-07, "logps/chosen": -148.78396606445312, "logps/rejected": -240.42269897460938, "loss": 0.2743, "rewards/chosen": 0.6462602615356445, "rewards/margins": 3.7638111114501953, "rewards/rejected": -3.117550849914551, "step": 1901 }, { "epoch": 0.5, "grad_norm": 31.662979125976562, "kl": 0.0, "learning_rate": 2.511122742737503e-07, "logps/chosen": -178.69883728027344, "logps/rejected": -310.7706298828125, "loss": 0.2792, "rewards/chosen": 0.750849723815918, "rewards/margins": 4.345252990722656, "rewards/rejected": -3.594403028488159, "step": 1902 }, { "epoch": 0.5, "grad_norm": 34.83212661743164, "kl": 0.0, "learning_rate": 2.509814184768385e-07, "logps/chosen": -272.0216979980469, "logps/rejected": -305.064208984375, "loss": 0.231, "rewards/chosen": 0.39868250489234924, "rewards/margins": 4.167855739593506, "rewards/rejected": -3.7691731452941895, "step": 1903 }, { "epoch": 0.5, "grad_norm": 47.436729431152344, "kl": 0.0, "learning_rate": 2.508505626799267e-07, "logps/chosen": -206.93185424804688, "logps/rejected": -327.25689697265625, "loss": 0.3349, "rewards/chosen": 0.07472336292266846, "rewards/margins": 3.3958563804626465, "rewards/rejected": -3.3211331367492676, "step": 1904 }, { "epoch": 0.5, "grad_norm": 27.917442321777344, "kl": 0.0, "learning_rate": 2.507197068830149e-07, "logps/chosen": -222.82229614257812, "logps/rejected": -137.33285522460938, "loss": 0.2192, "rewards/chosen": -0.539939284324646, "rewards/margins": 2.519014835357666, "rewards/rejected": -3.0589540004730225, "step": 1905 }, { "epoch": 0.5, "grad_norm": 33.432159423828125, "kl": 0.0, "learning_rate": 2.505888510861031e-07, "logps/chosen": -246.05108642578125, "logps/rejected": -261.0732727050781, "loss": 0.3226, "rewards/chosen": 0.5959483981132507, "rewards/margins": 2.7738301753997803, "rewards/rejected": -2.1778817176818848, "step": 1906 }, { "epoch": 0.5, "grad_norm": 36.27235412597656, "kl": 0.0, "learning_rate": 2.504579952891913e-07, "logps/chosen": -227.25546264648438, "logps/rejected": -291.0820007324219, "loss": 0.2343, "rewards/chosen": 0.1316702663898468, "rewards/margins": 5.5357184410095215, "rewards/rejected": -5.404047966003418, "step": 1907 }, { "epoch": 0.5, "grad_norm": 45.1259765625, "kl": 0.0, "learning_rate": 2.503271394922795e-07, "logps/chosen": -191.4156951904297, "logps/rejected": -201.90771484375, "loss": 0.3261, "rewards/chosen": 1.4199970960617065, "rewards/margins": 3.0559487342834473, "rewards/rejected": -1.6359517574310303, "step": 1908 }, { "epoch": 0.5, "grad_norm": 43.99619674682617, "kl": 0.0, "learning_rate": 2.501962836953677e-07, "logps/chosen": -210.96713256835938, "logps/rejected": -218.38539123535156, "loss": 0.2506, "rewards/chosen": 0.7000874876976013, "rewards/margins": 4.751967906951904, "rewards/rejected": -4.051880359649658, "step": 1909 }, { "epoch": 0.5, "grad_norm": 34.98705291748047, "kl": 0.0, "learning_rate": 2.500654278984559e-07, "logps/chosen": -138.10946655273438, "logps/rejected": -166.25003051757812, "loss": 0.4363, "rewards/chosen": -0.312425434589386, "rewards/margins": 1.5524520874023438, "rewards/rejected": -1.864877462387085, "step": 1910 }, { "epoch": 0.5, "grad_norm": 33.36783981323242, "kl": 0.0, "learning_rate": 2.499345721015441e-07, "logps/chosen": -225.1962890625, "logps/rejected": -365.34783935546875, "loss": 0.2549, "rewards/chosen": -0.11971984803676605, "rewards/margins": 4.3747735023498535, "rewards/rejected": -4.49449348449707, "step": 1911 }, { "epoch": 0.5, "grad_norm": 33.521934509277344, "kl": 0.0, "learning_rate": 2.498037163046323e-07, "logps/chosen": -195.3630828857422, "logps/rejected": -289.0223693847656, "loss": 0.3226, "rewards/chosen": -0.13952356576919556, "rewards/margins": 2.2491676807403564, "rewards/rejected": -2.3886911869049072, "step": 1912 }, { "epoch": 0.5, "grad_norm": 31.397092819213867, "kl": 0.0, "learning_rate": 2.496728605077205e-07, "logps/chosen": -268.779541015625, "logps/rejected": -265.0915222167969, "loss": 0.2835, "rewards/chosen": -1.829577088356018, "rewards/margins": 3.003638744354248, "rewards/rejected": -4.833215713500977, "step": 1913 }, { "epoch": 0.5, "grad_norm": 37.29655075073242, "kl": 0.0, "learning_rate": 2.495420047108087e-07, "logps/chosen": -223.605712890625, "logps/rejected": -172.59487915039062, "loss": 0.2707, "rewards/chosen": -0.2785741686820984, "rewards/margins": 3.410541534423828, "rewards/rejected": -3.6891157627105713, "step": 1914 }, { "epoch": 0.5, "grad_norm": 28.963512420654297, "kl": 0.0, "learning_rate": 2.4941114891389687e-07, "logps/chosen": -220.93190002441406, "logps/rejected": -276.0335388183594, "loss": 0.2289, "rewards/chosen": 0.43622493743896484, "rewards/margins": 5.225882053375244, "rewards/rejected": -4.789657115936279, "step": 1915 }, { "epoch": 0.5, "grad_norm": 32.86867141723633, "kl": 0.0, "learning_rate": 2.4928029311698507e-07, "logps/chosen": -170.21336364746094, "logps/rejected": -254.4930419921875, "loss": 0.3359, "rewards/chosen": -0.02794739231467247, "rewards/margins": 3.419856548309326, "rewards/rejected": -3.4478039741516113, "step": 1916 }, { "epoch": 0.5, "grad_norm": 29.831283569335938, "kl": 0.0, "learning_rate": 2.4914943732007326e-07, "logps/chosen": -217.07911682128906, "logps/rejected": -244.5806427001953, "loss": 0.3747, "rewards/chosen": 0.47169598937034607, "rewards/margins": 3.60986065864563, "rewards/rejected": -3.138164758682251, "step": 1917 }, { "epoch": 0.5, "grad_norm": 29.670040130615234, "kl": 0.0, "learning_rate": 2.4901858152316146e-07, "logps/chosen": -168.487548828125, "logps/rejected": -217.94676208496094, "loss": 0.2863, "rewards/chosen": -0.3014002740383148, "rewards/margins": 3.647155284881592, "rewards/rejected": -3.9485554695129395, "step": 1918 }, { "epoch": 0.5, "grad_norm": 27.14519691467285, "kl": 0.0, "learning_rate": 2.4888772572624966e-07, "logps/chosen": -136.63169860839844, "logps/rejected": -230.33084106445312, "loss": 0.2316, "rewards/chosen": 0.6357839107513428, "rewards/margins": 6.009737968444824, "rewards/rejected": -5.373953819274902, "step": 1919 }, { "epoch": 0.5, "grad_norm": 50.06220245361328, "kl": 0.0, "learning_rate": 2.4875686992933785e-07, "logps/chosen": -264.7572326660156, "logps/rejected": -206.96310424804688, "loss": 0.3289, "rewards/chosen": -1.4153270721435547, "rewards/margins": 1.3842098712921143, "rewards/rejected": -2.799536943435669, "step": 1920 }, { "epoch": 0.5, "grad_norm": 37.91456604003906, "kl": 0.0, "learning_rate": 2.4862601413242605e-07, "logps/chosen": -232.99554443359375, "logps/rejected": -212.722412109375, "loss": 0.2997, "rewards/chosen": 1.4048177003860474, "rewards/margins": 2.8434221744537354, "rewards/rejected": -1.438604474067688, "step": 1921 }, { "epoch": 0.5, "grad_norm": 26.27916145324707, "kl": 0.0, "learning_rate": 2.4849515833551424e-07, "logps/chosen": -194.2124786376953, "logps/rejected": -242.9920654296875, "loss": 0.2118, "rewards/chosen": 0.6471109390258789, "rewards/margins": 4.211012840270996, "rewards/rejected": -3.563901662826538, "step": 1922 }, { "epoch": 0.5, "grad_norm": 33.186946868896484, "kl": 0.0, "learning_rate": 2.4836430253860244e-07, "logps/chosen": -223.27500915527344, "logps/rejected": -232.27818298339844, "loss": 0.3531, "rewards/chosen": 0.3251649737358093, "rewards/margins": 3.1628851890563965, "rewards/rejected": -2.8377201557159424, "step": 1923 }, { "epoch": 0.5, "grad_norm": 31.578989028930664, "kl": 0.0, "learning_rate": 2.4823344674169064e-07, "logps/chosen": -218.21646118164062, "logps/rejected": -155.0374298095703, "loss": 0.2636, "rewards/chosen": 0.9165531992912292, "rewards/margins": 3.4226291179656982, "rewards/rejected": -2.506075859069824, "step": 1924 }, { "epoch": 0.5, "grad_norm": 32.22088623046875, "kl": 0.0, "learning_rate": 2.4810259094477883e-07, "logps/chosen": -186.41725158691406, "logps/rejected": -283.95074462890625, "loss": 0.2782, "rewards/chosen": 0.27410784363746643, "rewards/margins": 3.7751660346984863, "rewards/rejected": -3.5010581016540527, "step": 1925 }, { "epoch": 0.5, "grad_norm": 31.215028762817383, "kl": 0.0, "learning_rate": 2.4797173514786703e-07, "logps/chosen": -230.7908172607422, "logps/rejected": -231.13796997070312, "loss": 0.2081, "rewards/chosen": 0.9891664385795593, "rewards/margins": 3.502727746963501, "rewards/rejected": -2.513561248779297, "step": 1926 }, { "epoch": 0.5, "grad_norm": 30.78005027770996, "kl": 0.0, "learning_rate": 2.478408793509552e-07, "logps/chosen": -268.7662048339844, "logps/rejected": -245.4067840576172, "loss": 0.2993, "rewards/chosen": 1.0699352025985718, "rewards/margins": 5.307055473327637, "rewards/rejected": -4.237120151519775, "step": 1927 }, { "epoch": 0.5, "grad_norm": 36.56181335449219, "kl": 0.0, "learning_rate": 2.477100235540434e-07, "logps/chosen": -168.37936401367188, "logps/rejected": -200.1322021484375, "loss": 0.3212, "rewards/chosen": 1.214407205581665, "rewards/margins": 3.4486401081085205, "rewards/rejected": -2.2342329025268555, "step": 1928 }, { "epoch": 0.5, "grad_norm": 32.369869232177734, "kl": 0.0, "learning_rate": 2.475791677571316e-07, "logps/chosen": -172.96864318847656, "logps/rejected": -308.66448974609375, "loss": 0.2533, "rewards/chosen": 1.525896430015564, "rewards/margins": 5.966020107269287, "rewards/rejected": -4.440123558044434, "step": 1929 }, { "epoch": 0.51, "grad_norm": 38.35368728637695, "kl": 0.0, "learning_rate": 2.474483119602198e-07, "logps/chosen": -219.0493927001953, "logps/rejected": -232.3117218017578, "loss": 0.2026, "rewards/chosen": 2.0987181663513184, "rewards/margins": 6.051818370819092, "rewards/rejected": -3.9531002044677734, "step": 1930 }, { "epoch": 0.51, "grad_norm": 30.660757064819336, "kl": 0.0, "learning_rate": 2.47317456163308e-07, "logps/chosen": -157.80386352539062, "logps/rejected": -218.20924377441406, "loss": 0.3078, "rewards/chosen": 1.5564416646957397, "rewards/margins": 4.841548442840576, "rewards/rejected": -3.285106897354126, "step": 1931 }, { "epoch": 0.51, "grad_norm": 25.03354835510254, "kl": 0.0, "learning_rate": 2.471866003663962e-07, "logps/chosen": -246.04446411132812, "logps/rejected": -182.98593139648438, "loss": 0.3044, "rewards/chosen": 0.08799563348293304, "rewards/margins": 3.25834321975708, "rewards/rejected": -3.1703476905822754, "step": 1932 }, { "epoch": 0.51, "grad_norm": 38.93879699707031, "kl": 0.0, "learning_rate": 2.470557445694844e-07, "logps/chosen": -197.60214233398438, "logps/rejected": -254.27537536621094, "loss": 0.3131, "rewards/chosen": 0.9349378943443298, "rewards/margins": 4.313072204589844, "rewards/rejected": -3.378134250640869, "step": 1933 }, { "epoch": 0.51, "grad_norm": 36.97758483886719, "kl": 0.0, "learning_rate": 2.469248887725726e-07, "logps/chosen": -185.9903564453125, "logps/rejected": -231.436767578125, "loss": 0.3016, "rewards/chosen": 1.0533219575881958, "rewards/margins": 3.887955665588379, "rewards/rejected": -2.8346335887908936, "step": 1934 }, { "epoch": 0.51, "grad_norm": 28.87652015686035, "kl": 0.0, "learning_rate": 2.467940329756608e-07, "logps/chosen": -187.3426055908203, "logps/rejected": -204.7564239501953, "loss": 0.3289, "rewards/chosen": 0.27323493361473083, "rewards/margins": 3.4943530559539795, "rewards/rejected": -3.221118211746216, "step": 1935 }, { "epoch": 0.51, "grad_norm": 26.652814865112305, "kl": 0.0, "learning_rate": 2.4666317717874904e-07, "logps/chosen": -243.4410858154297, "logps/rejected": -184.99859619140625, "loss": 0.2302, "rewards/chosen": 1.6804203987121582, "rewards/margins": 4.532587051391602, "rewards/rejected": -2.8521664142608643, "step": 1936 }, { "epoch": 0.51, "grad_norm": 41.8165168762207, "kl": 0.0, "learning_rate": 2.4653232138183724e-07, "logps/chosen": -239.2473907470703, "logps/rejected": -203.16265869140625, "loss": 0.2688, "rewards/chosen": 0.7100369930267334, "rewards/margins": 3.5892412662506104, "rewards/rejected": -2.879204273223877, "step": 1937 }, { "epoch": 0.51, "grad_norm": 40.11600875854492, "kl": 0.0, "learning_rate": 2.464014655849254e-07, "logps/chosen": -173.17868041992188, "logps/rejected": -304.7061462402344, "loss": 0.2404, "rewards/chosen": 0.33373430371284485, "rewards/margins": 5.660027503967285, "rewards/rejected": -5.326292991638184, "step": 1938 }, { "epoch": 0.51, "grad_norm": 38.29595184326172, "kl": 0.0, "learning_rate": 2.462706097880136e-07, "logps/chosen": -195.17929077148438, "logps/rejected": -195.31170654296875, "loss": 0.3822, "rewards/chosen": 0.5516433119773865, "rewards/margins": 3.5008597373962402, "rewards/rejected": -2.949216365814209, "step": 1939 }, { "epoch": 0.51, "grad_norm": 28.157041549682617, "kl": 0.0, "learning_rate": 2.4613975399110177e-07, "logps/chosen": -164.0011749267578, "logps/rejected": -276.69927978515625, "loss": 0.2932, "rewards/chosen": 0.31394287943840027, "rewards/margins": 3.9898202419281006, "rewards/rejected": -3.675877332687378, "step": 1940 }, { "epoch": 0.51, "grad_norm": 26.90674591064453, "kl": 0.0, "learning_rate": 2.4600889819418997e-07, "logps/chosen": -157.42709350585938, "logps/rejected": -193.3916015625, "loss": 0.2054, "rewards/chosen": 0.030436119064688683, "rewards/margins": 3.6355412006378174, "rewards/rejected": -3.60510516166687, "step": 1941 }, { "epoch": 0.51, "grad_norm": 37.283145904541016, "kl": 0.0, "learning_rate": 2.4587804239727817e-07, "logps/chosen": -256.7583923339844, "logps/rejected": -203.76365661621094, "loss": 0.305, "rewards/chosen": 1.2997283935546875, "rewards/margins": 2.928647994995117, "rewards/rejected": -1.6289196014404297, "step": 1942 }, { "epoch": 0.51, "grad_norm": 36.678279876708984, "kl": 0.0, "learning_rate": 2.457471866003664e-07, "logps/chosen": -220.28211975097656, "logps/rejected": -180.16119384765625, "loss": 0.3362, "rewards/chosen": 0.16220007836818695, "rewards/margins": 2.24269700050354, "rewards/rejected": -2.0804970264434814, "step": 1943 }, { "epoch": 0.51, "grad_norm": 28.942739486694336, "kl": 0.0, "learning_rate": 2.456163308034546e-07, "logps/chosen": -156.3596649169922, "logps/rejected": -227.49583435058594, "loss": 0.2208, "rewards/chosen": 0.7010388374328613, "rewards/margins": 3.153454303741455, "rewards/rejected": -2.4524154663085938, "step": 1944 }, { "epoch": 0.51, "grad_norm": 26.555570602416992, "kl": 0.0, "learning_rate": 2.454854750065428e-07, "logps/chosen": -144.39047241210938, "logps/rejected": -280.34881591796875, "loss": 0.2719, "rewards/chosen": 1.1340817213058472, "rewards/margins": 4.722748279571533, "rewards/rejected": -3.5886664390563965, "step": 1945 }, { "epoch": 0.51, "grad_norm": 39.58638000488281, "kl": 0.0, "learning_rate": 2.4535461920963095e-07, "logps/chosen": -204.01065063476562, "logps/rejected": -162.374267578125, "loss": 0.2145, "rewards/chosen": -0.44569912552833557, "rewards/margins": 3.055454969406128, "rewards/rejected": -3.5011541843414307, "step": 1946 }, { "epoch": 0.51, "grad_norm": 43.68654251098633, "kl": 0.0, "learning_rate": 2.4522376341271915e-07, "logps/chosen": -295.45184326171875, "logps/rejected": -255.74960327148438, "loss": 0.3089, "rewards/chosen": 0.3848971128463745, "rewards/margins": 3.178168773651123, "rewards/rejected": -2.793271541595459, "step": 1947 }, { "epoch": 0.51, "grad_norm": 32.19191360473633, "kl": 0.0, "learning_rate": 2.4509290761580734e-07, "logps/chosen": -210.06488037109375, "logps/rejected": -217.54795837402344, "loss": 0.3451, "rewards/chosen": 0.453418493270874, "rewards/margins": 4.383719444274902, "rewards/rejected": -3.9303009510040283, "step": 1948 }, { "epoch": 0.51, "grad_norm": 35.40753173828125, "kl": 0.0, "learning_rate": 2.449620518188956e-07, "logps/chosen": -277.6213684082031, "logps/rejected": -188.61752319335938, "loss": 0.2245, "rewards/chosen": 2.0205652713775635, "rewards/margins": 4.447739601135254, "rewards/rejected": -2.4271743297576904, "step": 1949 }, { "epoch": 0.51, "grad_norm": 34.40073776245117, "kl": 0.0, "learning_rate": 2.448311960219838e-07, "logps/chosen": -233.07374572753906, "logps/rejected": -211.34178161621094, "loss": 0.2372, "rewards/chosen": 0.8760273456573486, "rewards/margins": 3.128060817718506, "rewards/rejected": -2.2520334720611572, "step": 1950 }, { "epoch": 0.51, "grad_norm": 31.40346908569336, "kl": 0.0, "learning_rate": 2.44700340225072e-07, "logps/chosen": -213.6735076904297, "logps/rejected": -291.5716552734375, "loss": 0.2644, "rewards/chosen": 0.4644780457019806, "rewards/margins": 4.4314141273498535, "rewards/rejected": -3.9669361114501953, "step": 1951 }, { "epoch": 0.51, "grad_norm": 29.35367774963379, "kl": 0.0, "learning_rate": 2.445694844281602e-07, "logps/chosen": -187.635986328125, "logps/rejected": -354.99658203125, "loss": 0.2588, "rewards/chosen": 1.0316107273101807, "rewards/margins": 6.271823883056641, "rewards/rejected": -5.240213394165039, "step": 1952 }, { "epoch": 0.51, "grad_norm": 34.39826583862305, "kl": 0.0, "learning_rate": 2.444386286312484e-07, "logps/chosen": -279.107666015625, "logps/rejected": -217.9149169921875, "loss": 0.2955, "rewards/chosen": -0.4340619146823883, "rewards/margins": 1.6407526731491089, "rewards/rejected": -2.074814558029175, "step": 1953 }, { "epoch": 0.51, "grad_norm": 36.55764389038086, "kl": 0.0, "learning_rate": 2.443077728343365e-07, "logps/chosen": -185.63668823242188, "logps/rejected": -254.9978790283203, "loss": 0.2592, "rewards/chosen": 0.41983142495155334, "rewards/margins": 5.603705883026123, "rewards/rejected": -5.183874607086182, "step": 1954 }, { "epoch": 0.51, "grad_norm": 39.05849075317383, "kl": 0.0, "learning_rate": 2.441769170374247e-07, "logps/chosen": -125.34358215332031, "logps/rejected": -198.91729736328125, "loss": 0.2841, "rewards/chosen": 1.3333288431167603, "rewards/margins": 3.365635871887207, "rewards/rejected": -2.0323069095611572, "step": 1955 }, { "epoch": 0.51, "grad_norm": 39.851844787597656, "kl": 0.0, "learning_rate": 2.4404606124051296e-07, "logps/chosen": -264.37908935546875, "logps/rejected": -268.1690673828125, "loss": 0.325, "rewards/chosen": 1.1693298816680908, "rewards/margins": 3.3857524394989014, "rewards/rejected": -2.2164225578308105, "step": 1956 }, { "epoch": 0.51, "grad_norm": 31.71510887145996, "kl": 0.0, "learning_rate": 2.4391520544360116e-07, "logps/chosen": -243.44525146484375, "logps/rejected": -193.47544860839844, "loss": 0.2813, "rewards/chosen": -1.3031715154647827, "rewards/margins": 1.8220585584640503, "rewards/rejected": -3.125230073928833, "step": 1957 }, { "epoch": 0.51, "grad_norm": 33.640567779541016, "kl": 0.0, "learning_rate": 2.4378434964668936e-07, "logps/chosen": -217.40499877929688, "logps/rejected": -174.684326171875, "loss": 0.283, "rewards/chosen": 0.24121162295341492, "rewards/margins": 2.657731294631958, "rewards/rejected": -2.4165196418762207, "step": 1958 }, { "epoch": 0.51, "grad_norm": 35.96133041381836, "kl": 0.0, "learning_rate": 2.4365349384977755e-07, "logps/chosen": -214.48333740234375, "logps/rejected": -197.01153564453125, "loss": 0.1731, "rewards/chosen": 1.1828800439834595, "rewards/margins": 3.951263904571533, "rewards/rejected": -2.7683839797973633, "step": 1959 }, { "epoch": 0.51, "grad_norm": 33.87704849243164, "kl": 0.0, "learning_rate": 2.4352263805286575e-07, "logps/chosen": -209.52987670898438, "logps/rejected": -191.13827514648438, "loss": 0.3245, "rewards/chosen": 0.3229031562805176, "rewards/margins": 2.702648162841797, "rewards/rejected": -2.3797450065612793, "step": 1960 }, { "epoch": 0.51, "grad_norm": 32.43837356567383, "kl": 0.0, "learning_rate": 2.433917822559539e-07, "logps/chosen": -142.2443389892578, "logps/rejected": -217.48886108398438, "loss": 0.3319, "rewards/chosen": -0.2942158579826355, "rewards/margins": 3.3216006755828857, "rewards/rejected": -3.615816593170166, "step": 1961 }, { "epoch": 0.51, "grad_norm": 48.2795524597168, "kl": 0.0, "learning_rate": 2.4326092645904214e-07, "logps/chosen": -225.362548828125, "logps/rejected": -381.4612731933594, "loss": 0.2663, "rewards/chosen": 0.8754479885101318, "rewards/margins": 6.36458683013916, "rewards/rejected": -5.489138603210449, "step": 1962 }, { "epoch": 0.51, "grad_norm": 53.284034729003906, "kl": 0.0, "learning_rate": 2.4313007066213034e-07, "logps/chosen": -186.80157470703125, "logps/rejected": -194.40985107421875, "loss": 0.3662, "rewards/chosen": 0.3198693096637726, "rewards/margins": 2.4725139141082764, "rewards/rejected": -2.152644634246826, "step": 1963 }, { "epoch": 0.51, "grad_norm": 26.204788208007812, "kl": 0.0, "learning_rate": 2.4299921486521853e-07, "logps/chosen": -235.0901641845703, "logps/rejected": -214.7635955810547, "loss": 0.1917, "rewards/chosen": 0.1780397891998291, "rewards/margins": 4.566169738769531, "rewards/rejected": -4.388129711151123, "step": 1964 }, { "epoch": 0.51, "grad_norm": 34.75926208496094, "kl": 0.0, "learning_rate": 2.4286835906830673e-07, "logps/chosen": -254.1787567138672, "logps/rejected": -289.5296325683594, "loss": 0.1977, "rewards/chosen": -0.25179824233055115, "rewards/margins": 3.519669771194458, "rewards/rejected": -3.771467924118042, "step": 1965 }, { "epoch": 0.51, "grad_norm": 26.714811325073242, "kl": 0.0, "learning_rate": 2.427375032713949e-07, "logps/chosen": -163.204833984375, "logps/rejected": -167.9423828125, "loss": 0.2762, "rewards/chosen": 0.9019087553024292, "rewards/margins": 4.088323593139648, "rewards/rejected": -3.186414957046509, "step": 1966 }, { "epoch": 0.51, "grad_norm": 29.82887840270996, "kl": 0.0, "learning_rate": 2.426066474744831e-07, "logps/chosen": -211.63839721679688, "logps/rejected": -247.35690307617188, "loss": 0.2539, "rewards/chosen": 1.2622199058532715, "rewards/margins": 5.667209625244141, "rewards/rejected": -4.404989719390869, "step": 1967 }, { "epoch": 0.52, "grad_norm": 32.24669647216797, "kl": 0.0, "learning_rate": 2.424757916775713e-07, "logps/chosen": -313.1291198730469, "logps/rejected": -282.4967956542969, "loss": 0.1948, "rewards/chosen": 0.557210385799408, "rewards/margins": 5.114065647125244, "rewards/rejected": -4.556855201721191, "step": 1968 }, { "epoch": 0.52, "grad_norm": 33.87299728393555, "kl": 0.0, "learning_rate": 2.423449358806595e-07, "logps/chosen": -227.81365966796875, "logps/rejected": -222.52450561523438, "loss": 0.3365, "rewards/chosen": -0.6963803768157959, "rewards/margins": 2.368621587753296, "rewards/rejected": -3.065001964569092, "step": 1969 }, { "epoch": 0.52, "grad_norm": 43.42044448852539, "kl": 0.0, "learning_rate": 2.422140800837477e-07, "logps/chosen": -228.2924041748047, "logps/rejected": -372.4189453125, "loss": 0.4293, "rewards/chosen": -0.8277515172958374, "rewards/margins": 1.1175405979156494, "rewards/rejected": -1.9452921152114868, "step": 1970 }, { "epoch": 0.52, "grad_norm": 33.60139846801758, "kl": 0.0, "learning_rate": 2.420832242868359e-07, "logps/chosen": -259.1628723144531, "logps/rejected": -381.32757568359375, "loss": 0.2342, "rewards/chosen": 1.377594232559204, "rewards/margins": 5.173920631408691, "rewards/rejected": -3.7963266372680664, "step": 1971 }, { "epoch": 0.52, "grad_norm": 33.57109069824219, "kl": 0.0, "learning_rate": 2.419523684899241e-07, "logps/chosen": -205.43441772460938, "logps/rejected": -240.46177673339844, "loss": 0.2658, "rewards/chosen": -0.9131128787994385, "rewards/margins": 2.3757729530334473, "rewards/rejected": -3.2888858318328857, "step": 1972 }, { "epoch": 0.52, "grad_norm": 42.60993576049805, "kl": 0.0, "learning_rate": 2.418215126930123e-07, "logps/chosen": -217.70196533203125, "logps/rejected": -162.82742309570312, "loss": 0.3996, "rewards/chosen": 0.12623104453086853, "rewards/margins": 0.7846701145172119, "rewards/rejected": -0.658439040184021, "step": 1973 }, { "epoch": 0.52, "grad_norm": 34.66673278808594, "kl": 0.0, "learning_rate": 2.416906568961005e-07, "logps/chosen": -217.13880920410156, "logps/rejected": -273.5459899902344, "loss": 0.3047, "rewards/chosen": 0.6674838066101074, "rewards/margins": 3.589686632156372, "rewards/rejected": -2.9222028255462646, "step": 1974 }, { "epoch": 0.52, "grad_norm": 32.35926055908203, "kl": 0.0, "learning_rate": 2.415598010991887e-07, "logps/chosen": -344.4316711425781, "logps/rejected": -287.30889892578125, "loss": 0.1567, "rewards/chosen": 2.5555834770202637, "rewards/margins": 7.409801959991455, "rewards/rejected": -4.854218482971191, "step": 1975 }, { "epoch": 0.52, "grad_norm": 29.469270706176758, "kl": 0.0, "learning_rate": 2.414289453022769e-07, "logps/chosen": -267.01593017578125, "logps/rejected": -216.3834686279297, "loss": 0.2763, "rewards/chosen": -0.6539653539657593, "rewards/margins": 0.95839524269104, "rewards/rejected": -1.6123605966567993, "step": 1976 }, { "epoch": 0.52, "grad_norm": 44.44554901123047, "kl": 0.0, "learning_rate": 2.412980895053651e-07, "logps/chosen": -265.7334899902344, "logps/rejected": -310.4556579589844, "loss": 0.2784, "rewards/chosen": -0.21513530611991882, "rewards/margins": 4.380249977111816, "rewards/rejected": -4.5953850746154785, "step": 1977 }, { "epoch": 0.52, "grad_norm": 25.45243263244629, "kl": 0.0, "learning_rate": 2.411672337084533e-07, "logps/chosen": -185.72653198242188, "logps/rejected": -275.7221374511719, "loss": 0.2038, "rewards/chosen": 1.7371035814285278, "rewards/margins": 5.086659908294678, "rewards/rejected": -3.3495562076568604, "step": 1978 }, { "epoch": 0.52, "grad_norm": 25.313249588012695, "kl": 0.0, "learning_rate": 2.410363779115415e-07, "logps/chosen": -195.99685668945312, "logps/rejected": -190.29600524902344, "loss": 0.1704, "rewards/chosen": 2.642261266708374, "rewards/margins": 6.334344387054443, "rewards/rejected": -3.6920831203460693, "step": 1979 }, { "epoch": 0.52, "grad_norm": 32.64925765991211, "kl": 0.0, "learning_rate": 2.4090552211462967e-07, "logps/chosen": -182.5782470703125, "logps/rejected": -299.02032470703125, "loss": 0.3535, "rewards/chosen": -0.3439632058143616, "rewards/margins": 3.923374652862549, "rewards/rejected": -4.267337799072266, "step": 1980 }, { "epoch": 0.52, "grad_norm": 26.917070388793945, "kl": 0.0, "learning_rate": 2.4077466631771787e-07, "logps/chosen": -154.9143829345703, "logps/rejected": -252.35838317871094, "loss": 0.354, "rewards/chosen": -0.6761507987976074, "rewards/margins": 5.603362083435059, "rewards/rejected": -6.279512882232666, "step": 1981 }, { "epoch": 0.52, "grad_norm": 39.749664306640625, "kl": 0.0, "learning_rate": 2.4064381052080606e-07, "logps/chosen": -228.64013671875, "logps/rejected": -169.25645446777344, "loss": 0.3207, "rewards/chosen": 0.6242986917495728, "rewards/margins": 2.633760929107666, "rewards/rejected": -2.009462356567383, "step": 1982 }, { "epoch": 0.52, "grad_norm": 40.571128845214844, "kl": 0.0, "learning_rate": 2.4051295472389426e-07, "logps/chosen": -202.51095581054688, "logps/rejected": -189.1254119873047, "loss": 0.366, "rewards/chosen": 0.6944351196289062, "rewards/margins": 2.6559300422668457, "rewards/rejected": -1.96149480342865, "step": 1983 }, { "epoch": 0.52, "grad_norm": 27.367752075195312, "kl": 0.0, "learning_rate": 2.4038209892698245e-07, "logps/chosen": -159.35598754882812, "logps/rejected": -217.9905548095703, "loss": 0.209, "rewards/chosen": 1.7716405391693115, "rewards/margins": 5.866267204284668, "rewards/rejected": -4.094626426696777, "step": 1984 }, { "epoch": 0.52, "grad_norm": 36.464874267578125, "kl": 0.0, "learning_rate": 2.4025124313007065e-07, "logps/chosen": -159.65773010253906, "logps/rejected": -226.83433532714844, "loss": 0.2122, "rewards/chosen": 1.215417742729187, "rewards/margins": 3.594834804534912, "rewards/rejected": -2.3794169425964355, "step": 1985 }, { "epoch": 0.52, "grad_norm": 36.60029220581055, "kl": 0.0, "learning_rate": 2.4012038733315885e-07, "logps/chosen": -210.5471649169922, "logps/rejected": -252.05821228027344, "loss": 0.3524, "rewards/chosen": -0.19484469294548035, "rewards/margins": 1.9677438735961914, "rewards/rejected": -2.162588596343994, "step": 1986 }, { "epoch": 0.52, "grad_norm": 39.36339569091797, "kl": 0.0, "learning_rate": 2.3998953153624704e-07, "logps/chosen": -206.82473754882812, "logps/rejected": -286.58221435546875, "loss": 0.3572, "rewards/chosen": 1.0052763223648071, "rewards/margins": 3.1620450019836426, "rewards/rejected": -2.156768798828125, "step": 1987 }, { "epoch": 0.52, "grad_norm": 40.26015090942383, "kl": 0.0, "learning_rate": 2.3985867573933524e-07, "logps/chosen": -249.87423706054688, "logps/rejected": -213.3260498046875, "loss": 0.2833, "rewards/chosen": -0.04886872321367264, "rewards/margins": 2.056130886077881, "rewards/rejected": -2.104999542236328, "step": 1988 }, { "epoch": 0.52, "grad_norm": 35.690330505371094, "kl": 0.0, "learning_rate": 2.3972781994242343e-07, "logps/chosen": -246.7486572265625, "logps/rejected": -248.2810821533203, "loss": 0.3034, "rewards/chosen": -0.03401753306388855, "rewards/margins": 1.7448437213897705, "rewards/rejected": -1.7788612842559814, "step": 1989 }, { "epoch": 0.52, "grad_norm": 29.125974655151367, "kl": 0.0, "learning_rate": 2.3959696414551163e-07, "logps/chosen": -183.20132446289062, "logps/rejected": -304.69183349609375, "loss": 0.1834, "rewards/chosen": 2.362597942352295, "rewards/margins": 5.717899322509766, "rewards/rejected": -3.3553013801574707, "step": 1990 }, { "epoch": 0.52, "grad_norm": 47.09006881713867, "kl": 0.0, "learning_rate": 2.3946610834859983e-07, "logps/chosen": -255.5098876953125, "logps/rejected": -304.6872863769531, "loss": 0.2865, "rewards/chosen": 0.13334283232688904, "rewards/margins": 4.940859794616699, "rewards/rejected": -4.807517051696777, "step": 1991 }, { "epoch": 0.52, "grad_norm": 30.52964210510254, "kl": 0.0, "learning_rate": 2.39335252551688e-07, "logps/chosen": -208.18450927734375, "logps/rejected": -293.5606384277344, "loss": 0.1683, "rewards/chosen": 1.081275224685669, "rewards/margins": 6.437618255615234, "rewards/rejected": -5.356342792510986, "step": 1992 }, { "epoch": 0.52, "grad_norm": 35.834815979003906, "kl": 0.0, "learning_rate": 2.392043967547762e-07, "logps/chosen": -233.533203125, "logps/rejected": -275.6778259277344, "loss": 0.2766, "rewards/chosen": 1.9862544536590576, "rewards/margins": 7.9831438064575195, "rewards/rejected": -5.996889591217041, "step": 1993 }, { "epoch": 0.52, "grad_norm": 41.54240798950195, "kl": 0.0, "learning_rate": 2.390735409578644e-07, "logps/chosen": -175.21755981445312, "logps/rejected": -177.35218811035156, "loss": 0.272, "rewards/chosen": 1.9858801364898682, "rewards/margins": 5.943943977355957, "rewards/rejected": -3.958063840866089, "step": 1994 }, { "epoch": 0.52, "grad_norm": 33.839927673339844, "kl": 0.0, "learning_rate": 2.389426851609526e-07, "logps/chosen": -187.34593200683594, "logps/rejected": -250.72268676757812, "loss": 0.3413, "rewards/chosen": 0.8257482051849365, "rewards/margins": 4.029740333557129, "rewards/rejected": -3.2039921283721924, "step": 1995 }, { "epoch": 0.52, "grad_norm": 34.77016830444336, "kl": 0.0, "learning_rate": 2.388118293640408e-07, "logps/chosen": -247.8369903564453, "logps/rejected": -240.98202514648438, "loss": 0.2067, "rewards/chosen": 2.7072157859802246, "rewards/margins": 6.077670097351074, "rewards/rejected": -3.3704545497894287, "step": 1996 }, { "epoch": 0.52, "grad_norm": 35.83280944824219, "kl": 0.0, "learning_rate": 2.38680973567129e-07, "logps/chosen": -189.8119354248047, "logps/rejected": -286.9617004394531, "loss": 0.2035, "rewards/chosen": 0.5847533345222473, "rewards/margins": 3.042118549346924, "rewards/rejected": -2.4573652744293213, "step": 1997 }, { "epoch": 0.52, "grad_norm": 30.720943450927734, "kl": 0.0, "learning_rate": 2.385501177702172e-07, "logps/chosen": -263.184814453125, "logps/rejected": -183.3404998779297, "loss": 0.3176, "rewards/chosen": -0.8872728943824768, "rewards/margins": 4.405872344970703, "rewards/rejected": -5.293145179748535, "step": 1998 }, { "epoch": 0.52, "grad_norm": 36.902984619140625, "kl": 0.0, "learning_rate": 2.384192619733054e-07, "logps/chosen": -342.3307800292969, "logps/rejected": -220.6918487548828, "loss": 0.2243, "rewards/chosen": 2.224860668182373, "rewards/margins": 7.383242130279541, "rewards/rejected": -5.158381462097168, "step": 1999 }, { "epoch": 0.52, "grad_norm": 31.698623657226562, "kl": 0.0, "learning_rate": 2.382884061763936e-07, "logps/chosen": -281.65863037109375, "logps/rejected": -207.4318084716797, "loss": 0.3269, "rewards/chosen": 2.86019229888916, "rewards/margins": 4.098602294921875, "rewards/rejected": -1.2384099960327148, "step": 2000 }, { "epoch": 0.52, "grad_norm": 35.31499099731445, "kl": 0.0, "learning_rate": 2.381575503794818e-07, "logps/chosen": -230.57676696777344, "logps/rejected": -317.56005859375, "loss": 0.25, "rewards/chosen": 1.5604783296585083, "rewards/margins": 4.952313423156738, "rewards/rejected": -3.3918352127075195, "step": 2001 }, { "epoch": 0.52, "grad_norm": 31.48984146118164, "kl": 0.0, "learning_rate": 2.3802669458256998e-07, "logps/chosen": -212.90480041503906, "logps/rejected": -433.2575988769531, "loss": 0.2678, "rewards/chosen": 0.2492951601743698, "rewards/margins": 3.0489494800567627, "rewards/rejected": -2.799654245376587, "step": 2002 }, { "epoch": 0.52, "grad_norm": 41.93503952026367, "kl": 0.0, "learning_rate": 2.3789583878565818e-07, "logps/chosen": -235.5082244873047, "logps/rejected": -195.15249633789062, "loss": 0.3205, "rewards/chosen": 0.1757785975933075, "rewards/margins": 1.9791057109832764, "rewards/rejected": -1.8033270835876465, "step": 2003 }, { "epoch": 0.52, "grad_norm": 24.260042190551758, "kl": 0.0, "learning_rate": 2.377649829887464e-07, "logps/chosen": -269.15264892578125, "logps/rejected": -240.45811462402344, "loss": 0.2631, "rewards/chosen": 0.8864780068397522, "rewards/margins": 4.7506513595581055, "rewards/rejected": -3.864173173904419, "step": 2004 }, { "epoch": 0.52, "grad_norm": 24.483179092407227, "kl": 0.0, "learning_rate": 2.376341271918346e-07, "logps/chosen": -153.36785888671875, "logps/rejected": -258.3946838378906, "loss": 0.2444, "rewards/chosen": -0.7696568369865417, "rewards/margins": 3.203350067138672, "rewards/rejected": -3.9730069637298584, "step": 2005 }, { "epoch": 0.52, "grad_norm": 39.20796203613281, "kl": 0.0, "learning_rate": 2.375032713949228e-07, "logps/chosen": -132.20347595214844, "logps/rejected": -303.88922119140625, "loss": 0.2446, "rewards/chosen": 0.35151904821395874, "rewards/margins": 3.453958511352539, "rewards/rejected": -3.1024394035339355, "step": 2006 }, { "epoch": 0.53, "grad_norm": 31.88499641418457, "kl": 0.0, "learning_rate": 2.37372415598011e-07, "logps/chosen": -227.24574279785156, "logps/rejected": -222.66433715820312, "loss": 0.297, "rewards/chosen": -1.9475051164627075, "rewards/margins": 1.7535721063613892, "rewards/rejected": -3.7010772228240967, "step": 2007 }, { "epoch": 0.53, "grad_norm": 36.38870620727539, "kl": 0.0, "learning_rate": 2.3724155980109916e-07, "logps/chosen": -213.55064392089844, "logps/rejected": -368.173095703125, "loss": 0.2498, "rewards/chosen": 0.9353142380714417, "rewards/margins": 6.506036758422852, "rewards/rejected": -5.570722579956055, "step": 2008 }, { "epoch": 0.53, "grad_norm": 34.4116325378418, "kl": 0.0, "learning_rate": 2.3711070400418736e-07, "logps/chosen": -186.77804565429688, "logps/rejected": -120.74478912353516, "loss": 0.3031, "rewards/chosen": 0.9410187005996704, "rewards/margins": 3.602015972137451, "rewards/rejected": -2.660997152328491, "step": 2009 }, { "epoch": 0.53, "grad_norm": 34.9754753112793, "kl": 0.0, "learning_rate": 2.3697984820727558e-07, "logps/chosen": -235.28622436523438, "logps/rejected": -229.31048583984375, "loss": 0.275, "rewards/chosen": 0.7856313586235046, "rewards/margins": 3.691113233566284, "rewards/rejected": -2.9054818153381348, "step": 2010 }, { "epoch": 0.53, "grad_norm": 32.74319076538086, "kl": 0.0, "learning_rate": 2.3684899241036378e-07, "logps/chosen": -200.742431640625, "logps/rejected": -311.3709411621094, "loss": 0.2691, "rewards/chosen": 0.4585956931114197, "rewards/margins": 3.353062152862549, "rewards/rejected": -2.8944664001464844, "step": 2011 }, { "epoch": 0.53, "grad_norm": 38.48353958129883, "kl": 0.0, "learning_rate": 2.3671813661345197e-07, "logps/chosen": -240.978759765625, "logps/rejected": -241.54025268554688, "loss": 0.2153, "rewards/chosen": 1.544427514076233, "rewards/margins": 5.172610759735107, "rewards/rejected": -3.628183126449585, "step": 2012 }, { "epoch": 0.53, "grad_norm": 29.140609741210938, "kl": 0.0, "learning_rate": 2.3658728081654017e-07, "logps/chosen": -243.23280334472656, "logps/rejected": -175.568115234375, "loss": 0.3854, "rewards/chosen": -0.7309738397598267, "rewards/margins": 1.0130811929702759, "rewards/rejected": -1.7440550327301025, "step": 2013 }, { "epoch": 0.53, "grad_norm": 36.87703323364258, "kl": 0.0, "learning_rate": 2.3645642501962836e-07, "logps/chosen": -300.458251953125, "logps/rejected": -286.5090026855469, "loss": 0.3353, "rewards/chosen": 1.6609442234039307, "rewards/margins": 4.945582389831543, "rewards/rejected": -3.2846384048461914, "step": 2014 }, { "epoch": 0.53, "grad_norm": 46.88383865356445, "kl": 0.0, "learning_rate": 2.3632556922271653e-07, "logps/chosen": -124.17665100097656, "logps/rejected": -381.4186096191406, "loss": 0.4301, "rewards/chosen": -0.28269121050834656, "rewards/margins": 2.143458127975464, "rewards/rejected": -2.426149368286133, "step": 2015 }, { "epoch": 0.53, "grad_norm": 30.590614318847656, "kl": 0.0, "learning_rate": 2.3619471342580473e-07, "logps/chosen": -165.1231689453125, "logps/rejected": -216.70668029785156, "loss": 0.363, "rewards/chosen": -0.5194066166877747, "rewards/margins": 3.826328992843628, "rewards/rejected": -4.345735549926758, "step": 2016 }, { "epoch": 0.53, "grad_norm": 28.982065200805664, "kl": 0.0, "learning_rate": 2.3606385762889295e-07, "logps/chosen": -106.90189361572266, "logps/rejected": -241.30169677734375, "loss": 0.2629, "rewards/chosen": 1.0548369884490967, "rewards/margins": 2.681565999984741, "rewards/rejected": -1.6267290115356445, "step": 2017 }, { "epoch": 0.53, "grad_norm": 40.08494186401367, "kl": 0.0, "learning_rate": 2.3593300183198115e-07, "logps/chosen": -279.9969482421875, "logps/rejected": -170.47064208984375, "loss": 0.2288, "rewards/chosen": 2.9944510459899902, "rewards/margins": 5.17792272567749, "rewards/rejected": -2.1834716796875, "step": 2018 }, { "epoch": 0.53, "grad_norm": 59.07280349731445, "kl": 0.0, "learning_rate": 2.3580214603506934e-07, "logps/chosen": -207.0133514404297, "logps/rejected": -319.8271484375, "loss": 0.3363, "rewards/chosen": 0.8179835081100464, "rewards/margins": 3.3634209632873535, "rewards/rejected": -2.5454375743865967, "step": 2019 }, { "epoch": 0.53, "grad_norm": 34.716835021972656, "kl": 0.0, "learning_rate": 2.3567129023815754e-07, "logps/chosen": -285.7257080078125, "logps/rejected": -236.57012939453125, "loss": 0.3969, "rewards/chosen": -0.056455135345458984, "rewards/margins": 3.922239065170288, "rewards/rejected": -3.978694200515747, "step": 2020 }, { "epoch": 0.53, "grad_norm": 30.732534408569336, "kl": 0.0, "learning_rate": 2.3554043444124574e-07, "logps/chosen": -191.0402069091797, "logps/rejected": -263.2533264160156, "loss": 0.2471, "rewards/chosen": 0.4087807834148407, "rewards/margins": 4.085519790649414, "rewards/rejected": -3.67673921585083, "step": 2021 }, { "epoch": 0.53, "grad_norm": 30.43977165222168, "kl": 0.0, "learning_rate": 2.3540957864433393e-07, "logps/chosen": -137.8092498779297, "logps/rejected": -281.7326965332031, "loss": 0.1874, "rewards/chosen": 0.5577578544616699, "rewards/margins": 4.101840496063232, "rewards/rejected": -3.5440826416015625, "step": 2022 }, { "epoch": 0.53, "grad_norm": 34.099342346191406, "kl": 0.0, "learning_rate": 2.3527872284742213e-07, "logps/chosen": -203.07217407226562, "logps/rejected": -312.94195556640625, "loss": 0.381, "rewards/chosen": 0.886745810508728, "rewards/margins": 2.430570602416992, "rewards/rejected": -1.5438249111175537, "step": 2023 }, { "epoch": 0.53, "grad_norm": 35.84495162963867, "kl": 0.0, "learning_rate": 2.3514786705051032e-07, "logps/chosen": -228.089111328125, "logps/rejected": -355.3266296386719, "loss": 0.1612, "rewards/chosen": 0.7639932036399841, "rewards/margins": 4.759679317474365, "rewards/rejected": -3.9956860542297363, "step": 2024 }, { "epoch": 0.53, "grad_norm": 30.479677200317383, "kl": 0.0, "learning_rate": 2.3501701125359852e-07, "logps/chosen": -221.3619842529297, "logps/rejected": -189.91445922851562, "loss": 0.1739, "rewards/chosen": 1.7698726654052734, "rewards/margins": 4.553598403930664, "rewards/rejected": -2.7837259769439697, "step": 2025 }, { "epoch": 0.53, "grad_norm": 30.979618072509766, "kl": 0.0, "learning_rate": 2.3488615545668672e-07, "logps/chosen": -182.97657775878906, "logps/rejected": -242.29254150390625, "loss": 0.2574, "rewards/chosen": 2.3239691257476807, "rewards/margins": 6.539175033569336, "rewards/rejected": -4.215206146240234, "step": 2026 }, { "epoch": 0.53, "grad_norm": 34.30022430419922, "kl": 0.0, "learning_rate": 2.347552996597749e-07, "logps/chosen": -222.5321807861328, "logps/rejected": -224.00125122070312, "loss": 0.2416, "rewards/chosen": 0.36926016211509705, "rewards/margins": 4.0446457862854, "rewards/rejected": -3.6753857135772705, "step": 2027 }, { "epoch": 0.53, "grad_norm": 31.23023223876953, "kl": 0.0, "learning_rate": 2.346244438628631e-07, "logps/chosen": -190.40428161621094, "logps/rejected": -179.3951873779297, "loss": 0.2639, "rewards/chosen": 2.5482754707336426, "rewards/margins": 5.174324989318848, "rewards/rejected": -2.626049757003784, "step": 2028 }, { "epoch": 0.53, "grad_norm": 40.01976013183594, "kl": 0.0, "learning_rate": 2.3449358806595133e-07, "logps/chosen": -181.07247924804688, "logps/rejected": -252.67135620117188, "loss": 0.2582, "rewards/chosen": 1.9590002298355103, "rewards/margins": 4.202696323394775, "rewards/rejected": -2.2436962127685547, "step": 2029 }, { "epoch": 0.53, "grad_norm": 32.337642669677734, "kl": 0.0, "learning_rate": 2.3436273226903953e-07, "logps/chosen": -259.14239501953125, "logps/rejected": -253.57275390625, "loss": 0.3372, "rewards/chosen": 0.023569345474243164, "rewards/margins": 3.4797098636627197, "rewards/rejected": -3.4561405181884766, "step": 2030 }, { "epoch": 0.53, "grad_norm": 36.94684982299805, "kl": 0.0, "learning_rate": 2.342318764721277e-07, "logps/chosen": -193.77459716796875, "logps/rejected": -276.48101806640625, "loss": 0.3417, "rewards/chosen": 1.8443284034729004, "rewards/margins": 5.706932067871094, "rewards/rejected": -3.8626036643981934, "step": 2031 }, { "epoch": 0.53, "grad_norm": 26.014928817749023, "kl": 0.0, "learning_rate": 2.341010206752159e-07, "logps/chosen": -182.404296875, "logps/rejected": -355.6706848144531, "loss": 0.2146, "rewards/chosen": 1.1796051263809204, "rewards/margins": 5.5185322761535645, "rewards/rejected": -4.338927268981934, "step": 2032 }, { "epoch": 0.53, "grad_norm": 31.432388305664062, "kl": 0.0, "learning_rate": 2.339701648783041e-07, "logps/chosen": -187.7768096923828, "logps/rejected": -205.853759765625, "loss": 0.21, "rewards/chosen": 1.4805999994277954, "rewards/margins": 4.251741409301758, "rewards/rejected": -2.771141529083252, "step": 2033 }, { "epoch": 0.53, "grad_norm": 38.272300720214844, "kl": 0.0, "learning_rate": 2.3383930908139229e-07, "logps/chosen": -227.22837829589844, "logps/rejected": -221.27516174316406, "loss": 0.3342, "rewards/chosen": 0.6269685626029968, "rewards/margins": 2.780792474746704, "rewards/rejected": -2.1538238525390625, "step": 2034 }, { "epoch": 0.53, "grad_norm": 30.997676849365234, "kl": 0.0, "learning_rate": 2.3370845328448048e-07, "logps/chosen": -223.91384887695312, "logps/rejected": -177.48019409179688, "loss": 0.2491, "rewards/chosen": -0.05373985320329666, "rewards/margins": 2.6699798107147217, "rewards/rejected": -2.723719596862793, "step": 2035 }, { "epoch": 0.53, "grad_norm": 37.9314079284668, "kl": 0.0, "learning_rate": 2.335775974875687e-07, "logps/chosen": -276.69659423828125, "logps/rejected": -320.3892517089844, "loss": 0.3087, "rewards/chosen": 1.2256563901901245, "rewards/margins": 4.2694549560546875, "rewards/rejected": -3.0437984466552734, "step": 2036 }, { "epoch": 0.53, "grad_norm": 30.487159729003906, "kl": 0.0, "learning_rate": 2.334467416906569e-07, "logps/chosen": -151.29331970214844, "logps/rejected": -225.7119598388672, "loss": 0.2399, "rewards/chosen": 1.0992461442947388, "rewards/margins": 4.55507230758667, "rewards/rejected": -3.4558262825012207, "step": 2037 }, { "epoch": 0.53, "grad_norm": 33.59539031982422, "kl": 0.0, "learning_rate": 2.333158858937451e-07, "logps/chosen": -179.46453857421875, "logps/rejected": -236.49371337890625, "loss": 0.3458, "rewards/chosen": 0.380405068397522, "rewards/margins": 3.274649143218994, "rewards/rejected": -2.8942441940307617, "step": 2038 }, { "epoch": 0.53, "grad_norm": 26.793468475341797, "kl": 0.0, "learning_rate": 2.3318503009683327e-07, "logps/chosen": -223.2361602783203, "logps/rejected": -212.54644775390625, "loss": 0.2385, "rewards/chosen": 1.6074248552322388, "rewards/margins": 3.5587990283966064, "rewards/rejected": -1.9513741731643677, "step": 2039 }, { "epoch": 0.53, "grad_norm": 25.713003158569336, "kl": 0.0, "learning_rate": 2.3305417429992146e-07, "logps/chosen": -252.0117645263672, "logps/rejected": -209.342529296875, "loss": 0.2839, "rewards/chosen": -0.6636760830879211, "rewards/margins": 3.629101037979126, "rewards/rejected": -4.292777061462402, "step": 2040 }, { "epoch": 0.53, "grad_norm": 26.062259674072266, "kl": 0.0, "learning_rate": 2.3292331850300966e-07, "logps/chosen": -199.529296875, "logps/rejected": -284.480712890625, "loss": 0.1446, "rewards/chosen": 3.976663112640381, "rewards/margins": 10.23494815826416, "rewards/rejected": -6.258285045623779, "step": 2041 }, { "epoch": 0.53, "grad_norm": 40.40095520019531, "kl": 0.0, "learning_rate": 2.3279246270609788e-07, "logps/chosen": -225.0578155517578, "logps/rejected": -151.56898498535156, "loss": 0.306, "rewards/chosen": -0.09351640939712524, "rewards/margins": 3.9907732009887695, "rewards/rejected": -4.08428955078125, "step": 2042 }, { "epoch": 0.53, "grad_norm": 43.919803619384766, "kl": 0.0, "learning_rate": 2.3266160690918608e-07, "logps/chosen": -231.98822021484375, "logps/rejected": -263.60198974609375, "loss": 0.3163, "rewards/chosen": 0.3000814914703369, "rewards/margins": 5.877663612365723, "rewards/rejected": -5.577582359313965, "step": 2043 }, { "epoch": 0.53, "grad_norm": 39.102787017822266, "kl": 0.0, "learning_rate": 2.3253075111227427e-07, "logps/chosen": -257.454833984375, "logps/rejected": -203.32229614257812, "loss": 0.2321, "rewards/chosen": 0.9678674936294556, "rewards/margins": 5.673610210418701, "rewards/rejected": -4.705742835998535, "step": 2044 }, { "epoch": 0.54, "grad_norm": 25.78129005432129, "kl": 0.0, "learning_rate": 2.3239989531536247e-07, "logps/chosen": -182.70339965820312, "logps/rejected": -234.2674102783203, "loss": 0.1706, "rewards/chosen": 1.4865150451660156, "rewards/margins": 5.233919620513916, "rewards/rejected": -3.7474045753479004, "step": 2045 }, { "epoch": 0.54, "grad_norm": 35.66816329956055, "kl": 0.0, "learning_rate": 2.3226903951845064e-07, "logps/chosen": -217.499267578125, "logps/rejected": -160.31105041503906, "loss": 0.309, "rewards/chosen": -0.4123508632183075, "rewards/margins": 2.4263663291931152, "rewards/rejected": -2.838717222213745, "step": 2046 }, { "epoch": 0.54, "grad_norm": 51.080989837646484, "kl": 0.0, "learning_rate": 2.3213818372153883e-07, "logps/chosen": -257.92095947265625, "logps/rejected": -225.64663696289062, "loss": 0.3738, "rewards/chosen": 0.7672978639602661, "rewards/margins": 2.8424081802368164, "rewards/rejected": -2.0751101970672607, "step": 2047 }, { "epoch": 0.54, "grad_norm": 36.70692443847656, "kl": 0.0, "learning_rate": 2.3200732792462706e-07, "logps/chosen": -291.3275146484375, "logps/rejected": -155.7629852294922, "loss": 0.366, "rewards/chosen": 1.1201237440109253, "rewards/margins": 2.4310898780822754, "rewards/rejected": -1.3109660148620605, "step": 2048 }, { "epoch": 0.54, "grad_norm": 40.8702392578125, "kl": 0.0, "learning_rate": 2.3187647212771525e-07, "logps/chosen": -267.50341796875, "logps/rejected": -201.24693298339844, "loss": 0.3107, "rewards/chosen": 1.1985299587249756, "rewards/margins": 4.256721496582031, "rewards/rejected": -3.0581917762756348, "step": 2049 }, { "epoch": 0.54, "grad_norm": 51.903621673583984, "kl": 0.0, "learning_rate": 2.3174561633080345e-07, "logps/chosen": -294.9173278808594, "logps/rejected": -271.4747314453125, "loss": 0.2116, "rewards/chosen": 0.5645659565925598, "rewards/margins": 3.08552885055542, "rewards/rejected": -2.520962953567505, "step": 2050 }, { "epoch": 0.54, "grad_norm": 36.193016052246094, "kl": 0.0, "learning_rate": 2.3161476053389165e-07, "logps/chosen": -345.0677490234375, "logps/rejected": -302.47991943359375, "loss": 0.1589, "rewards/chosen": 2.930142641067505, "rewards/margins": 6.179523468017578, "rewards/rejected": -3.2493808269500732, "step": 2051 }, { "epoch": 0.54, "grad_norm": 35.836212158203125, "kl": 0.0, "learning_rate": 2.3148390473697984e-07, "logps/chosen": -274.1582336425781, "logps/rejected": -256.15203857421875, "loss": 0.357, "rewards/chosen": 0.9071843028068542, "rewards/margins": 2.8665997982025146, "rewards/rejected": -1.9594155550003052, "step": 2052 }, { "epoch": 0.54, "grad_norm": 33.03522872924805, "kl": 0.0, "learning_rate": 2.3135304894006804e-07, "logps/chosen": -148.79110717773438, "logps/rejected": -246.0092315673828, "loss": 0.2571, "rewards/chosen": 1.2804070711135864, "rewards/margins": 2.9049646854400635, "rewards/rejected": -1.624557614326477, "step": 2053 }, { "epoch": 0.54, "grad_norm": 26.965436935424805, "kl": 0.0, "learning_rate": 2.312221931431562e-07, "logps/chosen": -234.40638732910156, "logps/rejected": -242.86436462402344, "loss": 0.1664, "rewards/chosen": 0.8637860417366028, "rewards/margins": 4.613651275634766, "rewards/rejected": -3.7498650550842285, "step": 2054 }, { "epoch": 0.54, "grad_norm": 37.29470443725586, "kl": 0.0, "learning_rate": 2.3109133734624443e-07, "logps/chosen": -172.05210876464844, "logps/rejected": -238.47952270507812, "loss": 0.2839, "rewards/chosen": 0.9011194705963135, "rewards/margins": 3.0700299739837646, "rewards/rejected": -2.168910503387451, "step": 2055 }, { "epoch": 0.54, "grad_norm": 25.493192672729492, "kl": 0.0, "learning_rate": 2.3096048154933263e-07, "logps/chosen": -215.99545288085938, "logps/rejected": -243.0108184814453, "loss": 0.3314, "rewards/chosen": 0.6691614389419556, "rewards/margins": 4.748775959014893, "rewards/rejected": -4.079614639282227, "step": 2056 }, { "epoch": 0.54, "grad_norm": 39.543739318847656, "kl": 0.0, "learning_rate": 2.3082962575242082e-07, "logps/chosen": -262.2772521972656, "logps/rejected": -238.32432556152344, "loss": 0.4089, "rewards/chosen": -0.20746786892414093, "rewards/margins": 1.0184450149536133, "rewards/rejected": -1.2259129285812378, "step": 2057 }, { "epoch": 0.54, "grad_norm": 41.07046890258789, "kl": 0.0, "learning_rate": 2.3069876995550902e-07, "logps/chosen": -245.2674560546875, "logps/rejected": -271.6712646484375, "loss": 0.337, "rewards/chosen": 0.7155461311340332, "rewards/margins": 3.8527755737304688, "rewards/rejected": -3.1372294425964355, "step": 2058 }, { "epoch": 0.54, "grad_norm": 34.27461624145508, "kl": 0.0, "learning_rate": 2.3056791415859721e-07, "logps/chosen": -137.6658935546875, "logps/rejected": -287.7002868652344, "loss": 0.3097, "rewards/chosen": 0.8914886116981506, "rewards/margins": 4.4790730476379395, "rewards/rejected": -3.5875844955444336, "step": 2059 }, { "epoch": 0.54, "grad_norm": 34.30635452270508, "kl": 0.0, "learning_rate": 2.304370583616854e-07, "logps/chosen": -132.6312255859375, "logps/rejected": -329.7675476074219, "loss": 0.3, "rewards/chosen": 0.9626002311706543, "rewards/margins": 3.7751975059509277, "rewards/rejected": -2.8125972747802734, "step": 2060 }, { "epoch": 0.54, "grad_norm": 30.666933059692383, "kl": 0.0, "learning_rate": 2.3030620256477363e-07, "logps/chosen": -176.3401336669922, "logps/rejected": -278.4073486328125, "loss": 0.2786, "rewards/chosen": 0.10558772087097168, "rewards/margins": 2.623535394668579, "rewards/rejected": -2.5179476737976074, "step": 2061 }, { "epoch": 0.54, "grad_norm": 27.88566780090332, "kl": 0.0, "learning_rate": 2.301753467678618e-07, "logps/chosen": -157.7092742919922, "logps/rejected": -206.13185119628906, "loss": 0.3859, "rewards/chosen": 1.1426632404327393, "rewards/margins": 2.981151580810547, "rewards/rejected": -1.8384883403778076, "step": 2062 }, { "epoch": 0.54, "grad_norm": 30.36982536315918, "kl": 0.0, "learning_rate": 2.3004449097095e-07, "logps/chosen": -191.57711791992188, "logps/rejected": -152.2157440185547, "loss": 0.2586, "rewards/chosen": 1.6766765117645264, "rewards/margins": 3.6210598945617676, "rewards/rejected": -1.9443833827972412, "step": 2063 }, { "epoch": 0.54, "grad_norm": 38.566322326660156, "kl": 0.0, "learning_rate": 2.299136351740382e-07, "logps/chosen": -198.1544189453125, "logps/rejected": -227.01773071289062, "loss": 0.3586, "rewards/chosen": 0.406218022108078, "rewards/margins": 3.210958957672119, "rewards/rejected": -2.8047409057617188, "step": 2064 }, { "epoch": 0.54, "grad_norm": 36.628318786621094, "kl": 0.0, "learning_rate": 2.297827793771264e-07, "logps/chosen": -145.6483154296875, "logps/rejected": -219.84780883789062, "loss": 0.3084, "rewards/chosen": 0.7460767030715942, "rewards/margins": 3.5765395164489746, "rewards/rejected": -2.830462694168091, "step": 2065 }, { "epoch": 0.54, "grad_norm": 23.9617919921875, "kl": 0.0, "learning_rate": 2.296519235802146e-07, "logps/chosen": -118.81352996826172, "logps/rejected": -207.17298889160156, "loss": 0.3238, "rewards/chosen": 0.7397387623786926, "rewards/margins": 2.863344192504883, "rewards/rejected": -2.123605489730835, "step": 2066 }, { "epoch": 0.54, "grad_norm": 40.05659866333008, "kl": 0.0, "learning_rate": 2.295210677833028e-07, "logps/chosen": -155.1406707763672, "logps/rejected": -207.32810974121094, "loss": 0.2911, "rewards/chosen": 0.2956763803958893, "rewards/margins": 2.1852457523345947, "rewards/rejected": -1.8895692825317383, "step": 2067 }, { "epoch": 0.54, "grad_norm": 38.00597381591797, "kl": 0.0, "learning_rate": 2.29390211986391e-07, "logps/chosen": -222.1738739013672, "logps/rejected": -213.74107360839844, "loss": 0.2449, "rewards/chosen": 1.4015142917633057, "rewards/margins": 3.836189031600952, "rewards/rejected": -2.4346747398376465, "step": 2068 }, { "epoch": 0.54, "grad_norm": 39.29631805419922, "kl": 0.0, "learning_rate": 2.292593561894792e-07, "logps/chosen": -289.01568603515625, "logps/rejected": -258.3611755371094, "loss": 0.3935, "rewards/chosen": -0.3644639253616333, "rewards/margins": 3.4178175926208496, "rewards/rejected": -3.7822813987731934, "step": 2069 }, { "epoch": 0.54, "grad_norm": 32.35689163208008, "kl": 0.0, "learning_rate": 2.2912850039256737e-07, "logps/chosen": -187.1671142578125, "logps/rejected": -373.15692138671875, "loss": 0.3169, "rewards/chosen": 0.5017662644386292, "rewards/margins": 3.3714005947113037, "rewards/rejected": -2.8696343898773193, "step": 2070 }, { "epoch": 0.54, "grad_norm": 32.022483825683594, "kl": 0.0, "learning_rate": 2.2899764459565557e-07, "logps/chosen": -270.5025634765625, "logps/rejected": -257.8646240234375, "loss": 0.3022, "rewards/chosen": 0.6621522903442383, "rewards/margins": 4.498281478881836, "rewards/rejected": -3.8361291885375977, "step": 2071 }, { "epoch": 0.54, "grad_norm": 28.791711807250977, "kl": 0.0, "learning_rate": 2.2886678879874376e-07, "logps/chosen": -190.7532501220703, "logps/rejected": -294.8187561035156, "loss": 0.3575, "rewards/chosen": -1.2482961416244507, "rewards/margins": 0.8909574747085571, "rewards/rejected": -2.139253616333008, "step": 2072 }, { "epoch": 0.54, "grad_norm": 34.729156494140625, "kl": 0.0, "learning_rate": 2.2873593300183196e-07, "logps/chosen": -209.54147338867188, "logps/rejected": -211.24008178710938, "loss": 0.3454, "rewards/chosen": 2.8879120349884033, "rewards/margins": 4.048269748687744, "rewards/rejected": -1.1603578329086304, "step": 2073 }, { "epoch": 0.54, "grad_norm": 34.23164367675781, "kl": 0.0, "learning_rate": 2.2860507720492018e-07, "logps/chosen": -121.15304565429688, "logps/rejected": -223.7058868408203, "loss": 0.282, "rewards/chosen": 0.3577978014945984, "rewards/margins": 2.298954486846924, "rewards/rejected": -1.9411567449569702, "step": 2074 }, { "epoch": 0.54, "grad_norm": 30.46665382385254, "kl": 0.0, "learning_rate": 2.2847422140800838e-07, "logps/chosen": -169.19219970703125, "logps/rejected": -232.3114013671875, "loss": 0.3508, "rewards/chosen": 0.5962676405906677, "rewards/margins": 2.8509838581085205, "rewards/rejected": -2.254716157913208, "step": 2075 }, { "epoch": 0.54, "grad_norm": 24.444530487060547, "kl": 0.0, "learning_rate": 2.2834336561109657e-07, "logps/chosen": -166.1984100341797, "logps/rejected": -266.15338134765625, "loss": 0.1984, "rewards/chosen": 1.1371859312057495, "rewards/margins": 5.206028938293457, "rewards/rejected": -4.068842887878418, "step": 2076 }, { "epoch": 0.54, "grad_norm": 43.3023796081543, "kl": 0.0, "learning_rate": 2.2821250981418474e-07, "logps/chosen": -162.20846557617188, "logps/rejected": -226.7047882080078, "loss": 0.2923, "rewards/chosen": 0.9513958692550659, "rewards/margins": 2.763209581375122, "rewards/rejected": -1.8118137121200562, "step": 2077 }, { "epoch": 0.54, "grad_norm": 43.34999465942383, "kl": 0.0, "learning_rate": 2.2808165401727294e-07, "logps/chosen": -253.7343292236328, "logps/rejected": -234.84107971191406, "loss": 0.2351, "rewards/chosen": 0.7632766962051392, "rewards/margins": 4.0219950675964355, "rewards/rejected": -3.258718490600586, "step": 2078 }, { "epoch": 0.54, "grad_norm": 32.680931091308594, "kl": 0.0, "learning_rate": 2.2795079822036114e-07, "logps/chosen": -121.70050048828125, "logps/rejected": -294.01043701171875, "loss": 0.3483, "rewards/chosen": 1.3794186115264893, "rewards/margins": 3.346170663833618, "rewards/rejected": -1.966752052307129, "step": 2079 }, { "epoch": 0.54, "grad_norm": 33.10261917114258, "kl": 0.0, "learning_rate": 2.2781994242344936e-07, "logps/chosen": -243.3766632080078, "logps/rejected": -152.2445526123047, "loss": 0.2724, "rewards/chosen": 1.2106717824935913, "rewards/margins": 4.09446907043457, "rewards/rejected": -2.8837971687316895, "step": 2080 }, { "epoch": 0.54, "grad_norm": 35.59720230102539, "kl": 0.0, "learning_rate": 2.2768908662653755e-07, "logps/chosen": -259.7003173828125, "logps/rejected": -189.43997192382812, "loss": 0.255, "rewards/chosen": 0.9261613488197327, "rewards/margins": 3.8873603343963623, "rewards/rejected": -2.9611990451812744, "step": 2081 }, { "epoch": 0.54, "grad_norm": 37.25908660888672, "kl": 0.0, "learning_rate": 2.2755823082962575e-07, "logps/chosen": -278.11016845703125, "logps/rejected": -288.7530517578125, "loss": 0.2594, "rewards/chosen": 2.6176414489746094, "rewards/margins": 7.874884605407715, "rewards/rejected": -5.2572431564331055, "step": 2082 }, { "epoch": 0.55, "grad_norm": 42.3132438659668, "kl": 0.0, "learning_rate": 2.2742737503271395e-07, "logps/chosen": -223.02183532714844, "logps/rejected": -247.94920349121094, "loss": 0.2972, "rewards/chosen": 0.5573528409004211, "rewards/margins": 4.124536991119385, "rewards/rejected": -3.5671842098236084, "step": 2083 }, { "epoch": 0.55, "grad_norm": 43.50983810424805, "kl": 0.0, "learning_rate": 2.2729651923580214e-07, "logps/chosen": -209.89193725585938, "logps/rejected": -259.04034423828125, "loss": 0.3302, "rewards/chosen": -0.0854308009147644, "rewards/margins": 4.501287937164307, "rewards/rejected": -4.586718559265137, "step": 2084 }, { "epoch": 0.55, "grad_norm": 34.85049057006836, "kl": 0.0, "learning_rate": 2.271656634388903e-07, "logps/chosen": -175.64346313476562, "logps/rejected": -311.0813293457031, "loss": 0.1309, "rewards/chosen": 0.5914903283119202, "rewards/margins": 6.049227714538574, "rewards/rejected": -5.457737445831299, "step": 2085 }, { "epoch": 0.55, "grad_norm": 32.070823669433594, "kl": 0.0, "learning_rate": 2.270348076419785e-07, "logps/chosen": -309.7897033691406, "logps/rejected": -279.0724792480469, "loss": 0.2345, "rewards/chosen": -0.2845461070537567, "rewards/margins": 2.9219210147857666, "rewards/rejected": -3.2064671516418457, "step": 2086 }, { "epoch": 0.55, "grad_norm": 36.27039337158203, "kl": 0.0, "learning_rate": 2.2690395184506673e-07, "logps/chosen": -227.92718505859375, "logps/rejected": -302.98028564453125, "loss": 0.3127, "rewards/chosen": 2.268153190612793, "rewards/margins": 4.073305130004883, "rewards/rejected": -1.8051517009735107, "step": 2087 }, { "epoch": 0.55, "grad_norm": 28.964229583740234, "kl": 0.0, "learning_rate": 2.2677309604815493e-07, "logps/chosen": -165.77523803710938, "logps/rejected": -276.9595031738281, "loss": 0.2121, "rewards/chosen": 1.7321010828018188, "rewards/margins": 5.551499843597412, "rewards/rejected": -3.819398880004883, "step": 2088 }, { "epoch": 0.55, "grad_norm": 41.056583404541016, "kl": 0.0, "learning_rate": 2.2664224025124312e-07, "logps/chosen": -261.00238037109375, "logps/rejected": -283.1644287109375, "loss": 0.3032, "rewards/chosen": -0.3032938838005066, "rewards/margins": 3.9948885440826416, "rewards/rejected": -4.298182487487793, "step": 2089 }, { "epoch": 0.55, "grad_norm": 38.75579071044922, "kl": 0.0, "learning_rate": 2.2651138445433132e-07, "logps/chosen": -139.2954864501953, "logps/rejected": -168.7962646484375, "loss": 0.3067, "rewards/chosen": 0.5684501528739929, "rewards/margins": 3.4893176555633545, "rewards/rejected": -2.920867443084717, "step": 2090 }, { "epoch": 0.55, "grad_norm": 30.218616485595703, "kl": 0.0, "learning_rate": 2.2638052865741952e-07, "logps/chosen": -153.24710083007812, "logps/rejected": -190.56497192382812, "loss": 0.2842, "rewards/chosen": 1.887859582901001, "rewards/margins": 4.847748756408691, "rewards/rejected": -2.9598894119262695, "step": 2091 }, { "epoch": 0.55, "grad_norm": 37.584842681884766, "kl": 0.0, "learning_rate": 2.262496728605077e-07, "logps/chosen": -269.6733703613281, "logps/rejected": -246.76669311523438, "loss": 0.274, "rewards/chosen": 0.8811774253845215, "rewards/margins": 4.343387603759766, "rewards/rejected": -3.462209939956665, "step": 2092 }, { "epoch": 0.55, "grad_norm": 31.532167434692383, "kl": 0.0, "learning_rate": 2.261188170635959e-07, "logps/chosen": -225.5703887939453, "logps/rejected": -302.420654296875, "loss": 0.2572, "rewards/chosen": -0.5330899357795715, "rewards/margins": 3.7925503253936768, "rewards/rejected": -4.3256402015686035, "step": 2093 }, { "epoch": 0.55, "grad_norm": 36.89503479003906, "kl": 0.0, "learning_rate": 2.259879612666841e-07, "logps/chosen": -272.6938781738281, "logps/rejected": -233.2865753173828, "loss": 0.2564, "rewards/chosen": 0.8622837066650391, "rewards/margins": 3.173910617828369, "rewards/rejected": -2.31162691116333, "step": 2094 }, { "epoch": 0.55, "grad_norm": 42.40085983276367, "kl": 0.0, "learning_rate": 2.258571054697723e-07, "logps/chosen": -177.05694580078125, "logps/rejected": -227.2252197265625, "loss": 0.3312, "rewards/chosen": 1.544360637664795, "rewards/margins": 2.9416725635528564, "rewards/rejected": -1.3973119258880615, "step": 2095 }, { "epoch": 0.55, "grad_norm": 38.93367004394531, "kl": 0.0, "learning_rate": 2.257262496728605e-07, "logps/chosen": -229.6371307373047, "logps/rejected": -266.69879150390625, "loss": 0.3313, "rewards/chosen": 1.1398550271987915, "rewards/margins": 5.1608171463012695, "rewards/rejected": -4.020962238311768, "step": 2096 }, { "epoch": 0.55, "grad_norm": 34.740570068359375, "kl": 0.0, "learning_rate": 2.255953938759487e-07, "logps/chosen": -147.47230529785156, "logps/rejected": -291.76287841796875, "loss": 0.2119, "rewards/chosen": 1.6616230010986328, "rewards/margins": 5.7488112449646, "rewards/rejected": -4.087188243865967, "step": 2097 }, { "epoch": 0.55, "grad_norm": 39.187652587890625, "kl": 0.0, "learning_rate": 2.254645380790369e-07, "logps/chosen": -196.1621856689453, "logps/rejected": -310.6141357421875, "loss": 0.2301, "rewards/chosen": 1.0132759809494019, "rewards/margins": 4.632660865783691, "rewards/rejected": -3.619384765625, "step": 2098 }, { "epoch": 0.55, "grad_norm": 28.46122932434082, "kl": 0.0, "learning_rate": 2.253336822821251e-07, "logps/chosen": -231.9829559326172, "logps/rejected": -218.917236328125, "loss": 0.2882, "rewards/chosen": -0.4349358379840851, "rewards/margins": 3.075749158859253, "rewards/rejected": -3.5106849670410156, "step": 2099 }, { "epoch": 0.55, "grad_norm": 28.59282875061035, "kl": 0.0, "learning_rate": 2.252028264852133e-07, "logps/chosen": -183.28744506835938, "logps/rejected": -218.506591796875, "loss": 0.2559, "rewards/chosen": 1.6624376773834229, "rewards/margins": 3.909499168395996, "rewards/rejected": -2.2470614910125732, "step": 2100 }, { "epoch": 0.55, "grad_norm": 37.74734878540039, "kl": 0.0, "learning_rate": 2.2507197068830148e-07, "logps/chosen": -339.4798889160156, "logps/rejected": -272.19970703125, "loss": 0.2443, "rewards/chosen": 2.859189033508301, "rewards/margins": 6.35294246673584, "rewards/rejected": -3.49375319480896, "step": 2101 }, { "epoch": 0.55, "grad_norm": 30.180173873901367, "kl": 0.0, "learning_rate": 2.2494111489138967e-07, "logps/chosen": -261.21405029296875, "logps/rejected": -268.70416259765625, "loss": 0.1482, "rewards/chosen": 1.0338104963302612, "rewards/margins": 5.0800395011901855, "rewards/rejected": -4.046228885650635, "step": 2102 }, { "epoch": 0.55, "grad_norm": 31.450355529785156, "kl": 0.0, "learning_rate": 2.2481025909447787e-07, "logps/chosen": -139.55465698242188, "logps/rejected": -239.27542114257812, "loss": 0.2741, "rewards/chosen": 0.44167831540107727, "rewards/margins": 3.0485122203826904, "rewards/rejected": -2.6068339347839355, "step": 2103 }, { "epoch": 0.55, "grad_norm": 47.74028396606445, "kl": 0.0, "learning_rate": 2.2467940329756606e-07, "logps/chosen": -274.1092529296875, "logps/rejected": -224.17535400390625, "loss": 0.3558, "rewards/chosen": -0.06706207990646362, "rewards/margins": 2.3114514350891113, "rewards/rejected": -2.3785135746002197, "step": 2104 }, { "epoch": 0.55, "grad_norm": 37.560523986816406, "kl": 0.0, "learning_rate": 2.2454854750065426e-07, "logps/chosen": -214.400146484375, "logps/rejected": -277.4588317871094, "loss": 0.1941, "rewards/chosen": 0.6747554540634155, "rewards/margins": 4.1736907958984375, "rewards/rejected": -3.4989354610443115, "step": 2105 }, { "epoch": 0.55, "grad_norm": 31.483280181884766, "kl": 0.0, "learning_rate": 2.2441769170374248e-07, "logps/chosen": -216.14453125, "logps/rejected": -244.1451416015625, "loss": 0.3242, "rewards/chosen": 0.3833763301372528, "rewards/margins": 4.8582916259765625, "rewards/rejected": -4.474915504455566, "step": 2106 }, { "epoch": 0.55, "grad_norm": 26.778085708618164, "kl": 0.0, "learning_rate": 2.2428683590683068e-07, "logps/chosen": -176.0457305908203, "logps/rejected": -272.55145263671875, "loss": 0.1509, "rewards/chosen": 1.2023561000823975, "rewards/margins": 4.90726375579834, "rewards/rejected": -3.7049076557159424, "step": 2107 }, { "epoch": 0.55, "grad_norm": 39.008323669433594, "kl": 0.0, "learning_rate": 2.2415598010991885e-07, "logps/chosen": -220.84750366210938, "logps/rejected": -272.07684326171875, "loss": 0.3226, "rewards/chosen": 1.5920367240905762, "rewards/margins": 4.589495658874512, "rewards/rejected": -2.9974586963653564, "step": 2108 }, { "epoch": 0.55, "grad_norm": 34.33102798461914, "kl": 0.0, "learning_rate": 2.2402512431300705e-07, "logps/chosen": -156.10195922851562, "logps/rejected": -184.01315307617188, "loss": 0.3066, "rewards/chosen": 1.0926826000213623, "rewards/margins": 3.315559148788452, "rewards/rejected": -2.22287654876709, "step": 2109 }, { "epoch": 0.55, "grad_norm": 35.68468475341797, "kl": 0.0, "learning_rate": 2.2389426851609524e-07, "logps/chosen": -184.97727966308594, "logps/rejected": -323.71539306640625, "loss": 0.2343, "rewards/chosen": 0.557456910610199, "rewards/margins": 4.741086006164551, "rewards/rejected": -4.183629035949707, "step": 2110 }, { "epoch": 0.55, "grad_norm": 40.77687072753906, "kl": 0.0, "learning_rate": 2.2376341271918344e-07, "logps/chosen": -243.48583984375, "logps/rejected": -239.84727478027344, "loss": 0.3424, "rewards/chosen": -0.6909189224243164, "rewards/margins": 1.2094755172729492, "rewards/rejected": -1.9003944396972656, "step": 2111 }, { "epoch": 0.55, "grad_norm": 33.72917556762695, "kl": 0.0, "learning_rate": 2.2363255692227166e-07, "logps/chosen": -245.06240844726562, "logps/rejected": -277.0432434082031, "loss": 0.2041, "rewards/chosen": 0.7709704637527466, "rewards/margins": 4.738353729248047, "rewards/rejected": -3.96738338470459, "step": 2112 }, { "epoch": 0.55, "grad_norm": 25.89661979675293, "kl": 0.0, "learning_rate": 2.2350170112535986e-07, "logps/chosen": -227.06411743164062, "logps/rejected": -211.87493896484375, "loss": 0.2515, "rewards/chosen": 2.6791181564331055, "rewards/margins": 5.6696014404296875, "rewards/rejected": -2.990483045578003, "step": 2113 }, { "epoch": 0.55, "grad_norm": 39.84913635253906, "kl": 0.0, "learning_rate": 2.2337084532844805e-07, "logps/chosen": -160.59805297851562, "logps/rejected": -188.2767333984375, "loss": 0.3441, "rewards/chosen": -0.2964557111263275, "rewards/margins": 1.1534799337387085, "rewards/rejected": -1.4499356746673584, "step": 2114 }, { "epoch": 0.55, "grad_norm": 36.11937713623047, "kl": 0.0, "learning_rate": 2.2323998953153625e-07, "logps/chosen": -204.82492065429688, "logps/rejected": -214.0093536376953, "loss": 0.2517, "rewards/chosen": 1.5596061944961548, "rewards/margins": 4.491985321044922, "rewards/rejected": -2.9323790073394775, "step": 2115 }, { "epoch": 0.55, "grad_norm": 39.927921295166016, "kl": 0.0, "learning_rate": 2.2310913373462442e-07, "logps/chosen": -169.34048461914062, "logps/rejected": -200.87672424316406, "loss": 0.3675, "rewards/chosen": -0.863287091255188, "rewards/margins": 0.387457013130188, "rewards/rejected": -1.250744104385376, "step": 2116 }, { "epoch": 0.55, "grad_norm": 24.469409942626953, "kl": 0.0, "learning_rate": 2.2297827793771261e-07, "logps/chosen": -157.0042724609375, "logps/rejected": -232.30807495117188, "loss": 0.2429, "rewards/chosen": -0.11154678463935852, "rewards/margins": 5.050269603729248, "rewards/rejected": -5.161816596984863, "step": 2117 }, { "epoch": 0.55, "grad_norm": 38.49171447753906, "kl": 0.0, "learning_rate": 2.228474221408008e-07, "logps/chosen": -289.55157470703125, "logps/rejected": -282.3967590332031, "loss": 0.2361, "rewards/chosen": 1.7181639671325684, "rewards/margins": 4.490333557128906, "rewards/rejected": -2.772169589996338, "step": 2118 }, { "epoch": 0.55, "grad_norm": 26.931310653686523, "kl": 0.0, "learning_rate": 2.2271656634388903e-07, "logps/chosen": -276.5685119628906, "logps/rejected": -287.1419677734375, "loss": 0.2312, "rewards/chosen": 1.2769911289215088, "rewards/margins": 5.967591285705566, "rewards/rejected": -4.690600395202637, "step": 2119 }, { "epoch": 0.55, "grad_norm": 42.65892791748047, "kl": 0.0, "learning_rate": 2.2258571054697723e-07, "logps/chosen": -200.8690643310547, "logps/rejected": -269.5424499511719, "loss": 0.2979, "rewards/chosen": -0.08438437432050705, "rewards/margins": 4.158094882965088, "rewards/rejected": -4.24247932434082, "step": 2120 }, { "epoch": 0.56, "grad_norm": 31.81760025024414, "kl": 0.0, "learning_rate": 2.2245485475006542e-07, "logps/chosen": -218.39447021484375, "logps/rejected": -310.91448974609375, "loss": 0.273, "rewards/chosen": 0.7956982254981995, "rewards/margins": 4.268186092376709, "rewards/rejected": -3.472487688064575, "step": 2121 }, { "epoch": 0.56, "grad_norm": 27.217666625976562, "kl": 0.0, "learning_rate": 2.2232399895315362e-07, "logps/chosen": -173.05572509765625, "logps/rejected": -205.1007080078125, "loss": 0.3612, "rewards/chosen": 0.7939292788505554, "rewards/margins": 3.3084661960601807, "rewards/rejected": -2.5145368576049805, "step": 2122 }, { "epoch": 0.56, "grad_norm": 39.9664306640625, "kl": 0.0, "learning_rate": 2.2219314315624182e-07, "logps/chosen": -266.86346435546875, "logps/rejected": -286.49920654296875, "loss": 0.3312, "rewards/chosen": 0.009727433323860168, "rewards/margins": 2.185347080230713, "rewards/rejected": -2.175619602203369, "step": 2123 }, { "epoch": 0.56, "grad_norm": 33.432987213134766, "kl": 0.0, "learning_rate": 2.2206228735933e-07, "logps/chosen": -192.0679931640625, "logps/rejected": -248.40536499023438, "loss": 0.1883, "rewards/chosen": 0.7708672881126404, "rewards/margins": 3.6535582542419434, "rewards/rejected": -2.882690906524658, "step": 2124 }, { "epoch": 0.56, "grad_norm": 37.950958251953125, "kl": 0.0, "learning_rate": 2.219314315624182e-07, "logps/chosen": -255.85194396972656, "logps/rejected": -309.26995849609375, "loss": 0.3021, "rewards/chosen": 1.773488998413086, "rewards/margins": 3.9557957649230957, "rewards/rejected": -2.1823067665100098, "step": 2125 }, { "epoch": 0.56, "grad_norm": 39.088714599609375, "kl": 0.0, "learning_rate": 2.218005757655064e-07, "logps/chosen": -227.0677490234375, "logps/rejected": -164.189697265625, "loss": 0.3448, "rewards/chosen": 0.35238975286483765, "rewards/margins": 2.7368478775024414, "rewards/rejected": -2.384458065032959, "step": 2126 }, { "epoch": 0.56, "grad_norm": 40.480316162109375, "kl": 0.0, "learning_rate": 2.216697199685946e-07, "logps/chosen": -213.88058471679688, "logps/rejected": -249.59548950195312, "loss": 0.4055, "rewards/chosen": 1.3990250825881958, "rewards/margins": 2.4990382194519043, "rewards/rejected": -1.100013017654419, "step": 2127 }, { "epoch": 0.56, "grad_norm": 38.71848678588867, "kl": 0.0, "learning_rate": 2.215388641716828e-07, "logps/chosen": -307.7563781738281, "logps/rejected": -276.81884765625, "loss": 0.2757, "rewards/chosen": -0.06617091596126556, "rewards/margins": 3.0361528396606445, "rewards/rejected": -3.1023237705230713, "step": 2128 }, { "epoch": 0.56, "grad_norm": 33.18585205078125, "kl": 0.0, "learning_rate": 2.21408008374771e-07, "logps/chosen": -220.02284240722656, "logps/rejected": -183.52706909179688, "loss": 0.1851, "rewards/chosen": 0.7266839146614075, "rewards/margins": 3.259298324584961, "rewards/rejected": -2.5326144695281982, "step": 2129 }, { "epoch": 0.56, "grad_norm": 32.663631439208984, "kl": 0.0, "learning_rate": 2.212771525778592e-07, "logps/chosen": -164.1979217529297, "logps/rejected": -256.0262145996094, "loss": 0.1787, "rewards/chosen": 0.20368297398090363, "rewards/margins": 4.717043399810791, "rewards/rejected": -4.513360500335693, "step": 2130 }, { "epoch": 0.56, "grad_norm": 28.846590042114258, "kl": 0.0, "learning_rate": 2.2114629678094736e-07, "logps/chosen": -181.72128295898438, "logps/rejected": -220.4137420654297, "loss": 0.2067, "rewards/chosen": 2.2399544715881348, "rewards/margins": 6.073826789855957, "rewards/rejected": -3.8338725566864014, "step": 2131 }, { "epoch": 0.56, "grad_norm": 36.972412109375, "kl": 0.0, "learning_rate": 2.2101544098403558e-07, "logps/chosen": -114.09268951416016, "logps/rejected": -245.62338256835938, "loss": 0.3456, "rewards/chosen": 0.64730304479599, "rewards/margins": 3.1397600173950195, "rewards/rejected": -2.4924569129943848, "step": 2132 }, { "epoch": 0.56, "grad_norm": 35.26288604736328, "kl": 0.0, "learning_rate": 2.2088458518712378e-07, "logps/chosen": -257.6791687011719, "logps/rejected": -192.79925537109375, "loss": 0.3397, "rewards/chosen": -0.32979992032051086, "rewards/margins": 1.9963352680206299, "rewards/rejected": -2.3261351585388184, "step": 2133 }, { "epoch": 0.56, "grad_norm": 36.57280349731445, "kl": 0.0, "learning_rate": 2.2075372939021197e-07, "logps/chosen": -187.70989990234375, "logps/rejected": -201.96804809570312, "loss": 0.3463, "rewards/chosen": 0.014570653438568115, "rewards/margins": 3.578090190887451, "rewards/rejected": -3.5635194778442383, "step": 2134 }, { "epoch": 0.56, "grad_norm": 27.505809783935547, "kl": 0.0, "learning_rate": 2.2062287359330017e-07, "logps/chosen": -225.48959350585938, "logps/rejected": -231.11915588378906, "loss": 0.2318, "rewards/chosen": 1.2916991710662842, "rewards/margins": 5.916680335998535, "rewards/rejected": -4.62498140335083, "step": 2135 }, { "epoch": 0.56, "grad_norm": 30.881328582763672, "kl": 0.0, "learning_rate": 2.2049201779638837e-07, "logps/chosen": -192.57763671875, "logps/rejected": -132.3449249267578, "loss": 0.309, "rewards/chosen": 0.8033540844917297, "rewards/margins": 3.7982852458953857, "rewards/rejected": -2.994931221008301, "step": 2136 }, { "epoch": 0.56, "grad_norm": 30.890676498413086, "kl": 0.0, "learning_rate": 2.2036116199947656e-07, "logps/chosen": -285.72076416015625, "logps/rejected": -222.05982971191406, "loss": 0.2203, "rewards/chosen": 2.0986266136169434, "rewards/margins": 6.601768970489502, "rewards/rejected": -4.503142356872559, "step": 2137 }, { "epoch": 0.56, "grad_norm": 38.768924713134766, "kl": 0.0, "learning_rate": 2.2023030620256479e-07, "logps/chosen": -226.85989379882812, "logps/rejected": -310.0096740722656, "loss": 0.2693, "rewards/chosen": -0.08474735170602798, "rewards/margins": 4.027078628540039, "rewards/rejected": -4.111825942993164, "step": 2138 }, { "epoch": 0.56, "grad_norm": 34.64961242675781, "kl": 0.0, "learning_rate": 2.2009945040565295e-07, "logps/chosen": -316.72369384765625, "logps/rejected": -335.4961242675781, "loss": 0.2341, "rewards/chosen": 1.2335299253463745, "rewards/margins": 3.577603816986084, "rewards/rejected": -2.344074010848999, "step": 2139 }, { "epoch": 0.56, "grad_norm": 31.96160316467285, "kl": 0.0, "learning_rate": 2.1996859460874115e-07, "logps/chosen": -184.9946746826172, "logps/rejected": -189.07225036621094, "loss": 0.1965, "rewards/chosen": 2.3256113529205322, "rewards/margins": 5.768891334533691, "rewards/rejected": -3.44327974319458, "step": 2140 }, { "epoch": 0.56, "grad_norm": 32.98160171508789, "kl": 0.0, "learning_rate": 2.1983773881182935e-07, "logps/chosen": -187.4897003173828, "logps/rejected": -235.82472229003906, "loss": 0.2849, "rewards/chosen": 0.5114994049072266, "rewards/margins": 3.3172054290771484, "rewards/rejected": -2.805706024169922, "step": 2141 }, { "epoch": 0.56, "grad_norm": 25.43107795715332, "kl": 0.0, "learning_rate": 2.1970688301491754e-07, "logps/chosen": -217.7528839111328, "logps/rejected": -294.7716979980469, "loss": 0.1704, "rewards/chosen": 1.8277029991149902, "rewards/margins": 5.8614821434021, "rewards/rejected": -4.033779144287109, "step": 2142 }, { "epoch": 0.56, "grad_norm": 32.24444580078125, "kl": 0.0, "learning_rate": 2.1957602721800574e-07, "logps/chosen": -240.55447387695312, "logps/rejected": -229.03414916992188, "loss": 0.3044, "rewards/chosen": -0.8243075609207153, "rewards/margins": 2.479674816131592, "rewards/rejected": -3.3039822578430176, "step": 2143 }, { "epoch": 0.56, "grad_norm": 39.09797668457031, "kl": 0.0, "learning_rate": 2.1944517142109396e-07, "logps/chosen": -180.61624145507812, "logps/rejected": -211.71670532226562, "loss": 0.3454, "rewards/chosen": 0.7784052491188049, "rewards/margins": 2.5519487857818604, "rewards/rejected": -1.7735434770584106, "step": 2144 }, { "epoch": 0.56, "grad_norm": 22.509733200073242, "kl": 0.0, "learning_rate": 2.1931431562418216e-07, "logps/chosen": -154.87033081054688, "logps/rejected": -232.0869903564453, "loss": 0.3044, "rewards/chosen": 1.8187867403030396, "rewards/margins": 5.667130470275879, "rewards/rejected": -3.848343849182129, "step": 2145 }, { "epoch": 0.56, "grad_norm": 40.67682647705078, "kl": 0.0, "learning_rate": 2.1918345982727035e-07, "logps/chosen": -254.24398803710938, "logps/rejected": -272.7815246582031, "loss": 0.2777, "rewards/chosen": 2.3531463146209717, "rewards/margins": 5.830312728881836, "rewards/rejected": -3.4771666526794434, "step": 2146 }, { "epoch": 0.56, "grad_norm": 30.30360984802246, "kl": 0.0, "learning_rate": 2.1905260403035852e-07, "logps/chosen": -165.32546997070312, "logps/rejected": -269.3461608886719, "loss": 0.2567, "rewards/chosen": 1.1869643926620483, "rewards/margins": 4.445183753967285, "rewards/rejected": -3.2582192420959473, "step": 2147 }, { "epoch": 0.56, "grad_norm": 42.13438415527344, "kl": 0.0, "learning_rate": 2.1892174823344672e-07, "logps/chosen": -197.6481170654297, "logps/rejected": -216.67141723632812, "loss": 0.3744, "rewards/chosen": 0.4272662401199341, "rewards/margins": 3.3712263107299805, "rewards/rejected": -2.943960189819336, "step": 2148 }, { "epoch": 0.56, "grad_norm": 37.454105377197266, "kl": 0.0, "learning_rate": 2.1879089243653492e-07, "logps/chosen": -222.81515502929688, "logps/rejected": -316.2408447265625, "loss": 0.3504, "rewards/chosen": -0.2285047173500061, "rewards/margins": 2.9904603958129883, "rewards/rejected": -3.2189650535583496, "step": 2149 }, { "epoch": 0.56, "grad_norm": 42.05679702758789, "kl": 0.0, "learning_rate": 2.186600366396231e-07, "logps/chosen": -171.63548278808594, "logps/rejected": -273.6712341308594, "loss": 0.2923, "rewards/chosen": 1.511362075805664, "rewards/margins": 4.281908988952637, "rewards/rejected": -2.7705469131469727, "step": 2150 }, { "epoch": 0.56, "grad_norm": 37.90392303466797, "kl": 0.0, "learning_rate": 2.1852918084271133e-07, "logps/chosen": -242.9505615234375, "logps/rejected": -183.59271240234375, "loss": 0.2654, "rewards/chosen": 0.4527307450771332, "rewards/margins": 2.6310439109802246, "rewards/rejected": -2.1783132553100586, "step": 2151 }, { "epoch": 0.56, "grad_norm": 33.639251708984375, "kl": 0.0, "learning_rate": 2.1839832504579953e-07, "logps/chosen": -163.69761657714844, "logps/rejected": -191.52780151367188, "loss": 0.2613, "rewards/chosen": 1.0045702457427979, "rewards/margins": 3.827319860458374, "rewards/rejected": -2.822749614715576, "step": 2152 }, { "epoch": 0.56, "grad_norm": 33.028106689453125, "kl": 0.0, "learning_rate": 2.1826746924888773e-07, "logps/chosen": -192.94677734375, "logps/rejected": -217.50164794921875, "loss": 0.339, "rewards/chosen": 0.38689547777175903, "rewards/margins": 3.0231199264526367, "rewards/rejected": -2.6362245082855225, "step": 2153 }, { "epoch": 0.56, "grad_norm": 27.515506744384766, "kl": 0.0, "learning_rate": 2.1813661345197592e-07, "logps/chosen": -169.6422119140625, "logps/rejected": -232.17977905273438, "loss": 0.3014, "rewards/chosen": 0.4662325084209442, "rewards/margins": 4.453876495361328, "rewards/rejected": -3.9876441955566406, "step": 2154 }, { "epoch": 0.56, "grad_norm": 34.339027404785156, "kl": 0.0, "learning_rate": 2.180057576550641e-07, "logps/chosen": -213.93386840820312, "logps/rejected": -202.92160034179688, "loss": 0.3042, "rewards/chosen": 2.6573381423950195, "rewards/margins": 5.09763240814209, "rewards/rejected": -2.4402945041656494, "step": 2155 }, { "epoch": 0.56, "grad_norm": 28.567813873291016, "kl": 0.0, "learning_rate": 2.178749018581523e-07, "logps/chosen": -205.0366973876953, "logps/rejected": -156.337646484375, "loss": 0.1926, "rewards/chosen": 1.1674195528030396, "rewards/margins": 3.3237528800964355, "rewards/rejected": -2.1563334465026855, "step": 2156 }, { "epoch": 0.56, "grad_norm": 34.4302864074707, "kl": 0.0, "learning_rate": 2.177440460612405e-07, "logps/chosen": -169.4579315185547, "logps/rejected": -209.5000457763672, "loss": 0.2903, "rewards/chosen": -0.6242586374282837, "rewards/margins": 4.042919635772705, "rewards/rejected": -4.667178153991699, "step": 2157 }, { "epoch": 0.56, "grad_norm": 30.366474151611328, "kl": 0.0, "learning_rate": 2.176131902643287e-07, "logps/chosen": -273.88360595703125, "logps/rejected": -211.7207794189453, "loss": 0.2764, "rewards/chosen": 1.9383443593978882, "rewards/margins": 4.192324161529541, "rewards/rejected": -2.2539799213409424, "step": 2158 }, { "epoch": 0.57, "grad_norm": 32.51573181152344, "kl": 0.0, "learning_rate": 2.174823344674169e-07, "logps/chosen": -298.5272216796875, "logps/rejected": -213.78758239746094, "loss": 0.2315, "rewards/chosen": 2.326655626296997, "rewards/margins": 5.482968330383301, "rewards/rejected": -3.1563124656677246, "step": 2159 }, { "epoch": 0.57, "grad_norm": 21.78364372253418, "kl": 0.0, "learning_rate": 2.173514786705051e-07, "logps/chosen": -234.16297912597656, "logps/rejected": -159.34295654296875, "loss": 0.2376, "rewards/chosen": 0.8480652570724487, "rewards/margins": 6.228734016418457, "rewards/rejected": -5.380668640136719, "step": 2160 }, { "epoch": 0.57, "grad_norm": 34.532020568847656, "kl": 0.0, "learning_rate": 2.172206228735933e-07, "logps/chosen": -224.65281677246094, "logps/rejected": -256.1437683105469, "loss": 0.216, "rewards/chosen": 2.745820999145508, "rewards/margins": 6.271589279174805, "rewards/rejected": -3.525768280029297, "step": 2161 }, { "epoch": 0.57, "grad_norm": 40.12411117553711, "kl": 0.0, "learning_rate": 2.1708976707668146e-07, "logps/chosen": -147.53533935546875, "logps/rejected": -218.46353149414062, "loss": 0.2874, "rewards/chosen": 0.935888946056366, "rewards/margins": 2.4985463619232178, "rewards/rejected": -1.5626574754714966, "step": 2162 }, { "epoch": 0.57, "grad_norm": 33.51218795776367, "kl": 0.0, "learning_rate": 2.1695891127976966e-07, "logps/chosen": -185.3155059814453, "logps/rejected": -369.5777893066406, "loss": 0.1757, "rewards/chosen": 1.2122989892959595, "rewards/margins": 4.9565558433532715, "rewards/rejected": -3.7442567348480225, "step": 2163 }, { "epoch": 0.57, "grad_norm": 39.717586517333984, "kl": 0.0, "learning_rate": 2.1682805548285788e-07, "logps/chosen": -229.6589813232422, "logps/rejected": -243.6190948486328, "loss": 0.2637, "rewards/chosen": 0.4361046850681305, "rewards/margins": 4.181896209716797, "rewards/rejected": -3.745791435241699, "step": 2164 }, { "epoch": 0.57, "grad_norm": 41.20941162109375, "kl": 0.0, "learning_rate": 2.1669719968594608e-07, "logps/chosen": -254.28562927246094, "logps/rejected": -211.9898681640625, "loss": 0.2955, "rewards/chosen": 1.7442975044250488, "rewards/margins": 4.571514129638672, "rewards/rejected": -2.827216386795044, "step": 2165 }, { "epoch": 0.57, "grad_norm": 178.1888885498047, "kl": 0.0, "learning_rate": 2.1656634388903428e-07, "logps/chosen": -139.78665161132812, "logps/rejected": -312.6597595214844, "loss": 0.3272, "rewards/chosen": 1.3665469884872437, "rewards/margins": 4.133112907409668, "rewards/rejected": -2.7665657997131348, "step": 2166 }, { "epoch": 0.57, "grad_norm": 33.31022644042969, "kl": 0.0, "learning_rate": 2.1643548809212247e-07, "logps/chosen": -225.217529296875, "logps/rejected": -217.964111328125, "loss": 0.287, "rewards/chosen": -0.30057814717292786, "rewards/margins": 2.33773136138916, "rewards/rejected": -2.6383094787597656, "step": 2167 }, { "epoch": 0.57, "grad_norm": 27.65751075744629, "kl": 0.0, "learning_rate": 2.1630463229521067e-07, "logps/chosen": -194.6341552734375, "logps/rejected": -216.58169555664062, "loss": 0.2897, "rewards/chosen": 1.8614850044250488, "rewards/margins": 5.929368019104004, "rewards/rejected": -4.067883014678955, "step": 2168 }, { "epoch": 0.57, "grad_norm": 35.12147903442383, "kl": 0.0, "learning_rate": 2.1617377649829886e-07, "logps/chosen": -152.90708923339844, "logps/rejected": -152.86331176757812, "loss": 0.3569, "rewards/chosen": -0.22007495164871216, "rewards/margins": 1.8580245971679688, "rewards/rejected": -2.078099489212036, "step": 2169 }, { "epoch": 0.57, "grad_norm": 34.59441375732422, "kl": 0.0, "learning_rate": 2.1604292070138706e-07, "logps/chosen": -160.6201171875, "logps/rejected": -378.0250244140625, "loss": 0.2663, "rewards/chosen": 2.978060245513916, "rewards/margins": 10.298988342285156, "rewards/rejected": -7.320927619934082, "step": 2170 }, { "epoch": 0.57, "grad_norm": 27.22705078125, "kl": 0.0, "learning_rate": 2.1591206490447526e-07, "logps/chosen": -199.74655151367188, "logps/rejected": -290.22235107421875, "loss": 0.1602, "rewards/chosen": 1.652679681777954, "rewards/margins": 6.4878082275390625, "rewards/rejected": -4.8351287841796875, "step": 2171 }, { "epoch": 0.57, "grad_norm": 31.888912200927734, "kl": 0.0, "learning_rate": 2.1578120910756345e-07, "logps/chosen": -136.07327270507812, "logps/rejected": -275.01251220703125, "loss": 0.2782, "rewards/chosen": 0.604078471660614, "rewards/margins": 6.338095188140869, "rewards/rejected": -5.7340168952941895, "step": 2172 }, { "epoch": 0.57, "grad_norm": 32.569190979003906, "kl": 0.0, "learning_rate": 2.1565035331065165e-07, "logps/chosen": -200.0724639892578, "logps/rejected": -308.7223205566406, "loss": 0.2741, "rewards/chosen": 0.7700318098068237, "rewards/margins": 5.290283679962158, "rewards/rejected": -4.520251750946045, "step": 2173 }, { "epoch": 0.57, "grad_norm": 29.394424438476562, "kl": 0.0, "learning_rate": 2.1551949751373984e-07, "logps/chosen": -261.6470031738281, "logps/rejected": -275.6170654296875, "loss": 0.3277, "rewards/chosen": -1.3419867753982544, "rewards/margins": 2.240203857421875, "rewards/rejected": -3.58219051361084, "step": 2174 }, { "epoch": 0.57, "grad_norm": 28.1448974609375, "kl": 0.0, "learning_rate": 2.1538864171682804e-07, "logps/chosen": -219.25546264648438, "logps/rejected": -211.60226440429688, "loss": 0.2081, "rewards/chosen": 1.6857521533966064, "rewards/margins": 4.913581371307373, "rewards/rejected": -3.2278292179107666, "step": 2175 }, { "epoch": 0.57, "grad_norm": 32.16535568237305, "kl": 0.0, "learning_rate": 2.1525778591991626e-07, "logps/chosen": -209.88577270507812, "logps/rejected": -337.0541687011719, "loss": 0.274, "rewards/chosen": 0.017229488119482994, "rewards/margins": 3.5651438236236572, "rewards/rejected": -3.5479142665863037, "step": 2176 }, { "epoch": 0.57, "grad_norm": 31.019987106323242, "kl": 0.0, "learning_rate": 2.1512693012300446e-07, "logps/chosen": -179.34695434570312, "logps/rejected": -221.03477478027344, "loss": 0.3261, "rewards/chosen": 1.2182724475860596, "rewards/margins": 5.014984607696533, "rewards/rejected": -3.7967121601104736, "step": 2177 }, { "epoch": 0.57, "grad_norm": 28.094982147216797, "kl": 0.0, "learning_rate": 2.1499607432609263e-07, "logps/chosen": -180.26943969726562, "logps/rejected": -301.4582214355469, "loss": 0.2488, "rewards/chosen": 1.3258321285247803, "rewards/margins": 5.93510627746582, "rewards/rejected": -4.609274387359619, "step": 2178 }, { "epoch": 0.57, "grad_norm": 30.352922439575195, "kl": 0.0, "learning_rate": 2.1486521852918082e-07, "logps/chosen": -268.167724609375, "logps/rejected": -183.42550659179688, "loss": 0.2504, "rewards/chosen": 1.3395893573760986, "rewards/margins": 4.956480026245117, "rewards/rejected": -3.6168904304504395, "step": 2179 }, { "epoch": 0.57, "grad_norm": 36.48234558105469, "kl": 0.0, "learning_rate": 2.1473436273226902e-07, "logps/chosen": -190.11546325683594, "logps/rejected": -230.5013427734375, "loss": 0.2733, "rewards/chosen": 0.42728573083877563, "rewards/margins": 3.968291759490967, "rewards/rejected": -3.541006088256836, "step": 2180 }, { "epoch": 0.57, "grad_norm": 31.570514678955078, "kl": 0.0, "learning_rate": 2.1460350693535722e-07, "logps/chosen": -185.64596557617188, "logps/rejected": -236.3667755126953, "loss": 0.2253, "rewards/chosen": 2.1138195991516113, "rewards/margins": 5.250305652618408, "rewards/rejected": -3.136486053466797, "step": 2181 }, { "epoch": 0.57, "grad_norm": 38.71629333496094, "kl": 0.0, "learning_rate": 2.144726511384454e-07, "logps/chosen": -242.28211975097656, "logps/rejected": -348.03997802734375, "loss": 0.3644, "rewards/chosen": 0.37175941467285156, "rewards/margins": 7.486160755157471, "rewards/rejected": -7.114401340484619, "step": 2182 }, { "epoch": 0.57, "grad_norm": 43.70787811279297, "kl": 0.0, "learning_rate": 2.1434179534153364e-07, "logps/chosen": -220.95281982421875, "logps/rejected": -279.92919921875, "loss": 0.2103, "rewards/chosen": 1.4428505897521973, "rewards/margins": 4.366604804992676, "rewards/rejected": -2.9237544536590576, "step": 2183 }, { "epoch": 0.57, "grad_norm": 37.34977340698242, "kl": 0.0, "learning_rate": 2.1421093954462183e-07, "logps/chosen": -203.84178161621094, "logps/rejected": -306.4324035644531, "loss": 0.2319, "rewards/chosen": 1.7929118871688843, "rewards/margins": 4.468847274780273, "rewards/rejected": -2.6759352684020996, "step": 2184 }, { "epoch": 0.57, "grad_norm": 37.7314453125, "kl": 0.0, "learning_rate": 2.1408008374771003e-07, "logps/chosen": -202.56298828125, "logps/rejected": -220.33523559570312, "loss": 0.2206, "rewards/chosen": 0.2738282084465027, "rewards/margins": 4.876184463500977, "rewards/rejected": -4.602356433868408, "step": 2185 }, { "epoch": 0.57, "grad_norm": 34.27370834350586, "kl": 0.0, "learning_rate": 2.139492279507982e-07, "logps/chosen": -282.90570068359375, "logps/rejected": -265.1341857910156, "loss": 0.1865, "rewards/chosen": 1.703295111656189, "rewards/margins": 6.923501491546631, "rewards/rejected": -5.220206260681152, "step": 2186 }, { "epoch": 0.57, "grad_norm": 52.453433990478516, "kl": 0.0, "learning_rate": 2.138183721538864e-07, "logps/chosen": -167.38584899902344, "logps/rejected": -235.28759765625, "loss": 0.3206, "rewards/chosen": -0.025245457887649536, "rewards/margins": 3.191519021987915, "rewards/rejected": -3.216764450073242, "step": 2187 }, { "epoch": 0.57, "grad_norm": 28.034582138061523, "kl": 0.0, "learning_rate": 2.136875163569746e-07, "logps/chosen": -178.97332763671875, "logps/rejected": -241.8062744140625, "loss": 0.3562, "rewards/chosen": 0.7335941791534424, "rewards/margins": 5.105332374572754, "rewards/rejected": -4.371738433837891, "step": 2188 }, { "epoch": 0.57, "grad_norm": 75.6514892578125, "kl": 0.0, "learning_rate": 2.135566605600628e-07, "logps/chosen": -157.3307647705078, "logps/rejected": -202.5512237548828, "loss": 0.3532, "rewards/chosen": 0.16009896993637085, "rewards/margins": 2.7601494789123535, "rewards/rejected": -2.600050449371338, "step": 2189 }, { "epoch": 0.57, "grad_norm": 26.94536590576172, "kl": 0.0, "learning_rate": 2.13425804763151e-07, "logps/chosen": -181.12144470214844, "logps/rejected": -358.65509033203125, "loss": 0.1763, "rewards/chosen": 1.8456650972366333, "rewards/margins": 6.388722896575928, "rewards/rejected": -4.543057918548584, "step": 2190 }, { "epoch": 0.57, "grad_norm": 25.052820205688477, "kl": 0.0, "learning_rate": 2.132949489662392e-07, "logps/chosen": -138.30601501464844, "logps/rejected": -200.51475524902344, "loss": 0.2248, "rewards/chosen": 1.732397437095642, "rewards/margins": 4.560910224914551, "rewards/rejected": -2.828512668609619, "step": 2191 }, { "epoch": 0.57, "grad_norm": 32.18205261230469, "kl": 0.0, "learning_rate": 2.131640931693274e-07, "logps/chosen": -245.55789184570312, "logps/rejected": -193.5570526123047, "loss": 0.3338, "rewards/chosen": 0.15273427963256836, "rewards/margins": 2.8544297218322754, "rewards/rejected": -2.701695442199707, "step": 2192 }, { "epoch": 0.57, "grad_norm": 22.67527961730957, "kl": 0.0, "learning_rate": 2.1303323737241557e-07, "logps/chosen": -179.4833984375, "logps/rejected": -201.1123046875, "loss": 0.1737, "rewards/chosen": 1.5341203212738037, "rewards/margins": 4.5851240158081055, "rewards/rejected": -3.0510034561157227, "step": 2193 }, { "epoch": 0.57, "grad_norm": 40.20901870727539, "kl": 0.0, "learning_rate": 2.1290238157550377e-07, "logps/chosen": -282.07647705078125, "logps/rejected": -189.77468872070312, "loss": 0.279, "rewards/chosen": 0.2572477459907532, "rewards/margins": 2.514106273651123, "rewards/rejected": -2.2568585872650146, "step": 2194 }, { "epoch": 0.57, "grad_norm": 36.39990997314453, "kl": 0.0, "learning_rate": 2.12771525778592e-07, "logps/chosen": -169.90203857421875, "logps/rejected": -300.26422119140625, "loss": 0.2349, "rewards/chosen": 0.17301256954669952, "rewards/margins": 3.594301462173462, "rewards/rejected": -3.4212889671325684, "step": 2195 }, { "epoch": 0.57, "grad_norm": 31.77029037475586, "kl": 0.0, "learning_rate": 2.1264066998168018e-07, "logps/chosen": -203.70204162597656, "logps/rejected": -237.15872192382812, "loss": 0.3168, "rewards/chosen": 1.2523151636123657, "rewards/margins": 4.519637584686279, "rewards/rejected": -3.267322301864624, "step": 2196 }, { "epoch": 0.57, "grad_norm": 30.283166885375977, "kl": 0.0, "learning_rate": 2.1250981418476838e-07, "logps/chosen": -234.54457092285156, "logps/rejected": -201.38150024414062, "loss": 0.2492, "rewards/chosen": 0.1242559552192688, "rewards/margins": 3.404097318649292, "rewards/rejected": -3.279841423034668, "step": 2197 }, { "epoch": 0.58, "grad_norm": 28.874860763549805, "kl": 0.0, "learning_rate": 2.1237895838785658e-07, "logps/chosen": -146.4224853515625, "logps/rejected": -255.65232849121094, "loss": 0.2861, "rewards/chosen": 0.9600822925567627, "rewards/margins": 4.900790214538574, "rewards/rejected": -3.9407076835632324, "step": 2198 }, { "epoch": 0.58, "grad_norm": 26.083675384521484, "kl": 0.0, "learning_rate": 2.1224810259094477e-07, "logps/chosen": -209.79185485839844, "logps/rejected": -204.64401245117188, "loss": 0.2222, "rewards/chosen": 1.7219187021255493, "rewards/margins": 3.7728919982910156, "rewards/rejected": -2.050973415374756, "step": 2199 }, { "epoch": 0.58, "grad_norm": 34.4384765625, "kl": 0.0, "learning_rate": 2.1211724679403297e-07, "logps/chosen": -203.7540740966797, "logps/rejected": -280.116943359375, "loss": 0.2361, "rewards/chosen": -0.42424073815345764, "rewards/margins": 3.4924893379211426, "rewards/rejected": -3.9167301654815674, "step": 2200 }, { "epoch": 0.58, "grad_norm": 35.34817123413086, "kl": 0.0, "learning_rate": 2.1198639099712114e-07, "logps/chosen": -266.0620422363281, "logps/rejected": -350.2867736816406, "loss": 0.2504, "rewards/chosen": 0.5245301127433777, "rewards/margins": 4.969491004943848, "rewards/rejected": -4.444961071014404, "step": 2201 }, { "epoch": 0.58, "grad_norm": 38.560787200927734, "kl": 0.0, "learning_rate": 2.1185553520020936e-07, "logps/chosen": -145.8126678466797, "logps/rejected": -275.6151428222656, "loss": 0.3191, "rewards/chosen": 0.6639471054077148, "rewards/margins": 3.947199583053589, "rewards/rejected": -3.283252477645874, "step": 2202 }, { "epoch": 0.58, "grad_norm": 23.77583885192871, "kl": 0.0, "learning_rate": 2.1172467940329756e-07, "logps/chosen": -166.97357177734375, "logps/rejected": -223.1553955078125, "loss": 0.1399, "rewards/chosen": 1.8763564825057983, "rewards/margins": 6.32912540435791, "rewards/rejected": -4.452768802642822, "step": 2203 }, { "epoch": 0.58, "grad_norm": 36.36481475830078, "kl": 0.0, "learning_rate": 2.1159382360638575e-07, "logps/chosen": -203.16331481933594, "logps/rejected": -351.1559753417969, "loss": 0.2323, "rewards/chosen": -0.09685803204774857, "rewards/margins": 5.4982428550720215, "rewards/rejected": -5.5951008796691895, "step": 2204 }, { "epoch": 0.58, "grad_norm": 34.4608268737793, "kl": 0.0, "learning_rate": 2.1146296780947395e-07, "logps/chosen": -220.57008361816406, "logps/rejected": -289.4773864746094, "loss": 0.2413, "rewards/chosen": 1.1138266324996948, "rewards/margins": 4.565395355224609, "rewards/rejected": -3.451568603515625, "step": 2205 }, { "epoch": 0.58, "grad_norm": 35.523155212402344, "kl": 0.0, "learning_rate": 2.1133211201256215e-07, "logps/chosen": -156.62518310546875, "logps/rejected": -215.77374267578125, "loss": 0.2555, "rewards/chosen": 1.4320130348205566, "rewards/margins": 3.965759754180908, "rewards/rejected": -2.5337467193603516, "step": 2206 }, { "epoch": 0.58, "grad_norm": 42.77104187011719, "kl": 0.0, "learning_rate": 2.1120125621565034e-07, "logps/chosen": -197.8491668701172, "logps/rejected": -234.12548828125, "loss": 0.2988, "rewards/chosen": 0.7398046851158142, "rewards/margins": 2.2821764945983887, "rewards/rejected": -1.5423718690872192, "step": 2207 }, { "epoch": 0.58, "grad_norm": 37.442527770996094, "kl": 0.0, "learning_rate": 2.1107040041873856e-07, "logps/chosen": -237.7860870361328, "logps/rejected": -215.8834228515625, "loss": 0.2311, "rewards/chosen": 1.64774751663208, "rewards/margins": 5.702475070953369, "rewards/rejected": -4.054727554321289, "step": 2208 }, { "epoch": 0.58, "grad_norm": 30.224994659423828, "kl": 0.0, "learning_rate": 2.1093954462182673e-07, "logps/chosen": -165.15650939941406, "logps/rejected": -310.42462158203125, "loss": 0.2603, "rewards/chosen": 1.1862459182739258, "rewards/margins": 5.600722312927246, "rewards/rejected": -4.41447639465332, "step": 2209 }, { "epoch": 0.58, "grad_norm": 38.65983963012695, "kl": 0.0, "learning_rate": 2.1080868882491493e-07, "logps/chosen": -196.13844299316406, "logps/rejected": -230.19146728515625, "loss": 0.2714, "rewards/chosen": 2.045654773712158, "rewards/margins": 4.847431182861328, "rewards/rejected": -2.801776647567749, "step": 2210 }, { "epoch": 0.58, "grad_norm": 33.665855407714844, "kl": 0.0, "learning_rate": 2.1067783302800313e-07, "logps/chosen": -227.00308227539062, "logps/rejected": -192.12783813476562, "loss": 0.2827, "rewards/chosen": 1.3021548986434937, "rewards/margins": 4.660031318664551, "rewards/rejected": -3.3578763008117676, "step": 2211 }, { "epoch": 0.58, "grad_norm": 34.888275146484375, "kl": 0.0, "learning_rate": 2.1054697723109132e-07, "logps/chosen": -205.8463592529297, "logps/rejected": -176.98809814453125, "loss": 0.2552, "rewards/chosen": 0.521612286567688, "rewards/margins": 2.937107563018799, "rewards/rejected": -2.4154953956604004, "step": 2212 }, { "epoch": 0.58, "grad_norm": 35.640281677246094, "kl": 0.0, "learning_rate": 2.1041612143417952e-07, "logps/chosen": -234.514892578125, "logps/rejected": -269.9208679199219, "loss": 0.2382, "rewards/chosen": 1.1150953769683838, "rewards/margins": 4.122707366943359, "rewards/rejected": -3.0076119899749756, "step": 2213 }, { "epoch": 0.58, "grad_norm": 35.11532211303711, "kl": 0.0, "learning_rate": 2.1028526563726774e-07, "logps/chosen": -192.67202758789062, "logps/rejected": -212.2769775390625, "loss": 0.2851, "rewards/chosen": -0.08396945893764496, "rewards/margins": 3.1026346683502197, "rewards/rejected": -3.1866040229797363, "step": 2214 }, { "epoch": 0.58, "grad_norm": 42.883541107177734, "kl": 0.0, "learning_rate": 2.1015440984035594e-07, "logps/chosen": -241.93801879882812, "logps/rejected": -286.9732971191406, "loss": 0.2737, "rewards/chosen": 1.6893227100372314, "rewards/margins": 5.431354522705078, "rewards/rejected": -3.7420315742492676, "step": 2215 }, { "epoch": 0.58, "grad_norm": 34.18726348876953, "kl": 0.0, "learning_rate": 2.1002355404344413e-07, "logps/chosen": -153.22897338867188, "logps/rejected": -150.74411010742188, "loss": 0.3609, "rewards/chosen": 1.0502197742462158, "rewards/margins": 2.500394821166992, "rewards/rejected": -1.4501750469207764, "step": 2216 }, { "epoch": 0.58, "grad_norm": 38.212318420410156, "kl": 0.0, "learning_rate": 2.098926982465323e-07, "logps/chosen": -224.86199951171875, "logps/rejected": -308.0025329589844, "loss": 0.1965, "rewards/chosen": 0.44962361454963684, "rewards/margins": 4.539242744445801, "rewards/rejected": -4.089619159698486, "step": 2217 }, { "epoch": 0.58, "grad_norm": 32.787559509277344, "kl": 0.0, "learning_rate": 2.097618424496205e-07, "logps/chosen": -277.6907043457031, "logps/rejected": -224.82205200195312, "loss": 0.2155, "rewards/chosen": 0.506578803062439, "rewards/margins": 3.952627182006836, "rewards/rejected": -3.4460482597351074, "step": 2218 }, { "epoch": 0.58, "grad_norm": 31.50032615661621, "kl": 0.0, "learning_rate": 2.096309866527087e-07, "logps/chosen": -219.48257446289062, "logps/rejected": -213.10658264160156, "loss": 0.3219, "rewards/chosen": 0.9474495649337769, "rewards/margins": 2.4071855545043945, "rewards/rejected": -1.4597361087799072, "step": 2219 }, { "epoch": 0.58, "grad_norm": 36.32145309448242, "kl": 0.0, "learning_rate": 2.095001308557969e-07, "logps/chosen": -225.48365783691406, "logps/rejected": -273.248291015625, "loss": 0.225, "rewards/chosen": 0.33812087774276733, "rewards/margins": 3.649228811264038, "rewards/rejected": -3.311107873916626, "step": 2220 }, { "epoch": 0.58, "grad_norm": 38.5352668762207, "kl": 0.0, "learning_rate": 2.0936927505888511e-07, "logps/chosen": -219.2425537109375, "logps/rejected": -274.7395935058594, "loss": 0.2791, "rewards/chosen": 1.2792561054229736, "rewards/margins": 5.127982139587402, "rewards/rejected": -3.8487257957458496, "step": 2221 }, { "epoch": 0.58, "grad_norm": 41.40312576293945, "kl": 0.0, "learning_rate": 2.092384192619733e-07, "logps/chosen": -229.9542694091797, "logps/rejected": -304.9787902832031, "loss": 0.2319, "rewards/chosen": 0.703386664390564, "rewards/margins": 4.285565376281738, "rewards/rejected": -3.582178831100464, "step": 2222 }, { "epoch": 0.58, "grad_norm": 34.01308059692383, "kl": 0.0, "learning_rate": 2.091075634650615e-07, "logps/chosen": -156.47169494628906, "logps/rejected": -347.48370361328125, "loss": 0.2302, "rewards/chosen": 1.8110793828964233, "rewards/margins": 3.668882369995117, "rewards/rejected": -1.8578031063079834, "step": 2223 }, { "epoch": 0.58, "grad_norm": 35.65562057495117, "kl": 0.0, "learning_rate": 2.0897670766814968e-07, "logps/chosen": -155.8760528564453, "logps/rejected": -218.9156036376953, "loss": 0.2815, "rewards/chosen": 0.4598958194255829, "rewards/margins": 3.565948963165283, "rewards/rejected": -3.106053113937378, "step": 2224 }, { "epoch": 0.58, "grad_norm": 35.85246658325195, "kl": 0.0, "learning_rate": 2.0884585187123787e-07, "logps/chosen": -196.15066528320312, "logps/rejected": -293.7716064453125, "loss": 0.2994, "rewards/chosen": 0.30416417121887207, "rewards/margins": 3.709951162338257, "rewards/rejected": -3.4057869911193848, "step": 2225 }, { "epoch": 0.58, "grad_norm": 31.45633316040039, "kl": 0.0, "learning_rate": 2.0871499607432607e-07, "logps/chosen": -161.6018829345703, "logps/rejected": -231.9564971923828, "loss": 0.204, "rewards/chosen": 1.5219181776046753, "rewards/margins": 5.138897895812988, "rewards/rejected": -3.6169795989990234, "step": 2226 }, { "epoch": 0.58, "grad_norm": 27.005146026611328, "kl": 0.0, "learning_rate": 2.085841402774143e-07, "logps/chosen": -170.5829620361328, "logps/rejected": -275.69659423828125, "loss": 0.2103, "rewards/chosen": 2.6731762886047363, "rewards/margins": 7.106757640838623, "rewards/rejected": -4.433581352233887, "step": 2227 }, { "epoch": 0.58, "grad_norm": 26.06833839416504, "kl": 0.0, "learning_rate": 2.0845328448050249e-07, "logps/chosen": -215.9841766357422, "logps/rejected": -279.5541076660156, "loss": 0.1581, "rewards/chosen": 1.9819355010986328, "rewards/margins": 6.945269584655762, "rewards/rejected": -4.963334083557129, "step": 2228 }, { "epoch": 0.58, "grad_norm": 35.79971694946289, "kl": 0.0, "learning_rate": 2.0832242868359068e-07, "logps/chosen": -252.36244201660156, "logps/rejected": -285.28302001953125, "loss": 0.2275, "rewards/chosen": 1.6426401138305664, "rewards/margins": 4.779829502105713, "rewards/rejected": -3.1371893882751465, "step": 2229 }, { "epoch": 0.58, "grad_norm": 50.33610534667969, "kl": 0.0, "learning_rate": 2.0819157288667888e-07, "logps/chosen": -287.94281005859375, "logps/rejected": -286.266845703125, "loss": 0.2573, "rewards/chosen": -0.2679663300514221, "rewards/margins": 2.9511818885803223, "rewards/rejected": -3.2191481590270996, "step": 2230 }, { "epoch": 0.58, "grad_norm": 27.209035873413086, "kl": 0.0, "learning_rate": 2.0806071708976707e-07, "logps/chosen": -179.55047607421875, "logps/rejected": -257.8847961425781, "loss": 0.1428, "rewards/chosen": 2.861096143722534, "rewards/margins": 6.1882853507995605, "rewards/rejected": -3.3271892070770264, "step": 2231 }, { "epoch": 0.58, "grad_norm": 26.88276481628418, "kl": 0.0, "learning_rate": 2.0792986129285524e-07, "logps/chosen": -169.83990478515625, "logps/rejected": -234.80967712402344, "loss": 0.2285, "rewards/chosen": 1.3872361183166504, "rewards/margins": 7.181230545043945, "rewards/rejected": -5.793994426727295, "step": 2232 }, { "epoch": 0.58, "grad_norm": 35.116703033447266, "kl": 0.0, "learning_rate": 2.0779900549594344e-07, "logps/chosen": -237.92538452148438, "logps/rejected": -313.69866943359375, "loss": 0.2332, "rewards/chosen": -0.5640676021575928, "rewards/margins": 2.326479911804199, "rewards/rejected": -2.890547513961792, "step": 2233 }, { "epoch": 0.58, "grad_norm": 32.42366409301758, "kl": 0.0, "learning_rate": 2.0766814969903166e-07, "logps/chosen": -123.78561401367188, "logps/rejected": -258.0790710449219, "loss": 0.1687, "rewards/chosen": 0.20091545581817627, "rewards/margins": 4.391180038452148, "rewards/rejected": -4.190264701843262, "step": 2234 }, { "epoch": 0.58, "grad_norm": 38.087501525878906, "kl": 0.0, "learning_rate": 2.0753729390211986e-07, "logps/chosen": -235.76138305664062, "logps/rejected": -244.61502075195312, "loss": 0.3038, "rewards/chosen": 0.21202373504638672, "rewards/margins": 3.9693703651428223, "rewards/rejected": -3.7573466300964355, "step": 2235 }, { "epoch": 0.59, "grad_norm": 34.855560302734375, "kl": 0.0, "learning_rate": 2.0740643810520806e-07, "logps/chosen": -259.66522216796875, "logps/rejected": -245.32571411132812, "loss": 0.2637, "rewards/chosen": 0.4846804141998291, "rewards/margins": 4.76972770690918, "rewards/rejected": -4.2850470542907715, "step": 2236 }, { "epoch": 0.59, "grad_norm": 40.834163665771484, "kl": 0.0, "learning_rate": 2.0727558230829625e-07, "logps/chosen": -293.44512939453125, "logps/rejected": -210.43310546875, "loss": 0.3469, "rewards/chosen": 2.0567681789398193, "rewards/margins": 3.828779935836792, "rewards/rejected": -1.7720117568969727, "step": 2237 }, { "epoch": 0.59, "grad_norm": 32.50837707519531, "kl": 0.0, "learning_rate": 2.0714472651138445e-07, "logps/chosen": -228.298828125, "logps/rejected": -189.87283325195312, "loss": 0.2022, "rewards/chosen": 2.2769954204559326, "rewards/margins": 5.409682273864746, "rewards/rejected": -3.1326866149902344, "step": 2238 }, { "epoch": 0.59, "grad_norm": 34.75202178955078, "kl": 0.0, "learning_rate": 2.0701387071447264e-07, "logps/chosen": -229.86451721191406, "logps/rejected": -253.40927124023438, "loss": 0.3198, "rewards/chosen": 0.4948378801345825, "rewards/margins": 3.6758928298950195, "rewards/rejected": -3.1810548305511475, "step": 2239 }, { "epoch": 0.59, "grad_norm": 36.177547454833984, "kl": 0.0, "learning_rate": 2.0688301491756084e-07, "logps/chosen": -188.50875854492188, "logps/rejected": -309.8052978515625, "loss": 0.282, "rewards/chosen": 0.8006499409675598, "rewards/margins": 4.804440975189209, "rewards/rejected": -4.003790855407715, "step": 2240 }, { "epoch": 0.59, "grad_norm": 27.196758270263672, "kl": 0.0, "learning_rate": 2.0675215912064904e-07, "logps/chosen": -162.847900390625, "logps/rejected": -280.6136474609375, "loss": 0.3561, "rewards/chosen": 0.6857805252075195, "rewards/margins": 5.029175758361816, "rewards/rejected": -4.343395233154297, "step": 2241 }, { "epoch": 0.59, "grad_norm": 40.71586608886719, "kl": 0.0, "learning_rate": 2.0662130332373723e-07, "logps/chosen": -257.04296875, "logps/rejected": -187.94483947753906, "loss": 0.2425, "rewards/chosen": 1.0367754697799683, "rewards/margins": 4.446749687194824, "rewards/rejected": -3.4099740982055664, "step": 2242 }, { "epoch": 0.59, "grad_norm": 32.89750289916992, "kl": 0.0, "learning_rate": 2.0649044752682543e-07, "logps/chosen": -270.7828063964844, "logps/rejected": -268.9897155761719, "loss": 0.1506, "rewards/chosen": 2.786309242248535, "rewards/margins": 7.044668674468994, "rewards/rejected": -4.258359432220459, "step": 2243 }, { "epoch": 0.59, "grad_norm": 36.8209228515625, "kl": 0.0, "learning_rate": 2.0635959172991362e-07, "logps/chosen": -205.2930908203125, "logps/rejected": -207.3105010986328, "loss": 0.2875, "rewards/chosen": 1.5179622173309326, "rewards/margins": 3.5570497512817383, "rewards/rejected": -2.0390875339508057, "step": 2244 }, { "epoch": 0.59, "grad_norm": 36.65214157104492, "kl": 0.0, "learning_rate": 2.0622873593300182e-07, "logps/chosen": -158.63861083984375, "logps/rejected": -217.1534881591797, "loss": 0.3151, "rewards/chosen": 0.019611716270446777, "rewards/margins": 2.943315029144287, "rewards/rejected": -2.92370343208313, "step": 2245 }, { "epoch": 0.59, "grad_norm": 40.02116394042969, "kl": 0.0, "learning_rate": 2.0609788013609004e-07, "logps/chosen": -249.88970947265625, "logps/rejected": -151.15518188476562, "loss": 0.3755, "rewards/chosen": 0.8100465536117554, "rewards/margins": 2.0503296852111816, "rewards/rejected": -1.2402830123901367, "step": 2246 }, { "epoch": 0.59, "grad_norm": 35.16876983642578, "kl": 0.0, "learning_rate": 2.059670243391782e-07, "logps/chosen": -226.23460388183594, "logps/rejected": -245.07582092285156, "loss": 0.2016, "rewards/chosen": 1.5717494487762451, "rewards/margins": 5.3078742027282715, "rewards/rejected": -3.7361247539520264, "step": 2247 }, { "epoch": 0.59, "grad_norm": 30.50265121459961, "kl": 0.0, "learning_rate": 2.058361685422664e-07, "logps/chosen": -155.54959106445312, "logps/rejected": -251.41653442382812, "loss": 0.2158, "rewards/chosen": 1.5379118919372559, "rewards/margins": 5.863900184631348, "rewards/rejected": -4.325988292694092, "step": 2248 }, { "epoch": 0.59, "grad_norm": 45.763301849365234, "kl": 0.0, "learning_rate": 2.057053127453546e-07, "logps/chosen": -204.8267059326172, "logps/rejected": -216.93017578125, "loss": 0.2251, "rewards/chosen": 1.2222058773040771, "rewards/margins": 4.166966438293457, "rewards/rejected": -2.944760322570801, "step": 2249 }, { "epoch": 0.59, "grad_norm": 35.689453125, "kl": 0.0, "learning_rate": 2.055744569484428e-07, "logps/chosen": -167.92544555664062, "logps/rejected": -189.067626953125, "loss": 0.2259, "rewards/chosen": 0.4054737687110901, "rewards/margins": 4.531358242034912, "rewards/rejected": -4.125884532928467, "step": 2250 }, { "epoch": 0.59, "grad_norm": 31.52212905883789, "kl": 0.0, "learning_rate": 2.05443601151531e-07, "logps/chosen": -227.6565704345703, "logps/rejected": -263.6648864746094, "loss": 0.1781, "rewards/chosen": 1.7398179769515991, "rewards/margins": 4.553890228271484, "rewards/rejected": -2.8140721321105957, "step": 2251 }, { "epoch": 0.59, "grad_norm": 26.91353988647461, "kl": 0.0, "learning_rate": 2.053127453546192e-07, "logps/chosen": -145.8267059326172, "logps/rejected": -270.70635986328125, "loss": 0.2029, "rewards/chosen": 1.10588800907135, "rewards/margins": 4.696566104888916, "rewards/rejected": -3.5906779766082764, "step": 2252 }, { "epoch": 0.59, "grad_norm": 33.017311096191406, "kl": 0.0, "learning_rate": 2.0518188955770742e-07, "logps/chosen": -204.77932739257812, "logps/rejected": -308.2210388183594, "loss": 0.2999, "rewards/chosen": 1.018227219581604, "rewards/margins": 4.7906084060668945, "rewards/rejected": -3.77238130569458, "step": 2253 }, { "epoch": 0.59, "grad_norm": 28.2861385345459, "kl": 0.0, "learning_rate": 2.050510337607956e-07, "logps/chosen": -233.28012084960938, "logps/rejected": -170.07479858398438, "loss": 0.4263, "rewards/chosen": -0.36268797516822815, "rewards/margins": 2.6240553855895996, "rewards/rejected": -2.986743450164795, "step": 2254 }, { "epoch": 0.59, "grad_norm": 31.228620529174805, "kl": 0.0, "learning_rate": 2.0492017796388378e-07, "logps/chosen": -257.2540588378906, "logps/rejected": -267.53515625, "loss": 0.1912, "rewards/chosen": 0.64061039686203, "rewards/margins": 3.424734592437744, "rewards/rejected": -2.7841241359710693, "step": 2255 }, { "epoch": 0.59, "grad_norm": 35.21196365356445, "kl": 0.0, "learning_rate": 2.0478932216697198e-07, "logps/chosen": -213.58702087402344, "logps/rejected": -329.76788330078125, "loss": 0.2916, "rewards/chosen": 0.6111934185028076, "rewards/margins": 4.495480060577393, "rewards/rejected": -3.884286642074585, "step": 2256 }, { "epoch": 0.59, "grad_norm": 31.973970413208008, "kl": 0.0, "learning_rate": 2.0465846637006017e-07, "logps/chosen": -239.40626525878906, "logps/rejected": -191.99514770507812, "loss": 0.2833, "rewards/chosen": -0.32032090425491333, "rewards/margins": 3.3377461433410645, "rewards/rejected": -3.658066987991333, "step": 2257 }, { "epoch": 0.59, "grad_norm": 31.78082275390625, "kl": 0.0, "learning_rate": 2.0452761057314837e-07, "logps/chosen": -224.99295043945312, "logps/rejected": -208.2244873046875, "loss": 0.4009, "rewards/chosen": -0.342149019241333, "rewards/margins": 2.974942922592163, "rewards/rejected": -3.317091941833496, "step": 2258 }, { "epoch": 0.59, "grad_norm": 40.125362396240234, "kl": 0.0, "learning_rate": 2.043967547762366e-07, "logps/chosen": -142.40603637695312, "logps/rejected": -232.1035614013672, "loss": 0.3002, "rewards/chosen": 0.41182827949523926, "rewards/margins": 5.9469099044799805, "rewards/rejected": -5.53508186340332, "step": 2259 }, { "epoch": 0.59, "grad_norm": 31.44727325439453, "kl": 0.0, "learning_rate": 2.042658989793248e-07, "logps/chosen": -238.10951232910156, "logps/rejected": -203.90293884277344, "loss": 0.2593, "rewards/chosen": 0.28709545731544495, "rewards/margins": 2.862297296524048, "rewards/rejected": -2.5752017498016357, "step": 2260 }, { "epoch": 0.59, "grad_norm": 40.94384002685547, "kl": 0.0, "learning_rate": 2.0413504318241298e-07, "logps/chosen": -169.60580444335938, "logps/rejected": -287.79888916015625, "loss": 0.4053, "rewards/chosen": -0.45726338028907776, "rewards/margins": 2.6300411224365234, "rewards/rejected": -3.0873045921325684, "step": 2261 }, { "epoch": 0.59, "grad_norm": 29.562114715576172, "kl": 0.0, "learning_rate": 2.0400418738550118e-07, "logps/chosen": -239.27877807617188, "logps/rejected": -378.2849426269531, "loss": 0.2454, "rewards/chosen": 0.4851676821708679, "rewards/margins": 4.73510217666626, "rewards/rejected": -4.249934673309326, "step": 2262 }, { "epoch": 0.59, "grad_norm": 47.51166915893555, "kl": 0.0, "learning_rate": 2.0387333158858935e-07, "logps/chosen": -213.38778686523438, "logps/rejected": -257.33331298828125, "loss": 0.3026, "rewards/chosen": 0.39499297738075256, "rewards/margins": 3.191692590713501, "rewards/rejected": -2.7966995239257812, "step": 2263 }, { "epoch": 0.59, "grad_norm": 32.65146255493164, "kl": 0.0, "learning_rate": 2.0374247579167755e-07, "logps/chosen": -215.3774871826172, "logps/rejected": -231.47328186035156, "loss": 0.3043, "rewards/chosen": 0.690306544303894, "rewards/margins": 2.42452335357666, "rewards/rejected": -1.7342166900634766, "step": 2264 }, { "epoch": 0.59, "grad_norm": 30.549049377441406, "kl": 0.0, "learning_rate": 2.0361161999476574e-07, "logps/chosen": -220.37583923339844, "logps/rejected": -260.5372009277344, "loss": 0.1832, "rewards/chosen": -0.4377664029598236, "rewards/margins": 4.060056686401367, "rewards/rejected": -4.497823238372803, "step": 2265 }, { "epoch": 0.59, "grad_norm": 29.487773895263672, "kl": 0.0, "learning_rate": 2.0348076419785396e-07, "logps/chosen": -219.22384643554688, "logps/rejected": -278.33453369140625, "loss": 0.2964, "rewards/chosen": 1.3252283334732056, "rewards/margins": 5.023862361907959, "rewards/rejected": -3.698634147644043, "step": 2266 }, { "epoch": 0.59, "grad_norm": 31.48894500732422, "kl": 0.0, "learning_rate": 2.0334990840094216e-07, "logps/chosen": -219.73208618164062, "logps/rejected": -219.90045166015625, "loss": 0.1944, "rewards/chosen": 1.8473029136657715, "rewards/margins": 6.5499396324157715, "rewards/rejected": -4.70263671875, "step": 2267 }, { "epoch": 0.59, "grad_norm": 23.981107711791992, "kl": 0.0, "learning_rate": 2.0321905260403036e-07, "logps/chosen": -133.35617065429688, "logps/rejected": -190.30067443847656, "loss": 0.33, "rewards/chosen": -0.8549145460128784, "rewards/margins": 3.315229892730713, "rewards/rejected": -4.170144557952881, "step": 2268 }, { "epoch": 0.59, "grad_norm": 38.23117446899414, "kl": 0.0, "learning_rate": 2.0308819680711855e-07, "logps/chosen": -198.3517303466797, "logps/rejected": -327.96453857421875, "loss": 0.2805, "rewards/chosen": 0.14711612462997437, "rewards/margins": 4.454745769500732, "rewards/rejected": -4.307629585266113, "step": 2269 }, { "epoch": 0.59, "grad_norm": 34.60736846923828, "kl": 0.0, "learning_rate": 2.0295734101020675e-07, "logps/chosen": -182.74720764160156, "logps/rejected": -207.50497436523438, "loss": 0.2981, "rewards/chosen": 0.09414029121398926, "rewards/margins": 3.436382293701172, "rewards/rejected": -3.3422420024871826, "step": 2270 }, { "epoch": 0.59, "grad_norm": 42.404842376708984, "kl": 0.0, "learning_rate": 2.0282648521329492e-07, "logps/chosen": -246.59584045410156, "logps/rejected": -236.692626953125, "loss": 0.2859, "rewards/chosen": 0.38779035210609436, "rewards/margins": 3.6365997791290283, "rewards/rejected": -3.248809337615967, "step": 2271 }, { "epoch": 0.59, "grad_norm": 32.34096145629883, "kl": 0.0, "learning_rate": 2.0269562941638314e-07, "logps/chosen": -184.75596618652344, "logps/rejected": -225.25875854492188, "loss": 0.2885, "rewards/chosen": 0.9209032654762268, "rewards/margins": 3.5063042640686035, "rewards/rejected": -2.5854010581970215, "step": 2272 }, { "epoch": 0.59, "grad_norm": 20.64876365661621, "kl": 0.0, "learning_rate": 2.0256477361947134e-07, "logps/chosen": -146.1011505126953, "logps/rejected": -244.64822387695312, "loss": 0.1834, "rewards/chosen": 2.905881643295288, "rewards/margins": 6.3737945556640625, "rewards/rejected": -3.4679126739501953, "step": 2273 }, { "epoch": 0.6, "grad_norm": 33.63536071777344, "kl": 0.0, "learning_rate": 2.0243391782255953e-07, "logps/chosen": -235.8003692626953, "logps/rejected": -336.51385498046875, "loss": 0.2576, "rewards/chosen": 0.5231467485427856, "rewards/margins": 7.412076950073242, "rewards/rejected": -6.888930320739746, "step": 2274 }, { "epoch": 0.6, "grad_norm": 34.075950622558594, "kl": 0.0, "learning_rate": 2.0230306202564773e-07, "logps/chosen": -211.6470489501953, "logps/rejected": -185.71539306640625, "loss": 0.3364, "rewards/chosen": 1.6632918119430542, "rewards/margins": 5.770527362823486, "rewards/rejected": -4.107235431671143, "step": 2275 }, { "epoch": 0.6, "grad_norm": 25.605819702148438, "kl": 0.0, "learning_rate": 2.0217220622873593e-07, "logps/chosen": -247.4192352294922, "logps/rejected": -231.5600128173828, "loss": 0.2677, "rewards/chosen": -0.2584896385669708, "rewards/margins": 4.03928804397583, "rewards/rejected": -4.2977776527404785, "step": 2276 }, { "epoch": 0.6, "grad_norm": 42.00531768798828, "kl": 0.0, "learning_rate": 2.0204135043182412e-07, "logps/chosen": -300.18609619140625, "logps/rejected": -263.7876281738281, "loss": 0.2941, "rewards/chosen": 1.2072327136993408, "rewards/margins": 2.713994026184082, "rewards/rejected": -1.5067613124847412, "step": 2277 }, { "epoch": 0.6, "grad_norm": 31.236480712890625, "kl": 0.0, "learning_rate": 2.019104946349123e-07, "logps/chosen": -231.35574340820312, "logps/rejected": -279.8404541015625, "loss": 0.2957, "rewards/chosen": 0.4912111163139343, "rewards/margins": 6.064578056335449, "rewards/rejected": -5.573367118835449, "step": 2278 }, { "epoch": 0.6, "grad_norm": 30.456981658935547, "kl": 0.0, "learning_rate": 2.0177963883800051e-07, "logps/chosen": -208.600830078125, "logps/rejected": -197.56190490722656, "loss": 0.1894, "rewards/chosen": 1.867599368095398, "rewards/margins": 5.389187812805176, "rewards/rejected": -3.5215883255004883, "step": 2279 }, { "epoch": 0.6, "grad_norm": 36.0406494140625, "kl": 0.0, "learning_rate": 2.016487830410887e-07, "logps/chosen": -174.555908203125, "logps/rejected": -345.35638427734375, "loss": 0.2134, "rewards/chosen": -0.6918312907218933, "rewards/margins": 5.157019138336182, "rewards/rejected": -5.848850250244141, "step": 2280 }, { "epoch": 0.6, "grad_norm": 52.155052185058594, "kl": 0.0, "learning_rate": 2.015179272441769e-07, "logps/chosen": -164.63284301757812, "logps/rejected": -272.5175476074219, "loss": 0.279, "rewards/chosen": 0.6520876884460449, "rewards/margins": 5.025417327880859, "rewards/rejected": -4.3733296394348145, "step": 2281 }, { "epoch": 0.6, "grad_norm": 46.86626052856445, "kl": 0.0, "learning_rate": 2.013870714472651e-07, "logps/chosen": -207.02198791503906, "logps/rejected": -338.878662109375, "loss": 0.2458, "rewards/chosen": 1.6476094722747803, "rewards/margins": 5.5855560302734375, "rewards/rejected": -3.937946319580078, "step": 2282 }, { "epoch": 0.6, "grad_norm": 35.65882110595703, "kl": 0.0, "learning_rate": 2.012562156503533e-07, "logps/chosen": -238.85012817382812, "logps/rejected": -276.264404296875, "loss": 0.3135, "rewards/chosen": 0.41609618067741394, "rewards/margins": 4.50074577331543, "rewards/rejected": -4.084649562835693, "step": 2283 }, { "epoch": 0.6, "grad_norm": 40.120540618896484, "kl": 0.0, "learning_rate": 2.011253598534415e-07, "logps/chosen": -206.02651977539062, "logps/rejected": -254.01666259765625, "loss": 0.2503, "rewards/chosen": 1.6650097370147705, "rewards/margins": 4.307703971862793, "rewards/rejected": -2.6426944732666016, "step": 2284 }, { "epoch": 0.6, "grad_norm": 28.992889404296875, "kl": 0.0, "learning_rate": 2.0099450405652972e-07, "logps/chosen": -196.9144287109375, "logps/rejected": -263.31854248046875, "loss": 0.291, "rewards/chosen": 1.4108424186706543, "rewards/margins": 3.198729991912842, "rewards/rejected": -1.787887454032898, "step": 2285 }, { "epoch": 0.6, "grad_norm": 29.407325744628906, "kl": 0.0, "learning_rate": 2.0086364825961789e-07, "logps/chosen": -188.19052124023438, "logps/rejected": -263.819580078125, "loss": 0.3178, "rewards/chosen": 1.2455238103866577, "rewards/margins": 5.1608991622924805, "rewards/rejected": -3.915375232696533, "step": 2286 }, { "epoch": 0.6, "grad_norm": 29.475479125976562, "kl": 0.0, "learning_rate": 2.0073279246270608e-07, "logps/chosen": -224.15408325195312, "logps/rejected": -250.49986267089844, "loss": 0.2832, "rewards/chosen": 1.654449462890625, "rewards/margins": 3.692563056945801, "rewards/rejected": -2.038113594055176, "step": 2287 }, { "epoch": 0.6, "grad_norm": 26.579605102539062, "kl": 0.0, "learning_rate": 2.0060193666579428e-07, "logps/chosen": -244.87388610839844, "logps/rejected": -297.46636962890625, "loss": 0.2238, "rewards/chosen": 1.6434001922607422, "rewards/margins": 6.1101579666137695, "rewards/rejected": -4.466757774353027, "step": 2288 }, { "epoch": 0.6, "grad_norm": 33.45051574707031, "kl": 0.0, "learning_rate": 2.0047108086888247e-07, "logps/chosen": -131.8313751220703, "logps/rejected": -243.05029296875, "loss": 0.2313, "rewards/chosen": -0.7130619287490845, "rewards/margins": 2.145439624786377, "rewards/rejected": -2.858501434326172, "step": 2289 }, { "epoch": 0.6, "grad_norm": 38.82516098022461, "kl": 0.0, "learning_rate": 2.0034022507197067e-07, "logps/chosen": -207.3368377685547, "logps/rejected": -229.88768005371094, "loss": 0.2461, "rewards/chosen": 1.289283037185669, "rewards/margins": 5.446189880371094, "rewards/rejected": -4.156907081604004, "step": 2290 }, { "epoch": 0.6, "grad_norm": 34.82200622558594, "kl": 0.0, "learning_rate": 2.002093692750589e-07, "logps/chosen": -95.67045593261719, "logps/rejected": -228.51051330566406, "loss": 0.2927, "rewards/chosen": 0.8008228540420532, "rewards/margins": 4.647127151489258, "rewards/rejected": -3.846304178237915, "step": 2291 }, { "epoch": 0.6, "grad_norm": 39.77212905883789, "kl": 0.0, "learning_rate": 2.000785134781471e-07, "logps/chosen": -177.02243041992188, "logps/rejected": -266.8466491699219, "loss": 0.3203, "rewards/chosen": 0.5170644521713257, "rewards/margins": 3.983919143676758, "rewards/rejected": -3.4668545722961426, "step": 2292 }, { "epoch": 0.6, "grad_norm": 31.12018394470215, "kl": 0.0, "learning_rate": 1.9994765768123529e-07, "logps/chosen": -187.1675567626953, "logps/rejected": -226.5849151611328, "loss": 0.2171, "rewards/chosen": 1.33591628074646, "rewards/margins": 5.126616477966309, "rewards/rejected": -3.7907004356384277, "step": 2293 }, { "epoch": 0.6, "grad_norm": 64.28606414794922, "kl": 0.0, "learning_rate": 1.9981680188432345e-07, "logps/chosen": -211.73049926757812, "logps/rejected": -318.67529296875, "loss": 0.256, "rewards/chosen": 0.35816943645477295, "rewards/margins": 4.0642313957214355, "rewards/rejected": -3.706062078475952, "step": 2294 }, { "epoch": 0.6, "grad_norm": 38.446563720703125, "kl": 0.0, "learning_rate": 1.9968594608741165e-07, "logps/chosen": -170.37831115722656, "logps/rejected": -272.9222412109375, "loss": 0.2997, "rewards/chosen": -0.15325644612312317, "rewards/margins": 1.2989284992218018, "rewards/rejected": -1.4521849155426025, "step": 2295 }, { "epoch": 0.6, "grad_norm": 40.77598571777344, "kl": 0.0, "learning_rate": 1.9955509029049985e-07, "logps/chosen": -204.7364501953125, "logps/rejected": -204.11572265625, "loss": 0.2454, "rewards/chosen": 1.5838898420333862, "rewards/margins": 3.9927096366882324, "rewards/rejected": -2.4088196754455566, "step": 2296 }, { "epoch": 0.6, "grad_norm": 34.579124450683594, "kl": 0.0, "learning_rate": 1.9942423449358804e-07, "logps/chosen": -203.11920166015625, "logps/rejected": -258.780029296875, "loss": 0.3125, "rewards/chosen": 1.4499505758285522, "rewards/margins": 5.605946063995361, "rewards/rejected": -4.1559953689575195, "step": 2297 }, { "epoch": 0.6, "grad_norm": 31.835281372070312, "kl": 0.0, "learning_rate": 1.9929337869667627e-07, "logps/chosen": -191.6736297607422, "logps/rejected": -239.4393768310547, "loss": 0.2598, "rewards/chosen": 1.3314719200134277, "rewards/margins": 5.094294548034668, "rewards/rejected": -3.762822389602661, "step": 2298 }, { "epoch": 0.6, "grad_norm": 32.93226623535156, "kl": 0.0, "learning_rate": 1.9916252289976446e-07, "logps/chosen": -257.39154052734375, "logps/rejected": -298.38360595703125, "loss": 0.2827, "rewards/chosen": 2.54595685005188, "rewards/margins": 6.505262851715088, "rewards/rejected": -3.959306001663208, "step": 2299 }, { "epoch": 0.6, "grad_norm": 35.539306640625, "kl": 0.0, "learning_rate": 1.9903166710285266e-07, "logps/chosen": -237.4660186767578, "logps/rejected": -189.1026153564453, "loss": 0.2667, "rewards/chosen": 1.5426915884017944, "rewards/margins": 4.236473083496094, "rewards/rejected": -2.6937813758850098, "step": 2300 }, { "epoch": 0.6, "grad_norm": 26.92068099975586, "kl": 0.0, "learning_rate": 1.9890081130594085e-07, "logps/chosen": -242.7715301513672, "logps/rejected": -251.5889892578125, "loss": 0.2824, "rewards/chosen": -0.13105101883411407, "rewards/margins": 3.447944164276123, "rewards/rejected": -3.5789952278137207, "step": 2301 }, { "epoch": 0.6, "grad_norm": 29.411447525024414, "kl": 0.0, "learning_rate": 1.9876995550902902e-07, "logps/chosen": -185.21484375, "logps/rejected": -173.87359619140625, "loss": 0.2309, "rewards/chosen": 0.9786520004272461, "rewards/margins": 3.331249237060547, "rewards/rejected": -2.352597236633301, "step": 2302 }, { "epoch": 0.6, "grad_norm": 37.42856979370117, "kl": 0.0, "learning_rate": 1.9863909971211722e-07, "logps/chosen": -266.0807800292969, "logps/rejected": -315.8362731933594, "loss": 0.1693, "rewards/chosen": 0.9435846209526062, "rewards/margins": 5.589478969573975, "rewards/rejected": -4.645894527435303, "step": 2303 }, { "epoch": 0.6, "grad_norm": 32.103275299072266, "kl": 0.0, "learning_rate": 1.9850824391520544e-07, "logps/chosen": -232.0700225830078, "logps/rejected": -273.20611572265625, "loss": 0.2743, "rewards/chosen": -1.274770736694336, "rewards/margins": 3.538522720336914, "rewards/rejected": -4.81329345703125, "step": 2304 }, { "epoch": 0.6, "grad_norm": 41.43397521972656, "kl": 0.0, "learning_rate": 1.9837738811829364e-07, "logps/chosen": -148.38497924804688, "logps/rejected": -239.9488067626953, "loss": 0.3287, "rewards/chosen": 0.009937132708728313, "rewards/margins": 2.5099294185638428, "rewards/rejected": -2.4999923706054688, "step": 2305 }, { "epoch": 0.6, "grad_norm": 36.7087287902832, "kl": 0.0, "learning_rate": 1.9824653232138183e-07, "logps/chosen": -225.000732421875, "logps/rejected": -190.12754821777344, "loss": 0.3282, "rewards/chosen": -0.3766823410987854, "rewards/margins": 1.5658700466156006, "rewards/rejected": -1.9425524473190308, "step": 2306 }, { "epoch": 0.6, "grad_norm": 31.22111701965332, "kl": 0.0, "learning_rate": 1.9811567652447003e-07, "logps/chosen": -167.56187438964844, "logps/rejected": -208.11669921875, "loss": 0.3685, "rewards/chosen": 0.5481584668159485, "rewards/margins": 1.6451292037963867, "rewards/rejected": -1.096970796585083, "step": 2307 }, { "epoch": 0.6, "grad_norm": 49.5782585144043, "kl": 0.0, "learning_rate": 1.9798482072755823e-07, "logps/chosen": -189.76531982421875, "logps/rejected": -211.35142517089844, "loss": 0.3392, "rewards/chosen": 1.0844168663024902, "rewards/margins": 3.5620598793029785, "rewards/rejected": -2.4776430130004883, "step": 2308 }, { "epoch": 0.6, "grad_norm": 38.009700775146484, "kl": 0.0, "learning_rate": 1.978539649306464e-07, "logps/chosen": -301.82708740234375, "logps/rejected": -276.3048400878906, "loss": 0.2941, "rewards/chosen": 0.7866557240486145, "rewards/margins": 4.300216197967529, "rewards/rejected": -3.5135605335235596, "step": 2309 }, { "epoch": 0.6, "grad_norm": 38.04002380371094, "kl": 0.0, "learning_rate": 1.9772310913373462e-07, "logps/chosen": -222.57528686523438, "logps/rejected": -237.14932250976562, "loss": 0.272, "rewards/chosen": 0.6795270442962646, "rewards/margins": 4.367680072784424, "rewards/rejected": -3.688153028488159, "step": 2310 }, { "epoch": 0.6, "grad_norm": 34.9122428894043, "kl": 0.0, "learning_rate": 1.9759225333682281e-07, "logps/chosen": -175.81056213378906, "logps/rejected": -248.84698486328125, "loss": 0.1641, "rewards/chosen": 1.5451974868774414, "rewards/margins": 5.348091125488281, "rewards/rejected": -3.802893877029419, "step": 2311 }, { "epoch": 0.61, "grad_norm": 42.78825378417969, "kl": 0.0, "learning_rate": 1.97461397539911e-07, "logps/chosen": -278.53704833984375, "logps/rejected": -235.3387451171875, "loss": 0.2333, "rewards/chosen": 1.9742026329040527, "rewards/margins": 4.3761515617370605, "rewards/rejected": -2.401948928833008, "step": 2312 }, { "epoch": 0.61, "grad_norm": 33.957889556884766, "kl": 0.0, "learning_rate": 1.973305417429992e-07, "logps/chosen": -300.9579162597656, "logps/rejected": -196.88919067382812, "loss": 0.2687, "rewards/chosen": 0.5445224642753601, "rewards/margins": 4.595375061035156, "rewards/rejected": -4.0508527755737305, "step": 2313 }, { "epoch": 0.61, "grad_norm": 31.26093864440918, "kl": 0.0, "learning_rate": 1.971996859460874e-07, "logps/chosen": -226.60665893554688, "logps/rejected": -192.66175842285156, "loss": 0.2809, "rewards/chosen": 1.3789070844650269, "rewards/margins": 4.590993881225586, "rewards/rejected": -3.2120869159698486, "step": 2314 }, { "epoch": 0.61, "grad_norm": 33.976322174072266, "kl": 0.0, "learning_rate": 1.970688301491756e-07, "logps/chosen": -286.7447509765625, "logps/rejected": -288.9273681640625, "loss": 0.3294, "rewards/chosen": 0.5929345488548279, "rewards/margins": 3.9306373596191406, "rewards/rejected": -3.337702751159668, "step": 2315 }, { "epoch": 0.61, "grad_norm": 31.508800506591797, "kl": 0.0, "learning_rate": 1.969379743522638e-07, "logps/chosen": -205.0119171142578, "logps/rejected": -276.708984375, "loss": 0.1806, "rewards/chosen": 1.152121901512146, "rewards/margins": 4.042749404907227, "rewards/rejected": -2.89062762260437, "step": 2316 }, { "epoch": 0.61, "grad_norm": 32.762855529785156, "kl": 0.0, "learning_rate": 1.96807118555352e-07, "logps/chosen": -133.92018127441406, "logps/rejected": -246.65771484375, "loss": 0.2742, "rewards/chosen": 0.08014148473739624, "rewards/margins": 2.6316933631896973, "rewards/rejected": -2.5515518188476562, "step": 2317 }, { "epoch": 0.61, "grad_norm": 26.48649787902832, "kl": 0.0, "learning_rate": 1.966762627584402e-07, "logps/chosen": -232.65948486328125, "logps/rejected": -216.8899383544922, "loss": 0.3232, "rewards/chosen": 0.629813551902771, "rewards/margins": 3.5861639976501465, "rewards/rejected": -2.956350564956665, "step": 2318 }, { "epoch": 0.61, "grad_norm": 39.5434684753418, "kl": 0.0, "learning_rate": 1.9654540696152838e-07, "logps/chosen": -326.10894775390625, "logps/rejected": -340.63677978515625, "loss": 0.2668, "rewards/chosen": 1.9403249025344849, "rewards/margins": 4.059950828552246, "rewards/rejected": -2.1196258068084717, "step": 2319 }, { "epoch": 0.61, "grad_norm": 29.929338455200195, "kl": 0.0, "learning_rate": 1.9641455116461658e-07, "logps/chosen": -156.9414520263672, "logps/rejected": -169.06515502929688, "loss": 0.193, "rewards/chosen": -0.8299055695533752, "rewards/margins": 2.2383029460906982, "rewards/rejected": -3.0682084560394287, "step": 2320 }, { "epoch": 0.61, "grad_norm": 36.10740661621094, "kl": 0.0, "learning_rate": 1.9628369536770478e-07, "logps/chosen": -198.9758758544922, "logps/rejected": -279.46466064453125, "loss": 0.2819, "rewards/chosen": 1.3619959354400635, "rewards/margins": 5.403970718383789, "rewards/rejected": -4.0419745445251465, "step": 2321 }, { "epoch": 0.61, "grad_norm": 39.26609420776367, "kl": 0.0, "learning_rate": 1.9615283957079297e-07, "logps/chosen": -182.7283477783203, "logps/rejected": -257.07550048828125, "loss": 0.3292, "rewards/chosen": 0.010081393644213676, "rewards/margins": 3.4077489376068115, "rewards/rejected": -3.397667646408081, "step": 2322 }, { "epoch": 0.61, "grad_norm": 30.399250030517578, "kl": 0.0, "learning_rate": 1.960219837738812e-07, "logps/chosen": -184.43582153320312, "logps/rejected": -172.64540100097656, "loss": 0.2158, "rewards/chosen": 1.5585291385650635, "rewards/margins": 6.423967361450195, "rewards/rejected": -4.865437984466553, "step": 2323 }, { "epoch": 0.61, "grad_norm": 42.39558029174805, "kl": 0.0, "learning_rate": 1.958911279769694e-07, "logps/chosen": -204.50466918945312, "logps/rejected": -185.65188598632812, "loss": 0.2282, "rewards/chosen": 0.8809568881988525, "rewards/margins": 5.840044975280762, "rewards/rejected": -4.95908784866333, "step": 2324 }, { "epoch": 0.61, "grad_norm": 28.4328670501709, "kl": 0.0, "learning_rate": 1.9576027218005756e-07, "logps/chosen": -290.41998291015625, "logps/rejected": -194.86659240722656, "loss": 0.1874, "rewards/chosen": 1.8371068239212036, "rewards/margins": 5.469941139221191, "rewards/rejected": -3.6328341960906982, "step": 2325 }, { "epoch": 0.61, "grad_norm": 33.32115936279297, "kl": 0.0, "learning_rate": 1.9562941638314576e-07, "logps/chosen": -214.08786010742188, "logps/rejected": -268.7986755371094, "loss": 0.2692, "rewards/chosen": 1.712082862854004, "rewards/margins": 5.668825149536133, "rewards/rejected": -3.95674204826355, "step": 2326 }, { "epoch": 0.61, "grad_norm": 37.174095153808594, "kl": 0.0, "learning_rate": 1.9549856058623395e-07, "logps/chosen": -190.068603515625, "logps/rejected": -241.15679931640625, "loss": 0.3069, "rewards/chosen": 1.4767121076583862, "rewards/margins": 3.502028465270996, "rewards/rejected": -2.0253164768218994, "step": 2327 }, { "epoch": 0.61, "grad_norm": 40.89213562011719, "kl": 0.0, "learning_rate": 1.9536770478932215e-07, "logps/chosen": -181.3578338623047, "logps/rejected": -280.31915283203125, "loss": 0.3081, "rewards/chosen": 1.517583966255188, "rewards/margins": 3.3675737380981445, "rewards/rejected": -1.849989652633667, "step": 2328 }, { "epoch": 0.61, "grad_norm": 32.44668960571289, "kl": 0.0, "learning_rate": 1.9523684899241037e-07, "logps/chosen": -149.7053985595703, "logps/rejected": -227.4987335205078, "loss": 0.2448, "rewards/chosen": -0.6433849334716797, "rewards/margins": 3.7083725929260254, "rewards/rejected": -4.351757526397705, "step": 2329 }, { "epoch": 0.61, "grad_norm": 23.76544189453125, "kl": 0.0, "learning_rate": 1.9510599319549857e-07, "logps/chosen": -140.5263671875, "logps/rejected": -253.57730102539062, "loss": 0.2735, "rewards/chosen": -0.26378029584884644, "rewards/margins": 4.153634548187256, "rewards/rejected": -4.417414665222168, "step": 2330 }, { "epoch": 0.61, "grad_norm": 28.82638168334961, "kl": 0.0, "learning_rate": 1.9497513739858676e-07, "logps/chosen": -227.6795654296875, "logps/rejected": -303.05035400390625, "loss": 0.2781, "rewards/chosen": 0.5589046478271484, "rewards/margins": 5.917707443237305, "rewards/rejected": -5.358802795410156, "step": 2331 }, { "epoch": 0.61, "grad_norm": 29.29484748840332, "kl": 0.0, "learning_rate": 1.9484428160167496e-07, "logps/chosen": -186.6299285888672, "logps/rejected": -297.8755798339844, "loss": 0.2199, "rewards/chosen": 1.4472681283950806, "rewards/margins": 4.518843650817871, "rewards/rejected": -3.07157564163208, "step": 2332 }, { "epoch": 0.61, "grad_norm": 40.52328872680664, "kl": 0.0, "learning_rate": 1.9471342580476313e-07, "logps/chosen": -143.46986389160156, "logps/rejected": -132.3037872314453, "loss": 0.4259, "rewards/chosen": -0.21857896447181702, "rewards/margins": 2.3188986778259277, "rewards/rejected": -2.537477731704712, "step": 2333 }, { "epoch": 0.61, "grad_norm": 27.894329071044922, "kl": 0.0, "learning_rate": 1.9458257000785133e-07, "logps/chosen": -209.2640380859375, "logps/rejected": -304.1653137207031, "loss": 0.1426, "rewards/chosen": 2.2198264598846436, "rewards/margins": 6.847384452819824, "rewards/rejected": -4.62755823135376, "step": 2334 }, { "epoch": 0.61, "grad_norm": 38.58939743041992, "kl": 0.0, "learning_rate": 1.9445171421093952e-07, "logps/chosen": -235.97927856445312, "logps/rejected": -215.62770080566406, "loss": 0.3428, "rewards/chosen": -0.37354040145874023, "rewards/margins": 2.582803964614868, "rewards/rejected": -2.9563443660736084, "step": 2335 }, { "epoch": 0.61, "grad_norm": 55.73698806762695, "kl": 0.0, "learning_rate": 1.9432085841402774e-07, "logps/chosen": -202.73712158203125, "logps/rejected": -323.6961669921875, "loss": 0.333, "rewards/chosen": 0.4086439907550812, "rewards/margins": 3.40960431098938, "rewards/rejected": -3.000960350036621, "step": 2336 }, { "epoch": 0.61, "grad_norm": 38.34394836425781, "kl": 0.0, "learning_rate": 1.9419000261711594e-07, "logps/chosen": -262.01995849609375, "logps/rejected": -272.8934020996094, "loss": 0.3725, "rewards/chosen": 0.051617540419101715, "rewards/margins": 2.554993152618408, "rewards/rejected": -2.50337553024292, "step": 2337 }, { "epoch": 0.61, "grad_norm": 40.03268814086914, "kl": 0.0, "learning_rate": 1.9405914682020414e-07, "logps/chosen": -312.5582275390625, "logps/rejected": -213.3430938720703, "loss": 0.25, "rewards/chosen": 2.550088405609131, "rewards/margins": 6.379768371582031, "rewards/rejected": -3.8296799659729004, "step": 2338 }, { "epoch": 0.61, "grad_norm": 30.763914108276367, "kl": 0.0, "learning_rate": 1.9392829102329233e-07, "logps/chosen": -256.4914245605469, "logps/rejected": -216.35769653320312, "loss": 0.3354, "rewards/chosen": 0.5054255127906799, "rewards/margins": 4.105841636657715, "rewards/rejected": -3.6004161834716797, "step": 2339 }, { "epoch": 0.61, "grad_norm": 21.9720458984375, "kl": 0.0, "learning_rate": 1.937974352263805e-07, "logps/chosen": -256.197509765625, "logps/rejected": -244.96725463867188, "loss": 0.3228, "rewards/chosen": 0.6852515339851379, "rewards/margins": 5.482264518737793, "rewards/rejected": -4.797012805938721, "step": 2340 }, { "epoch": 0.61, "grad_norm": 29.677480697631836, "kl": 0.0, "learning_rate": 1.936665794294687e-07, "logps/chosen": -220.20875549316406, "logps/rejected": -222.16822814941406, "loss": 0.3136, "rewards/chosen": -1.0607540607452393, "rewards/margins": 1.4132604598999023, "rewards/rejected": -2.4740145206451416, "step": 2341 }, { "epoch": 0.61, "grad_norm": 39.20149612426758, "kl": 0.0, "learning_rate": 1.9353572363255692e-07, "logps/chosen": -238.39891052246094, "logps/rejected": -147.9998779296875, "loss": 0.1857, "rewards/chosen": 2.199713945388794, "rewards/margins": 4.197799205780029, "rewards/rejected": -1.9980851411819458, "step": 2342 }, { "epoch": 0.61, "grad_norm": 42.02245330810547, "kl": 0.0, "learning_rate": 1.9340486783564512e-07, "logps/chosen": -139.3223419189453, "logps/rejected": -246.05770874023438, "loss": 0.3031, "rewards/chosen": 0.6252093315124512, "rewards/margins": 3.0687246322631836, "rewards/rejected": -2.4435153007507324, "step": 2343 }, { "epoch": 0.61, "grad_norm": 31.7598876953125, "kl": 0.0, "learning_rate": 1.932740120387333e-07, "logps/chosen": -222.23658752441406, "logps/rejected": -234.6144256591797, "loss": 0.2148, "rewards/chosen": 2.6373138427734375, "rewards/margins": 5.807960510253906, "rewards/rejected": -3.1706466674804688, "step": 2344 }, { "epoch": 0.61, "grad_norm": 42.98470687866211, "kl": 0.0, "learning_rate": 1.931431562418215e-07, "logps/chosen": -241.89788818359375, "logps/rejected": -238.154296875, "loss": 0.32, "rewards/chosen": 1.266499638557434, "rewards/margins": 4.807114601135254, "rewards/rejected": -3.5406150817871094, "step": 2345 }, { "epoch": 0.61, "grad_norm": 43.5479850769043, "kl": 0.0, "learning_rate": 1.930123004449097e-07, "logps/chosen": -292.238525390625, "logps/rejected": -243.96377563476562, "loss": 0.2991, "rewards/chosen": 0.4111224114894867, "rewards/margins": 4.473801612854004, "rewards/rejected": -4.062679290771484, "step": 2346 }, { "epoch": 0.61, "grad_norm": 49.00059127807617, "kl": 0.0, "learning_rate": 1.928814446479979e-07, "logps/chosen": -239.2394256591797, "logps/rejected": -258.6951904296875, "loss": 0.4388, "rewards/chosen": -0.20010191202163696, "rewards/margins": 1.1232011318206787, "rewards/rejected": -1.323302984237671, "step": 2347 }, { "epoch": 0.61, "grad_norm": 39.21022033691406, "kl": 0.0, "learning_rate": 1.9275058885108607e-07, "logps/chosen": -255.04443359375, "logps/rejected": -244.91729736328125, "loss": 0.2715, "rewards/chosen": -0.001400096109136939, "rewards/margins": 3.116028070449829, "rewards/rejected": -3.1174280643463135, "step": 2348 }, { "epoch": 0.61, "grad_norm": 34.833255767822266, "kl": 0.0, "learning_rate": 1.926197330541743e-07, "logps/chosen": -194.5554962158203, "logps/rejected": -260.3280334472656, "loss": 0.2837, "rewards/chosen": 0.05182616040110588, "rewards/margins": 2.837132215499878, "rewards/rejected": -2.785305976867676, "step": 2349 }, { "epoch": 0.62, "grad_norm": 31.625768661499023, "kl": 0.0, "learning_rate": 1.924888772572625e-07, "logps/chosen": -214.2799072265625, "logps/rejected": -145.29727172851562, "loss": 0.2255, "rewards/chosen": 2.1377511024475098, "rewards/margins": 4.877354621887207, "rewards/rejected": -2.7396037578582764, "step": 2350 }, { "epoch": 0.62, "grad_norm": 32.17449188232422, "kl": 0.0, "learning_rate": 1.9235802146035069e-07, "logps/chosen": -182.0697479248047, "logps/rejected": -301.1997375488281, "loss": 0.2758, "rewards/chosen": 1.7245681285858154, "rewards/margins": 5.422351837158203, "rewards/rejected": -3.6977834701538086, "step": 2351 }, { "epoch": 0.62, "grad_norm": 36.291282653808594, "kl": 0.0, "learning_rate": 1.9222716566343888e-07, "logps/chosen": -220.62863159179688, "logps/rejected": -197.6534423828125, "loss": 0.3495, "rewards/chosen": -0.28744202852249146, "rewards/margins": 2.591951608657837, "rewards/rejected": -2.8793935775756836, "step": 2352 }, { "epoch": 0.62, "grad_norm": 43.120182037353516, "kl": 0.0, "learning_rate": 1.9209630986652708e-07, "logps/chosen": -189.58387756347656, "logps/rejected": -183.0449981689453, "loss": 0.274, "rewards/chosen": 0.5014609694480896, "rewards/margins": 2.78602933883667, "rewards/rejected": -2.2845683097839355, "step": 2353 }, { "epoch": 0.62, "grad_norm": 31.61993980407715, "kl": 0.0, "learning_rate": 1.9196545406961527e-07, "logps/chosen": -245.66635131835938, "logps/rejected": -229.27239990234375, "loss": 0.3288, "rewards/chosen": -0.1258794069290161, "rewards/margins": 2.4228243827819824, "rewards/rejected": -2.548703670501709, "step": 2354 }, { "epoch": 0.62, "grad_norm": 29.71607208251953, "kl": 0.0, "learning_rate": 1.918345982727035e-07, "logps/chosen": -240.70870971679688, "logps/rejected": -260.2348937988281, "loss": 0.1747, "rewards/chosen": 2.110532522201538, "rewards/margins": 5.318500518798828, "rewards/rejected": -3.207967758178711, "step": 2355 }, { "epoch": 0.62, "grad_norm": 37.52629470825195, "kl": 0.0, "learning_rate": 1.9170374247579167e-07, "logps/chosen": -156.80447387695312, "logps/rejected": -227.19284057617188, "loss": 0.2688, "rewards/chosen": 2.4135518074035645, "rewards/margins": 4.432103157043457, "rewards/rejected": -2.0185515880584717, "step": 2356 }, { "epoch": 0.62, "grad_norm": 39.853370666503906, "kl": 0.0, "learning_rate": 1.9157288667887986e-07, "logps/chosen": -221.11697387695312, "logps/rejected": -348.988037109375, "loss": 0.3974, "rewards/chosen": 0.023872777819633484, "rewards/margins": 6.337624549865723, "rewards/rejected": -6.313751697540283, "step": 2357 }, { "epoch": 0.62, "grad_norm": 29.161619186401367, "kl": 0.0, "learning_rate": 1.9144203088196806e-07, "logps/chosen": -187.28756713867188, "logps/rejected": -215.73773193359375, "loss": 0.2739, "rewards/chosen": 0.8985105156898499, "rewards/margins": 3.8181779384613037, "rewards/rejected": -2.9196674823760986, "step": 2358 }, { "epoch": 0.62, "grad_norm": 32.86288833618164, "kl": 0.0, "learning_rate": 1.9131117508505625e-07, "logps/chosen": -171.5975341796875, "logps/rejected": -362.61248779296875, "loss": 0.2672, "rewards/chosen": 0.9393047094345093, "rewards/margins": 5.1951680183410645, "rewards/rejected": -4.255863189697266, "step": 2359 }, { "epoch": 0.62, "grad_norm": 29.304479598999023, "kl": 0.0, "learning_rate": 1.9118031928814445e-07, "logps/chosen": -253.9071044921875, "logps/rejected": -265.0699157714844, "loss": 0.2201, "rewards/chosen": 2.477958917617798, "rewards/margins": 6.703524589538574, "rewards/rejected": -4.225565433502197, "step": 2360 }, { "epoch": 0.62, "grad_norm": 34.67782974243164, "kl": 0.0, "learning_rate": 1.9104946349123267e-07, "logps/chosen": -157.28024291992188, "logps/rejected": -303.87591552734375, "loss": 0.1655, "rewards/chosen": 1.717661738395691, "rewards/margins": 5.473419666290283, "rewards/rejected": -3.7557578086853027, "step": 2361 }, { "epoch": 0.62, "grad_norm": 33.45250701904297, "kl": 0.0, "learning_rate": 1.9091860769432087e-07, "logps/chosen": -164.96530151367188, "logps/rejected": -325.362060546875, "loss": 0.2493, "rewards/chosen": -1.0385844707489014, "rewards/margins": 4.012018203735352, "rewards/rejected": -5.050602912902832, "step": 2362 }, { "epoch": 0.62, "grad_norm": 40.05767059326172, "kl": 0.0, "learning_rate": 1.9078775189740904e-07, "logps/chosen": -182.37466430664062, "logps/rejected": -162.07620239257812, "loss": 0.1577, "rewards/chosen": 1.3069759607315063, "rewards/margins": 4.243577480316162, "rewards/rejected": -2.9366016387939453, "step": 2363 }, { "epoch": 0.62, "grad_norm": 41.3237419128418, "kl": 0.0, "learning_rate": 1.9065689610049723e-07, "logps/chosen": -214.30227661132812, "logps/rejected": -252.76242065429688, "loss": 0.284, "rewards/chosen": -0.3962777853012085, "rewards/margins": 2.5071358680725098, "rewards/rejected": -2.9034135341644287, "step": 2364 }, { "epoch": 0.62, "grad_norm": 35.928016662597656, "kl": 0.0, "learning_rate": 1.9052604030358543e-07, "logps/chosen": -230.25433349609375, "logps/rejected": -209.69033813476562, "loss": 0.2953, "rewards/chosen": 0.2936524748802185, "rewards/margins": 3.8067970275878906, "rewards/rejected": -3.5131444931030273, "step": 2365 }, { "epoch": 0.62, "grad_norm": 35.94745635986328, "kl": 0.0, "learning_rate": 1.9039518450667363e-07, "logps/chosen": -222.4689178466797, "logps/rejected": -242.89163208007812, "loss": 0.3081, "rewards/chosen": -0.4748765826225281, "rewards/margins": 2.3422157764434814, "rewards/rejected": -2.8170924186706543, "step": 2366 }, { "epoch": 0.62, "grad_norm": 55.70354461669922, "kl": 0.0, "learning_rate": 1.9026432870976182e-07, "logps/chosen": -207.79037475585938, "logps/rejected": -231.1091766357422, "loss": 0.2639, "rewards/chosen": 1.4316012859344482, "rewards/margins": 4.312504768371582, "rewards/rejected": -2.880903482437134, "step": 2367 }, { "epoch": 0.62, "grad_norm": 31.606653213500977, "kl": 0.0, "learning_rate": 1.9013347291285005e-07, "logps/chosen": -212.349609375, "logps/rejected": -259.9991760253906, "loss": 0.3397, "rewards/chosen": 0.9540568590164185, "rewards/margins": 6.031365394592285, "rewards/rejected": -5.077308654785156, "step": 2368 }, { "epoch": 0.62, "grad_norm": 36.44898223876953, "kl": 0.0, "learning_rate": 1.9000261711593824e-07, "logps/chosen": -194.91915893554688, "logps/rejected": -281.0176696777344, "loss": 0.2525, "rewards/chosen": 0.3901931047439575, "rewards/margins": 4.855154037475586, "rewards/rejected": -4.464961051940918, "step": 2369 }, { "epoch": 0.62, "grad_norm": 30.430397033691406, "kl": 0.0, "learning_rate": 1.8987176131902644e-07, "logps/chosen": -239.862548828125, "logps/rejected": -260.174072265625, "loss": 0.1879, "rewards/chosen": 0.952376663684845, "rewards/margins": 5.6407036781311035, "rewards/rejected": -4.688326835632324, "step": 2370 }, { "epoch": 0.62, "grad_norm": 25.95115852355957, "kl": 0.0, "learning_rate": 1.897409055221146e-07, "logps/chosen": -217.9676513671875, "logps/rejected": -193.67478942871094, "loss": 0.2545, "rewards/chosen": 0.6721733212471008, "rewards/margins": 4.830235958099365, "rewards/rejected": -4.15806245803833, "step": 2371 }, { "epoch": 0.62, "grad_norm": 31.143552780151367, "kl": 0.0, "learning_rate": 1.896100497252028e-07, "logps/chosen": -183.70237731933594, "logps/rejected": -233.95692443847656, "loss": 0.3316, "rewards/chosen": 0.9070913195610046, "rewards/margins": 4.197821140289307, "rewards/rejected": -3.2907299995422363, "step": 2372 }, { "epoch": 0.62, "grad_norm": 28.717832565307617, "kl": 0.0, "learning_rate": 1.89479193928291e-07, "logps/chosen": -247.36865234375, "logps/rejected": -263.091552734375, "loss": 0.2654, "rewards/chosen": -0.25779542326927185, "rewards/margins": 3.495422124862671, "rewards/rejected": -3.7532174587249756, "step": 2373 }, { "epoch": 0.62, "grad_norm": 46.38393783569336, "kl": 0.0, "learning_rate": 1.8934833813137922e-07, "logps/chosen": -251.53179931640625, "logps/rejected": -275.49102783203125, "loss": 0.2089, "rewards/chosen": 3.581019878387451, "rewards/margins": 6.682999610900879, "rewards/rejected": -3.1019797325134277, "step": 2374 }, { "epoch": 0.62, "grad_norm": 32.04011535644531, "kl": 0.0, "learning_rate": 1.8921748233446742e-07, "logps/chosen": -226.80967712402344, "logps/rejected": -185.3697967529297, "loss": 0.1354, "rewards/chosen": 1.043463110923767, "rewards/margins": 4.468041896820068, "rewards/rejected": -3.424578905105591, "step": 2375 }, { "epoch": 0.62, "grad_norm": 33.520713806152344, "kl": 0.0, "learning_rate": 1.8908662653755561e-07, "logps/chosen": -224.3209686279297, "logps/rejected": -259.0567321777344, "loss": 0.2562, "rewards/chosen": 2.0344178676605225, "rewards/margins": 4.964014053344727, "rewards/rejected": -2.929595947265625, "step": 2376 }, { "epoch": 0.62, "grad_norm": 32.75332260131836, "kl": 0.0, "learning_rate": 1.889557707406438e-07, "logps/chosen": -229.57943725585938, "logps/rejected": -235.8605499267578, "loss": 0.3635, "rewards/chosen": 0.24739819765090942, "rewards/margins": 4.021427631378174, "rewards/rejected": -3.77402925491333, "step": 2377 }, { "epoch": 0.62, "grad_norm": 33.14479064941406, "kl": 0.0, "learning_rate": 1.88824914943732e-07, "logps/chosen": -258.2164306640625, "logps/rejected": -145.81407165527344, "loss": 0.3409, "rewards/chosen": -0.3117099404335022, "rewards/margins": 2.2764155864715576, "rewards/rejected": -2.588125467300415, "step": 2378 }, { "epoch": 0.62, "grad_norm": 28.57996368408203, "kl": 0.0, "learning_rate": 1.8869405914682018e-07, "logps/chosen": -163.08482360839844, "logps/rejected": -152.67636108398438, "loss": 0.2951, "rewards/chosen": 1.1140191555023193, "rewards/margins": 2.7893271446228027, "rewards/rejected": -1.6753078699111938, "step": 2379 }, { "epoch": 0.62, "grad_norm": 26.788040161132812, "kl": 0.0, "learning_rate": 1.8856320334990837e-07, "logps/chosen": -130.69322204589844, "logps/rejected": -254.1563720703125, "loss": 0.2663, "rewards/chosen": 1.4976850748062134, "rewards/margins": 4.847748756408691, "rewards/rejected": -3.3500638008117676, "step": 2380 }, { "epoch": 0.62, "grad_norm": 36.504066467285156, "kl": 0.0, "learning_rate": 1.884323475529966e-07, "logps/chosen": -198.76889038085938, "logps/rejected": -234.64707946777344, "loss": 0.401, "rewards/chosen": -0.3146289885044098, "rewards/margins": 2.4038166999816895, "rewards/rejected": -2.7184457778930664, "step": 2381 }, { "epoch": 0.62, "grad_norm": 30.19637680053711, "kl": 0.0, "learning_rate": 1.883014917560848e-07, "logps/chosen": -167.89947509765625, "logps/rejected": -297.1753234863281, "loss": 0.2258, "rewards/chosen": 2.146338701248169, "rewards/margins": 7.123197555541992, "rewards/rejected": -4.976859092712402, "step": 2382 }, { "epoch": 0.62, "grad_norm": 46.778839111328125, "kl": 0.0, "learning_rate": 1.8817063595917299e-07, "logps/chosen": -275.8388366699219, "logps/rejected": -222.3790283203125, "loss": 0.4312, "rewards/chosen": -0.8422994613647461, "rewards/margins": 1.4927501678466797, "rewards/rejected": -2.335049629211426, "step": 2383 }, { "epoch": 0.62, "grad_norm": 38.08612823486328, "kl": 0.0, "learning_rate": 1.8803978016226118e-07, "logps/chosen": -189.39913940429688, "logps/rejected": -221.18472290039062, "loss": 0.2462, "rewards/chosen": 0.9700742959976196, "rewards/margins": 3.71815824508667, "rewards/rejected": -2.74808406829834, "step": 2384 }, { "epoch": 0.62, "grad_norm": 31.00204849243164, "kl": 0.0, "learning_rate": 1.8790892436534938e-07, "logps/chosen": -201.84481811523438, "logps/rejected": -230.4281005859375, "loss": 0.2608, "rewards/chosen": 0.6287637948989868, "rewards/margins": 3.8814697265625, "rewards/rejected": -3.2527058124542236, "step": 2385 }, { "epoch": 0.62, "grad_norm": 29.59605598449707, "kl": 0.0, "learning_rate": 1.8777806856843757e-07, "logps/chosen": -182.8221435546875, "logps/rejected": -246.32264709472656, "loss": 0.2239, "rewards/chosen": 0.729421854019165, "rewards/margins": 4.723428726196289, "rewards/rejected": -3.994007110595703, "step": 2386 }, { "epoch": 0.62, "grad_norm": 38.360904693603516, "kl": 0.0, "learning_rate": 1.8764721277152577e-07, "logps/chosen": -209.9322509765625, "logps/rejected": -353.68414306640625, "loss": 0.2591, "rewards/chosen": 1.7438795566558838, "rewards/margins": 4.995150566101074, "rewards/rejected": -3.2512707710266113, "step": 2387 }, { "epoch": 0.62, "grad_norm": 36.978267669677734, "kl": 0.0, "learning_rate": 1.8751635697461397e-07, "logps/chosen": -184.61764526367188, "logps/rejected": -162.25514221191406, "loss": 0.2072, "rewards/chosen": 0.8075557351112366, "rewards/margins": 3.699028253555298, "rewards/rejected": -2.891472578048706, "step": 2388 }, { "epoch": 0.63, "grad_norm": 36.411163330078125, "kl": 0.0, "learning_rate": 1.8738550117770216e-07, "logps/chosen": -247.070068359375, "logps/rejected": -230.36900329589844, "loss": 0.2015, "rewards/chosen": -0.4802358150482178, "rewards/margins": 3.238145589828491, "rewards/rejected": -3.718381404876709, "step": 2389 }, { "epoch": 0.63, "grad_norm": 30.9532470703125, "kl": 0.0, "learning_rate": 1.8725464538079036e-07, "logps/chosen": -161.55776977539062, "logps/rejected": -247.14659118652344, "loss": 0.2947, "rewards/chosen": -0.4317225217819214, "rewards/margins": 2.7858991622924805, "rewards/rejected": -3.2176215648651123, "step": 2390 }, { "epoch": 0.63, "grad_norm": 33.81348419189453, "kl": 0.0, "learning_rate": 1.8712378958387856e-07, "logps/chosen": -246.77601623535156, "logps/rejected": -257.72235107421875, "loss": 0.2854, "rewards/chosen": 0.6340773105621338, "rewards/margins": 4.2819366455078125, "rewards/rejected": -3.647859573364258, "step": 2391 }, { "epoch": 0.63, "grad_norm": 33.270442962646484, "kl": 0.0, "learning_rate": 1.8699293378696675e-07, "logps/chosen": -202.92318725585938, "logps/rejected": -237.6372528076172, "loss": 0.3072, "rewards/chosen": 1.4428868293762207, "rewards/margins": 4.175738334655762, "rewards/rejected": -2.732851505279541, "step": 2392 }, { "epoch": 0.63, "grad_norm": 35.90946578979492, "kl": 0.0, "learning_rate": 1.8686207799005497e-07, "logps/chosen": -205.4209442138672, "logps/rejected": -257.3418273925781, "loss": 0.313, "rewards/chosen": -0.1432461142539978, "rewards/margins": 4.7647905349731445, "rewards/rejected": -4.908036708831787, "step": 2393 }, { "epoch": 0.63, "grad_norm": 33.007659912109375, "kl": 0.0, "learning_rate": 1.8673122219314314e-07, "logps/chosen": -165.77957153320312, "logps/rejected": -215.4703369140625, "loss": 0.2844, "rewards/chosen": 0.5170528888702393, "rewards/margins": 4.497333526611328, "rewards/rejected": -3.980280637741089, "step": 2394 }, { "epoch": 0.63, "grad_norm": 33.04104232788086, "kl": 0.0, "learning_rate": 1.8660036639623134e-07, "logps/chosen": -170.15045166015625, "logps/rejected": -236.4322967529297, "loss": 0.2423, "rewards/chosen": 1.5568426847457886, "rewards/margins": 3.240248680114746, "rewards/rejected": -1.683405876159668, "step": 2395 }, { "epoch": 0.63, "grad_norm": 40.87158203125, "kl": 0.0, "learning_rate": 1.8646951059931954e-07, "logps/chosen": -224.96380615234375, "logps/rejected": -285.54058837890625, "loss": 0.2544, "rewards/chosen": 1.9070959091186523, "rewards/margins": 6.817129135131836, "rewards/rejected": -4.910033226013184, "step": 2396 }, { "epoch": 0.63, "grad_norm": 39.35676956176758, "kl": 0.0, "learning_rate": 1.8633865480240773e-07, "logps/chosen": -186.65476989746094, "logps/rejected": -186.18894958496094, "loss": 0.3623, "rewards/chosen": -0.24426405131816864, "rewards/margins": 2.902918815612793, "rewards/rejected": -3.1471829414367676, "step": 2397 }, { "epoch": 0.63, "grad_norm": 34.04732131958008, "kl": 0.0, "learning_rate": 1.8620779900549593e-07, "logps/chosen": -216.03396606445312, "logps/rejected": -287.70501708984375, "loss": 0.2585, "rewards/chosen": 1.3595733642578125, "rewards/margins": 5.254693984985352, "rewards/rejected": -3.895120620727539, "step": 2398 }, { "epoch": 0.63, "grad_norm": 36.18494415283203, "kl": 0.0, "learning_rate": 1.8607694320858412e-07, "logps/chosen": -215.787841796875, "logps/rejected": -389.7153015136719, "loss": 0.1888, "rewards/chosen": 1.4879820346832275, "rewards/margins": 4.753214359283447, "rewards/rejected": -3.2652323246002197, "step": 2399 }, { "epoch": 0.63, "grad_norm": 49.94510269165039, "kl": 0.0, "learning_rate": 1.8594608741167235e-07, "logps/chosen": -261.67913818359375, "logps/rejected": -219.42575073242188, "loss": 0.2057, "rewards/chosen": 0.8924288153648376, "rewards/margins": 4.58126974105835, "rewards/rejected": -3.6888411045074463, "step": 2400 }, { "epoch": 0.63, "grad_norm": 34.9915771484375, "kl": 0.0, "learning_rate": 1.8581523161476054e-07, "logps/chosen": -157.85699462890625, "logps/rejected": -282.4374084472656, "loss": 0.2325, "rewards/chosen": 2.0740764141082764, "rewards/margins": 4.795737266540527, "rewards/rejected": -2.72166109085083, "step": 2401 }, { "epoch": 0.63, "grad_norm": 36.97850036621094, "kl": 0.0, "learning_rate": 1.856843758178487e-07, "logps/chosen": -219.75802612304688, "logps/rejected": -340.8087158203125, "loss": 0.2289, "rewards/chosen": 2.2684707641601562, "rewards/margins": 7.455198287963867, "rewards/rejected": -5.186727523803711, "step": 2402 }, { "epoch": 0.63, "grad_norm": 35.18425750732422, "kl": 0.0, "learning_rate": 1.855535200209369e-07, "logps/chosen": -159.93328857421875, "logps/rejected": -339.95556640625, "loss": 0.2479, "rewards/chosen": 1.4932546615600586, "rewards/margins": 3.469346523284912, "rewards/rejected": -1.976091980934143, "step": 2403 }, { "epoch": 0.63, "grad_norm": 38.96127700805664, "kl": 0.0, "learning_rate": 1.854226642240251e-07, "logps/chosen": -216.40310668945312, "logps/rejected": -195.5192108154297, "loss": 0.2884, "rewards/chosen": -0.16616499423980713, "rewards/margins": 3.938969135284424, "rewards/rejected": -4.105134010314941, "step": 2404 }, { "epoch": 0.63, "grad_norm": 38.0742073059082, "kl": 0.0, "learning_rate": 1.852918084271133e-07, "logps/chosen": -173.3180694580078, "logps/rejected": -259.1162414550781, "loss": 0.2673, "rewards/chosen": 1.2408561706542969, "rewards/margins": 4.919090270996094, "rewards/rejected": -3.678234338760376, "step": 2405 }, { "epoch": 0.63, "grad_norm": 30.632402420043945, "kl": 0.0, "learning_rate": 1.8516095263020152e-07, "logps/chosen": -143.20333862304688, "logps/rejected": -234.39422607421875, "loss": 0.2918, "rewards/chosen": 1.0636051893234253, "rewards/margins": 3.586050033569336, "rewards/rejected": -2.522444725036621, "step": 2406 }, { "epoch": 0.63, "grad_norm": 38.62371063232422, "kl": 0.0, "learning_rate": 1.8503009683328972e-07, "logps/chosen": -229.66151428222656, "logps/rejected": -292.41717529296875, "loss": 0.3, "rewards/chosen": 0.04666091501712799, "rewards/margins": 4.005421161651611, "rewards/rejected": -3.9587600231170654, "step": 2407 }, { "epoch": 0.63, "grad_norm": 41.32472610473633, "kl": 0.0, "learning_rate": 1.8489924103637792e-07, "logps/chosen": -249.702880859375, "logps/rejected": -257.2342529296875, "loss": 0.3374, "rewards/chosen": 1.1685850620269775, "rewards/margins": 2.980180501937866, "rewards/rejected": -1.8115954399108887, "step": 2408 }, { "epoch": 0.63, "grad_norm": 27.124591827392578, "kl": 0.0, "learning_rate": 1.847683852394661e-07, "logps/chosen": -172.0544891357422, "logps/rejected": -246.92007446289062, "loss": 0.2379, "rewards/chosen": 0.1499977856874466, "rewards/margins": 4.02953577041626, "rewards/rejected": -3.87953782081604, "step": 2409 }, { "epoch": 0.63, "grad_norm": 41.78574752807617, "kl": 0.0, "learning_rate": 1.8463752944255428e-07, "logps/chosen": -175.81947326660156, "logps/rejected": -271.74114990234375, "loss": 0.2285, "rewards/chosen": 1.0703699588775635, "rewards/margins": 4.834184646606445, "rewards/rejected": -3.7638144493103027, "step": 2410 }, { "epoch": 0.63, "grad_norm": 39.91765594482422, "kl": 0.0, "learning_rate": 1.8450667364564248e-07, "logps/chosen": -276.7518005371094, "logps/rejected": -347.8869323730469, "loss": 0.3181, "rewards/chosen": -0.08891824632883072, "rewards/margins": 5.355866432189941, "rewards/rejected": -5.444784641265869, "step": 2411 }, { "epoch": 0.63, "grad_norm": 40.989402770996094, "kl": 0.0, "learning_rate": 1.8437581784873067e-07, "logps/chosen": -221.4810791015625, "logps/rejected": -204.2392578125, "loss": 0.2914, "rewards/chosen": 1.4958677291870117, "rewards/margins": 3.875377893447876, "rewards/rejected": -2.3795101642608643, "step": 2412 }, { "epoch": 0.63, "grad_norm": 40.61765670776367, "kl": 0.0, "learning_rate": 1.842449620518189e-07, "logps/chosen": -157.64474487304688, "logps/rejected": -282.7855529785156, "loss": 0.3276, "rewards/chosen": 0.4709343910217285, "rewards/margins": 3.0344228744506836, "rewards/rejected": -2.563488483428955, "step": 2413 }, { "epoch": 0.63, "grad_norm": 32.019100189208984, "kl": 0.0, "learning_rate": 1.841141062549071e-07, "logps/chosen": -152.17665100097656, "logps/rejected": -321.3105163574219, "loss": 0.2842, "rewards/chosen": 0.6752780079841614, "rewards/margins": 5.736945629119873, "rewards/rejected": -5.061667442321777, "step": 2414 }, { "epoch": 0.63, "grad_norm": 26.227691650390625, "kl": 0.0, "learning_rate": 1.839832504579953e-07, "logps/chosen": -166.89146423339844, "logps/rejected": -273.63555908203125, "loss": 0.2115, "rewards/chosen": 0.6281822919845581, "rewards/margins": 2.977193832397461, "rewards/rejected": -2.3490116596221924, "step": 2415 }, { "epoch": 0.63, "grad_norm": 28.663330078125, "kl": 0.0, "learning_rate": 1.8385239466108348e-07, "logps/chosen": -117.30267333984375, "logps/rejected": -310.23828125, "loss": 0.2346, "rewards/chosen": 0.3416542410850525, "rewards/margins": 4.763482570648193, "rewards/rejected": -4.421828269958496, "step": 2416 }, { "epoch": 0.63, "grad_norm": 28.140478134155273, "kl": 0.0, "learning_rate": 1.8372153886417168e-07, "logps/chosen": -248.0314178466797, "logps/rejected": -195.76657104492188, "loss": 0.2903, "rewards/chosen": 1.8965684175491333, "rewards/margins": 5.802505970001221, "rewards/rejected": -3.905937671661377, "step": 2417 }, { "epoch": 0.63, "grad_norm": 40.49986267089844, "kl": 0.0, "learning_rate": 1.8359068306725985e-07, "logps/chosen": -138.3631134033203, "logps/rejected": -231.3643341064453, "loss": 0.3318, "rewards/chosen": -0.4930739998817444, "rewards/margins": 2.0822386741638184, "rewards/rejected": -2.575312614440918, "step": 2418 }, { "epoch": 0.63, "grad_norm": 28.37915802001953, "kl": 0.0, "learning_rate": 1.8345982727034807e-07, "logps/chosen": -122.53243255615234, "logps/rejected": -233.7292938232422, "loss": 0.2877, "rewards/chosen": -0.30030661821365356, "rewards/margins": 4.429220676422119, "rewards/rejected": -4.729527473449707, "step": 2419 }, { "epoch": 0.63, "grad_norm": 27.743270874023438, "kl": 0.0, "learning_rate": 1.8332897147343627e-07, "logps/chosen": -193.67543029785156, "logps/rejected": -227.80419921875, "loss": 0.2714, "rewards/chosen": 0.15854603052139282, "rewards/margins": 3.921008348464966, "rewards/rejected": -3.7624623775482178, "step": 2420 }, { "epoch": 0.63, "grad_norm": 48.460201263427734, "kl": 0.0, "learning_rate": 1.8319811567652446e-07, "logps/chosen": -185.3589324951172, "logps/rejected": -280.7593078613281, "loss": 0.3378, "rewards/chosen": 0.38898900151252747, "rewards/margins": 3.3332223892211914, "rewards/rejected": -2.9442334175109863, "step": 2421 }, { "epoch": 0.63, "grad_norm": 39.76223373413086, "kl": 0.0, "learning_rate": 1.8306725987961266e-07, "logps/chosen": -206.49517822265625, "logps/rejected": -231.34706115722656, "loss": 0.1478, "rewards/chosen": 2.267397880554199, "rewards/margins": 6.553323745727539, "rewards/rejected": -4.28592586517334, "step": 2422 }, { "epoch": 0.63, "grad_norm": 35.084251403808594, "kl": 0.0, "learning_rate": 1.8293640408270086e-07, "logps/chosen": -233.82870483398438, "logps/rejected": -215.7780303955078, "loss": 0.2572, "rewards/chosen": 0.8703588247299194, "rewards/margins": 4.745369911193848, "rewards/rejected": -3.8750109672546387, "step": 2423 }, { "epoch": 0.63, "grad_norm": 26.757802963256836, "kl": 0.0, "learning_rate": 1.8280554828578905e-07, "logps/chosen": -208.91114807128906, "logps/rejected": -294.6512451171875, "loss": 0.3139, "rewards/chosen": 0.3779858350753784, "rewards/margins": 3.4969449043273926, "rewards/rejected": -3.1189589500427246, "step": 2424 }, { "epoch": 0.63, "grad_norm": 35.949974060058594, "kl": 0.0, "learning_rate": 1.8267469248887722e-07, "logps/chosen": -210.23870849609375, "logps/rejected": -213.9252471923828, "loss": 0.3204, "rewards/chosen": -0.17984256148338318, "rewards/margins": 2.2258670330047607, "rewards/rejected": -2.4057095050811768, "step": 2425 }, { "epoch": 0.63, "grad_norm": 26.635404586791992, "kl": 0.0, "learning_rate": 1.8254383669196544e-07, "logps/chosen": -216.50100708007812, "logps/rejected": -290.6695556640625, "loss": 0.2065, "rewards/chosen": 2.7978515625, "rewards/margins": 6.623014450073242, "rewards/rejected": -3.825162887573242, "step": 2426 }, { "epoch": 0.64, "grad_norm": 37.63100051879883, "kl": 0.0, "learning_rate": 1.8241298089505364e-07, "logps/chosen": -206.00582885742188, "logps/rejected": -234.72286987304688, "loss": 0.3039, "rewards/chosen": 0.5800890922546387, "rewards/margins": 2.977942705154419, "rewards/rejected": -2.3978536128997803, "step": 2427 }, { "epoch": 0.64, "grad_norm": 42.57049560546875, "kl": 0.0, "learning_rate": 1.8228212509814184e-07, "logps/chosen": -288.19244384765625, "logps/rejected": -245.7040252685547, "loss": 0.4022, "rewards/chosen": -0.9926659464836121, "rewards/margins": 1.974844217300415, "rewards/rejected": -2.967510223388672, "step": 2428 }, { "epoch": 0.64, "grad_norm": 29.48101234436035, "kl": 0.0, "learning_rate": 1.8215126930123003e-07, "logps/chosen": -152.51112365722656, "logps/rejected": -317.3865966796875, "loss": 0.2226, "rewards/chosen": 0.774302065372467, "rewards/margins": 4.343541622161865, "rewards/rejected": -3.569239616394043, "step": 2429 }, { "epoch": 0.64, "grad_norm": 32.72949981689453, "kl": 0.0, "learning_rate": 1.8202041350431823e-07, "logps/chosen": -181.32928466796875, "logps/rejected": -258.7019958496094, "loss": 0.2056, "rewards/chosen": 0.38289085030555725, "rewards/margins": 5.136065483093262, "rewards/rejected": -4.753174781799316, "step": 2430 }, { "epoch": 0.64, "grad_norm": 38.77192687988281, "kl": 0.0, "learning_rate": 1.8188955770740643e-07, "logps/chosen": -197.99661254882812, "logps/rejected": -297.813232421875, "loss": 0.2575, "rewards/chosen": 0.8634415864944458, "rewards/margins": 3.4837145805358887, "rewards/rejected": -2.6202731132507324, "step": 2431 }, { "epoch": 0.64, "grad_norm": 37.21419906616211, "kl": 0.0, "learning_rate": 1.8175870191049465e-07, "logps/chosen": -219.07395935058594, "logps/rejected": -278.3205871582031, "loss": 0.1869, "rewards/chosen": 0.7768957018852234, "rewards/margins": 5.239342212677002, "rewards/rejected": -4.462446689605713, "step": 2432 }, { "epoch": 0.64, "grad_norm": 34.17312240600586, "kl": 0.0, "learning_rate": 1.8162784611358282e-07, "logps/chosen": -174.0838623046875, "logps/rejected": -198.52992248535156, "loss": 0.3237, "rewards/chosen": 0.5632308721542358, "rewards/margins": 4.041640758514404, "rewards/rejected": -3.478409767150879, "step": 2433 }, { "epoch": 0.64, "grad_norm": 34.17102813720703, "kl": 0.0, "learning_rate": 1.8149699031667101e-07, "logps/chosen": -182.22317504882812, "logps/rejected": -242.95675659179688, "loss": 0.3105, "rewards/chosen": 0.08409518748521805, "rewards/margins": 3.777181625366211, "rewards/rejected": -3.6930863857269287, "step": 2434 }, { "epoch": 0.64, "grad_norm": 37.4450798034668, "kl": 0.0, "learning_rate": 1.813661345197592e-07, "logps/chosen": -234.27056884765625, "logps/rejected": -240.46636962890625, "loss": 0.1962, "rewards/chosen": 1.776934027671814, "rewards/margins": 5.846645355224609, "rewards/rejected": -4.069711208343506, "step": 2435 }, { "epoch": 0.64, "grad_norm": 30.845748901367188, "kl": 0.0, "learning_rate": 1.812352787228474e-07, "logps/chosen": -252.45223999023438, "logps/rejected": -358.770751953125, "loss": 0.1712, "rewards/chosen": -0.10393345355987549, "rewards/margins": 5.5656962394714355, "rewards/rejected": -5.6696295738220215, "step": 2436 }, { "epoch": 0.64, "grad_norm": 41.58126449584961, "kl": 0.0, "learning_rate": 1.811044229259356e-07, "logps/chosen": -227.92178344726562, "logps/rejected": -236.83309936523438, "loss": 0.3241, "rewards/chosen": 1.2366797924041748, "rewards/margins": 3.234030246734619, "rewards/rejected": -1.9973505735397339, "step": 2437 }, { "epoch": 0.64, "grad_norm": 34.29671859741211, "kl": 0.0, "learning_rate": 1.8097356712902382e-07, "logps/chosen": -194.61155700683594, "logps/rejected": -224.98800659179688, "loss": 0.283, "rewards/chosen": 0.47941941022872925, "rewards/margins": 2.887995958328247, "rewards/rejected": -2.408576488494873, "step": 2438 }, { "epoch": 0.64, "grad_norm": 28.590755462646484, "kl": 0.0, "learning_rate": 1.8084271133211202e-07, "logps/chosen": -145.79449462890625, "logps/rejected": -300.8880615234375, "loss": 0.1904, "rewards/chosen": 0.6898126006126404, "rewards/margins": 5.064650058746338, "rewards/rejected": -4.374837398529053, "step": 2439 }, { "epoch": 0.64, "grad_norm": 21.487871170043945, "kl": 0.0, "learning_rate": 1.8071185553520022e-07, "logps/chosen": -158.74505615234375, "logps/rejected": -275.7796630859375, "loss": 0.2573, "rewards/chosen": 1.552902102470398, "rewards/margins": 5.477572917938232, "rewards/rejected": -3.924670696258545, "step": 2440 }, { "epoch": 0.64, "grad_norm": 28.447002410888672, "kl": 0.0, "learning_rate": 1.8058099973828839e-07, "logps/chosen": -211.07456970214844, "logps/rejected": -278.2138671875, "loss": 0.258, "rewards/chosen": -0.24363432824611664, "rewards/margins": 3.8275575637817383, "rewards/rejected": -4.071191787719727, "step": 2441 }, { "epoch": 0.64, "grad_norm": 34.62162780761719, "kl": 0.0, "learning_rate": 1.8045014394137658e-07, "logps/chosen": -261.4964294433594, "logps/rejected": -281.09320068359375, "loss": 0.3759, "rewards/chosen": 0.2970547080039978, "rewards/margins": 2.0743181705474854, "rewards/rejected": -1.7772635221481323, "step": 2442 }, { "epoch": 0.64, "grad_norm": 37.94488525390625, "kl": 0.0, "learning_rate": 1.8031928814446478e-07, "logps/chosen": -223.45570373535156, "logps/rejected": -238.4624481201172, "loss": 0.2795, "rewards/chosen": 0.44729822874069214, "rewards/margins": 4.174413204193115, "rewards/rejected": -3.7271151542663574, "step": 2443 }, { "epoch": 0.64, "grad_norm": 42.214988708496094, "kl": 0.0, "learning_rate": 1.8018843234755297e-07, "logps/chosen": -194.6533203125, "logps/rejected": -261.42962646484375, "loss": 0.2167, "rewards/chosen": 0.04782336950302124, "rewards/margins": 3.3527185916900635, "rewards/rejected": -3.3048951625823975, "step": 2444 }, { "epoch": 0.64, "grad_norm": 34.41872024536133, "kl": 0.0, "learning_rate": 1.800575765506412e-07, "logps/chosen": -208.5053253173828, "logps/rejected": -261.6292724609375, "loss": 0.2264, "rewards/chosen": 1.414298176765442, "rewards/margins": 6.264079570770264, "rewards/rejected": -4.849781513214111, "step": 2445 }, { "epoch": 0.64, "grad_norm": 55.12093734741211, "kl": 0.0, "learning_rate": 1.799267207537294e-07, "logps/chosen": -213.4249725341797, "logps/rejected": -304.13330078125, "loss": 0.3324, "rewards/chosen": 0.5824081301689148, "rewards/margins": 4.4119977951049805, "rewards/rejected": -3.82958984375, "step": 2446 }, { "epoch": 0.64, "grad_norm": 42.6986198425293, "kl": 0.0, "learning_rate": 1.797958649568176e-07, "logps/chosen": -176.6316680908203, "logps/rejected": -293.6900634765625, "loss": 0.3693, "rewards/chosen": -0.32846707105636597, "rewards/margins": 4.005442142486572, "rewards/rejected": -4.333909034729004, "step": 2447 }, { "epoch": 0.64, "grad_norm": 40.18217468261719, "kl": 0.0, "learning_rate": 1.7966500915990579e-07, "logps/chosen": -183.23350524902344, "logps/rejected": -217.79002380371094, "loss": 0.2887, "rewards/chosen": 1.0397711992263794, "rewards/margins": 2.5541157722473145, "rewards/rejected": -1.5143444538116455, "step": 2448 }, { "epoch": 0.64, "grad_norm": 33.37748336791992, "kl": 0.0, "learning_rate": 1.7953415336299396e-07, "logps/chosen": -202.826416015625, "logps/rejected": -217.74087524414062, "loss": 0.2367, "rewards/chosen": 0.40020081400871277, "rewards/margins": 4.570520401000977, "rewards/rejected": -4.170319557189941, "step": 2449 }, { "epoch": 0.64, "grad_norm": 29.754947662353516, "kl": 0.0, "learning_rate": 1.7940329756608215e-07, "logps/chosen": -222.77371215820312, "logps/rejected": -223.1396026611328, "loss": 0.2676, "rewards/chosen": 1.1596344709396362, "rewards/margins": 4.159872055053711, "rewards/rejected": -3.000237464904785, "step": 2450 }, { "epoch": 0.64, "grad_norm": 39.329864501953125, "kl": 0.0, "learning_rate": 1.7927244176917037e-07, "logps/chosen": -255.2774658203125, "logps/rejected": -217.47598266601562, "loss": 0.2728, "rewards/chosen": 0.08437132835388184, "rewards/margins": 3.42891001701355, "rewards/rejected": -3.344538688659668, "step": 2451 }, { "epoch": 0.64, "grad_norm": 35.733943939208984, "kl": 0.0, "learning_rate": 1.7914158597225857e-07, "logps/chosen": -193.04127502441406, "logps/rejected": -307.47894287109375, "loss": 0.2573, "rewards/chosen": 1.3631714582443237, "rewards/margins": 4.002033710479736, "rewards/rejected": -2.638862371444702, "step": 2452 }, { "epoch": 0.64, "grad_norm": 28.73012924194336, "kl": 0.0, "learning_rate": 1.7901073017534677e-07, "logps/chosen": -175.82322692871094, "logps/rejected": -294.29327392578125, "loss": 0.2468, "rewards/chosen": 1.1468980312347412, "rewards/margins": 4.692342281341553, "rewards/rejected": -3.5454442501068115, "step": 2453 }, { "epoch": 0.64, "grad_norm": 35.15346145629883, "kl": 0.0, "learning_rate": 1.7887987437843496e-07, "logps/chosen": -269.71636962890625, "logps/rejected": -274.54364013671875, "loss": 0.4064, "rewards/chosen": 0.32359778881073, "rewards/margins": 2.4833879470825195, "rewards/rejected": -2.1597900390625, "step": 2454 }, { "epoch": 0.64, "grad_norm": 35.98922348022461, "kl": 0.0, "learning_rate": 1.7874901858152316e-07, "logps/chosen": -180.52163696289062, "logps/rejected": -253.1143798828125, "loss": 0.298, "rewards/chosen": 1.0234401226043701, "rewards/margins": 4.086539268493652, "rewards/rejected": -3.063098907470703, "step": 2455 }, { "epoch": 0.64, "grad_norm": 28.62803077697754, "kl": 0.0, "learning_rate": 1.7861816278461133e-07, "logps/chosen": -206.5957489013672, "logps/rejected": -294.3093566894531, "loss": 0.2218, "rewards/chosen": 0.06527487188577652, "rewards/margins": 4.730937480926514, "rewards/rejected": -4.66566276550293, "step": 2456 }, { "epoch": 0.64, "grad_norm": 48.36724853515625, "kl": 0.0, "learning_rate": 1.7848730698769955e-07, "logps/chosen": -129.61965942382812, "logps/rejected": -219.6337890625, "loss": 0.2765, "rewards/chosen": 0.6784979104995728, "rewards/margins": 3.8180155754089355, "rewards/rejected": -3.1395177841186523, "step": 2457 }, { "epoch": 0.64, "grad_norm": 31.1170654296875, "kl": 0.0, "learning_rate": 1.7835645119078775e-07, "logps/chosen": -196.87185668945312, "logps/rejected": -322.5999450683594, "loss": 0.2401, "rewards/chosen": 1.9326436519622803, "rewards/margins": 6.90676212310791, "rewards/rejected": -4.974118709564209, "step": 2458 }, { "epoch": 0.64, "grad_norm": 24.22307586669922, "kl": 0.0, "learning_rate": 1.7822559539387594e-07, "logps/chosen": -134.24618530273438, "logps/rejected": -140.83575439453125, "loss": 0.1632, "rewards/chosen": 2.179842948913574, "rewards/margins": 5.916825294494629, "rewards/rejected": -3.7369823455810547, "step": 2459 }, { "epoch": 0.64, "grad_norm": 38.252418518066406, "kl": 0.0, "learning_rate": 1.7809473959696414e-07, "logps/chosen": -129.58192443847656, "logps/rejected": -221.4130859375, "loss": 0.2781, "rewards/chosen": 0.37688159942626953, "rewards/margins": 2.2952418327331543, "rewards/rejected": -1.9183603525161743, "step": 2460 }, { "epoch": 0.64, "grad_norm": 32.61762237548828, "kl": 0.0, "learning_rate": 1.7796388380005233e-07, "logps/chosen": -251.0649871826172, "logps/rejected": -177.43023681640625, "loss": 0.2294, "rewards/chosen": 0.15406060218811035, "rewards/margins": 4.70060920715332, "rewards/rejected": -4.546548843383789, "step": 2461 }, { "epoch": 0.64, "grad_norm": 32.96717071533203, "kl": 0.0, "learning_rate": 1.7783302800314053e-07, "logps/chosen": -162.8365478515625, "logps/rejected": -236.65098571777344, "loss": 0.2856, "rewards/chosen": 1.1021099090576172, "rewards/margins": 4.090110778808594, "rewards/rejected": -2.9880011081695557, "step": 2462 }, { "epoch": 0.64, "grad_norm": 27.149503707885742, "kl": 0.0, "learning_rate": 1.7770217220622873e-07, "logps/chosen": -171.068359375, "logps/rejected": -306.48956298828125, "loss": 0.2813, "rewards/chosen": 1.013263463973999, "rewards/margins": 4.866916656494141, "rewards/rejected": -3.8536529541015625, "step": 2463 }, { "epoch": 0.64, "grad_norm": 44.923675537109375, "kl": 0.0, "learning_rate": 1.7757131640931692e-07, "logps/chosen": -206.68359375, "logps/rejected": -244.8902587890625, "loss": 0.2559, "rewards/chosen": 0.11413121223449707, "rewards/margins": 4.262955665588379, "rewards/rejected": -4.148824214935303, "step": 2464 }, { "epoch": 0.65, "grad_norm": 41.188018798828125, "kl": 0.0, "learning_rate": 1.7744046061240512e-07, "logps/chosen": -297.6365051269531, "logps/rejected": -292.2042236328125, "loss": 0.2326, "rewards/chosen": 1.7737928628921509, "rewards/margins": 5.583432197570801, "rewards/rejected": -3.8096394538879395, "step": 2465 }, { "epoch": 0.65, "grad_norm": 26.103349685668945, "kl": 0.0, "learning_rate": 1.7730960481549332e-07, "logps/chosen": -168.4176483154297, "logps/rejected": -240.8828125, "loss": 0.144, "rewards/chosen": 2.443394184112549, "rewards/margins": 5.812577247619629, "rewards/rejected": -3.369182825088501, "step": 2466 }, { "epoch": 0.65, "grad_norm": 33.39318084716797, "kl": 0.0, "learning_rate": 1.771787490185815e-07, "logps/chosen": -160.09738159179688, "logps/rejected": -280.89251708984375, "loss": 0.2235, "rewards/chosen": 1.8487533330917358, "rewards/margins": 4.299698829650879, "rewards/rejected": -2.4509453773498535, "step": 2467 }, { "epoch": 0.65, "grad_norm": 23.734865188598633, "kl": 0.0, "learning_rate": 1.770478932216697e-07, "logps/chosen": -162.56219482421875, "logps/rejected": -170.80076599121094, "loss": 0.1597, "rewards/chosen": -0.08058365434408188, "rewards/margins": 4.102053642272949, "rewards/rejected": -4.1826372146606445, "step": 2468 }, { "epoch": 0.65, "grad_norm": 33.352291107177734, "kl": 0.0, "learning_rate": 1.769170374247579e-07, "logps/chosen": -230.69261169433594, "logps/rejected": -318.5282287597656, "loss": 0.3542, "rewards/chosen": -0.887917160987854, "rewards/margins": 4.116541862487793, "rewards/rejected": -5.004458904266357, "step": 2469 }, { "epoch": 0.65, "grad_norm": 28.36016273498535, "kl": 0.0, "learning_rate": 1.7678618162784613e-07, "logps/chosen": -192.17990112304688, "logps/rejected": -252.4993133544922, "loss": 0.132, "rewards/chosen": 2.809781789779663, "rewards/margins": 7.519709587097168, "rewards/rejected": -4.709927558898926, "step": 2470 }, { "epoch": 0.65, "grad_norm": 34.00200271606445, "kl": 0.0, "learning_rate": 1.7665532583093432e-07, "logps/chosen": -243.95375061035156, "logps/rejected": -255.2686309814453, "loss": 0.2298, "rewards/chosen": 2.4145703315734863, "rewards/margins": 6.422689914703369, "rewards/rejected": -4.008119583129883, "step": 2471 }, { "epoch": 0.65, "grad_norm": 35.79818344116211, "kl": 0.0, "learning_rate": 1.765244700340225e-07, "logps/chosen": -201.85919189453125, "logps/rejected": -209.87356567382812, "loss": 0.2992, "rewards/chosen": 1.0443308353424072, "rewards/margins": 4.464914798736572, "rewards/rejected": -3.420583963394165, "step": 2472 }, { "epoch": 0.65, "grad_norm": 28.738393783569336, "kl": 0.0, "learning_rate": 1.763936142371107e-07, "logps/chosen": -298.4902648925781, "logps/rejected": -262.4606628417969, "loss": 0.1664, "rewards/chosen": 0.7948918342590332, "rewards/margins": 4.708080291748047, "rewards/rejected": -3.9131884574890137, "step": 2473 }, { "epoch": 0.65, "grad_norm": 32.60791015625, "kl": 0.0, "learning_rate": 1.7626275844019888e-07, "logps/chosen": -233.0884552001953, "logps/rejected": -306.63031005859375, "loss": 0.2184, "rewards/chosen": 3.0960965156555176, "rewards/margins": 7.467236518859863, "rewards/rejected": -4.371140003204346, "step": 2474 }, { "epoch": 0.65, "grad_norm": 28.54564094543457, "kl": 0.0, "learning_rate": 1.7613190264328708e-07, "logps/chosen": -171.89976501464844, "logps/rejected": -187.40951538085938, "loss": 0.2721, "rewards/chosen": -0.18746140599250793, "rewards/margins": 3.1394734382629395, "rewards/rejected": -3.326934814453125, "step": 2475 }, { "epoch": 0.65, "grad_norm": 34.77114486694336, "kl": 0.0, "learning_rate": 1.760010468463753e-07, "logps/chosen": -176.38687133789062, "logps/rejected": -217.43106079101562, "loss": 0.2527, "rewards/chosen": 1.2474067211151123, "rewards/margins": 4.102912902832031, "rewards/rejected": -2.855506420135498, "step": 2476 }, { "epoch": 0.65, "grad_norm": 33.361473083496094, "kl": 0.0, "learning_rate": 1.758701910494635e-07, "logps/chosen": -143.1798858642578, "logps/rejected": -371.7455749511719, "loss": 0.2283, "rewards/chosen": -0.4656195044517517, "rewards/margins": 5.626425743103027, "rewards/rejected": -6.092045307159424, "step": 2477 }, { "epoch": 0.65, "grad_norm": 29.22950553894043, "kl": 0.0, "learning_rate": 1.757393352525517e-07, "logps/chosen": -229.2440948486328, "logps/rejected": -299.21514892578125, "loss": 0.2769, "rewards/chosen": 0.769917905330658, "rewards/margins": 2.953812599182129, "rewards/rejected": -2.183894634246826, "step": 2478 }, { "epoch": 0.65, "grad_norm": 44.46550750732422, "kl": 0.0, "learning_rate": 1.756084794556399e-07, "logps/chosen": -235.3984375, "logps/rejected": -244.00491333007812, "loss": 0.2266, "rewards/chosen": 0.822199285030365, "rewards/margins": 3.435243606567383, "rewards/rejected": -2.613044261932373, "step": 2479 }, { "epoch": 0.65, "grad_norm": 30.26697540283203, "kl": 0.0, "learning_rate": 1.7547762365872806e-07, "logps/chosen": -173.37026977539062, "logps/rejected": -240.79135131835938, "loss": 0.235, "rewards/chosen": 1.0367459058761597, "rewards/margins": 4.59039831161499, "rewards/rejected": -3.55365252494812, "step": 2480 }, { "epoch": 0.65, "grad_norm": 36.04526138305664, "kl": 0.0, "learning_rate": 1.7534676786181626e-07, "logps/chosen": -201.38333129882812, "logps/rejected": -310.193359375, "loss": 0.304, "rewards/chosen": -0.028808683156967163, "rewards/margins": 3.178330898284912, "rewards/rejected": -3.207139492034912, "step": 2481 }, { "epoch": 0.65, "grad_norm": 34.90766906738281, "kl": 0.0, "learning_rate": 1.7521591206490445e-07, "logps/chosen": -199.36154174804688, "logps/rejected": -197.4955596923828, "loss": 0.22, "rewards/chosen": 0.07011854648590088, "rewards/margins": 3.276404857635498, "rewards/rejected": -3.2062861919403076, "step": 2482 }, { "epoch": 0.65, "grad_norm": 35.02695846557617, "kl": 0.0, "learning_rate": 1.7508505626799268e-07, "logps/chosen": -215.74803161621094, "logps/rejected": -266.92626953125, "loss": 0.3153, "rewards/chosen": 0.9781593084335327, "rewards/margins": 5.768883228302002, "rewards/rejected": -4.79072380065918, "step": 2483 }, { "epoch": 0.65, "grad_norm": 34.68666076660156, "kl": 0.0, "learning_rate": 1.7495420047108087e-07, "logps/chosen": -166.11175537109375, "logps/rejected": -273.3475646972656, "loss": 0.2436, "rewards/chosen": 1.2479914426803589, "rewards/margins": 5.1875176429748535, "rewards/rejected": -3.939526081085205, "step": 2484 }, { "epoch": 0.65, "grad_norm": 27.92112159729004, "kl": 0.0, "learning_rate": 1.7482334467416907e-07, "logps/chosen": -256.979248046875, "logps/rejected": -195.372802734375, "loss": 0.2592, "rewards/chosen": 1.7819782495498657, "rewards/margins": 4.243241786956787, "rewards/rejected": -2.461263418197632, "step": 2485 }, { "epoch": 0.65, "grad_norm": 38.856407165527344, "kl": 0.0, "learning_rate": 1.7469248887725726e-07, "logps/chosen": -220.26260375976562, "logps/rejected": -132.40969848632812, "loss": 0.2858, "rewards/chosen": 1.6704671382904053, "rewards/margins": 3.005314588546753, "rewards/rejected": -1.3348474502563477, "step": 2486 }, { "epoch": 0.65, "grad_norm": 33.75629425048828, "kl": 0.0, "learning_rate": 1.7456163308034543e-07, "logps/chosen": -224.50924682617188, "logps/rejected": -201.2100830078125, "loss": 0.2443, "rewards/chosen": 0.600041925907135, "rewards/margins": 2.671865463256836, "rewards/rejected": -2.0718235969543457, "step": 2487 }, { "epoch": 0.65, "grad_norm": 32.12745666503906, "kl": 0.0, "learning_rate": 1.7443077728343363e-07, "logps/chosen": -196.9830780029297, "logps/rejected": -138.11795043945312, "loss": 0.2377, "rewards/chosen": 1.765749216079712, "rewards/margins": 4.971625328063965, "rewards/rejected": -3.205876350402832, "step": 2488 }, { "epoch": 0.65, "grad_norm": 36.79436111450195, "kl": 0.0, "learning_rate": 1.7429992148652185e-07, "logps/chosen": -257.759765625, "logps/rejected": -156.55078125, "loss": 0.2281, "rewards/chosen": 1.8479835987091064, "rewards/margins": 5.976274490356445, "rewards/rejected": -4.128291130065918, "step": 2489 }, { "epoch": 0.65, "grad_norm": 38.34791564941406, "kl": 0.0, "learning_rate": 1.7416906568961005e-07, "logps/chosen": -242.24095153808594, "logps/rejected": -281.4273681640625, "loss": 0.3033, "rewards/chosen": -1.0780397653579712, "rewards/margins": 3.3613080978393555, "rewards/rejected": -4.439347743988037, "step": 2490 }, { "epoch": 0.65, "grad_norm": 34.285850524902344, "kl": 0.0, "learning_rate": 1.7403820989269824e-07, "logps/chosen": -167.8081512451172, "logps/rejected": -238.1160430908203, "loss": 0.2076, "rewards/chosen": 2.113269805908203, "rewards/margins": 5.478918075561523, "rewards/rejected": -3.3656485080718994, "step": 2491 }, { "epoch": 0.65, "grad_norm": 36.4212532043457, "kl": 0.0, "learning_rate": 1.7390735409578644e-07, "logps/chosen": -284.6696472167969, "logps/rejected": -324.4261779785156, "loss": 0.2279, "rewards/chosen": -0.020696640014648438, "rewards/margins": 3.7279841899871826, "rewards/rejected": -3.748680830001831, "step": 2492 }, { "epoch": 0.65, "grad_norm": 32.45363998413086, "kl": 0.0, "learning_rate": 1.7377649829887464e-07, "logps/chosen": -295.4963073730469, "logps/rejected": -249.78189086914062, "loss": 0.2526, "rewards/chosen": 2.3719584941864014, "rewards/margins": 4.654071807861328, "rewards/rejected": -2.282113552093506, "step": 2493 }, { "epoch": 0.65, "grad_norm": 29.20089340209961, "kl": 0.0, "learning_rate": 1.7364564250196283e-07, "logps/chosen": -192.12564086914062, "logps/rejected": -224.99407958984375, "loss": 0.2861, "rewards/chosen": 1.1605472564697266, "rewards/margins": 4.133330821990967, "rewards/rejected": -2.9727835655212402, "step": 2494 }, { "epoch": 0.65, "grad_norm": 39.42447280883789, "kl": 0.0, "learning_rate": 1.73514786705051e-07, "logps/chosen": -224.35662841796875, "logps/rejected": -237.2237548828125, "loss": 0.2572, "rewards/chosen": 1.054343581199646, "rewards/margins": 3.775313377380371, "rewards/rejected": -2.7209696769714355, "step": 2495 }, { "epoch": 0.65, "grad_norm": 39.16334915161133, "kl": 0.0, "learning_rate": 1.7338393090813922e-07, "logps/chosen": -214.0321807861328, "logps/rejected": -207.50270080566406, "loss": 0.232, "rewards/chosen": 1.2089825868606567, "rewards/margins": 4.443968296051025, "rewards/rejected": -3.234985828399658, "step": 2496 }, { "epoch": 0.65, "grad_norm": 25.793285369873047, "kl": 0.0, "learning_rate": 1.7325307511122742e-07, "logps/chosen": -179.11578369140625, "logps/rejected": -253.93060302734375, "loss": 0.2831, "rewards/chosen": 1.198627233505249, "rewards/margins": 5.378981590270996, "rewards/rejected": -4.180354118347168, "step": 2497 }, { "epoch": 0.65, "grad_norm": 35.483436584472656, "kl": 0.0, "learning_rate": 1.7312221931431562e-07, "logps/chosen": -181.34971618652344, "logps/rejected": -267.29290771484375, "loss": 0.3095, "rewards/chosen": 0.8412249088287354, "rewards/margins": 5.656091690063477, "rewards/rejected": -4.814866542816162, "step": 2498 }, { "epoch": 0.65, "grad_norm": 26.45248031616211, "kl": 0.0, "learning_rate": 1.729913635174038e-07, "logps/chosen": -268.8468017578125, "logps/rejected": -223.29476928710938, "loss": 0.2184, "rewards/chosen": 1.8355284929275513, "rewards/margins": 8.121630668640137, "rewards/rejected": -6.286102294921875, "step": 2499 }, { "epoch": 0.65, "grad_norm": 27.72164535522461, "kl": 0.0, "learning_rate": 1.72860507720492e-07, "logps/chosen": -250.75514221191406, "logps/rejected": -251.33322143554688, "loss": 0.1799, "rewards/chosen": 2.802277088165283, "rewards/margins": 7.107457637786865, "rewards/rejected": -4.305180549621582, "step": 2500 }, { "epoch": 0.65, "grad_norm": 33.076168060302734, "kl": 0.0, "learning_rate": 1.727296519235802e-07, "logps/chosen": -170.14682006835938, "logps/rejected": -306.0361328125, "loss": 0.1722, "rewards/chosen": 1.6673815250396729, "rewards/margins": 5.51771879196167, "rewards/rejected": -3.850337266921997, "step": 2501 }, { "epoch": 0.65, "grad_norm": 34.463134765625, "kl": 0.0, "learning_rate": 1.7259879612666843e-07, "logps/chosen": -258.2651672363281, "logps/rejected": -253.67453002929688, "loss": 0.2684, "rewards/chosen": 1.771608591079712, "rewards/margins": 5.85136604309082, "rewards/rejected": -4.0797576904296875, "step": 2502 }, { "epoch": 0.66, "grad_norm": 41.39519119262695, "kl": 0.0, "learning_rate": 1.724679403297566e-07, "logps/chosen": -150.8826904296875, "logps/rejected": -281.825439453125, "loss": 0.3189, "rewards/chosen": 0.4526558518409729, "rewards/margins": 2.457512855529785, "rewards/rejected": -2.004857063293457, "step": 2503 }, { "epoch": 0.66, "grad_norm": 33.52042770385742, "kl": 0.0, "learning_rate": 1.723370845328448e-07, "logps/chosen": -270.5859680175781, "logps/rejected": -232.13134765625, "loss": 0.225, "rewards/chosen": 1.8501927852630615, "rewards/margins": 5.286325454711914, "rewards/rejected": -3.4361324310302734, "step": 2504 }, { "epoch": 0.66, "grad_norm": 26.857891082763672, "kl": 0.0, "learning_rate": 1.72206228735933e-07, "logps/chosen": -235.99998474121094, "logps/rejected": -252.3543701171875, "loss": 0.2047, "rewards/chosen": 1.4618678092956543, "rewards/margins": 5.876453876495361, "rewards/rejected": -4.414586067199707, "step": 2505 }, { "epoch": 0.66, "grad_norm": 29.943302154541016, "kl": 0.0, "learning_rate": 1.7207537293902119e-07, "logps/chosen": -158.45936584472656, "logps/rejected": -222.40798950195312, "loss": 0.2241, "rewards/chosen": 1.1429415941238403, "rewards/margins": 5.468343734741211, "rewards/rejected": -4.32540225982666, "step": 2506 }, { "epoch": 0.66, "grad_norm": 32.61026382446289, "kl": 0.0, "learning_rate": 1.7194451714210938e-07, "logps/chosen": -225.8323516845703, "logps/rejected": -226.42227172851562, "loss": 0.2229, "rewards/chosen": 2.261629104614258, "rewards/margins": 6.06764030456543, "rewards/rejected": -3.8060109615325928, "step": 2507 }, { "epoch": 0.66, "grad_norm": 29.26123046875, "kl": 0.0, "learning_rate": 1.718136613451976e-07, "logps/chosen": -226.59225463867188, "logps/rejected": -286.61572265625, "loss": 0.2764, "rewards/chosen": 1.3310816287994385, "rewards/margins": 4.778102397918701, "rewards/rejected": -3.4470207691192627, "step": 2508 }, { "epoch": 0.66, "grad_norm": 37.21846389770508, "kl": 0.0, "learning_rate": 1.716828055482858e-07, "logps/chosen": -153.80140686035156, "logps/rejected": -322.50732421875, "loss": 0.1747, "rewards/chosen": 0.6639166474342346, "rewards/margins": 4.689450740814209, "rewards/rejected": -4.025534152984619, "step": 2509 }, { "epoch": 0.66, "grad_norm": 30.148347854614258, "kl": 0.0, "learning_rate": 1.7155194975137397e-07, "logps/chosen": -226.34432983398438, "logps/rejected": -206.22552490234375, "loss": 0.3551, "rewards/chosen": -0.16970336437225342, "rewards/margins": 2.9069418907165527, "rewards/rejected": -3.0766453742980957, "step": 2510 }, { "epoch": 0.66, "grad_norm": 34.247314453125, "kl": 0.0, "learning_rate": 1.7142109395446217e-07, "logps/chosen": -198.5338897705078, "logps/rejected": -207.5816650390625, "loss": 0.1954, "rewards/chosen": 1.4360510110855103, "rewards/margins": 4.661987781524658, "rewards/rejected": -3.2259366512298584, "step": 2511 }, { "epoch": 0.66, "grad_norm": 22.530338287353516, "kl": 0.0, "learning_rate": 1.7129023815755036e-07, "logps/chosen": -194.3182830810547, "logps/rejected": -249.9308624267578, "loss": 0.1564, "rewards/chosen": 1.923820972442627, "rewards/margins": 5.429113864898682, "rewards/rejected": -3.5052928924560547, "step": 2512 }, { "epoch": 0.66, "grad_norm": 36.972007751464844, "kl": 0.0, "learning_rate": 1.7115938236063856e-07, "logps/chosen": -200.99462890625, "logps/rejected": -229.53919982910156, "loss": 0.2399, "rewards/chosen": 1.6839921474456787, "rewards/margins": 4.945981025695801, "rewards/rejected": -3.261988878250122, "step": 2513 }, { "epoch": 0.66, "grad_norm": 36.51010513305664, "kl": 0.0, "learning_rate": 1.7102852656372675e-07, "logps/chosen": -220.27752685546875, "logps/rejected": -185.32997131347656, "loss": 0.3269, "rewards/chosen": 2.3345282077789307, "rewards/margins": 4.075137138366699, "rewards/rejected": -1.7406089305877686, "step": 2514 }, { "epoch": 0.66, "grad_norm": 45.91500473022461, "kl": 0.0, "learning_rate": 1.7089767076681498e-07, "logps/chosen": -243.1271514892578, "logps/rejected": -201.50389099121094, "loss": 0.1832, "rewards/chosen": 1.141270637512207, "rewards/margins": 4.884117126464844, "rewards/rejected": -3.742846727371216, "step": 2515 }, { "epoch": 0.66, "grad_norm": 31.586618423461914, "kl": 0.0, "learning_rate": 1.7076681496990317e-07, "logps/chosen": -146.20590209960938, "logps/rejected": -183.73988342285156, "loss": 0.3548, "rewards/chosen": 0.2781957685947418, "rewards/margins": 2.8900160789489746, "rewards/rejected": -2.6118202209472656, "step": 2516 }, { "epoch": 0.66, "grad_norm": 31.6408748626709, "kl": 0.0, "learning_rate": 1.7063595917299137e-07, "logps/chosen": -184.8021697998047, "logps/rejected": -232.1508026123047, "loss": 0.1541, "rewards/chosen": 1.8986157178878784, "rewards/margins": 5.86688232421875, "rewards/rejected": -3.968266725540161, "step": 2517 }, { "epoch": 0.66, "grad_norm": 47.6021842956543, "kl": 0.0, "learning_rate": 1.7050510337607954e-07, "logps/chosen": -169.6454315185547, "logps/rejected": -179.72286987304688, "loss": 0.2734, "rewards/chosen": 0.23273229598999023, "rewards/margins": 2.5509696006774902, "rewards/rejected": -2.3182373046875, "step": 2518 }, { "epoch": 0.66, "grad_norm": 37.73974609375, "kl": 0.0, "learning_rate": 1.7037424757916773e-07, "logps/chosen": -152.7825927734375, "logps/rejected": -184.64952087402344, "loss": 0.2823, "rewards/chosen": -0.130534827709198, "rewards/margins": 4.2096405029296875, "rewards/rejected": -4.340175151824951, "step": 2519 }, { "epoch": 0.66, "grad_norm": 31.94325065612793, "kl": 0.0, "learning_rate": 1.7024339178225593e-07, "logps/chosen": -158.05471801757812, "logps/rejected": -187.04931640625, "loss": 0.2188, "rewards/chosen": 1.0949746370315552, "rewards/margins": 4.778814315795898, "rewards/rejected": -3.6838395595550537, "step": 2520 }, { "epoch": 0.66, "grad_norm": 35.16641616821289, "kl": 0.0, "learning_rate": 1.7011253598534415e-07, "logps/chosen": -161.69552612304688, "logps/rejected": -265.4871826171875, "loss": 0.2512, "rewards/chosen": 0.9285306930541992, "rewards/margins": 3.704392433166504, "rewards/rejected": -2.7758617401123047, "step": 2521 }, { "epoch": 0.66, "grad_norm": 38.41954803466797, "kl": 0.0, "learning_rate": 1.6998168018843235e-07, "logps/chosen": -232.1246795654297, "logps/rejected": -205.31787109375, "loss": 0.2639, "rewards/chosen": 0.07170384377241135, "rewards/margins": 4.2816596031188965, "rewards/rejected": -4.20995569229126, "step": 2522 }, { "epoch": 0.66, "grad_norm": 47.72578430175781, "kl": 0.0, "learning_rate": 1.6985082439152055e-07, "logps/chosen": -162.1151123046875, "logps/rejected": -244.63079833984375, "loss": 0.2459, "rewards/chosen": -0.6484707593917847, "rewards/margins": 2.6753692626953125, "rewards/rejected": -3.3238401412963867, "step": 2523 }, { "epoch": 0.66, "grad_norm": 29.99666976928711, "kl": 0.0, "learning_rate": 1.6971996859460874e-07, "logps/chosen": -216.69342041015625, "logps/rejected": -247.39427185058594, "loss": 0.2399, "rewards/chosen": 0.34248360991477966, "rewards/margins": 3.2656843662261963, "rewards/rejected": -2.923200845718384, "step": 2524 }, { "epoch": 0.66, "grad_norm": 36.366554260253906, "kl": 0.0, "learning_rate": 1.6958911279769694e-07, "logps/chosen": -240.5819091796875, "logps/rejected": -249.13943481445312, "loss": 0.3091, "rewards/chosen": 0.6309940218925476, "rewards/margins": 4.097060203552246, "rewards/rejected": -3.4660661220550537, "step": 2525 }, { "epoch": 0.66, "grad_norm": 35.71635055541992, "kl": 0.0, "learning_rate": 1.694582570007851e-07, "logps/chosen": -219.22613525390625, "logps/rejected": -270.96051025390625, "loss": 0.2344, "rewards/chosen": 0.7651382088661194, "rewards/margins": 4.163715839385986, "rewards/rejected": -3.3985774517059326, "step": 2526 }, { "epoch": 0.66, "grad_norm": 26.018346786499023, "kl": 0.0, "learning_rate": 1.693274012038733e-07, "logps/chosen": -124.76805114746094, "logps/rejected": -178.9542999267578, "loss": 0.2418, "rewards/chosen": 0.2460649609565735, "rewards/margins": 3.744946241378784, "rewards/rejected": -3.4988813400268555, "step": 2527 }, { "epoch": 0.66, "grad_norm": 37.38301467895508, "kl": 0.0, "learning_rate": 1.6919654540696153e-07, "logps/chosen": -150.87771606445312, "logps/rejected": -201.72645568847656, "loss": 0.2214, "rewards/chosen": 1.6785235404968262, "rewards/margins": 7.901946067810059, "rewards/rejected": -6.223422527313232, "step": 2528 }, { "epoch": 0.66, "grad_norm": 30.10106086730957, "kl": 0.0, "learning_rate": 1.6906568961004972e-07, "logps/chosen": -193.71878051757812, "logps/rejected": -209.6848602294922, "loss": 0.2651, "rewards/chosen": 0.5997923016548157, "rewards/margins": 4.252300262451172, "rewards/rejected": -3.652507781982422, "step": 2529 }, { "epoch": 0.66, "grad_norm": 35.383975982666016, "kl": 0.0, "learning_rate": 1.6893483381313792e-07, "logps/chosen": -145.73793029785156, "logps/rejected": -252.19183349609375, "loss": 0.3688, "rewards/chosen": 0.0910300612449646, "rewards/margins": 4.4787726402282715, "rewards/rejected": -4.387742519378662, "step": 2530 }, { "epoch": 0.66, "grad_norm": 35.255428314208984, "kl": 0.0, "learning_rate": 1.6880397801622611e-07, "logps/chosen": -172.89564514160156, "logps/rejected": -302.4369812011719, "loss": 0.2492, "rewards/chosen": 1.2666151523590088, "rewards/margins": 8.008228302001953, "rewards/rejected": -6.741613388061523, "step": 2531 }, { "epoch": 0.66, "grad_norm": 45.66690444946289, "kl": 0.0, "learning_rate": 1.686731222193143e-07, "logps/chosen": -236.4287872314453, "logps/rejected": -223.3417510986328, "loss": 0.3459, "rewards/chosen": 0.06413912773132324, "rewards/margins": 2.366434335708618, "rewards/rejected": -2.302295207977295, "step": 2532 }, { "epoch": 0.66, "grad_norm": 29.879467010498047, "kl": 0.0, "learning_rate": 1.685422664224025e-07, "logps/chosen": -199.5425262451172, "logps/rejected": -373.2349548339844, "loss": 0.203, "rewards/chosen": 2.3080244064331055, "rewards/margins": 7.140763759613037, "rewards/rejected": -4.832739353179932, "step": 2533 }, { "epoch": 0.66, "grad_norm": 21.92876625061035, "kl": 0.0, "learning_rate": 1.684114106254907e-07, "logps/chosen": -236.30528259277344, "logps/rejected": -233.36373901367188, "loss": 0.2341, "rewards/chosen": -1.7516714334487915, "rewards/margins": 1.928197979927063, "rewards/rejected": -3.6798694133758545, "step": 2534 }, { "epoch": 0.66, "grad_norm": 35.66194534301758, "kl": 0.0, "learning_rate": 1.682805548285789e-07, "logps/chosen": -211.97149658203125, "logps/rejected": -355.55926513671875, "loss": 0.2266, "rewards/chosen": 1.2822884321212769, "rewards/margins": 4.822649955749512, "rewards/rejected": -3.5403616428375244, "step": 2535 }, { "epoch": 0.66, "grad_norm": 29.88081169128418, "kl": 0.0, "learning_rate": 1.681496990316671e-07, "logps/chosen": -209.38343811035156, "logps/rejected": -249.86630249023438, "loss": 0.2414, "rewards/chosen": 2.7169368267059326, "rewards/margins": 7.931248664855957, "rewards/rejected": -5.214311599731445, "step": 2536 }, { "epoch": 0.66, "grad_norm": 39.38168716430664, "kl": 0.0, "learning_rate": 1.680188432347553e-07, "logps/chosen": -214.57864379882812, "logps/rejected": -201.05377197265625, "loss": 0.3847, "rewards/chosen": 0.13023439049720764, "rewards/margins": 3.047044038772583, "rewards/rejected": -2.916809558868408, "step": 2537 }, { "epoch": 0.66, "grad_norm": 31.947851181030273, "kl": 0.0, "learning_rate": 1.678879874378435e-07, "logps/chosen": -157.2117156982422, "logps/rejected": -217.1915283203125, "loss": 0.2563, "rewards/chosen": 0.8770288228988647, "rewards/margins": 5.18677282333374, "rewards/rejected": -4.309743881225586, "step": 2538 }, { "epoch": 0.66, "grad_norm": 32.08303451538086, "kl": 0.0, "learning_rate": 1.6775713164093168e-07, "logps/chosen": -242.77227783203125, "logps/rejected": -256.195556640625, "loss": 0.2824, "rewards/chosen": 2.168449640274048, "rewards/margins": 5.70610237121582, "rewards/rejected": -3.5376524925231934, "step": 2539 }, { "epoch": 0.66, "grad_norm": 48.098262786865234, "kl": 0.0, "learning_rate": 1.676262758440199e-07, "logps/chosen": -190.1537628173828, "logps/rejected": -293.48931884765625, "loss": 0.2169, "rewards/chosen": 0.08483177423477173, "rewards/margins": 3.067207098007202, "rewards/rejected": -2.982375383377075, "step": 2540 }, { "epoch": 0.67, "grad_norm": 35.13059997558594, "kl": 0.0, "learning_rate": 1.6749542004710808e-07, "logps/chosen": -190.3247833251953, "logps/rejected": -254.23439025878906, "loss": 0.2753, "rewards/chosen": 0.970902144908905, "rewards/margins": 4.8270134925842285, "rewards/rejected": -3.8561112880706787, "step": 2541 }, { "epoch": 0.67, "grad_norm": 27.28483009338379, "kl": 0.0, "learning_rate": 1.6736456425019627e-07, "logps/chosen": -222.0068817138672, "logps/rejected": -290.8199462890625, "loss": 0.1891, "rewards/chosen": 1.633012056350708, "rewards/margins": 6.94444465637207, "rewards/rejected": -5.311432838439941, "step": 2542 }, { "epoch": 0.67, "grad_norm": 39.392120361328125, "kl": 0.0, "learning_rate": 1.6723370845328447e-07, "logps/chosen": -203.91192626953125, "logps/rejected": -183.90980529785156, "loss": 0.2754, "rewards/chosen": 2.2773795127868652, "rewards/margins": 5.241265773773193, "rewards/rejected": -2.963886260986328, "step": 2543 }, { "epoch": 0.67, "grad_norm": 28.039636611938477, "kl": 0.0, "learning_rate": 1.6710285265637266e-07, "logps/chosen": -313.6973571777344, "logps/rejected": -302.34881591796875, "loss": 0.0985, "rewards/chosen": 1.6423184871673584, "rewards/margins": 5.527065753936768, "rewards/rejected": -3.884747266769409, "step": 2544 }, { "epoch": 0.67, "grad_norm": 43.43989944458008, "kl": 0.0, "learning_rate": 1.6697199685946086e-07, "logps/chosen": -250.87657165527344, "logps/rejected": -170.1529541015625, "loss": 0.2919, "rewards/chosen": -0.765656590461731, "rewards/margins": 2.5168333053588867, "rewards/rejected": -3.282489776611328, "step": 2545 }, { "epoch": 0.67, "grad_norm": 26.912771224975586, "kl": 0.0, "learning_rate": 1.6684114106254906e-07, "logps/chosen": -274.5841064453125, "logps/rejected": -181.53070068359375, "loss": 0.3491, "rewards/chosen": 1.5585336685180664, "rewards/margins": 3.7967350482940674, "rewards/rejected": -2.238201379776001, "step": 2546 }, { "epoch": 0.67, "grad_norm": 42.92966842651367, "kl": 0.0, "learning_rate": 1.6671028526563728e-07, "logps/chosen": -223.40972900390625, "logps/rejected": -178.333984375, "loss": 0.364, "rewards/chosen": 0.08250631392002106, "rewards/margins": 1.972010850906372, "rewards/rejected": -1.8895045518875122, "step": 2547 }, { "epoch": 0.67, "grad_norm": 34.5250129699707, "kl": 0.0, "learning_rate": 1.6657942946872547e-07, "logps/chosen": -157.47816467285156, "logps/rejected": -239.72552490234375, "loss": 0.2466, "rewards/chosen": 1.272930383682251, "rewards/margins": 4.885026931762695, "rewards/rejected": -3.6120967864990234, "step": 2548 }, { "epoch": 0.67, "grad_norm": 32.43971633911133, "kl": 0.0, "learning_rate": 1.6644857367181364e-07, "logps/chosen": -226.4892578125, "logps/rejected": -253.7250213623047, "loss": 0.1761, "rewards/chosen": 1.7298102378845215, "rewards/margins": 5.25880765914917, "rewards/rejected": -3.5289974212646484, "step": 2549 }, { "epoch": 0.67, "grad_norm": 24.741518020629883, "kl": 0.0, "learning_rate": 1.6631771787490184e-07, "logps/chosen": -209.94854736328125, "logps/rejected": -198.7462615966797, "loss": 0.2187, "rewards/chosen": 1.6599093675613403, "rewards/margins": 4.376770973205566, "rewards/rejected": -2.7168617248535156, "step": 2550 }, { "epoch": 0.67, "grad_norm": 33.30052185058594, "kl": 0.0, "learning_rate": 1.6618686207799004e-07, "logps/chosen": -180.7981414794922, "logps/rejected": -295.49761962890625, "loss": 0.2962, "rewards/chosen": 0.1354203224182129, "rewards/margins": 4.54762601852417, "rewards/rejected": -4.412205696105957, "step": 2551 }, { "epoch": 0.67, "grad_norm": 27.4842472076416, "kl": 0.0, "learning_rate": 1.6605600628107823e-07, "logps/chosen": -331.968505859375, "logps/rejected": -256.1702575683594, "loss": 0.2373, "rewards/chosen": -2.0421226024627686, "rewards/margins": 1.523503065109253, "rewards/rejected": -3.5656256675720215, "step": 2552 }, { "epoch": 0.67, "grad_norm": 40.02251434326172, "kl": 0.0, "learning_rate": 1.6592515048416645e-07, "logps/chosen": -176.6609344482422, "logps/rejected": -343.7615966796875, "loss": 0.1817, "rewards/chosen": 1.9425089359283447, "rewards/margins": 6.32541561126709, "rewards/rejected": -4.382906913757324, "step": 2553 }, { "epoch": 0.67, "grad_norm": 33.022159576416016, "kl": 0.0, "learning_rate": 1.6579429468725465e-07, "logps/chosen": -184.307861328125, "logps/rejected": -344.2357482910156, "loss": 0.1872, "rewards/chosen": 0.2234984189271927, "rewards/margins": 4.028815746307373, "rewards/rejected": -3.8053174018859863, "step": 2554 }, { "epoch": 0.67, "grad_norm": 39.761287689208984, "kl": 0.0, "learning_rate": 1.6566343889034285e-07, "logps/chosen": -253.70285034179688, "logps/rejected": -258.4842834472656, "loss": 0.3053, "rewards/chosen": -0.629117488861084, "rewards/margins": 2.948265552520752, "rewards/rejected": -3.577383041381836, "step": 2555 }, { "epoch": 0.67, "grad_norm": 35.11906814575195, "kl": 0.0, "learning_rate": 1.6553258309343104e-07, "logps/chosen": -160.90133666992188, "logps/rejected": -142.0111846923828, "loss": 0.2437, "rewards/chosen": 2.033764123916626, "rewards/margins": 4.612457275390625, "rewards/rejected": -2.578693389892578, "step": 2556 }, { "epoch": 0.67, "grad_norm": 28.91179847717285, "kl": 0.0, "learning_rate": 1.654017272965192e-07, "logps/chosen": -211.7546844482422, "logps/rejected": -228.7609100341797, "loss": 0.2662, "rewards/chosen": 1.1779052019119263, "rewards/margins": 4.679661750793457, "rewards/rejected": -3.5017566680908203, "step": 2557 }, { "epoch": 0.67, "grad_norm": 25.67728042602539, "kl": 0.0, "learning_rate": 1.652708714996074e-07, "logps/chosen": -167.1860809326172, "logps/rejected": -192.19161987304688, "loss": 0.2026, "rewards/chosen": 1.4385385513305664, "rewards/margins": 5.501163959503174, "rewards/rejected": -4.062625408172607, "step": 2558 }, { "epoch": 0.67, "grad_norm": 37.68383026123047, "kl": 0.0, "learning_rate": 1.651400157026956e-07, "logps/chosen": -148.10342407226562, "logps/rejected": -275.0115966796875, "loss": 0.2775, "rewards/chosen": 0.9385942220687866, "rewards/margins": 5.033987522125244, "rewards/rejected": -4.095393180847168, "step": 2559 }, { "epoch": 0.67, "grad_norm": 26.639482498168945, "kl": 0.0, "learning_rate": 1.6500915990578383e-07, "logps/chosen": -196.66302490234375, "logps/rejected": -277.0069274902344, "loss": 0.2187, "rewards/chosen": 2.4432859420776367, "rewards/margins": 7.975201606750488, "rewards/rejected": -5.531915664672852, "step": 2560 }, { "epoch": 0.67, "grad_norm": 32.87889099121094, "kl": 0.0, "learning_rate": 1.6487830410887202e-07, "logps/chosen": -273.00091552734375, "logps/rejected": -261.990478515625, "loss": 0.3054, "rewards/chosen": 0.26404502987861633, "rewards/margins": 5.634684085845947, "rewards/rejected": -5.370638847351074, "step": 2561 }, { "epoch": 0.67, "grad_norm": 31.613813400268555, "kl": 0.0, "learning_rate": 1.6474744831196022e-07, "logps/chosen": -134.63540649414062, "logps/rejected": -189.412353515625, "loss": 0.3389, "rewards/chosen": 0.022171571850776672, "rewards/margins": 4.27623987197876, "rewards/rejected": -4.254068374633789, "step": 2562 }, { "epoch": 0.67, "grad_norm": 29.528295516967773, "kl": 0.0, "learning_rate": 1.6461659251504842e-07, "logps/chosen": -224.38973999023438, "logps/rejected": -231.9743194580078, "loss": 0.1835, "rewards/chosen": 1.7728891372680664, "rewards/margins": 7.36733341217041, "rewards/rejected": -5.594444274902344, "step": 2563 }, { "epoch": 0.67, "grad_norm": 38.98188781738281, "kl": 0.0, "learning_rate": 1.644857367181366e-07, "logps/chosen": -207.80917358398438, "logps/rejected": -355.8811340332031, "loss": 0.2485, "rewards/chosen": 1.5398192405700684, "rewards/margins": 4.257233142852783, "rewards/rejected": -2.717413902282715, "step": 2564 }, { "epoch": 0.67, "grad_norm": 33.208133697509766, "kl": 0.0, "learning_rate": 1.6435488092122478e-07, "logps/chosen": -210.67318725585938, "logps/rejected": -217.0673370361328, "loss": 0.2497, "rewards/chosen": 2.5629193782806396, "rewards/margins": 5.6290388107299805, "rewards/rejected": -3.06611967086792, "step": 2565 }, { "epoch": 0.67, "grad_norm": 32.00192642211914, "kl": 0.0, "learning_rate": 1.64224025124313e-07, "logps/chosen": -335.8350524902344, "logps/rejected": -221.9557647705078, "loss": 0.1103, "rewards/chosen": -1.1904784440994263, "rewards/margins": 3.179619789123535, "rewards/rejected": -4.370098114013672, "step": 2566 }, { "epoch": 0.67, "grad_norm": 39.68660354614258, "kl": 0.0, "learning_rate": 1.640931693274012e-07, "logps/chosen": -246.44900512695312, "logps/rejected": -313.3238830566406, "loss": 0.189, "rewards/chosen": 2.0520215034484863, "rewards/margins": 6.506097793579102, "rewards/rejected": -4.454076290130615, "step": 2567 }, { "epoch": 0.67, "grad_norm": 31.22844886779785, "kl": 0.0, "learning_rate": 1.639623135304894e-07, "logps/chosen": -166.51016235351562, "logps/rejected": -294.8640441894531, "loss": 0.1643, "rewards/chosen": 0.9799101948738098, "rewards/margins": 5.509171485900879, "rewards/rejected": -4.529261112213135, "step": 2568 }, { "epoch": 0.67, "grad_norm": 26.876453399658203, "kl": 0.0, "learning_rate": 1.638314577335776e-07, "logps/chosen": -216.63499450683594, "logps/rejected": -244.08470153808594, "loss": 0.2215, "rewards/chosen": -0.8786706924438477, "rewards/margins": 2.4429192543029785, "rewards/rejected": -3.321589946746826, "step": 2569 }, { "epoch": 0.67, "grad_norm": 28.603649139404297, "kl": 0.0, "learning_rate": 1.637006019366658e-07, "logps/chosen": -251.87803649902344, "logps/rejected": -275.10626220703125, "loss": 0.2048, "rewards/chosen": 3.599595785140991, "rewards/margins": 9.818099021911621, "rewards/rejected": -6.218502998352051, "step": 2570 }, { "epoch": 0.67, "grad_norm": 34.09560012817383, "kl": 0.0, "learning_rate": 1.6356974613975398e-07, "logps/chosen": -141.74624633789062, "logps/rejected": -207.2333984375, "loss": 0.2704, "rewards/chosen": -0.39835241436958313, "rewards/margins": 3.2094037532806396, "rewards/rejected": -3.6077561378479004, "step": 2571 }, { "epoch": 0.67, "grad_norm": 31.071380615234375, "kl": 0.0, "learning_rate": 1.6343889034284218e-07, "logps/chosen": -215.2388153076172, "logps/rejected": -381.1431579589844, "loss": 0.2196, "rewards/chosen": 0.882358729839325, "rewards/margins": 7.919827938079834, "rewards/rejected": -7.037469387054443, "step": 2572 }, { "epoch": 0.67, "grad_norm": 36.05377960205078, "kl": 0.0, "learning_rate": 1.6330803454593038e-07, "logps/chosen": -234.13755798339844, "logps/rejected": -321.06182861328125, "loss": 0.3183, "rewards/chosen": 0.21939189732074738, "rewards/margins": 3.5455498695373535, "rewards/rejected": -3.326158046722412, "step": 2573 }, { "epoch": 0.67, "grad_norm": 49.232383728027344, "kl": 0.0, "learning_rate": 1.6317717874901857e-07, "logps/chosen": -191.3748779296875, "logps/rejected": -362.3619384765625, "loss": 0.3358, "rewards/chosen": -0.1097564697265625, "rewards/margins": 3.5779199600219727, "rewards/rejected": -3.687676429748535, "step": 2574 }, { "epoch": 0.67, "grad_norm": 41.3016242980957, "kl": 0.0, "learning_rate": 1.6304632295210677e-07, "logps/chosen": -172.50607299804688, "logps/rejected": -286.0248718261719, "loss": 0.2457, "rewards/chosen": 1.2075005769729614, "rewards/margins": 4.418184757232666, "rewards/rejected": -3.210684299468994, "step": 2575 }, { "epoch": 0.67, "grad_norm": 37.03321838378906, "kl": 0.0, "learning_rate": 1.6291546715519496e-07, "logps/chosen": -270.3417053222656, "logps/rejected": -413.8136291503906, "loss": 0.23, "rewards/chosen": 0.908319890499115, "rewards/margins": 5.533775329589844, "rewards/rejected": -4.625455379486084, "step": 2576 }, { "epoch": 0.67, "grad_norm": 34.594757080078125, "kl": 0.0, "learning_rate": 1.6278461135828316e-07, "logps/chosen": -163.30946350097656, "logps/rejected": -162.75482177734375, "loss": 0.291, "rewards/chosen": 0.06622552871704102, "rewards/margins": 2.4234416484832764, "rewards/rejected": -2.3572161197662354, "step": 2577 }, { "epoch": 0.67, "grad_norm": 48.532493591308594, "kl": 0.0, "learning_rate": 1.6265375556137136e-07, "logps/chosen": -275.835205078125, "logps/rejected": -230.15435791015625, "loss": 0.1816, "rewards/chosen": 2.8731188774108887, "rewards/margins": 6.611090660095215, "rewards/rejected": -3.737971544265747, "step": 2578 }, { "epoch": 0.67, "grad_norm": 33.894386291503906, "kl": 0.0, "learning_rate": 1.6252289976445958e-07, "logps/chosen": -237.61642456054688, "logps/rejected": -250.01815795898438, "loss": 0.1913, "rewards/chosen": 0.8094295859336853, "rewards/margins": 5.128325939178467, "rewards/rejected": -4.318896293640137, "step": 2579 }, { "epoch": 0.68, "grad_norm": 27.747053146362305, "kl": 0.0, "learning_rate": 1.6239204396754775e-07, "logps/chosen": -268.0554504394531, "logps/rejected": -167.73800659179688, "loss": 0.2013, "rewards/chosen": 1.5180522203445435, "rewards/margins": 5.381387710571289, "rewards/rejected": -3.863335371017456, "step": 2580 }, { "epoch": 0.68, "grad_norm": 37.375667572021484, "kl": 0.0, "learning_rate": 1.6226118817063595e-07, "logps/chosen": -260.16192626953125, "logps/rejected": -177.43836975097656, "loss": 0.2694, "rewards/chosen": 1.000507116317749, "rewards/margins": 3.943662166595459, "rewards/rejected": -2.94315505027771, "step": 2581 }, { "epoch": 0.68, "grad_norm": 30.776037216186523, "kl": 0.0, "learning_rate": 1.6213033237372414e-07, "logps/chosen": -226.7903289794922, "logps/rejected": -201.4454345703125, "loss": 0.1559, "rewards/chosen": 3.192276954650879, "rewards/margins": 6.419692039489746, "rewards/rejected": -3.227415084838867, "step": 2582 }, { "epoch": 0.68, "grad_norm": 25.35350227355957, "kl": 0.0, "learning_rate": 1.6199947657681234e-07, "logps/chosen": -237.74191284179688, "logps/rejected": -241.92535400390625, "loss": 0.2614, "rewards/chosen": 1.408836007118225, "rewards/margins": 4.510784149169922, "rewards/rejected": -3.1019480228424072, "step": 2583 }, { "epoch": 0.68, "grad_norm": 38.679447174072266, "kl": 0.0, "learning_rate": 1.6186862077990053e-07, "logps/chosen": -168.99832153320312, "logps/rejected": -262.581787109375, "loss": 0.2806, "rewards/chosen": 2.509021759033203, "rewards/margins": 6.563523292541504, "rewards/rejected": -4.054501533508301, "step": 2584 }, { "epoch": 0.68, "grad_norm": 36.33589553833008, "kl": 0.0, "learning_rate": 1.6173776498298876e-07, "logps/chosen": -243.29920959472656, "logps/rejected": -210.0607452392578, "loss": 0.2599, "rewards/chosen": 3.176084518432617, "rewards/margins": 6.009356498718262, "rewards/rejected": -2.8332722187042236, "step": 2585 }, { "epoch": 0.68, "grad_norm": 34.518402099609375, "kl": 0.0, "learning_rate": 1.6160690918607695e-07, "logps/chosen": -144.8197021484375, "logps/rejected": -247.53363037109375, "loss": 0.2127, "rewards/chosen": 1.2055890560150146, "rewards/margins": 5.221860885620117, "rewards/rejected": -4.016272068023682, "step": 2586 }, { "epoch": 0.68, "grad_norm": 31.37455177307129, "kl": 0.0, "learning_rate": 1.6147605338916515e-07, "logps/chosen": -194.68289184570312, "logps/rejected": -296.93902587890625, "loss": 0.2881, "rewards/chosen": 0.40831291675567627, "rewards/margins": 4.4625091552734375, "rewards/rejected": -4.054196357727051, "step": 2587 }, { "epoch": 0.68, "grad_norm": 36.10368347167969, "kl": 0.0, "learning_rate": 1.6134519759225332e-07, "logps/chosen": -248.6439208984375, "logps/rejected": -207.64151000976562, "loss": 0.2397, "rewards/chosen": 0.784537136554718, "rewards/margins": 3.2910349369049072, "rewards/rejected": -2.506497859954834, "step": 2588 }, { "epoch": 0.68, "grad_norm": 27.241235733032227, "kl": 0.0, "learning_rate": 1.6121434179534151e-07, "logps/chosen": -140.28517150878906, "logps/rejected": -276.34014892578125, "loss": 0.1638, "rewards/chosen": 2.030177354812622, "rewards/margins": 5.352236747741699, "rewards/rejected": -3.322059392929077, "step": 2589 }, { "epoch": 0.68, "grad_norm": 35.7438850402832, "kl": 0.0, "learning_rate": 1.610834859984297e-07, "logps/chosen": -234.2539520263672, "logps/rejected": -221.09268188476562, "loss": 0.2338, "rewards/chosen": 1.7499068975448608, "rewards/margins": 4.797221660614014, "rewards/rejected": -3.0473148822784424, "step": 2590 }, { "epoch": 0.68, "grad_norm": 22.870019912719727, "kl": 0.0, "learning_rate": 1.609526302015179e-07, "logps/chosen": -158.86672973632812, "logps/rejected": -288.99896240234375, "loss": 0.2452, "rewards/chosen": 0.37331318855285645, "rewards/margins": 5.675410270690918, "rewards/rejected": -5.302096843719482, "step": 2591 }, { "epoch": 0.68, "grad_norm": 40.48781204223633, "kl": 0.0, "learning_rate": 1.6082177440460613e-07, "logps/chosen": -237.75875854492188, "logps/rejected": -227.9136199951172, "loss": 0.1677, "rewards/chosen": 1.2197824716567993, "rewards/margins": 4.607675075531006, "rewards/rejected": -3.387892723083496, "step": 2592 }, { "epoch": 0.68, "grad_norm": 25.184553146362305, "kl": 0.0, "learning_rate": 1.6069091860769432e-07, "logps/chosen": -175.31613159179688, "logps/rejected": -275.30609130859375, "loss": 0.2006, "rewards/chosen": 0.3677568733692169, "rewards/margins": 5.3581461906433105, "rewards/rejected": -4.990389347076416, "step": 2593 }, { "epoch": 0.68, "grad_norm": 41.90897750854492, "kl": 0.0, "learning_rate": 1.6056006281078252e-07, "logps/chosen": -166.27017211914062, "logps/rejected": -291.6557922363281, "loss": 0.2619, "rewards/chosen": 1.7553913593292236, "rewards/margins": 5.372653961181641, "rewards/rejected": -3.617262363433838, "step": 2594 }, { "epoch": 0.68, "grad_norm": 23.900545120239258, "kl": 0.0, "learning_rate": 1.6042920701387072e-07, "logps/chosen": -310.2008972167969, "logps/rejected": -202.36691284179688, "loss": 0.2446, "rewards/chosen": -0.24510131776332855, "rewards/margins": 4.654469013214111, "rewards/rejected": -4.899570465087891, "step": 2595 }, { "epoch": 0.68, "grad_norm": 37.62279510498047, "kl": 0.0, "learning_rate": 1.6029835121695889e-07, "logps/chosen": -155.13211059570312, "logps/rejected": -285.37933349609375, "loss": 0.2217, "rewards/chosen": 1.5080240964889526, "rewards/margins": 5.727861404418945, "rewards/rejected": -4.219837188720703, "step": 2596 }, { "epoch": 0.68, "grad_norm": 36.81294250488281, "kl": 0.0, "learning_rate": 1.6016749542004708e-07, "logps/chosen": -228.24301147460938, "logps/rejected": -280.6625061035156, "loss": 0.1871, "rewards/chosen": 1.3690900802612305, "rewards/margins": 4.551156520843506, "rewards/rejected": -3.1820664405822754, "step": 2597 }, { "epoch": 0.68, "grad_norm": 39.81789779663086, "kl": 0.0, "learning_rate": 1.600366396231353e-07, "logps/chosen": -219.61534118652344, "logps/rejected": -181.93167114257812, "loss": 0.2771, "rewards/chosen": 1.7412033081054688, "rewards/margins": 5.313830375671387, "rewards/rejected": -3.572626829147339, "step": 2598 }, { "epoch": 0.68, "grad_norm": 32.26911926269531, "kl": 0.0, "learning_rate": 1.599057838262235e-07, "logps/chosen": -163.3314971923828, "logps/rejected": -243.11912536621094, "loss": 0.2877, "rewards/chosen": -0.1219571903347969, "rewards/margins": 3.6402747631073, "rewards/rejected": -3.7622320652008057, "step": 2599 }, { "epoch": 0.68, "grad_norm": 37.65869140625, "kl": 0.0, "learning_rate": 1.597749280293117e-07, "logps/chosen": -149.3471221923828, "logps/rejected": -297.2940673828125, "loss": 0.288, "rewards/chosen": 0.8982479572296143, "rewards/margins": 7.234429359436035, "rewards/rejected": -6.336181640625, "step": 2600 }, { "epoch": 0.68, "grad_norm": 31.088722229003906, "kl": 0.0, "learning_rate": 1.596440722323999e-07, "logps/chosen": -233.47364807128906, "logps/rejected": -185.42413330078125, "loss": 0.1748, "rewards/chosen": 2.3092989921569824, "rewards/margins": 6.058829307556152, "rewards/rejected": -3.749530076980591, "step": 2601 }, { "epoch": 0.68, "grad_norm": 20.580787658691406, "kl": 0.0, "learning_rate": 1.595132164354881e-07, "logps/chosen": -184.43515014648438, "logps/rejected": -319.0971984863281, "loss": 0.208, "rewards/chosen": 0.845647931098938, "rewards/margins": 9.530956268310547, "rewards/rejected": -8.685308456420898, "step": 2602 }, { "epoch": 0.68, "grad_norm": 36.49890899658203, "kl": 0.0, "learning_rate": 1.5938236063857626e-07, "logps/chosen": -248.2190704345703, "logps/rejected": -357.3635559082031, "loss": 0.1562, "rewards/chosen": 2.4234509468078613, "rewards/margins": 6.921143054962158, "rewards/rejected": -4.497692108154297, "step": 2603 }, { "epoch": 0.68, "grad_norm": 47.470741271972656, "kl": 0.0, "learning_rate": 1.5925150484166448e-07, "logps/chosen": -169.32373046875, "logps/rejected": -272.4686584472656, "loss": 0.3708, "rewards/chosen": -0.13372045755386353, "rewards/margins": 4.087759017944336, "rewards/rejected": -4.221479415893555, "step": 2604 }, { "epoch": 0.68, "grad_norm": 29.60095977783203, "kl": 0.0, "learning_rate": 1.5912064904475268e-07, "logps/chosen": -144.54171752929688, "logps/rejected": -233.62033081054688, "loss": 0.2275, "rewards/chosen": 0.8344153165817261, "rewards/margins": 4.808291435241699, "rewards/rejected": -3.9738759994506836, "step": 2605 }, { "epoch": 0.68, "grad_norm": 20.64869499206543, "kl": 0.0, "learning_rate": 1.5898979324784087e-07, "logps/chosen": -214.33297729492188, "logps/rejected": -262.1623229980469, "loss": 0.1705, "rewards/chosen": 1.7195442914962769, "rewards/margins": 5.088170528411865, "rewards/rejected": -3.368626117706299, "step": 2606 }, { "epoch": 0.68, "grad_norm": 30.235107421875, "kl": 0.0, "learning_rate": 1.5885893745092907e-07, "logps/chosen": -168.021484375, "logps/rejected": -231.19189453125, "loss": 0.372, "rewards/chosen": -0.15469840168952942, "rewards/margins": 3.0709803104400635, "rewards/rejected": -3.2256786823272705, "step": 2607 }, { "epoch": 0.68, "grad_norm": 34.14283752441406, "kl": 0.0, "learning_rate": 1.5872808165401727e-07, "logps/chosen": -210.44888305664062, "logps/rejected": -228.640380859375, "loss": 0.3293, "rewards/chosen": -0.6430841684341431, "rewards/margins": 2.8537302017211914, "rewards/rejected": -3.496814250946045, "step": 2608 }, { "epoch": 0.68, "grad_norm": 40.17918395996094, "kl": 0.0, "learning_rate": 1.5859722585710546e-07, "logps/chosen": -175.11221313476562, "logps/rejected": -246.70181274414062, "loss": 0.3029, "rewards/chosen": 1.0205662250518799, "rewards/margins": 5.190764427185059, "rewards/rejected": -4.1701979637146, "step": 2609 }, { "epoch": 0.68, "grad_norm": 41.95241928100586, "kl": 0.0, "learning_rate": 1.5846637006019366e-07, "logps/chosen": -269.70556640625, "logps/rejected": -218.40017700195312, "loss": 0.2962, "rewards/chosen": -1.2158753871917725, "rewards/margins": 1.4317800998687744, "rewards/rejected": -2.647655487060547, "step": 2610 }, { "epoch": 0.68, "grad_norm": 32.46884536743164, "kl": 0.0, "learning_rate": 1.5833551426328185e-07, "logps/chosen": -240.30992126464844, "logps/rejected": -278.72393798828125, "loss": 0.2791, "rewards/chosen": 1.6434956789016724, "rewards/margins": 5.217041492462158, "rewards/rejected": -3.5735456943511963, "step": 2611 }, { "epoch": 0.68, "grad_norm": 29.526588439941406, "kl": 0.0, "learning_rate": 1.5820465846637005e-07, "logps/chosen": -217.5753631591797, "logps/rejected": -239.61083984375, "loss": 0.2835, "rewards/chosen": 1.0819295644760132, "rewards/margins": 4.064085006713867, "rewards/rejected": -2.9821553230285645, "step": 2612 }, { "epoch": 0.68, "grad_norm": 36.759010314941406, "kl": 0.0, "learning_rate": 1.5807380266945825e-07, "logps/chosen": -196.2762908935547, "logps/rejected": -238.6885528564453, "loss": 0.1709, "rewards/chosen": 0.7026401162147522, "rewards/margins": 5.314055442810059, "rewards/rejected": -4.611415386199951, "step": 2613 }, { "epoch": 0.68, "grad_norm": 32.18585205078125, "kl": 0.0, "learning_rate": 1.5794294687254644e-07, "logps/chosen": -212.0025634765625, "logps/rejected": -346.5420227050781, "loss": 0.1871, "rewards/chosen": 1.7895128726959229, "rewards/margins": 6.3749494552612305, "rewards/rejected": -4.5854363441467285, "step": 2614 }, { "epoch": 0.68, "grad_norm": 29.172779083251953, "kl": 0.0, "learning_rate": 1.5781209107563464e-07, "logps/chosen": -184.76547241210938, "logps/rejected": -235.42982482910156, "loss": 0.0766, "rewards/chosen": 1.806889533996582, "rewards/margins": 5.578388690948486, "rewards/rejected": -3.7714991569519043, "step": 2615 }, { "epoch": 0.68, "grad_norm": 39.95838928222656, "kl": 0.0, "learning_rate": 1.5768123527872283e-07, "logps/chosen": -181.52728271484375, "logps/rejected": -280.96826171875, "loss": 0.2783, "rewards/chosen": 0.32978546619415283, "rewards/margins": 6.202417850494385, "rewards/rejected": -5.8726325035095215, "step": 2616 }, { "epoch": 0.68, "grad_norm": 30.65798568725586, "kl": 0.0, "learning_rate": 1.5755037948181106e-07, "logps/chosen": -198.12869262695312, "logps/rejected": -301.61383056640625, "loss": 0.1814, "rewards/chosen": 1.700964331626892, "rewards/margins": 7.295632839202881, "rewards/rejected": -5.594668388366699, "step": 2617 }, { "epoch": 0.69, "grad_norm": 32.177223205566406, "kl": 0.0, "learning_rate": 1.5741952368489925e-07, "logps/chosen": -161.6401824951172, "logps/rejected": -255.524658203125, "loss": 0.2605, "rewards/chosen": 1.3057931661605835, "rewards/margins": 6.553887844085693, "rewards/rejected": -5.24809455871582, "step": 2618 }, { "epoch": 0.69, "grad_norm": 29.845308303833008, "kl": 0.0, "learning_rate": 1.5728866788798742e-07, "logps/chosen": -239.44155883789062, "logps/rejected": -179.19857788085938, "loss": 0.2494, "rewards/chosen": 0.5105099678039551, "rewards/margins": 4.343420028686523, "rewards/rejected": -3.8329102993011475, "step": 2619 }, { "epoch": 0.69, "grad_norm": 34.138816833496094, "kl": 0.0, "learning_rate": 1.5715781209107562e-07, "logps/chosen": -133.4228515625, "logps/rejected": -297.18719482421875, "loss": 0.2533, "rewards/chosen": 0.6324558258056641, "rewards/margins": 5.259095668792725, "rewards/rejected": -4.6266398429870605, "step": 2620 }, { "epoch": 0.69, "grad_norm": 35.212257385253906, "kl": 0.0, "learning_rate": 1.5702695629416382e-07, "logps/chosen": -200.92498779296875, "logps/rejected": -161.45701599121094, "loss": 0.276, "rewards/chosen": 0.06087028980255127, "rewards/margins": 2.053701877593994, "rewards/rejected": -1.9928315877914429, "step": 2621 }, { "epoch": 0.69, "grad_norm": 53.46183776855469, "kl": 0.0, "learning_rate": 1.56896100497252e-07, "logps/chosen": -210.4583740234375, "logps/rejected": -235.5002899169922, "loss": 0.2802, "rewards/chosen": 2.0013036727905273, "rewards/margins": 3.4605746269226074, "rewards/rejected": -1.4592708349227905, "step": 2622 }, { "epoch": 0.69, "grad_norm": 27.320358276367188, "kl": 0.0, "learning_rate": 1.5676524470034023e-07, "logps/chosen": -238.22268676757812, "logps/rejected": -362.32037353515625, "loss": 0.1861, "rewards/chosen": 0.02560839056968689, "rewards/margins": 5.871598720550537, "rewards/rejected": -5.845990180969238, "step": 2623 }, { "epoch": 0.69, "grad_norm": 44.43696975708008, "kl": 0.0, "learning_rate": 1.5663438890342843e-07, "logps/chosen": -276.4560241699219, "logps/rejected": -305.7402648925781, "loss": 0.22, "rewards/chosen": 2.26029109954834, "rewards/margins": 6.064087867736816, "rewards/rejected": -3.8037965297698975, "step": 2624 }, { "epoch": 0.69, "grad_norm": 23.91146469116211, "kl": 0.0, "learning_rate": 1.5650353310651663e-07, "logps/chosen": -157.71304321289062, "logps/rejected": -204.7120361328125, "loss": 0.2469, "rewards/chosen": -0.3723863363265991, "rewards/margins": 3.3025412559509277, "rewards/rejected": -3.6749274730682373, "step": 2625 }, { "epoch": 0.69, "grad_norm": 26.316261291503906, "kl": 0.0, "learning_rate": 1.563726773096048e-07, "logps/chosen": -206.56991577148438, "logps/rejected": -180.08187866210938, "loss": 0.3156, "rewards/chosen": 0.9054687023162842, "rewards/margins": 4.859877586364746, "rewards/rejected": -3.954408645629883, "step": 2626 }, { "epoch": 0.69, "grad_norm": 37.809608459472656, "kl": 0.0, "learning_rate": 1.56241821512693e-07, "logps/chosen": -208.02249145507812, "logps/rejected": -250.53970336914062, "loss": 0.3625, "rewards/chosen": -0.296889066696167, "rewards/margins": 3.3697378635406494, "rewards/rejected": -3.6666269302368164, "step": 2627 }, { "epoch": 0.69, "grad_norm": 40.240020751953125, "kl": 0.0, "learning_rate": 1.561109657157812e-07, "logps/chosen": -247.34298706054688, "logps/rejected": -228.52392578125, "loss": 0.355, "rewards/chosen": -0.22601445019245148, "rewards/margins": 2.2114017009735107, "rewards/rejected": -2.4374160766601562, "step": 2628 }, { "epoch": 0.69, "grad_norm": 43.382259368896484, "kl": 0.0, "learning_rate": 1.5598010991886938e-07, "logps/chosen": -261.98870849609375, "logps/rejected": -233.58889770507812, "loss": 0.3239, "rewards/chosen": -0.3544365167617798, "rewards/margins": 3.8448009490966797, "rewards/rejected": -4.19923734664917, "step": 2629 }, { "epoch": 0.69, "grad_norm": 28.072172164916992, "kl": 0.0, "learning_rate": 1.558492541219576e-07, "logps/chosen": -210.15908813476562, "logps/rejected": -259.5641784667969, "loss": 0.1628, "rewards/chosen": 1.8099766969680786, "rewards/margins": 5.892375469207764, "rewards/rejected": -4.082398891448975, "step": 2630 }, { "epoch": 0.69, "grad_norm": 33.46635818481445, "kl": 0.0, "learning_rate": 1.557183983250458e-07, "logps/chosen": -228.41885375976562, "logps/rejected": -255.07586669921875, "loss": 0.3827, "rewards/chosen": -0.8277987241744995, "rewards/margins": 1.9675222635269165, "rewards/rejected": -2.795320987701416, "step": 2631 }, { "epoch": 0.69, "grad_norm": 31.32046890258789, "kl": 0.0, "learning_rate": 1.55587542528134e-07, "logps/chosen": -147.96084594726562, "logps/rejected": -237.70318603515625, "loss": 0.2428, "rewards/chosen": 1.3135221004486084, "rewards/margins": 4.7350921630859375, "rewards/rejected": -3.421570301055908, "step": 2632 }, { "epoch": 0.69, "grad_norm": 36.38935852050781, "kl": 0.0, "learning_rate": 1.554566867312222e-07, "logps/chosen": -178.5509490966797, "logps/rejected": -218.0067596435547, "loss": 0.2722, "rewards/chosen": 0.268542617559433, "rewards/margins": 3.8187830448150635, "rewards/rejected": -3.5502405166625977, "step": 2633 }, { "epoch": 0.69, "grad_norm": 33.196720123291016, "kl": 0.0, "learning_rate": 1.5532583093431036e-07, "logps/chosen": -196.33807373046875, "logps/rejected": -196.2073516845703, "loss": 0.2131, "rewards/chosen": 1.0521554946899414, "rewards/margins": 7.267307281494141, "rewards/rejected": -6.215151786804199, "step": 2634 }, { "epoch": 0.69, "grad_norm": 36.032859802246094, "kl": 0.0, "learning_rate": 1.5519497513739856e-07, "logps/chosen": -218.05908203125, "logps/rejected": -240.1728515625, "loss": 0.275, "rewards/chosen": 0.8040695786476135, "rewards/margins": 5.664813041687012, "rewards/rejected": -4.860743522644043, "step": 2635 }, { "epoch": 0.69, "grad_norm": 40.46497344970703, "kl": 0.0, "learning_rate": 1.5506411934048678e-07, "logps/chosen": -168.18862915039062, "logps/rejected": -292.72064208984375, "loss": 0.3087, "rewards/chosen": -0.045253679156303406, "rewards/margins": 6.116161346435547, "rewards/rejected": -6.161415100097656, "step": 2636 }, { "epoch": 0.69, "grad_norm": 28.5998477935791, "kl": 0.0, "learning_rate": 1.5493326354357498e-07, "logps/chosen": -244.7726287841797, "logps/rejected": -213.48959350585938, "loss": 0.222, "rewards/chosen": 1.3389453887939453, "rewards/margins": 3.876892328262329, "rewards/rejected": -2.537946939468384, "step": 2637 }, { "epoch": 0.69, "grad_norm": 26.24286460876465, "kl": 0.0, "learning_rate": 1.5480240774666318e-07, "logps/chosen": -385.7197265625, "logps/rejected": -240.03048706054688, "loss": 0.1706, "rewards/chosen": 0.9546328186988831, "rewards/margins": 4.774143218994141, "rewards/rejected": -3.8195104598999023, "step": 2638 }, { "epoch": 0.69, "grad_norm": 31.864839553833008, "kl": 0.0, "learning_rate": 1.5467155194975137e-07, "logps/chosen": -146.63380432128906, "logps/rejected": -181.90550231933594, "loss": 0.3093, "rewards/chosen": 0.41834282875061035, "rewards/margins": 3.2443583011627197, "rewards/rejected": -2.8260154724121094, "step": 2639 }, { "epoch": 0.69, "grad_norm": 38.77482986450195, "kl": 0.0, "learning_rate": 1.5454069615283957e-07, "logps/chosen": -245.33843994140625, "logps/rejected": -288.5142822265625, "loss": 0.289, "rewards/chosen": 0.6637973785400391, "rewards/margins": 3.466867446899414, "rewards/rejected": -2.803070068359375, "step": 2640 }, { "epoch": 0.69, "grad_norm": 36.03498840332031, "kl": 0.0, "learning_rate": 1.5440984035592776e-07, "logps/chosen": -124.09053802490234, "logps/rejected": -223.87887573242188, "loss": 0.2097, "rewards/chosen": 0.37351658940315247, "rewards/margins": 4.57509183883667, "rewards/rejected": -4.20157527923584, "step": 2641 }, { "epoch": 0.69, "grad_norm": 28.98439598083496, "kl": 0.0, "learning_rate": 1.5427898455901593e-07, "logps/chosen": -198.59564208984375, "logps/rejected": -307.36444091796875, "loss": 0.1569, "rewards/chosen": 1.2171781063079834, "rewards/margins": 6.403229713439941, "rewards/rejected": -5.186051368713379, "step": 2642 }, { "epoch": 0.69, "grad_norm": 34.85798645019531, "kl": 0.0, "learning_rate": 1.5414812876210416e-07, "logps/chosen": -274.24334716796875, "logps/rejected": -247.5282745361328, "loss": 0.1454, "rewards/chosen": -0.12386541068553925, "rewards/margins": 3.7028324604034424, "rewards/rejected": -3.826697826385498, "step": 2643 }, { "epoch": 0.69, "grad_norm": 40.29619216918945, "kl": 0.0, "learning_rate": 1.5401727296519235e-07, "logps/chosen": -299.2466125488281, "logps/rejected": -177.40074157714844, "loss": 0.2911, "rewards/chosen": 0.5023276209831238, "rewards/margins": 3.8164069652557373, "rewards/rejected": -3.3140792846679688, "step": 2644 }, { "epoch": 0.69, "grad_norm": 39.41469192504883, "kl": 0.0, "learning_rate": 1.5388641716828055e-07, "logps/chosen": -159.89877319335938, "logps/rejected": -367.87188720703125, "loss": 0.3222, "rewards/chosen": 0.471483051776886, "rewards/margins": 4.570820331573486, "rewards/rejected": -4.099337100982666, "step": 2645 }, { "epoch": 0.69, "grad_norm": 33.267940521240234, "kl": 0.0, "learning_rate": 1.5375556137136874e-07, "logps/chosen": -288.19708251953125, "logps/rejected": -221.0328369140625, "loss": 0.2419, "rewards/chosen": 1.0454717874526978, "rewards/margins": 3.774181842803955, "rewards/rejected": -2.728710174560547, "step": 2646 }, { "epoch": 0.69, "grad_norm": 38.31817626953125, "kl": 0.0, "learning_rate": 1.5362470557445694e-07, "logps/chosen": -248.33908081054688, "logps/rejected": -239.07632446289062, "loss": 0.2773, "rewards/chosen": 0.7482012510299683, "rewards/margins": 5.982780456542969, "rewards/rejected": -5.234579086303711, "step": 2647 }, { "epoch": 0.69, "grad_norm": 51.11371612548828, "kl": 0.0, "learning_rate": 1.5349384977754514e-07, "logps/chosen": -251.4349822998047, "logps/rejected": -238.06494140625, "loss": 0.2741, "rewards/chosen": -0.5241379737854004, "rewards/margins": 3.8754677772521973, "rewards/rejected": -4.399605751037598, "step": 2648 }, { "epoch": 0.69, "grad_norm": 33.129005432128906, "kl": 0.0, "learning_rate": 1.5336299398063336e-07, "logps/chosen": -205.00674438476562, "logps/rejected": -331.13824462890625, "loss": 0.2413, "rewards/chosen": 0.2049345076084137, "rewards/margins": 5.713682174682617, "rewards/rejected": -5.508747577667236, "step": 2649 }, { "epoch": 0.69, "grad_norm": 35.91727066040039, "kl": 0.0, "learning_rate": 1.5323213818372153e-07, "logps/chosen": -228.20111083984375, "logps/rejected": -253.9140625, "loss": 0.3428, "rewards/chosen": 2.9546375274658203, "rewards/margins": 4.23104190826416, "rewards/rejected": -1.2764043807983398, "step": 2650 }, { "epoch": 0.69, "grad_norm": 26.117427825927734, "kl": 0.0, "learning_rate": 1.5310128238680972e-07, "logps/chosen": -181.5155029296875, "logps/rejected": -233.0501251220703, "loss": 0.2755, "rewards/chosen": 0.12543119490146637, "rewards/margins": 2.9917426109313965, "rewards/rejected": -2.8663113117218018, "step": 2651 }, { "epoch": 0.69, "grad_norm": 48.09539031982422, "kl": 0.0, "learning_rate": 1.5297042658989792e-07, "logps/chosen": -203.1796875, "logps/rejected": -259.39404296875, "loss": 0.2379, "rewards/chosen": 1.2114871740341187, "rewards/margins": 6.3016886711120605, "rewards/rejected": -5.090201377868652, "step": 2652 }, { "epoch": 0.69, "grad_norm": 35.25298309326172, "kl": 0.0, "learning_rate": 1.5283957079298612e-07, "logps/chosen": -290.0359191894531, "logps/rejected": -214.7886505126953, "loss": 0.3032, "rewards/chosen": 1.8502638339996338, "rewards/margins": 5.052051544189453, "rewards/rejected": -3.2017874717712402, "step": 2653 }, { "epoch": 0.69, "grad_norm": 25.84156608581543, "kl": 0.0, "learning_rate": 1.527087149960743e-07, "logps/chosen": -171.43955993652344, "logps/rejected": -305.144287109375, "loss": 0.1473, "rewards/chosen": 1.7509996891021729, "rewards/margins": 6.030915260314941, "rewards/rejected": -4.2799153327941895, "step": 2654 }, { "epoch": 0.69, "grad_norm": 34.59351348876953, "kl": 0.0, "learning_rate": 1.5257785919916254e-07, "logps/chosen": -209.79238891601562, "logps/rejected": -225.7039337158203, "loss": 0.4036, "rewards/chosen": 2.4418153762817383, "rewards/margins": 3.5955920219421387, "rewards/rejected": -1.1537765264511108, "step": 2655 }, { "epoch": 0.7, "grad_norm": 31.137544631958008, "kl": 0.0, "learning_rate": 1.5244700340225073e-07, "logps/chosen": -244.44252014160156, "logps/rejected": -289.1747131347656, "loss": 0.223, "rewards/chosen": 0.9428373575210571, "rewards/margins": 5.725986003875732, "rewards/rejected": -4.783148765563965, "step": 2656 }, { "epoch": 0.7, "grad_norm": 42.90130615234375, "kl": 0.0, "learning_rate": 1.523161476053389e-07, "logps/chosen": -200.8882598876953, "logps/rejected": -211.6605224609375, "loss": 0.2389, "rewards/chosen": 0.7012113332748413, "rewards/margins": 4.256584167480469, "rewards/rejected": -3.555372953414917, "step": 2657 }, { "epoch": 0.7, "grad_norm": 26.240468978881836, "kl": 0.0, "learning_rate": 1.521852918084271e-07, "logps/chosen": -179.78411865234375, "logps/rejected": -234.9866943359375, "loss": 0.2692, "rewards/chosen": 1.508581519126892, "rewards/margins": 5.079866409301758, "rewards/rejected": -3.571284770965576, "step": 2658 }, { "epoch": 0.7, "grad_norm": 54.163883209228516, "kl": 0.0, "learning_rate": 1.520544360115153e-07, "logps/chosen": -178.21804809570312, "logps/rejected": -310.228759765625, "loss": 0.3183, "rewards/chosen": 0.48193949460983276, "rewards/margins": 5.572971820831299, "rewards/rejected": -5.0910325050354, "step": 2659 }, { "epoch": 0.7, "grad_norm": 33.376529693603516, "kl": 0.0, "learning_rate": 1.519235802146035e-07, "logps/chosen": -278.4017028808594, "logps/rejected": -241.92855834960938, "loss": 0.1961, "rewards/chosen": 0.4909915030002594, "rewards/margins": 4.12484073638916, "rewards/rejected": -3.6338493824005127, "step": 2660 }, { "epoch": 0.7, "grad_norm": 35.16926956176758, "kl": 0.0, "learning_rate": 1.5179272441769169e-07, "logps/chosen": -161.02484130859375, "logps/rejected": -265.98419189453125, "loss": 0.295, "rewards/chosen": 0.2557518482208252, "rewards/margins": 5.179582595825195, "rewards/rejected": -4.923830509185791, "step": 2661 }, { "epoch": 0.7, "grad_norm": 37.866119384765625, "kl": 0.0, "learning_rate": 1.516618686207799e-07, "logps/chosen": -188.62120056152344, "logps/rejected": -188.76953125, "loss": 0.2359, "rewards/chosen": 1.9292501211166382, "rewards/margins": 5.487701416015625, "rewards/rejected": -3.5584514141082764, "step": 2662 }, { "epoch": 0.7, "grad_norm": 33.47230529785156, "kl": 0.0, "learning_rate": 1.515310128238681e-07, "logps/chosen": -198.44082641601562, "logps/rejected": -214.91188049316406, "loss": 0.3577, "rewards/chosen": 0.3523358404636383, "rewards/margins": 3.134798765182495, "rewards/rejected": -2.7824628353118896, "step": 2663 }, { "epoch": 0.7, "grad_norm": 38.74263381958008, "kl": 0.0, "learning_rate": 1.514001570269563e-07, "logps/chosen": -285.26641845703125, "logps/rejected": -270.3790283203125, "loss": 0.2876, "rewards/chosen": 0.34473711252212524, "rewards/margins": 4.277539253234863, "rewards/rejected": -3.9328019618988037, "step": 2664 }, { "epoch": 0.7, "grad_norm": 30.61463165283203, "kl": 0.0, "learning_rate": 1.5126930123004447e-07, "logps/chosen": -160.3408966064453, "logps/rejected": -221.6944580078125, "loss": 0.1917, "rewards/chosen": 2.274693012237549, "rewards/margins": 5.692990303039551, "rewards/rejected": -3.418297290802002, "step": 2665 }, { "epoch": 0.7, "grad_norm": 42.397430419921875, "kl": 0.0, "learning_rate": 1.5113844543313267e-07, "logps/chosen": -295.8493957519531, "logps/rejected": -198.1302490234375, "loss": 0.3157, "rewards/chosen": 0.8204801678657532, "rewards/margins": 3.688586473464966, "rewards/rejected": -2.8681063652038574, "step": 2666 }, { "epoch": 0.7, "grad_norm": 38.239810943603516, "kl": 0.0, "learning_rate": 1.5100758963622086e-07, "logps/chosen": -212.99639892578125, "logps/rejected": -250.43011474609375, "loss": 0.2838, "rewards/chosen": 1.9497802257537842, "rewards/margins": 4.861065864562988, "rewards/rejected": -2.911285400390625, "step": 2667 }, { "epoch": 0.7, "grad_norm": 22.39212989807129, "kl": 0.0, "learning_rate": 1.5087673383930908e-07, "logps/chosen": -181.47560119628906, "logps/rejected": -157.10704040527344, "loss": 0.2478, "rewards/chosen": 1.5009618997573853, "rewards/margins": 4.634808540344238, "rewards/rejected": -3.1338467597961426, "step": 2668 }, { "epoch": 0.7, "grad_norm": 39.13432312011719, "kl": 0.0, "learning_rate": 1.5074587804239728e-07, "logps/chosen": -229.26280212402344, "logps/rejected": -272.8870849609375, "loss": 0.3134, "rewards/chosen": 1.0489386320114136, "rewards/margins": 4.14993143081665, "rewards/rejected": -3.1009929180145264, "step": 2669 }, { "epoch": 0.7, "grad_norm": 36.7012825012207, "kl": 0.0, "learning_rate": 1.5061502224548548e-07, "logps/chosen": -220.99365234375, "logps/rejected": -224.8020782470703, "loss": 0.3269, "rewards/chosen": -0.3122532069683075, "rewards/margins": 2.7056050300598145, "rewards/rejected": -3.0178582668304443, "step": 2670 }, { "epoch": 0.7, "grad_norm": 35.51311111450195, "kl": 0.0, "learning_rate": 1.5048416644857367e-07, "logps/chosen": -175.61502075195312, "logps/rejected": -212.26882934570312, "loss": 0.245, "rewards/chosen": 1.2118319272994995, "rewards/margins": 5.409755706787109, "rewards/rejected": -4.19792366027832, "step": 2671 }, { "epoch": 0.7, "grad_norm": 32.9407958984375, "kl": 0.0, "learning_rate": 1.5035331065166187e-07, "logps/chosen": -168.10598754882812, "logps/rejected": -314.8375549316406, "loss": 0.2223, "rewards/chosen": 0.9820523858070374, "rewards/margins": 4.9999566078186035, "rewards/rejected": -4.017904281616211, "step": 2672 }, { "epoch": 0.7, "grad_norm": 40.48832702636719, "kl": 0.0, "learning_rate": 1.5022245485475004e-07, "logps/chosen": -150.69357299804688, "logps/rejected": -287.1880798339844, "loss": 0.2422, "rewards/chosen": 1.9085931777954102, "rewards/margins": 5.372089862823486, "rewards/rejected": -3.463496685028076, "step": 2673 }, { "epoch": 0.7, "grad_norm": 40.512306213378906, "kl": 0.0, "learning_rate": 1.5009159905783823e-07, "logps/chosen": -301.79364013671875, "logps/rejected": -183.44329833984375, "loss": 0.315, "rewards/chosen": 0.24387967586517334, "rewards/margins": 2.798271656036377, "rewards/rejected": -2.554391860961914, "step": 2674 }, { "epoch": 0.7, "grad_norm": 30.446664810180664, "kl": 0.0, "learning_rate": 1.4996074326092646e-07, "logps/chosen": -195.90579223632812, "logps/rejected": -343.2037658691406, "loss": 0.2241, "rewards/chosen": 1.6417574882507324, "rewards/margins": 5.597146987915039, "rewards/rejected": -3.9553897380828857, "step": 2675 }, { "epoch": 0.7, "grad_norm": 32.61503219604492, "kl": 0.0, "learning_rate": 1.4982988746401465e-07, "logps/chosen": -158.60813903808594, "logps/rejected": -193.80609130859375, "loss": 0.231, "rewards/chosen": 2.2817306518554688, "rewards/margins": 5.334161281585693, "rewards/rejected": -3.0524306297302246, "step": 2676 }, { "epoch": 0.7, "grad_norm": 36.24722671508789, "kl": 0.0, "learning_rate": 1.4969903166710285e-07, "logps/chosen": -179.80506896972656, "logps/rejected": -192.8569793701172, "loss": 0.3059, "rewards/chosen": 2.079206943511963, "rewards/margins": 3.3447813987731934, "rewards/rejected": -1.265574336051941, "step": 2677 }, { "epoch": 0.7, "grad_norm": 34.13661193847656, "kl": 0.0, "learning_rate": 1.4956817587019105e-07, "logps/chosen": -181.9069061279297, "logps/rejected": -220.1363525390625, "loss": 0.2766, "rewards/chosen": 1.1834523677825928, "rewards/margins": 4.73173713684082, "rewards/rejected": -3.5482850074768066, "step": 2678 }, { "epoch": 0.7, "grad_norm": 27.697574615478516, "kl": 0.0, "learning_rate": 1.4943732007327924e-07, "logps/chosen": -224.85931396484375, "logps/rejected": -242.00808715820312, "loss": 0.1453, "rewards/chosen": 2.999886989593506, "rewards/margins": 7.181798934936523, "rewards/rejected": -4.181911945343018, "step": 2679 }, { "epoch": 0.7, "grad_norm": 35.0987548828125, "kl": 0.0, "learning_rate": 1.4930646427636744e-07, "logps/chosen": -271.865478515625, "logps/rejected": -132.74562072753906, "loss": 0.1511, "rewards/chosen": 3.920344829559326, "rewards/margins": 5.227973461151123, "rewards/rejected": -1.3076287508010864, "step": 2680 }, { "epoch": 0.7, "grad_norm": 39.40233612060547, "kl": 0.0, "learning_rate": 1.4917560847945563e-07, "logps/chosen": -176.2115020751953, "logps/rejected": -169.6930389404297, "loss": 0.2693, "rewards/chosen": -0.39483752846717834, "rewards/margins": 2.6290271282196045, "rewards/rejected": -3.02386474609375, "step": 2681 }, { "epoch": 0.7, "grad_norm": 34.06243896484375, "kl": 0.0, "learning_rate": 1.4904475268254383e-07, "logps/chosen": -154.3968505859375, "logps/rejected": -251.61129760742188, "loss": 0.2755, "rewards/chosen": -0.5720101594924927, "rewards/margins": 4.821446418762207, "rewards/rejected": -5.39345645904541, "step": 2682 }, { "epoch": 0.7, "grad_norm": 49.528343200683594, "kl": 0.0, "learning_rate": 1.4891389688563203e-07, "logps/chosen": -202.0147705078125, "logps/rejected": -214.54263305664062, "loss": 0.3572, "rewards/chosen": 0.9688149690628052, "rewards/margins": 3.14668607711792, "rewards/rejected": -2.177870988845825, "step": 2683 }, { "epoch": 0.7, "grad_norm": 39.2160530090332, "kl": 0.0, "learning_rate": 1.4878304108872022e-07, "logps/chosen": -317.7562561035156, "logps/rejected": -237.9340057373047, "loss": 0.1387, "rewards/chosen": 2.1960606575012207, "rewards/margins": 5.66136360168457, "rewards/rejected": -3.4653029441833496, "step": 2684 }, { "epoch": 0.7, "grad_norm": 33.726722717285156, "kl": 0.0, "learning_rate": 1.4865218529180842e-07, "logps/chosen": -241.3274688720703, "logps/rejected": -260.200439453125, "loss": 0.2198, "rewards/chosen": 2.3069839477539062, "rewards/margins": 7.066941261291504, "rewards/rejected": -4.759957313537598, "step": 2685 }, { "epoch": 0.7, "grad_norm": 37.164852142333984, "kl": 0.0, "learning_rate": 1.4852132949489661e-07, "logps/chosen": -303.6715087890625, "logps/rejected": -298.21490478515625, "loss": 0.2933, "rewards/chosen": -1.2913599014282227, "rewards/margins": 3.334641933441162, "rewards/rejected": -4.626001834869385, "step": 2686 }, { "epoch": 0.7, "grad_norm": 39.393348693847656, "kl": 0.0, "learning_rate": 1.4839047369798484e-07, "logps/chosen": -285.0570983886719, "logps/rejected": -248.0108642578125, "loss": 0.1947, "rewards/chosen": 2.4950828552246094, "rewards/margins": 5.850523948669434, "rewards/rejected": -3.355441093444824, "step": 2687 }, { "epoch": 0.7, "grad_norm": 30.06007194519043, "kl": 0.0, "learning_rate": 1.48259617901073e-07, "logps/chosen": -205.47360229492188, "logps/rejected": -374.58258056640625, "loss": 0.2137, "rewards/chosen": 1.308136224746704, "rewards/margins": 5.134586334228516, "rewards/rejected": -3.8264501094818115, "step": 2688 }, { "epoch": 0.7, "grad_norm": 36.91188430786133, "kl": 0.0, "learning_rate": 1.481287621041612e-07, "logps/chosen": -172.80679321289062, "logps/rejected": -306.2414245605469, "loss": 0.2751, "rewards/chosen": 0.7240972518920898, "rewards/margins": 4.706175804138184, "rewards/rejected": -3.9820785522460938, "step": 2689 }, { "epoch": 0.7, "grad_norm": 33.07395553588867, "kl": 0.0, "learning_rate": 1.479979063072494e-07, "logps/chosen": -173.76805114746094, "logps/rejected": -237.40435791015625, "loss": 0.2752, "rewards/chosen": 0.2675023376941681, "rewards/margins": 3.5950818061828613, "rewards/rejected": -3.3275794982910156, "step": 2690 }, { "epoch": 0.7, "grad_norm": 28.920312881469727, "kl": 0.0, "learning_rate": 1.478670505103376e-07, "logps/chosen": -187.97975158691406, "logps/rejected": -165.814697265625, "loss": 0.2615, "rewards/chosen": 1.113723635673523, "rewards/margins": 3.297226905822754, "rewards/rejected": -2.1835031509399414, "step": 2691 }, { "epoch": 0.7, "grad_norm": 38.51026916503906, "kl": 0.0, "learning_rate": 1.477361947134258e-07, "logps/chosen": -188.7342987060547, "logps/rejected": -285.8819274902344, "loss": 0.2037, "rewards/chosen": 1.7209330797195435, "rewards/margins": 6.286745548248291, "rewards/rejected": -4.565812587738037, "step": 2692 }, { "epoch": 0.7, "grad_norm": 38.706119537353516, "kl": 0.0, "learning_rate": 1.47605338916514e-07, "logps/chosen": -148.42007446289062, "logps/rejected": -273.3832092285156, "loss": 0.2485, "rewards/chosen": 0.41066497564315796, "rewards/margins": 2.5059401988983154, "rewards/rejected": -2.0952751636505127, "step": 2693 }, { "epoch": 0.71, "grad_norm": 29.851346969604492, "kl": 0.0, "learning_rate": 1.474744831196022e-07, "logps/chosen": -230.69467163085938, "logps/rejected": -298.66607666015625, "loss": 0.2614, "rewards/chosen": 0.36818838119506836, "rewards/margins": 4.815489292144775, "rewards/rejected": -4.447300910949707, "step": 2694 }, { "epoch": 0.71, "grad_norm": 39.34650802612305, "kl": 0.0, "learning_rate": 1.473436273226904e-07, "logps/chosen": -172.72879028320312, "logps/rejected": -300.1294250488281, "loss": 0.2825, "rewards/chosen": 0.9292474389076233, "rewards/margins": 4.453124523162842, "rewards/rejected": -3.5238771438598633, "step": 2695 }, { "epoch": 0.71, "grad_norm": 37.44293212890625, "kl": 0.0, "learning_rate": 1.4721277152577858e-07, "logps/chosen": -181.06982421875, "logps/rejected": -196.63552856445312, "loss": 0.3068, "rewards/chosen": 1.38923180103302, "rewards/margins": 3.2653074264526367, "rewards/rejected": -1.8760757446289062, "step": 2696 }, { "epoch": 0.71, "grad_norm": 29.282588958740234, "kl": 0.0, "learning_rate": 1.4708191572886677e-07, "logps/chosen": -151.28732299804688, "logps/rejected": -230.4415740966797, "loss": 0.2568, "rewards/chosen": 2.138603448867798, "rewards/margins": 6.064770698547363, "rewards/rejected": -3.9261670112609863, "step": 2697 }, { "epoch": 0.71, "grad_norm": 35.645111083984375, "kl": 0.0, "learning_rate": 1.4695105993195497e-07, "logps/chosen": -247.11302185058594, "logps/rejected": -190.36375427246094, "loss": 0.1182, "rewards/chosen": 1.9163931608200073, "rewards/margins": 5.842294692993164, "rewards/rejected": -3.925901412963867, "step": 2698 }, { "epoch": 0.71, "grad_norm": 27.345233917236328, "kl": 0.0, "learning_rate": 1.4682020413504316e-07, "logps/chosen": -240.89288330078125, "logps/rejected": -336.945068359375, "loss": 0.256, "rewards/chosen": 1.7607756853103638, "rewards/margins": 8.707010269165039, "rewards/rejected": -6.946234703063965, "step": 2699 }, { "epoch": 0.71, "grad_norm": 31.545499801635742, "kl": 0.0, "learning_rate": 1.4668934833813139e-07, "logps/chosen": -188.96682739257812, "logps/rejected": -277.4720458984375, "loss": 0.2019, "rewards/chosen": 0.2540876865386963, "rewards/margins": 4.599085807800293, "rewards/rejected": -4.344997882843018, "step": 2700 }, { "epoch": 0.71, "grad_norm": 33.34171676635742, "kl": 0.0, "learning_rate": 1.4655849254121958e-07, "logps/chosen": -175.2567901611328, "logps/rejected": -243.83938598632812, "loss": 0.1399, "rewards/chosen": 2.2294211387634277, "rewards/margins": 5.678004264831543, "rewards/rejected": -3.4485833644866943, "step": 2701 }, { "epoch": 0.71, "grad_norm": 36.91253662109375, "kl": 0.0, "learning_rate": 1.4642763674430778e-07, "logps/chosen": -269.1799011230469, "logps/rejected": -342.16302490234375, "loss": 0.2629, "rewards/chosen": 2.4199092388153076, "rewards/margins": 8.765036582946777, "rewards/rejected": -6.345127582550049, "step": 2702 }, { "epoch": 0.71, "grad_norm": 41.61262130737305, "kl": 0.0, "learning_rate": 1.4629678094739597e-07, "logps/chosen": -185.5560302734375, "logps/rejected": -290.873046875, "loss": 0.2414, "rewards/chosen": 1.2345657348632812, "rewards/margins": 4.153214454650879, "rewards/rejected": -2.9186487197875977, "step": 2703 }, { "epoch": 0.71, "grad_norm": 34.835086822509766, "kl": 0.0, "learning_rate": 1.4616592515048414e-07, "logps/chosen": -244.6498260498047, "logps/rejected": -348.86383056640625, "loss": 0.2069, "rewards/chosen": 3.809274435043335, "rewards/margins": 7.847353935241699, "rewards/rejected": -4.038079738616943, "step": 2704 }, { "epoch": 0.71, "grad_norm": 43.22990036010742, "kl": 0.0, "learning_rate": 1.4603506935357234e-07, "logps/chosen": -250.3634033203125, "logps/rejected": -218.6263427734375, "loss": 0.3321, "rewards/chosen": 1.0411466360092163, "rewards/margins": 2.7422757148742676, "rewards/rejected": -1.7011290788650513, "step": 2705 }, { "epoch": 0.71, "grad_norm": 34.81473159790039, "kl": 0.0, "learning_rate": 1.4590421355666054e-07, "logps/chosen": -167.74928283691406, "logps/rejected": -185.5796356201172, "loss": 0.2666, "rewards/chosen": 1.0022720098495483, "rewards/margins": 3.729177474975586, "rewards/rejected": -2.726905345916748, "step": 2706 }, { "epoch": 0.71, "grad_norm": 40.64683151245117, "kl": 0.0, "learning_rate": 1.4577335775974876e-07, "logps/chosen": -161.60569763183594, "logps/rejected": -221.24998474121094, "loss": 0.2768, "rewards/chosen": -0.024179527536034584, "rewards/margins": 2.4233968257904053, "rewards/rejected": -2.4475762844085693, "step": 2707 }, { "epoch": 0.71, "grad_norm": 29.897737503051758, "kl": 0.0, "learning_rate": 1.4564250196283695e-07, "logps/chosen": -264.284912109375, "logps/rejected": -192.35458374023438, "loss": 0.2565, "rewards/chosen": 1.6027510166168213, "rewards/margins": 4.857088088989258, "rewards/rejected": -3.2543368339538574, "step": 2708 }, { "epoch": 0.71, "grad_norm": 27.95981216430664, "kl": 0.0, "learning_rate": 1.4551164616592515e-07, "logps/chosen": -206.34573364257812, "logps/rejected": -274.0844421386719, "loss": 0.2179, "rewards/chosen": 0.7812842130661011, "rewards/margins": 4.885187149047852, "rewards/rejected": -4.103902816772461, "step": 2709 }, { "epoch": 0.71, "grad_norm": 32.9197998046875, "kl": 0.0, "learning_rate": 1.4538079036901335e-07, "logps/chosen": -214.70269775390625, "logps/rejected": -313.8812255859375, "loss": 0.3233, "rewards/chosen": -0.612155020236969, "rewards/margins": 6.03891134262085, "rewards/rejected": -6.651066303253174, "step": 2710 }, { "epoch": 0.71, "grad_norm": 39.657840728759766, "kl": 0.0, "learning_rate": 1.4524993457210154e-07, "logps/chosen": -226.01513671875, "logps/rejected": -229.1612548828125, "loss": 0.3154, "rewards/chosen": 0.7531558871269226, "rewards/margins": 2.399531126022339, "rewards/rejected": -1.6463751792907715, "step": 2711 }, { "epoch": 0.71, "grad_norm": 32.31519317626953, "kl": 0.0, "learning_rate": 1.451190787751897e-07, "logps/chosen": -236.01914978027344, "logps/rejected": -264.3641052246094, "loss": 0.21, "rewards/chosen": 0.8073936700820923, "rewards/margins": 4.921087741851807, "rewards/rejected": -4.113694190979004, "step": 2712 }, { "epoch": 0.71, "grad_norm": 39.790504455566406, "kl": 0.0, "learning_rate": 1.4498822297827794e-07, "logps/chosen": -220.0768280029297, "logps/rejected": -199.35006713867188, "loss": 0.1974, "rewards/chosen": 1.070143699645996, "rewards/margins": 3.531202793121338, "rewards/rejected": -2.461059093475342, "step": 2713 }, { "epoch": 0.71, "grad_norm": 39.84419250488281, "kl": 0.0, "learning_rate": 1.4485736718136613e-07, "logps/chosen": -122.71448516845703, "logps/rejected": -245.1878662109375, "loss": 0.2674, "rewards/chosen": -0.22302575409412384, "rewards/margins": 4.754169940948486, "rewards/rejected": -4.977195739746094, "step": 2714 }, { "epoch": 0.71, "grad_norm": 32.12276077270508, "kl": 0.0, "learning_rate": 1.4472651138445433e-07, "logps/chosen": -229.18992614746094, "logps/rejected": -229.65902709960938, "loss": 0.3738, "rewards/chosen": -0.2686086595058441, "rewards/margins": 3.344529628753662, "rewards/rejected": -3.613138198852539, "step": 2715 }, { "epoch": 0.71, "grad_norm": 35.63005447387695, "kl": 0.0, "learning_rate": 1.4459565558754252e-07, "logps/chosen": -240.97824096679688, "logps/rejected": -382.0249938964844, "loss": 0.2113, "rewards/chosen": 1.1434698104858398, "rewards/margins": 10.897199630737305, "rewards/rejected": -9.753729820251465, "step": 2716 }, { "epoch": 0.71, "grad_norm": 31.236305236816406, "kl": 0.0, "learning_rate": 1.4446479979063072e-07, "logps/chosen": -232.86383056640625, "logps/rejected": -166.2635498046875, "loss": 0.3384, "rewards/chosen": 0.37403538823127747, "rewards/margins": 2.4673333168029785, "rewards/rejected": -2.0932979583740234, "step": 2717 }, { "epoch": 0.71, "grad_norm": 30.102787017822266, "kl": 0.0, "learning_rate": 1.4433394399371892e-07, "logps/chosen": -209.07101440429688, "logps/rejected": -200.48077392578125, "loss": 0.2709, "rewards/chosen": 1.7002995014190674, "rewards/margins": 7.061903953552246, "rewards/rejected": -5.361604690551758, "step": 2718 }, { "epoch": 0.71, "grad_norm": 46.64667892456055, "kl": 0.0, "learning_rate": 1.442030881968071e-07, "logps/chosen": -135.42672729492188, "logps/rejected": -240.09103393554688, "loss": 0.2792, "rewards/chosen": 0.5562674403190613, "rewards/margins": 4.204452991485596, "rewards/rejected": -3.6481854915618896, "step": 2719 }, { "epoch": 0.71, "grad_norm": 32.72495651245117, "kl": 0.0, "learning_rate": 1.440722323998953e-07, "logps/chosen": -228.77261352539062, "logps/rejected": -401.26251220703125, "loss": 0.2102, "rewards/chosen": 0.26309388875961304, "rewards/margins": 4.260984897613525, "rewards/rejected": -3.9978909492492676, "step": 2720 }, { "epoch": 0.71, "grad_norm": 25.724273681640625, "kl": 0.0, "learning_rate": 1.439413766029835e-07, "logps/chosen": -179.09849548339844, "logps/rejected": -202.8656005859375, "loss": 0.2205, "rewards/chosen": 0.9190692901611328, "rewards/margins": 5.547544002532959, "rewards/rejected": -4.628474712371826, "step": 2721 }, { "epoch": 0.71, "grad_norm": 33.738853454589844, "kl": 0.0, "learning_rate": 1.438105208060717e-07, "logps/chosen": -255.45628356933594, "logps/rejected": -245.0790252685547, "loss": 0.1962, "rewards/chosen": 3.40286922454834, "rewards/margins": 6.362302780151367, "rewards/rejected": -2.9594333171844482, "step": 2722 }, { "epoch": 0.71, "grad_norm": 47.03477096557617, "kl": 0.0, "learning_rate": 1.436796650091599e-07, "logps/chosen": -235.11585998535156, "logps/rejected": -235.00161743164062, "loss": 0.2361, "rewards/chosen": 1.440369725227356, "rewards/margins": 5.251692295074463, "rewards/rejected": -3.8113226890563965, "step": 2723 }, { "epoch": 0.71, "grad_norm": 34.62791442871094, "kl": 0.0, "learning_rate": 1.435488092122481e-07, "logps/chosen": -244.5506591796875, "logps/rejected": -347.0370178222656, "loss": 0.3516, "rewards/chosen": 0.8647078275680542, "rewards/margins": 4.863985538482666, "rewards/rejected": -3.9992778301239014, "step": 2724 }, { "epoch": 0.71, "grad_norm": 30.0848445892334, "kl": 0.0, "learning_rate": 1.434179534153363e-07, "logps/chosen": -207.02052307128906, "logps/rejected": -293.9341125488281, "loss": 0.1265, "rewards/chosen": 2.3349344730377197, "rewards/margins": 6.6972551345825195, "rewards/rejected": -4.362320899963379, "step": 2725 }, { "epoch": 0.71, "grad_norm": 26.7783260345459, "kl": 0.0, "learning_rate": 1.432870976184245e-07, "logps/chosen": -170.4520721435547, "logps/rejected": -281.2052917480469, "loss": 0.1864, "rewards/chosen": 1.0393160581588745, "rewards/margins": 5.177910804748535, "rewards/rejected": -4.138594627380371, "step": 2726 }, { "epoch": 0.71, "grad_norm": 32.45759201049805, "kl": 0.0, "learning_rate": 1.4315624182151268e-07, "logps/chosen": -229.599365234375, "logps/rejected": -188.02590942382812, "loss": 0.2791, "rewards/chosen": 2.7275326251983643, "rewards/margins": 5.223902702331543, "rewards/rejected": -2.4963700771331787, "step": 2727 }, { "epoch": 0.71, "grad_norm": 25.652467727661133, "kl": 0.0, "learning_rate": 1.4302538602460088e-07, "logps/chosen": -242.80453491210938, "logps/rejected": -230.92465209960938, "loss": 0.2005, "rewards/chosen": 1.264347791671753, "rewards/margins": 6.404541969299316, "rewards/rejected": -5.140194416046143, "step": 2728 }, { "epoch": 0.71, "grad_norm": 43.789573669433594, "kl": 0.0, "learning_rate": 1.4289453022768907e-07, "logps/chosen": -203.66488647460938, "logps/rejected": -180.29945373535156, "loss": 0.2973, "rewards/chosen": 1.972968578338623, "rewards/margins": 4.173624038696289, "rewards/rejected": -2.200655460357666, "step": 2729 }, { "epoch": 0.71, "grad_norm": 40.94158172607422, "kl": 0.0, "learning_rate": 1.4276367443077727e-07, "logps/chosen": -259.09027099609375, "logps/rejected": -186.64659118652344, "loss": 0.3076, "rewards/chosen": 0.5558538436889648, "rewards/margins": 3.275819778442383, "rewards/rejected": -2.719965934753418, "step": 2730 }, { "epoch": 0.71, "grad_norm": 31.99633026123047, "kl": 0.0, "learning_rate": 1.4263281863386546e-07, "logps/chosen": -157.29074096679688, "logps/rejected": -279.3443603515625, "loss": 0.145, "rewards/chosen": 0.8477217555046082, "rewards/margins": 5.280526638031006, "rewards/rejected": -4.432805061340332, "step": 2731 }, { "epoch": 0.71, "grad_norm": 27.111766815185547, "kl": 0.0, "learning_rate": 1.425019628369537e-07, "logps/chosen": -153.20616149902344, "logps/rejected": -160.79698181152344, "loss": 0.1781, "rewards/chosen": 3.4651429653167725, "rewards/margins": 6.490252494812012, "rewards/rejected": -3.0251095294952393, "step": 2732 }, { "epoch": 0.72, "grad_norm": 33.77357864379883, "kl": 0.0, "learning_rate": 1.4237110704004188e-07, "logps/chosen": -252.4764404296875, "logps/rejected": -168.84027099609375, "loss": 0.3839, "rewards/chosen": -0.3884561359882355, "rewards/margins": 2.14807391166687, "rewards/rejected": -2.536530017852783, "step": 2733 }, { "epoch": 0.72, "grad_norm": 32.69058609008789, "kl": 0.0, "learning_rate": 1.4224025124313008e-07, "logps/chosen": -198.02716064453125, "logps/rejected": -187.51708984375, "loss": 0.2079, "rewards/chosen": 1.7017436027526855, "rewards/margins": 3.9817001819610596, "rewards/rejected": -2.279956579208374, "step": 2734 }, { "epoch": 0.72, "grad_norm": 25.28070640563965, "kl": 0.0, "learning_rate": 1.4210939544621825e-07, "logps/chosen": -182.21871948242188, "logps/rejected": -251.4492950439453, "loss": 0.1658, "rewards/chosen": 0.9423468708992004, "rewards/margins": 4.9712605476379395, "rewards/rejected": -4.028913497924805, "step": 2735 }, { "epoch": 0.72, "grad_norm": 28.65892791748047, "kl": 0.0, "learning_rate": 1.4197853964930645e-07, "logps/chosen": -171.02603149414062, "logps/rejected": -222.0957489013672, "loss": 0.3054, "rewards/chosen": -0.5188900232315063, "rewards/margins": 1.9518908262252808, "rewards/rejected": -2.470780849456787, "step": 2736 }, { "epoch": 0.72, "grad_norm": 28.866334915161133, "kl": 0.0, "learning_rate": 1.4184768385239464e-07, "logps/chosen": -207.8139190673828, "logps/rejected": -366.18408203125, "loss": 0.2341, "rewards/chosen": 1.4074229001998901, "rewards/margins": 5.109437465667725, "rewards/rejected": -3.702014446258545, "step": 2737 }, { "epoch": 0.72, "grad_norm": 26.634572982788086, "kl": 0.0, "learning_rate": 1.4171682805548286e-07, "logps/chosen": -260.7043762207031, "logps/rejected": -213.9114990234375, "loss": 0.2407, "rewards/chosen": -0.022909751161932945, "rewards/margins": 4.45982027053833, "rewards/rejected": -4.482729911804199, "step": 2738 }, { "epoch": 0.72, "grad_norm": 32.87309646606445, "kl": 0.0, "learning_rate": 1.4158597225857106e-07, "logps/chosen": -147.40817260742188, "logps/rejected": -252.726806640625, "loss": 0.2776, "rewards/chosen": 1.218986988067627, "rewards/margins": 3.1168408393859863, "rewards/rejected": -1.8978537321090698, "step": 2739 }, { "epoch": 0.72, "grad_norm": 21.100400924682617, "kl": 0.0, "learning_rate": 1.4145511646165926e-07, "logps/chosen": -334.18707275390625, "logps/rejected": -321.79534912109375, "loss": 0.2434, "rewards/chosen": 0.3896353840827942, "rewards/margins": 4.801957607269287, "rewards/rejected": -4.412322044372559, "step": 2740 }, { "epoch": 0.72, "grad_norm": 43.7648811340332, "kl": 0.0, "learning_rate": 1.4132426066474745e-07, "logps/chosen": -217.05458068847656, "logps/rejected": -253.01095581054688, "loss": 0.3203, "rewards/chosen": 0.4657108783721924, "rewards/margins": 3.0607008934020996, "rewards/rejected": -2.5949900150299072, "step": 2741 }, { "epoch": 0.72, "grad_norm": 26.86048698425293, "kl": 0.0, "learning_rate": 1.4119340486783562e-07, "logps/chosen": -216.07846069335938, "logps/rejected": -217.1433563232422, "loss": 0.1815, "rewards/chosen": 0.9621667265892029, "rewards/margins": 4.179025173187256, "rewards/rejected": -3.2168586254119873, "step": 2742 }, { "epoch": 0.72, "grad_norm": 34.33148193359375, "kl": 0.0, "learning_rate": 1.4106254907092382e-07, "logps/chosen": -208.30294799804688, "logps/rejected": -191.87002563476562, "loss": 0.3044, "rewards/chosen": 0.46693187952041626, "rewards/margins": 4.43991231918335, "rewards/rejected": -3.972980499267578, "step": 2743 }, { "epoch": 0.72, "grad_norm": 36.13833236694336, "kl": 0.0, "learning_rate": 1.4093169327401201e-07, "logps/chosen": -193.1121368408203, "logps/rejected": -216.50210571289062, "loss": 0.3074, "rewards/chosen": 0.9344757795333862, "rewards/margins": 3.430565357208252, "rewards/rejected": -2.496089458465576, "step": 2744 }, { "epoch": 0.72, "grad_norm": 31.421340942382812, "kl": 0.0, "learning_rate": 1.4080083747710024e-07, "logps/chosen": -174.89791870117188, "logps/rejected": -237.71141052246094, "loss": 0.3358, "rewards/chosen": 0.9983955025672913, "rewards/margins": 4.591882705688477, "rewards/rejected": -3.59348726272583, "step": 2745 }, { "epoch": 0.72, "grad_norm": 48.589500427246094, "kl": 0.0, "learning_rate": 1.4066998168018843e-07, "logps/chosen": -248.13365173339844, "logps/rejected": -260.446533203125, "loss": 0.3376, "rewards/chosen": 0.1320873200893402, "rewards/margins": 3.9734907150268555, "rewards/rejected": -3.8414034843444824, "step": 2746 }, { "epoch": 0.72, "grad_norm": 35.76465606689453, "kl": 0.0, "learning_rate": 1.4053912588327663e-07, "logps/chosen": -162.7195587158203, "logps/rejected": -221.4072723388672, "loss": 0.2968, "rewards/chosen": 1.0338228940963745, "rewards/margins": 3.8434853553771973, "rewards/rejected": -2.809662342071533, "step": 2747 }, { "epoch": 0.72, "grad_norm": 37.44093322753906, "kl": 0.0, "learning_rate": 1.4040827008636483e-07, "logps/chosen": -219.79983520507812, "logps/rejected": -173.9461212158203, "loss": 0.3516, "rewards/chosen": 0.971840500831604, "rewards/margins": 5.520664215087891, "rewards/rejected": -4.548823833465576, "step": 2748 }, { "epoch": 0.72, "grad_norm": 35.50620651245117, "kl": 0.0, "learning_rate": 1.4027741428945302e-07, "logps/chosen": -127.27174377441406, "logps/rejected": -257.6485595703125, "loss": 0.2437, "rewards/chosen": 0.9993495941162109, "rewards/margins": 4.665413856506348, "rewards/rejected": -3.666064500808716, "step": 2749 }, { "epoch": 0.72, "grad_norm": 36.798675537109375, "kl": 0.0, "learning_rate": 1.401465584925412e-07, "logps/chosen": -230.24996948242188, "logps/rejected": -275.0447082519531, "loss": 0.2929, "rewards/chosen": 2.6916356086730957, "rewards/margins": 5.939266204833984, "rewards/rejected": -3.2476305961608887, "step": 2750 }, { "epoch": 0.72, "grad_norm": 34.59363555908203, "kl": 0.0, "learning_rate": 1.4001570269562941e-07, "logps/chosen": -182.05906677246094, "logps/rejected": -231.069091796875, "loss": 0.2712, "rewards/chosen": 1.328806757926941, "rewards/margins": 4.8035783767700195, "rewards/rejected": -3.474771738052368, "step": 2751 }, { "epoch": 0.72, "grad_norm": 35.60285949707031, "kl": 0.0, "learning_rate": 1.398848468987176e-07, "logps/chosen": -207.7548065185547, "logps/rejected": -264.8133544921875, "loss": 0.2353, "rewards/chosen": 0.33422619104385376, "rewards/margins": 3.725780963897705, "rewards/rejected": -3.391554832458496, "step": 2752 }, { "epoch": 0.72, "grad_norm": 41.25855255126953, "kl": 0.0, "learning_rate": 1.397539911018058e-07, "logps/chosen": -221.4940948486328, "logps/rejected": -160.0779571533203, "loss": 0.2543, "rewards/chosen": 0.46627089381217957, "rewards/margins": 2.0166687965393066, "rewards/rejected": -1.5503979921340942, "step": 2753 }, { "epoch": 0.72, "grad_norm": 29.777326583862305, "kl": 0.0, "learning_rate": 1.39623135304894e-07, "logps/chosen": -112.76001739501953, "logps/rejected": -175.68617248535156, "loss": 0.2654, "rewards/chosen": 2.5609025955200195, "rewards/margins": 4.485593318939209, "rewards/rejected": -1.9246906042099, "step": 2754 }, { "epoch": 0.72, "grad_norm": 31.612537384033203, "kl": 0.0, "learning_rate": 1.394922795079822e-07, "logps/chosen": -228.182373046875, "logps/rejected": -271.0409851074219, "loss": 0.2301, "rewards/chosen": 0.5148449540138245, "rewards/margins": 3.6974213123321533, "rewards/rejected": -3.1825764179229736, "step": 2755 }, { "epoch": 0.72, "grad_norm": 28.019922256469727, "kl": 0.0, "learning_rate": 1.393614237110704e-07, "logps/chosen": -129.9990997314453, "logps/rejected": -211.51722717285156, "loss": 0.2257, "rewards/chosen": 0.5919703245162964, "rewards/margins": 3.691455364227295, "rewards/rejected": -3.099485158920288, "step": 2756 }, { "epoch": 0.72, "grad_norm": 33.64453887939453, "kl": 0.0, "learning_rate": 1.3923056791415862e-07, "logps/chosen": -292.7642822265625, "logps/rejected": -293.0862731933594, "loss": 0.2164, "rewards/chosen": 0.4971492290496826, "rewards/margins": 4.758174896240234, "rewards/rejected": -4.261025905609131, "step": 2757 }, { "epoch": 0.72, "grad_norm": 33.85685348510742, "kl": 0.0, "learning_rate": 1.3909971211724679e-07, "logps/chosen": -134.65257263183594, "logps/rejected": -253.585693359375, "loss": 0.2708, "rewards/chosen": 1.0100104808807373, "rewards/margins": 3.8374264240264893, "rewards/rejected": -2.827415943145752, "step": 2758 }, { "epoch": 0.72, "grad_norm": 31.849849700927734, "kl": 0.0, "learning_rate": 1.3896885632033498e-07, "logps/chosen": -227.2148895263672, "logps/rejected": -259.6429138183594, "loss": 0.2142, "rewards/chosen": 2.4732985496520996, "rewards/margins": 5.716508865356445, "rewards/rejected": -3.2432103157043457, "step": 2759 }, { "epoch": 0.72, "grad_norm": 43.46824264526367, "kl": 0.0, "learning_rate": 1.3883800052342318e-07, "logps/chosen": -159.94631958007812, "logps/rejected": -315.32855224609375, "loss": 0.2885, "rewards/chosen": -0.026400430127978325, "rewards/margins": 3.871565103530884, "rewards/rejected": -3.897965431213379, "step": 2760 }, { "epoch": 0.72, "grad_norm": 33.06340789794922, "kl": 0.0, "learning_rate": 1.3870714472651137e-07, "logps/chosen": -186.7371063232422, "logps/rejected": -278.01531982421875, "loss": 0.1404, "rewards/chosen": 2.455404758453369, "rewards/margins": 7.5706915855407715, "rewards/rejected": -5.115286827087402, "step": 2761 }, { "epoch": 0.72, "grad_norm": 34.241844177246094, "kl": 0.0, "learning_rate": 1.3857628892959957e-07, "logps/chosen": -138.23428344726562, "logps/rejected": -227.390625, "loss": 0.3009, "rewards/chosen": -0.31029558181762695, "rewards/margins": 2.701427459716797, "rewards/rejected": -3.011723041534424, "step": 2762 }, { "epoch": 0.72, "grad_norm": 33.2817268371582, "kl": 0.0, "learning_rate": 1.3844543313268777e-07, "logps/chosen": -150.46156311035156, "logps/rejected": -204.14328002929688, "loss": 0.2398, "rewards/chosen": 0.7092301845550537, "rewards/margins": 3.1490092277526855, "rewards/rejected": -2.439779043197632, "step": 2763 }, { "epoch": 0.72, "grad_norm": 41.400272369384766, "kl": 0.0, "learning_rate": 1.38314577335776e-07, "logps/chosen": -237.2517852783203, "logps/rejected": -275.0649719238281, "loss": 0.3358, "rewards/chosen": -1.6412899494171143, "rewards/margins": 0.6008193492889404, "rewards/rejected": -2.2421092987060547, "step": 2764 }, { "epoch": 0.72, "grad_norm": 37.423851013183594, "kl": 0.0, "learning_rate": 1.3818372153886419e-07, "logps/chosen": -294.60052490234375, "logps/rejected": -282.4591064453125, "loss": 0.3369, "rewards/chosen": -0.7444419264793396, "rewards/margins": 2.0405898094177246, "rewards/rejected": -2.785031795501709, "step": 2765 }, { "epoch": 0.72, "grad_norm": 30.79787826538086, "kl": 0.0, "learning_rate": 1.3805286574195235e-07, "logps/chosen": -246.27394104003906, "logps/rejected": -155.23995971679688, "loss": 0.2902, "rewards/chosen": 0.21204765141010284, "rewards/margins": 3.1054985523223877, "rewards/rejected": -2.893450975418091, "step": 2766 }, { "epoch": 0.72, "grad_norm": 27.49995994567871, "kl": 0.0, "learning_rate": 1.3792200994504055e-07, "logps/chosen": -182.4592742919922, "logps/rejected": -204.7582244873047, "loss": 0.242, "rewards/chosen": 1.772039771080017, "rewards/margins": 4.657109260559082, "rewards/rejected": -2.8850696086883545, "step": 2767 }, { "epoch": 0.72, "grad_norm": 29.28543472290039, "kl": 0.0, "learning_rate": 1.3779115414812875e-07, "logps/chosen": -198.22662353515625, "logps/rejected": -167.86358642578125, "loss": 0.2965, "rewards/chosen": 1.0747374296188354, "rewards/margins": 4.275830268859863, "rewards/rejected": -3.2010929584503174, "step": 2768 }, { "epoch": 0.72, "grad_norm": 37.219173431396484, "kl": 0.0, "learning_rate": 1.3766029835121694e-07, "logps/chosen": -245.44570922851562, "logps/rejected": -235.1141357421875, "loss": 0.1944, "rewards/chosen": 2.7740979194641113, "rewards/margins": 5.648436546325684, "rewards/rejected": -2.8743388652801514, "step": 2769 }, { "epoch": 0.72, "grad_norm": 34.19056701660156, "kl": 0.0, "learning_rate": 1.3752944255430517e-07, "logps/chosen": -313.8372497558594, "logps/rejected": -224.35574340820312, "loss": 0.1986, "rewards/chosen": 1.6918792724609375, "rewards/margins": 4.29853630065918, "rewards/rejected": -2.606657028198242, "step": 2770 }, { "epoch": 0.73, "grad_norm": 32.03276824951172, "kl": 0.0, "learning_rate": 1.3739858675739336e-07, "logps/chosen": -176.02622985839844, "logps/rejected": -199.72654724121094, "loss": 0.3817, "rewards/chosen": -0.32908573746681213, "rewards/margins": 1.3851908445358276, "rewards/rejected": -1.7142765522003174, "step": 2771 }, { "epoch": 0.73, "grad_norm": 34.00090789794922, "kl": 0.0, "learning_rate": 1.3726773096048156e-07, "logps/chosen": -252.97853088378906, "logps/rejected": -260.77093505859375, "loss": 0.2482, "rewards/chosen": 1.4872760772705078, "rewards/margins": 5.681765556335449, "rewards/rejected": -4.194489479064941, "step": 2772 }, { "epoch": 0.73, "grad_norm": 34.82801818847656, "kl": 0.0, "learning_rate": 1.3713687516356973e-07, "logps/chosen": -259.81781005859375, "logps/rejected": -212.13023376464844, "loss": 0.2776, "rewards/chosen": 1.3029401302337646, "rewards/margins": 4.064812660217285, "rewards/rejected": -2.7618727684020996, "step": 2773 }, { "epoch": 0.73, "grad_norm": 30.624717712402344, "kl": 0.0, "learning_rate": 1.3700601936665792e-07, "logps/chosen": -237.0125732421875, "logps/rejected": -277.92010498046875, "loss": 0.2677, "rewards/chosen": 2.0963778495788574, "rewards/margins": 5.673805236816406, "rewards/rejected": -3.577427387237549, "step": 2774 }, { "epoch": 0.73, "grad_norm": 39.34622573852539, "kl": 0.0, "learning_rate": 1.3687516356974612e-07, "logps/chosen": -189.960205078125, "logps/rejected": -252.06198120117188, "loss": 0.2544, "rewards/chosen": 1.7920467853546143, "rewards/margins": 3.5940065383911133, "rewards/rejected": -1.801959753036499, "step": 2775 }, { "epoch": 0.73, "grad_norm": 42.03234100341797, "kl": 0.0, "learning_rate": 1.3674430777283432e-07, "logps/chosen": -265.41485595703125, "logps/rejected": -206.2362823486328, "loss": 0.3364, "rewards/chosen": 0.413890540599823, "rewards/margins": 3.1098921298980713, "rewards/rejected": -2.6960015296936035, "step": 2776 }, { "epoch": 0.73, "grad_norm": 31.57137680053711, "kl": 0.0, "learning_rate": 1.3661345197592254e-07, "logps/chosen": -189.6539764404297, "logps/rejected": -242.28109741210938, "loss": 0.2578, "rewards/chosen": 1.3802999258041382, "rewards/margins": 4.1886820793151855, "rewards/rejected": -2.808382034301758, "step": 2777 }, { "epoch": 0.73, "grad_norm": 29.047449111938477, "kl": 0.0, "learning_rate": 1.3648259617901073e-07, "logps/chosen": -260.63482666015625, "logps/rejected": -221.86618041992188, "loss": 0.3031, "rewards/chosen": -0.33293989300727844, "rewards/margins": 2.5053622722625732, "rewards/rejected": -2.8383021354675293, "step": 2778 }, { "epoch": 0.73, "grad_norm": 26.14084243774414, "kl": 0.0, "learning_rate": 1.3635174038209893e-07, "logps/chosen": -132.5585479736328, "logps/rejected": -174.09848022460938, "loss": 0.2469, "rewards/chosen": 0.8879119157791138, "rewards/margins": 3.6658644676208496, "rewards/rejected": -2.7779526710510254, "step": 2779 }, { "epoch": 0.73, "grad_norm": 41.65951156616211, "kl": 0.0, "learning_rate": 1.3622088458518713e-07, "logps/chosen": -217.3573455810547, "logps/rejected": -331.310546875, "loss": 0.2999, "rewards/chosen": 1.362498164176941, "rewards/margins": 3.556079864501953, "rewards/rejected": -2.1935815811157227, "step": 2780 }, { "epoch": 0.73, "grad_norm": 37.427066802978516, "kl": 0.0, "learning_rate": 1.360900287882753e-07, "logps/chosen": -218.29087829589844, "logps/rejected": -271.3686218261719, "loss": 0.3319, "rewards/chosen": -0.09768380224704742, "rewards/margins": 2.889094829559326, "rewards/rejected": -2.986778736114502, "step": 2781 }, { "epoch": 0.73, "grad_norm": 39.8854866027832, "kl": 0.0, "learning_rate": 1.359591729913635e-07, "logps/chosen": -164.79013061523438, "logps/rejected": -248.52499389648438, "loss": 0.297, "rewards/chosen": -0.26507285237312317, "rewards/margins": 3.319533109664917, "rewards/rejected": -3.5846059322357178, "step": 2782 }, { "epoch": 0.73, "grad_norm": 30.36602783203125, "kl": 0.0, "learning_rate": 1.3582831719445171e-07, "logps/chosen": -215.8096160888672, "logps/rejected": -179.60113525390625, "loss": 0.2619, "rewards/chosen": -0.6932493448257446, "rewards/margins": 1.766062617301941, "rewards/rejected": -2.4593119621276855, "step": 2783 }, { "epoch": 0.73, "grad_norm": 34.83140182495117, "kl": 0.0, "learning_rate": 1.356974613975399e-07, "logps/chosen": -269.39556884765625, "logps/rejected": -259.3585205078125, "loss": 0.2726, "rewards/chosen": 1.2403514385223389, "rewards/margins": 4.321313858032227, "rewards/rejected": -3.0809621810913086, "step": 2784 }, { "epoch": 0.73, "grad_norm": 35.393802642822266, "kl": 0.0, "learning_rate": 1.355666056006281e-07, "logps/chosen": -228.4115447998047, "logps/rejected": -283.6307373046875, "loss": 0.272, "rewards/chosen": 1.7523528337478638, "rewards/margins": 5.948849678039551, "rewards/rejected": -4.196496963500977, "step": 2785 }, { "epoch": 0.73, "grad_norm": 26.124181747436523, "kl": 0.0, "learning_rate": 1.354357498037163e-07, "logps/chosen": -196.4839324951172, "logps/rejected": -228.32936096191406, "loss": 0.2173, "rewards/chosen": 0.9325391054153442, "rewards/margins": 4.881499767303467, "rewards/rejected": -3.948960781097412, "step": 2786 }, { "epoch": 0.73, "grad_norm": 38.89645767211914, "kl": 0.0, "learning_rate": 1.353048940068045e-07, "logps/chosen": -178.40771484375, "logps/rejected": -188.349853515625, "loss": 0.4389, "rewards/chosen": -0.047868043184280396, "rewards/margins": 1.500084400177002, "rewards/rejected": -1.54795241355896, "step": 2787 }, { "epoch": 0.73, "grad_norm": 34.04124450683594, "kl": 0.0, "learning_rate": 1.351740382098927e-07, "logps/chosen": -254.98538208007812, "logps/rejected": -298.51654052734375, "loss": 0.3969, "rewards/chosen": 0.29132279753685, "rewards/margins": 4.159118175506592, "rewards/rejected": -3.86779522895813, "step": 2788 }, { "epoch": 0.73, "grad_norm": 27.67196273803711, "kl": 0.0, "learning_rate": 1.3504318241298086e-07, "logps/chosen": -174.4413299560547, "logps/rejected": -330.2989807128906, "loss": 0.18, "rewards/chosen": 0.7002738118171692, "rewards/margins": 5.744529724121094, "rewards/rejected": -5.04425573348999, "step": 2789 }, { "epoch": 0.73, "grad_norm": 33.7601203918457, "kl": 0.0, "learning_rate": 1.349123266160691e-07, "logps/chosen": -149.75143432617188, "logps/rejected": -281.0748596191406, "loss": 0.2179, "rewards/chosen": 1.7364554405212402, "rewards/margins": 6.038804531097412, "rewards/rejected": -4.302349090576172, "step": 2790 }, { "epoch": 0.73, "grad_norm": 34.44429016113281, "kl": 0.0, "learning_rate": 1.3478147081915728e-07, "logps/chosen": -235.973876953125, "logps/rejected": -297.8359375, "loss": 0.2328, "rewards/chosen": 1.891502857208252, "rewards/margins": 6.574864387512207, "rewards/rejected": -4.683361530303955, "step": 2791 }, { "epoch": 0.73, "grad_norm": 37.90352249145508, "kl": 0.0, "learning_rate": 1.3465061502224548e-07, "logps/chosen": -108.64974212646484, "logps/rejected": -247.4210205078125, "loss": 0.2774, "rewards/chosen": 1.4293962717056274, "rewards/margins": 3.7838568687438965, "rewards/rejected": -2.3544607162475586, "step": 2792 }, { "epoch": 0.73, "grad_norm": 34.55895233154297, "kl": 0.0, "learning_rate": 1.3451975922533368e-07, "logps/chosen": -239.15586853027344, "logps/rejected": -230.9355010986328, "loss": 0.3273, "rewards/chosen": 0.25174659490585327, "rewards/margins": 3.4415252208709717, "rewards/rejected": -3.1897785663604736, "step": 2793 }, { "epoch": 0.73, "grad_norm": 39.81635284423828, "kl": 0.0, "learning_rate": 1.3438890342842187e-07, "logps/chosen": -253.4542236328125, "logps/rejected": -341.01483154296875, "loss": 0.2496, "rewards/chosen": 0.5031195878982544, "rewards/margins": 4.771719932556152, "rewards/rejected": -4.2686004638671875, "step": 2794 }, { "epoch": 0.73, "grad_norm": 29.70501136779785, "kl": 0.0, "learning_rate": 1.3425804763151007e-07, "logps/chosen": -154.07347106933594, "logps/rejected": -212.02188110351562, "loss": 0.2204, "rewards/chosen": 0.0629243478178978, "rewards/margins": 2.979095220565796, "rewards/rejected": -2.916170835494995, "step": 2795 }, { "epoch": 0.73, "grad_norm": 25.16206932067871, "kl": 0.0, "learning_rate": 1.341271918345983e-07, "logps/chosen": -204.69512939453125, "logps/rejected": -185.43121337890625, "loss": 0.1135, "rewards/chosen": 2.616105318069458, "rewards/margins": 5.921466827392578, "rewards/rejected": -3.305361747741699, "step": 2796 }, { "epoch": 0.73, "grad_norm": 35.79527282714844, "kl": 0.0, "learning_rate": 1.3399633603768646e-07, "logps/chosen": -273.466552734375, "logps/rejected": -260.4999084472656, "loss": 0.2636, "rewards/chosen": -0.1472821831703186, "rewards/margins": 4.861036777496338, "rewards/rejected": -5.008318901062012, "step": 2797 }, { "epoch": 0.73, "grad_norm": 32.936561584472656, "kl": 0.0, "learning_rate": 1.3386548024077466e-07, "logps/chosen": -256.4706115722656, "logps/rejected": -284.8575744628906, "loss": 0.2715, "rewards/chosen": 0.46489885449409485, "rewards/margins": 3.55414080619812, "rewards/rejected": -3.0892419815063477, "step": 2798 }, { "epoch": 0.73, "grad_norm": 29.936857223510742, "kl": 0.0, "learning_rate": 1.3373462444386285e-07, "logps/chosen": -218.6918487548828, "logps/rejected": -254.4846649169922, "loss": 0.2718, "rewards/chosen": 0.6963696479797363, "rewards/margins": 3.5298426151275635, "rewards/rejected": -2.833472967147827, "step": 2799 }, { "epoch": 0.73, "grad_norm": 40.529151916503906, "kl": 0.0, "learning_rate": 1.3360376864695105e-07, "logps/chosen": -198.93524169921875, "logps/rejected": -221.15386962890625, "loss": 0.2807, "rewards/chosen": 1.1004393100738525, "rewards/margins": 5.532849311828613, "rewards/rejected": -4.43241024017334, "step": 2800 }, { "epoch": 0.73, "grad_norm": 39.31686019897461, "kl": 0.0, "learning_rate": 1.3347291285003924e-07, "logps/chosen": -207.00869750976562, "logps/rejected": -250.30758666992188, "loss": 0.2649, "rewards/chosen": -0.03526926040649414, "rewards/margins": 3.3345630168914795, "rewards/rejected": -3.3698322772979736, "step": 2801 }, { "epoch": 0.73, "grad_norm": 26.114871978759766, "kl": 0.0, "learning_rate": 1.3334205705312747e-07, "logps/chosen": -204.65841674804688, "logps/rejected": -247.81874084472656, "loss": 0.2696, "rewards/chosen": 0.9131640195846558, "rewards/margins": 4.818316459655762, "rewards/rejected": -3.9051523208618164, "step": 2802 }, { "epoch": 0.73, "grad_norm": 35.78491973876953, "kl": 0.0, "learning_rate": 1.3321120125621566e-07, "logps/chosen": -251.0792999267578, "logps/rejected": -247.90907287597656, "loss": 0.2803, "rewards/chosen": 1.001746654510498, "rewards/margins": 4.692512512207031, "rewards/rejected": -3.6907660961151123, "step": 2803 }, { "epoch": 0.73, "grad_norm": 31.028039932250977, "kl": 0.0, "learning_rate": 1.3308034545930383e-07, "logps/chosen": -150.0222930908203, "logps/rejected": -263.18804931640625, "loss": 0.2219, "rewards/chosen": 0.88972407579422, "rewards/margins": 4.433566570281982, "rewards/rejected": -3.543842315673828, "step": 2804 }, { "epoch": 0.73, "grad_norm": 32.93028259277344, "kl": 0.0, "learning_rate": 1.3294948966239203e-07, "logps/chosen": -160.91116333007812, "logps/rejected": -246.55479431152344, "loss": 0.2802, "rewards/chosen": 0.5891302824020386, "rewards/margins": 3.9440999031066895, "rewards/rejected": -3.3549697399139404, "step": 2805 }, { "epoch": 0.73, "grad_norm": 33.47201919555664, "kl": 0.0, "learning_rate": 1.3281863386548022e-07, "logps/chosen": -284.26470947265625, "logps/rejected": -261.2138671875, "loss": 0.2462, "rewards/chosen": 0.9916735887527466, "rewards/margins": 5.291661739349365, "rewards/rejected": -4.299988269805908, "step": 2806 }, { "epoch": 0.73, "grad_norm": 31.195194244384766, "kl": 0.0, "learning_rate": 1.3268777806856842e-07, "logps/chosen": -223.95889282226562, "logps/rejected": -220.4429931640625, "loss": 0.2153, "rewards/chosen": 0.1629033088684082, "rewards/margins": 3.741335153579712, "rewards/rejected": -3.5784318447113037, "step": 2807 }, { "epoch": 0.73, "grad_norm": 36.86294174194336, "kl": 0.0, "learning_rate": 1.3255692227165662e-07, "logps/chosen": -288.4587097167969, "logps/rejected": -332.08062744140625, "loss": 0.1844, "rewards/chosen": 2.108771800994873, "rewards/margins": 7.597304344177246, "rewards/rejected": -5.488532543182373, "step": 2808 }, { "epoch": 0.74, "grad_norm": 27.21337890625, "kl": 0.0, "learning_rate": 1.3242606647474484e-07, "logps/chosen": -178.190185546875, "logps/rejected": -295.3204650878906, "loss": 0.1621, "rewards/chosen": 2.2621607780456543, "rewards/margins": 5.234269142150879, "rewards/rejected": -2.9721086025238037, "step": 2809 }, { "epoch": 0.74, "grad_norm": 32.42479705810547, "kl": 0.0, "learning_rate": 1.3229521067783304e-07, "logps/chosen": -267.2407531738281, "logps/rejected": -195.87710571289062, "loss": 0.2883, "rewards/chosen": 0.8129584789276123, "rewards/margins": 3.803115129470825, "rewards/rejected": -2.990156650543213, "step": 2810 }, { "epoch": 0.74, "grad_norm": 40.45661926269531, "kl": 0.0, "learning_rate": 1.3216435488092123e-07, "logps/chosen": -194.41912841796875, "logps/rejected": -258.4333801269531, "loss": 0.2662, "rewards/chosen": 0.8145712018013, "rewards/margins": 4.6983819007873535, "rewards/rejected": -3.8838107585906982, "step": 2811 }, { "epoch": 0.74, "grad_norm": 27.655622482299805, "kl": 0.0, "learning_rate": 1.320334990840094e-07, "logps/chosen": -263.7073059082031, "logps/rejected": -285.5018005371094, "loss": 0.2301, "rewards/chosen": 2.682894229888916, "rewards/margins": 6.3988237380981445, "rewards/rejected": -3.7159295082092285, "step": 2812 }, { "epoch": 0.74, "grad_norm": 42.45039367675781, "kl": 0.0, "learning_rate": 1.319026432870976e-07, "logps/chosen": -208.87033081054688, "logps/rejected": -189.85069274902344, "loss": 0.3343, "rewards/chosen": 0.5289901494979858, "rewards/margins": 3.121410369873047, "rewards/rejected": -2.5924201011657715, "step": 2813 }, { "epoch": 0.74, "grad_norm": 33.07746505737305, "kl": 0.0, "learning_rate": 1.317717874901858e-07, "logps/chosen": -325.9510803222656, "logps/rejected": -175.13290405273438, "loss": 0.2399, "rewards/chosen": 1.947821855545044, "rewards/margins": 6.143939971923828, "rewards/rejected": -4.196118354797363, "step": 2814 }, { "epoch": 0.74, "grad_norm": 31.211721420288086, "kl": 0.0, "learning_rate": 1.3164093169327402e-07, "logps/chosen": -134.55836486816406, "logps/rejected": -330.57147216796875, "loss": 0.2007, "rewards/chosen": 1.4923220872879028, "rewards/margins": 4.616800785064697, "rewards/rejected": -3.124478578567505, "step": 2815 }, { "epoch": 0.74, "grad_norm": 34.78199005126953, "kl": 0.0, "learning_rate": 1.315100758963622e-07, "logps/chosen": -298.7881774902344, "logps/rejected": -305.9605407714844, "loss": 0.2001, "rewards/chosen": 2.4990897178649902, "rewards/margins": 6.823459148406982, "rewards/rejected": -4.324369430541992, "step": 2816 }, { "epoch": 0.74, "grad_norm": 27.847217559814453, "kl": 0.0, "learning_rate": 1.313792200994504e-07, "logps/chosen": -208.53501892089844, "logps/rejected": -265.9464111328125, "loss": 0.172, "rewards/chosen": 2.3986852169036865, "rewards/margins": 6.262216091156006, "rewards/rejected": -3.8635308742523193, "step": 2817 }, { "epoch": 0.74, "grad_norm": 24.946470260620117, "kl": 0.0, "learning_rate": 1.312483643025386e-07, "logps/chosen": -153.75546264648438, "logps/rejected": -194.64955139160156, "loss": 0.2043, "rewards/chosen": 1.8664082288742065, "rewards/margins": 5.680118083953857, "rewards/rejected": -3.8137097358703613, "step": 2818 }, { "epoch": 0.74, "grad_norm": 34.00639343261719, "kl": 0.0, "learning_rate": 1.311175085056268e-07, "logps/chosen": -213.54026794433594, "logps/rejected": -204.0853271484375, "loss": 0.2822, "rewards/chosen": 1.1439380645751953, "rewards/margins": 3.6970744132995605, "rewards/rejected": -2.5531363487243652, "step": 2819 }, { "epoch": 0.74, "grad_norm": 34.658931732177734, "kl": 0.0, "learning_rate": 1.3098665270871497e-07, "logps/chosen": -280.839599609375, "logps/rejected": -277.06951904296875, "loss": 0.2256, "rewards/chosen": 1.5120201110839844, "rewards/margins": 5.279638290405273, "rewards/rejected": -3.76761794090271, "step": 2820 }, { "epoch": 0.74, "grad_norm": 33.02054977416992, "kl": 0.0, "learning_rate": 1.3085579691180317e-07, "logps/chosen": -209.41510009765625, "logps/rejected": -184.58187866210938, "loss": 0.1982, "rewards/chosen": 1.890760064125061, "rewards/margins": 5.372137069702148, "rewards/rejected": -3.481376886367798, "step": 2821 }, { "epoch": 0.74, "grad_norm": 29.64247703552246, "kl": 0.0, "learning_rate": 1.307249411148914e-07, "logps/chosen": -244.3789825439453, "logps/rejected": -236.93154907226562, "loss": 0.2085, "rewards/chosen": 2.0428965091705322, "rewards/margins": 5.614164352416992, "rewards/rejected": -3.571267604827881, "step": 2822 }, { "epoch": 0.74, "grad_norm": 39.05708312988281, "kl": 0.0, "learning_rate": 1.3059408531797958e-07, "logps/chosen": -322.265869140625, "logps/rejected": -282.5099182128906, "loss": 0.2222, "rewards/chosen": 0.1460273712873459, "rewards/margins": 3.9599175453186035, "rewards/rejected": -3.813890218734741, "step": 2823 }, { "epoch": 0.74, "grad_norm": 49.731666564941406, "kl": 0.0, "learning_rate": 1.3046322952106778e-07, "logps/chosen": -335.8645324707031, "logps/rejected": -237.4248504638672, "loss": 0.1767, "rewards/chosen": 0.3891473412513733, "rewards/margins": 3.2716710567474365, "rewards/rejected": -2.882523775100708, "step": 2824 }, { "epoch": 0.74, "grad_norm": 34.1038818359375, "kl": 0.0, "learning_rate": 1.3033237372415598e-07, "logps/chosen": -188.07669067382812, "logps/rejected": -259.12640380859375, "loss": 0.2208, "rewards/chosen": 2.7795515060424805, "rewards/margins": 5.237063884735107, "rewards/rejected": -2.457512378692627, "step": 2825 }, { "epoch": 0.74, "grad_norm": 38.49484634399414, "kl": 0.0, "learning_rate": 1.3020151792724417e-07, "logps/chosen": -215.10951232910156, "logps/rejected": -226.5205535888672, "loss": 0.2578, "rewards/chosen": 0.7708897590637207, "rewards/margins": 1.8423322439193726, "rewards/rejected": -1.0714424848556519, "step": 2826 }, { "epoch": 0.74, "grad_norm": 34.55825424194336, "kl": 0.0, "learning_rate": 1.3007066213033237e-07, "logps/chosen": -181.13351440429688, "logps/rejected": -340.60955810546875, "loss": 0.1785, "rewards/chosen": 0.8886696696281433, "rewards/margins": 5.78781795501709, "rewards/rejected": -4.899148464202881, "step": 2827 }, { "epoch": 0.74, "grad_norm": 51.14204025268555, "kl": 0.0, "learning_rate": 1.2993980633342057e-07, "logps/chosen": -184.99362182617188, "logps/rejected": -222.6909637451172, "loss": 0.2876, "rewards/chosen": 2.048811674118042, "rewards/margins": 4.70229959487915, "rewards/rejected": -2.6534879207611084, "step": 2828 }, { "epoch": 0.74, "grad_norm": 38.40785598754883, "kl": 0.0, "learning_rate": 1.2980895053650876e-07, "logps/chosen": -272.9462890625, "logps/rejected": -202.24850463867188, "loss": 0.2388, "rewards/chosen": 1.8347502946853638, "rewards/margins": 3.973668098449707, "rewards/rejected": -2.1389176845550537, "step": 2829 }, { "epoch": 0.74, "grad_norm": 32.16298294067383, "kl": 0.0, "learning_rate": 1.2967809473959696e-07, "logps/chosen": -195.91893005371094, "logps/rejected": -312.01727294921875, "loss": 0.2314, "rewards/chosen": 2.0726287364959717, "rewards/margins": 5.435080528259277, "rewards/rejected": -3.3624520301818848, "step": 2830 }, { "epoch": 0.74, "grad_norm": 33.658443450927734, "kl": 0.0, "learning_rate": 1.2954723894268515e-07, "logps/chosen": -183.74356079101562, "logps/rejected": -224.95396423339844, "loss": 0.1889, "rewards/chosen": 1.7647711038589478, "rewards/margins": 5.015651702880859, "rewards/rejected": -3.250880718231201, "step": 2831 }, { "epoch": 0.74, "grad_norm": 34.89668273925781, "kl": 0.0, "learning_rate": 1.2941638314577335e-07, "logps/chosen": -218.40185546875, "logps/rejected": -170.23614501953125, "loss": 0.2388, "rewards/chosen": 2.2100014686584473, "rewards/margins": 4.51767635345459, "rewards/rejected": -2.3076748847961426, "step": 2832 }, { "epoch": 0.74, "grad_norm": 32.520774841308594, "kl": 0.0, "learning_rate": 1.2928552734886155e-07, "logps/chosen": -300.9206848144531, "logps/rejected": -271.1351623535156, "loss": 0.279, "rewards/chosen": -0.4567300081253052, "rewards/margins": 4.208921909332275, "rewards/rejected": -4.665651798248291, "step": 2833 }, { "epoch": 0.74, "grad_norm": 38.00114059448242, "kl": 0.0, "learning_rate": 1.2915467155194977e-07, "logps/chosen": -155.7928924560547, "logps/rejected": -290.24359130859375, "loss": 0.2434, "rewards/chosen": 1.9330978393554688, "rewards/margins": 4.819847106933594, "rewards/rejected": -2.886749267578125, "step": 2834 }, { "epoch": 0.74, "grad_norm": 35.707950592041016, "kl": 0.0, "learning_rate": 1.2902381575503794e-07, "logps/chosen": -246.90481567382812, "logps/rejected": -271.83056640625, "loss": 0.2246, "rewards/chosen": 1.0874056816101074, "rewards/margins": 4.229583740234375, "rewards/rejected": -3.1421780586242676, "step": 2835 }, { "epoch": 0.74, "grad_norm": 35.23543167114258, "kl": 0.0, "learning_rate": 1.2889295995812613e-07, "logps/chosen": -158.85867309570312, "logps/rejected": -159.07968139648438, "loss": 0.2004, "rewards/chosen": 3.075439929962158, "rewards/margins": 6.421757698059082, "rewards/rejected": -3.346317768096924, "step": 2836 }, { "epoch": 0.74, "grad_norm": 36.72352600097656, "kl": 0.0, "learning_rate": 1.2876210416121433e-07, "logps/chosen": -207.18263244628906, "logps/rejected": -279.25860595703125, "loss": 0.1752, "rewards/chosen": 1.3812538385391235, "rewards/margins": 4.762180805206299, "rewards/rejected": -3.380927085876465, "step": 2837 }, { "epoch": 0.74, "grad_norm": 37.70661926269531, "kl": 0.0, "learning_rate": 1.2863124836430253e-07, "logps/chosen": -167.04473876953125, "logps/rejected": -321.56060791015625, "loss": 0.2574, "rewards/chosen": 2.6713783740997314, "rewards/margins": 5.929797172546387, "rewards/rejected": -3.2584190368652344, "step": 2838 }, { "epoch": 0.74, "grad_norm": 32.261695861816406, "kl": 0.0, "learning_rate": 1.2850039256739072e-07, "logps/chosen": -204.63113403320312, "logps/rejected": -239.1153564453125, "loss": 0.1671, "rewards/chosen": 1.1335726976394653, "rewards/margins": 5.587896347045898, "rewards/rejected": -4.454323768615723, "step": 2839 }, { "epoch": 0.74, "grad_norm": 30.68526268005371, "kl": 0.0, "learning_rate": 1.2836953677047892e-07, "logps/chosen": -189.3596954345703, "logps/rejected": -240.0169219970703, "loss": 0.2215, "rewards/chosen": 3.3140547275543213, "rewards/margins": 5.870977401733398, "rewards/rejected": -2.556922674179077, "step": 2840 }, { "epoch": 0.74, "grad_norm": 40.76029586791992, "kl": 0.0, "learning_rate": 1.2823868097356714e-07, "logps/chosen": -267.5643310546875, "logps/rejected": -378.8826599121094, "loss": 0.3176, "rewards/chosen": 0.5281890034675598, "rewards/margins": 5.403535842895508, "rewards/rejected": -4.875346660614014, "step": 2841 }, { "epoch": 0.74, "grad_norm": 31.456031799316406, "kl": 0.0, "learning_rate": 1.2810782517665534e-07, "logps/chosen": -191.51092529296875, "logps/rejected": -234.32913208007812, "loss": 0.2861, "rewards/chosen": 0.31234022974967957, "rewards/margins": 2.9550578594207764, "rewards/rejected": -2.6427175998687744, "step": 2842 }, { "epoch": 0.74, "grad_norm": 54.55989074707031, "kl": 0.0, "learning_rate": 1.279769693797435e-07, "logps/chosen": -226.59442138671875, "logps/rejected": -211.96656799316406, "loss": 0.3083, "rewards/chosen": 1.6426137685775757, "rewards/margins": 3.635915756225586, "rewards/rejected": -1.9933021068572998, "step": 2843 }, { "epoch": 0.74, "grad_norm": 30.520023345947266, "kl": 0.0, "learning_rate": 1.278461135828317e-07, "logps/chosen": -177.09706115722656, "logps/rejected": -190.50453186035156, "loss": 0.127, "rewards/chosen": 1.1707515716552734, "rewards/margins": 5.3002448081970215, "rewards/rejected": -4.129493236541748, "step": 2844 }, { "epoch": 0.74, "grad_norm": 43.148956298828125, "kl": 0.0, "learning_rate": 1.277152577859199e-07, "logps/chosen": -226.48245239257812, "logps/rejected": -201.1748046875, "loss": 0.3406, "rewards/chosen": -0.8820660710334778, "rewards/margins": 2.871274471282959, "rewards/rejected": -3.753340482711792, "step": 2845 }, { "epoch": 0.74, "grad_norm": 27.76190948486328, "kl": 0.0, "learning_rate": 1.275844019890081e-07, "logps/chosen": -201.8677978515625, "logps/rejected": -255.43519592285156, "loss": 0.1749, "rewards/chosen": 2.41351056098938, "rewards/margins": 7.772078514099121, "rewards/rejected": -5.358567714691162, "step": 2846 }, { "epoch": 0.75, "grad_norm": 25.458728790283203, "kl": 0.0, "learning_rate": 1.2745354619209632e-07, "logps/chosen": -207.4493408203125, "logps/rejected": -224.9928436279297, "loss": 0.1354, "rewards/chosen": 1.2176116704940796, "rewards/margins": 5.856996536254883, "rewards/rejected": -4.639384746551514, "step": 2847 }, { "epoch": 0.75, "grad_norm": 37.17304992675781, "kl": 0.0, "learning_rate": 1.2732269039518451e-07, "logps/chosen": -234.0641632080078, "logps/rejected": -266.6547546386719, "loss": 0.3049, "rewards/chosen": 1.5123035907745361, "rewards/margins": 3.934211254119873, "rewards/rejected": -2.421907663345337, "step": 2848 }, { "epoch": 0.75, "grad_norm": 31.614761352539062, "kl": 0.0, "learning_rate": 1.271918345982727e-07, "logps/chosen": -149.47503662109375, "logps/rejected": -312.2427978515625, "loss": 0.3124, "rewards/chosen": 0.877579391002655, "rewards/margins": 4.342172145843506, "rewards/rejected": -3.464592933654785, "step": 2849 }, { "epoch": 0.75, "grad_norm": 29.895217895507812, "kl": 0.0, "learning_rate": 1.270609788013609e-07, "logps/chosen": -204.3394012451172, "logps/rejected": -203.88108825683594, "loss": 0.2429, "rewards/chosen": 2.5909066200256348, "rewards/margins": 5.3905134201049805, "rewards/rejected": -2.799607038497925, "step": 2850 }, { "epoch": 0.75, "grad_norm": 29.323200225830078, "kl": 0.0, "learning_rate": 1.2693012300444908e-07, "logps/chosen": -200.34286499023438, "logps/rejected": -305.9087829589844, "loss": 0.3187, "rewards/chosen": 0.6447303295135498, "rewards/margins": 4.248477458953857, "rewards/rejected": -3.6037471294403076, "step": 2851 }, { "epoch": 0.75, "grad_norm": 33.479618072509766, "kl": 0.0, "learning_rate": 1.2679926720753727e-07, "logps/chosen": -211.47300720214844, "logps/rejected": -234.58995056152344, "loss": 0.1717, "rewards/chosen": 0.6005478501319885, "rewards/margins": 5.318211078643799, "rewards/rejected": -4.717663288116455, "step": 2852 }, { "epoch": 0.75, "grad_norm": 33.29733657836914, "kl": 0.0, "learning_rate": 1.2666841141062547e-07, "logps/chosen": -185.76760864257812, "logps/rejected": -336.5068359375, "loss": 0.2649, "rewards/chosen": 0.17454427480697632, "rewards/margins": 3.135378837585449, "rewards/rejected": -2.960834503173828, "step": 2853 }, { "epoch": 0.75, "grad_norm": 28.257957458496094, "kl": 0.0, "learning_rate": 1.265375556137137e-07, "logps/chosen": -188.02969360351562, "logps/rejected": -208.114501953125, "loss": 0.2783, "rewards/chosen": 2.182997465133667, "rewards/margins": 5.579323768615723, "rewards/rejected": -3.3963260650634766, "step": 2854 }, { "epoch": 0.75, "grad_norm": 40.46211624145508, "kl": 0.0, "learning_rate": 1.2640669981680189e-07, "logps/chosen": -247.18666076660156, "logps/rejected": -271.8265380859375, "loss": 0.1999, "rewards/chosen": 0.41464537382125854, "rewards/margins": 5.825222969055176, "rewards/rejected": -5.410577774047852, "step": 2855 }, { "epoch": 0.75, "grad_norm": 38.835689544677734, "kl": 0.0, "learning_rate": 1.2627584401989008e-07, "logps/chosen": -226.5570526123047, "logps/rejected": -229.08575439453125, "loss": 0.2518, "rewards/chosen": 2.2463645935058594, "rewards/margins": 6.170310020446777, "rewards/rejected": -3.923945188522339, "step": 2856 }, { "epoch": 0.75, "grad_norm": 37.920345306396484, "kl": 0.0, "learning_rate": 1.2614498822297828e-07, "logps/chosen": -209.99766540527344, "logps/rejected": -227.28494262695312, "loss": 0.2198, "rewards/chosen": 1.4239280223846436, "rewards/margins": 5.315568923950195, "rewards/rejected": -3.8916409015655518, "step": 2857 }, { "epoch": 0.75, "grad_norm": 35.52437973022461, "kl": 0.0, "learning_rate": 1.2601413242606647e-07, "logps/chosen": -215.59051513671875, "logps/rejected": -300.15167236328125, "loss": 0.1788, "rewards/chosen": 2.225846767425537, "rewards/margins": 6.136997699737549, "rewards/rejected": -3.9111509323120117, "step": 2858 }, { "epoch": 0.75, "grad_norm": 37.68877410888672, "kl": 0.0, "learning_rate": 1.2588327662915464e-07, "logps/chosen": -205.9502410888672, "logps/rejected": -235.28225708007812, "loss": 0.2043, "rewards/chosen": 1.3810704946517944, "rewards/margins": 5.162478446960449, "rewards/rejected": -3.7814078330993652, "step": 2859 }, { "epoch": 0.75, "grad_norm": 29.949377059936523, "kl": 0.0, "learning_rate": 1.2575242083224287e-07, "logps/chosen": -240.81326293945312, "logps/rejected": -176.24057006835938, "loss": 0.2927, "rewards/chosen": 0.8815681338310242, "rewards/margins": 4.728879928588867, "rewards/rejected": -3.8473119735717773, "step": 2860 }, { "epoch": 0.75, "grad_norm": 36.66761016845703, "kl": 0.0, "learning_rate": 1.2562156503533106e-07, "logps/chosen": -158.10072326660156, "logps/rejected": -300.7066345214844, "loss": 0.249, "rewards/chosen": 0.6434949636459351, "rewards/margins": 5.606198787689209, "rewards/rejected": -4.962703704833984, "step": 2861 }, { "epoch": 0.75, "grad_norm": 42.22517013549805, "kl": 0.0, "learning_rate": 1.2549070923841926e-07, "logps/chosen": -243.78887939453125, "logps/rejected": -282.7771911621094, "loss": 0.2351, "rewards/chosen": 2.329538106918335, "rewards/margins": 5.888676643371582, "rewards/rejected": -3.559138774871826, "step": 2862 }, { "epoch": 0.75, "grad_norm": 28.08859634399414, "kl": 0.0, "learning_rate": 1.2535985344150746e-07, "logps/chosen": -180.0941925048828, "logps/rejected": -300.3199157714844, "loss": 0.2466, "rewards/chosen": 0.9009974002838135, "rewards/margins": 4.991559982299805, "rewards/rejected": -4.09056282043457, "step": 2863 }, { "epoch": 0.75, "grad_norm": 36.271453857421875, "kl": 0.0, "learning_rate": 1.2522899764459565e-07, "logps/chosen": -142.3069305419922, "logps/rejected": -477.3985900878906, "loss": 0.2336, "rewards/chosen": 0.7708263397216797, "rewards/margins": 7.935967922210693, "rewards/rejected": -7.165141582489014, "step": 2864 }, { "epoch": 0.75, "grad_norm": 36.600616455078125, "kl": 0.0, "learning_rate": 1.2509814184768385e-07, "logps/chosen": -297.0329895019531, "logps/rejected": -275.8288879394531, "loss": 0.2147, "rewards/chosen": 0.1721043586730957, "rewards/margins": 5.66850471496582, "rewards/rejected": -5.496400356292725, "step": 2865 }, { "epoch": 0.75, "grad_norm": 32.472042083740234, "kl": 0.0, "learning_rate": 1.2496728605077204e-07, "logps/chosen": -197.33554077148438, "logps/rejected": -202.48606872558594, "loss": 0.3602, "rewards/chosen": 0.5126596689224243, "rewards/margins": 4.411322116851807, "rewards/rejected": -3.8986623287200928, "step": 2866 }, { "epoch": 0.75, "grad_norm": 31.626291275024414, "kl": 0.0, "learning_rate": 1.2483643025386024e-07, "logps/chosen": -220.61660766601562, "logps/rejected": -281.240966796875, "loss": 0.237, "rewards/chosen": 1.3547782897949219, "rewards/margins": 3.2674479484558105, "rewards/rejected": -1.9126697778701782, "step": 2867 }, { "epoch": 0.75, "grad_norm": 30.53521728515625, "kl": 0.0, "learning_rate": 1.2470557445694844e-07, "logps/chosen": -180.86444091796875, "logps/rejected": -281.6353454589844, "loss": 0.2105, "rewards/chosen": 1.173238754272461, "rewards/margins": 4.285660743713379, "rewards/rejected": -3.112421751022339, "step": 2868 }, { "epoch": 0.75, "grad_norm": 38.975013732910156, "kl": 0.0, "learning_rate": 1.2457471866003663e-07, "logps/chosen": -221.84617614746094, "logps/rejected": -238.2664794921875, "loss": 0.2226, "rewards/chosen": 1.4654902219772339, "rewards/margins": 5.618080139160156, "rewards/rejected": -4.152589797973633, "step": 2869 }, { "epoch": 0.75, "grad_norm": 31.062091827392578, "kl": 0.0, "learning_rate": 1.2444386286312483e-07, "logps/chosen": -240.03097534179688, "logps/rejected": -269.620361328125, "loss": 0.2435, "rewards/chosen": 1.7040454149246216, "rewards/margins": 4.696616172790527, "rewards/rejected": -2.9925708770751953, "step": 2870 }, { "epoch": 0.75, "grad_norm": 29.831459045410156, "kl": 0.0, "learning_rate": 1.2431300706621302e-07, "logps/chosen": -158.68234252929688, "logps/rejected": -241.193115234375, "loss": 0.2662, "rewards/chosen": 0.047480225563049316, "rewards/margins": 3.3005714416503906, "rewards/rejected": -3.253091335296631, "step": 2871 }, { "epoch": 0.75, "grad_norm": 26.44230079650879, "kl": 0.0, "learning_rate": 1.2418215126930122e-07, "logps/chosen": -345.5702819824219, "logps/rejected": -201.339111328125, "loss": 0.1851, "rewards/chosen": 1.2837327718734741, "rewards/margins": 5.676604747772217, "rewards/rejected": -4.392871856689453, "step": 2872 }, { "epoch": 0.75, "grad_norm": 33.63683319091797, "kl": 0.0, "learning_rate": 1.2405129547238942e-07, "logps/chosen": -177.061279296875, "logps/rejected": -248.8441925048828, "loss": 0.1536, "rewards/chosen": 4.102713108062744, "rewards/margins": 7.341584205627441, "rewards/rejected": -3.2388713359832764, "step": 2873 }, { "epoch": 0.75, "grad_norm": 31.185894012451172, "kl": 0.0, "learning_rate": 1.239204396754776e-07, "logps/chosen": -174.25146484375, "logps/rejected": -331.90045166015625, "loss": 0.1468, "rewards/chosen": 3.4060115814208984, "rewards/margins": 8.459066390991211, "rewards/rejected": -5.053055286407471, "step": 2874 }, { "epoch": 0.75, "grad_norm": 36.41456985473633, "kl": 0.0, "learning_rate": 1.237895838785658e-07, "logps/chosen": -257.15521240234375, "logps/rejected": -217.49343872070312, "loss": 0.2738, "rewards/chosen": 2.5418829917907715, "rewards/margins": 5.1777496337890625, "rewards/rejected": -2.635866403579712, "step": 2875 }, { "epoch": 0.75, "grad_norm": 27.071542739868164, "kl": 0.0, "learning_rate": 1.23658728081654e-07, "logps/chosen": -199.8568572998047, "logps/rejected": -237.8272247314453, "loss": 0.2854, "rewards/chosen": 0.3178449869155884, "rewards/margins": 3.073559284210205, "rewards/rejected": -2.7557144165039062, "step": 2876 }, { "epoch": 0.75, "grad_norm": 28.203113555908203, "kl": 0.0, "learning_rate": 1.235278722847422e-07, "logps/chosen": -184.3144989013672, "logps/rejected": -269.3959655761719, "loss": 0.2805, "rewards/chosen": 0.6483626961708069, "rewards/margins": 6.0424699783325195, "rewards/rejected": -5.394107341766357, "step": 2877 }, { "epoch": 0.75, "grad_norm": 35.722450256347656, "kl": 0.0, "learning_rate": 1.233970164878304e-07, "logps/chosen": -197.92510986328125, "logps/rejected": -298.2138671875, "loss": 0.2674, "rewards/chosen": 0.42415380477905273, "rewards/margins": 3.037757158279419, "rewards/rejected": -2.613603353500366, "step": 2878 }, { "epoch": 0.75, "grad_norm": 41.887447357177734, "kl": 0.0, "learning_rate": 1.2326616069091862e-07, "logps/chosen": -217.36782836914062, "logps/rejected": -303.54290771484375, "loss": 0.3861, "rewards/chosen": 0.8330056667327881, "rewards/margins": 3.9003896713256836, "rewards/rejected": -3.0673840045928955, "step": 2879 }, { "epoch": 0.75, "grad_norm": 25.675533294677734, "kl": 0.0, "learning_rate": 1.231353048940068e-07, "logps/chosen": -217.59751892089844, "logps/rejected": -293.5157470703125, "loss": 0.2278, "rewards/chosen": 0.7520068883895874, "rewards/margins": 6.41138219833374, "rewards/rejected": -5.659375190734863, "step": 2880 }, { "epoch": 0.75, "grad_norm": 33.77341842651367, "kl": 0.0, "learning_rate": 1.2300444909709498e-07, "logps/chosen": -190.04165649414062, "logps/rejected": -243.47940063476562, "loss": 0.4059, "rewards/chosen": -1.3758662939071655, "rewards/margins": 1.1594191789627075, "rewards/rejected": -2.535285472869873, "step": 2881 }, { "epoch": 0.75, "grad_norm": 37.45033645629883, "kl": 0.0, "learning_rate": 1.228735933001832e-07, "logps/chosen": -232.29136657714844, "logps/rejected": -252.78013610839844, "loss": 0.1841, "rewards/chosen": 1.7727702856063843, "rewards/margins": 3.7750887870788574, "rewards/rejected": -2.0023186206817627, "step": 2882 }, { "epoch": 0.75, "grad_norm": 36.58238983154297, "kl": 0.0, "learning_rate": 1.227427375032714e-07, "logps/chosen": -235.13485717773438, "logps/rejected": -155.39901733398438, "loss": 0.2151, "rewards/chosen": 1.5003623962402344, "rewards/margins": 4.468344688415527, "rewards/rejected": -2.967982053756714, "step": 2883 }, { "epoch": 0.75, "grad_norm": 34.22605895996094, "kl": 0.0, "learning_rate": 1.2261188170635957e-07, "logps/chosen": -243.13113403320312, "logps/rejected": -213.46397399902344, "loss": 0.1841, "rewards/chosen": 0.6716588139533997, "rewards/margins": 5.077381610870361, "rewards/rejected": -4.405722618103027, "step": 2884 }, { "epoch": 0.76, "grad_norm": 38.82898712158203, "kl": 0.0, "learning_rate": 1.224810259094478e-07, "logps/chosen": -212.1895294189453, "logps/rejected": -275.56085205078125, "loss": 0.2102, "rewards/chosen": 0.16195379197597504, "rewards/margins": 5.055579662322998, "rewards/rejected": -4.893625736236572, "step": 2885 }, { "epoch": 0.76, "grad_norm": 33.71118927001953, "kl": 0.0, "learning_rate": 1.22350170112536e-07, "logps/chosen": -222.42874145507812, "logps/rejected": -211.8098907470703, "loss": 0.2643, "rewards/chosen": 0.7175885438919067, "rewards/margins": 3.3388028144836426, "rewards/rejected": -2.6212143898010254, "step": 2886 }, { "epoch": 0.76, "grad_norm": 44.06155014038086, "kl": 0.0, "learning_rate": 1.222193143156242e-07, "logps/chosen": -192.49566650390625, "logps/rejected": -219.57589721679688, "loss": 0.3047, "rewards/chosen": 1.4475936889648438, "rewards/margins": 3.2824594974517822, "rewards/rejected": -1.8348658084869385, "step": 2887 }, { "epoch": 0.76, "grad_norm": 37.90978240966797, "kl": 0.0, "learning_rate": 1.2208845851871236e-07, "logps/chosen": -221.5460205078125, "logps/rejected": -296.1938171386719, "loss": 0.2267, "rewards/chosen": 1.5457605123519897, "rewards/margins": 4.726670742034912, "rewards/rejected": -3.180910348892212, "step": 2888 }, { "epoch": 0.76, "grad_norm": 38.740081787109375, "kl": 0.0, "learning_rate": 1.2195760272180058e-07, "logps/chosen": -215.4282989501953, "logps/rejected": -215.94064331054688, "loss": 0.2473, "rewards/chosen": 1.6079682111740112, "rewards/margins": 4.185258388519287, "rewards/rejected": -2.5772900581359863, "step": 2889 }, { "epoch": 0.76, "grad_norm": 28.836488723754883, "kl": 0.0, "learning_rate": 1.2182674692488878e-07, "logps/chosen": -234.37863159179688, "logps/rejected": -252.62863159179688, "loss": 0.2242, "rewards/chosen": 1.2684545516967773, "rewards/margins": 6.477774620056152, "rewards/rejected": -5.209320068359375, "step": 2890 }, { "epoch": 0.76, "grad_norm": 30.18337631225586, "kl": 0.0, "learning_rate": 1.2169589112797695e-07, "logps/chosen": -267.667724609375, "logps/rejected": -222.0542755126953, "loss": 0.2185, "rewards/chosen": 1.34689462184906, "rewards/margins": 5.4736127853393555, "rewards/rejected": -4.126718044281006, "step": 2891 }, { "epoch": 0.76, "grad_norm": 25.85747718811035, "kl": 0.0, "learning_rate": 1.2156503533106517e-07, "logps/chosen": -198.385498046875, "logps/rejected": -330.31707763671875, "loss": 0.1765, "rewards/chosen": 1.7326695919036865, "rewards/margins": 4.857114315032959, "rewards/rejected": -3.1244447231292725, "step": 2892 }, { "epoch": 0.76, "grad_norm": 51.8387451171875, "kl": 0.0, "learning_rate": 1.2143417953415336e-07, "logps/chosen": -192.14889526367188, "logps/rejected": -185.65231323242188, "loss": 0.2475, "rewards/chosen": 1.5236883163452148, "rewards/margins": 5.039030075073242, "rewards/rejected": -3.5153415203094482, "step": 2893 }, { "epoch": 0.76, "grad_norm": 32.10373306274414, "kl": 0.0, "learning_rate": 1.2130332373724156e-07, "logps/chosen": -174.0630340576172, "logps/rejected": -271.5389099121094, "loss": 0.1499, "rewards/chosen": 1.0889085531234741, "rewards/margins": 2.904127597808838, "rewards/rejected": -1.8152190446853638, "step": 2894 }, { "epoch": 0.76, "grad_norm": 28.912342071533203, "kl": 0.0, "learning_rate": 1.2117246794032976e-07, "logps/chosen": -206.34083557128906, "logps/rejected": -236.11744689941406, "loss": 0.1711, "rewards/chosen": 2.815239429473877, "rewards/margins": 6.5172834396362305, "rewards/rejected": -3.7020437717437744, "step": 2895 }, { "epoch": 0.76, "grad_norm": 35.956024169921875, "kl": 0.0, "learning_rate": 1.2104161214341795e-07, "logps/chosen": -265.2823181152344, "logps/rejected": -413.5455017089844, "loss": 0.1822, "rewards/chosen": 1.1553301811218262, "rewards/margins": 6.729024887084961, "rewards/rejected": -5.573694705963135, "step": 2896 }, { "epoch": 0.76, "grad_norm": 32.14839172363281, "kl": 0.0, "learning_rate": 1.2091075634650615e-07, "logps/chosen": -152.54571533203125, "logps/rejected": -204.89080810546875, "loss": 0.2508, "rewards/chosen": 1.2394222021102905, "rewards/margins": 3.0110769271850586, "rewards/rejected": -1.7716546058654785, "step": 2897 }, { "epoch": 0.76, "grad_norm": 21.592060089111328, "kl": 0.0, "learning_rate": 1.2077990054959434e-07, "logps/chosen": -286.0022277832031, "logps/rejected": -226.6644744873047, "loss": 0.3169, "rewards/chosen": -1.6726806163787842, "rewards/margins": 2.2264373302459717, "rewards/rejected": -3.899117946624756, "step": 2898 }, { "epoch": 0.76, "grad_norm": 30.0306339263916, "kl": 0.0, "learning_rate": 1.2064904475268254e-07, "logps/chosen": -206.56137084960938, "logps/rejected": -223.92355346679688, "loss": 0.2848, "rewards/chosen": 1.3688303232192993, "rewards/margins": 4.59629487991333, "rewards/rejected": -3.2274646759033203, "step": 2899 }, { "epoch": 0.76, "grad_norm": 26.28697395324707, "kl": 0.0, "learning_rate": 1.2051818895577074e-07, "logps/chosen": -227.15841674804688, "logps/rejected": -260.24835205078125, "loss": 0.308, "rewards/chosen": -0.8018272519111633, "rewards/margins": 3.403519868850708, "rewards/rejected": -4.205347061157227, "step": 2900 }, { "epoch": 0.76, "grad_norm": 32.87641143798828, "kl": 0.0, "learning_rate": 1.2038733315885893e-07, "logps/chosen": -242.5460205078125, "logps/rejected": -257.4617004394531, "loss": 0.2887, "rewards/chosen": -0.07051602005958557, "rewards/margins": 4.160338401794434, "rewards/rejected": -4.230854511260986, "step": 2901 }, { "epoch": 0.76, "grad_norm": 30.622848510742188, "kl": 0.0, "learning_rate": 1.2025647736194713e-07, "logps/chosen": -232.50624084472656, "logps/rejected": -188.72296142578125, "loss": 0.341, "rewards/chosen": 1.6381621360778809, "rewards/margins": 4.317827224731445, "rewards/rejected": -2.6796653270721436, "step": 2902 }, { "epoch": 0.76, "grad_norm": 35.67957305908203, "kl": 0.0, "learning_rate": 1.2012562156503533e-07, "logps/chosen": -291.6125793457031, "logps/rejected": -262.95806884765625, "loss": 0.2262, "rewards/chosen": 2.4084537029266357, "rewards/margins": 6.307066917419434, "rewards/rejected": -3.8986129760742188, "step": 2903 }, { "epoch": 0.76, "grad_norm": 28.24009132385254, "kl": 0.0, "learning_rate": 1.1999476576812352e-07, "logps/chosen": -223.4033203125, "logps/rejected": -267.7476806640625, "loss": 0.2482, "rewards/chosen": 0.842714786529541, "rewards/margins": 6.109996318817139, "rewards/rejected": -5.267281532287598, "step": 2904 }, { "epoch": 0.76, "grad_norm": 37.31360626220703, "kl": 0.0, "learning_rate": 1.1986390997121172e-07, "logps/chosen": -249.6298370361328, "logps/rejected": -187.60121154785156, "loss": 0.2571, "rewards/chosen": 0.9127798080444336, "rewards/margins": 4.006119251251221, "rewards/rejected": -3.093339443206787, "step": 2905 }, { "epoch": 0.76, "grad_norm": 25.03266143798828, "kl": 0.0, "learning_rate": 1.1973305417429991e-07, "logps/chosen": -209.19241333007812, "logps/rejected": -292.7968444824219, "loss": 0.1879, "rewards/chosen": 2.684803009033203, "rewards/margins": 6.696554660797119, "rewards/rejected": -4.011751651763916, "step": 2906 }, { "epoch": 0.76, "grad_norm": 41.646968841552734, "kl": 0.0, "learning_rate": 1.196021983773881e-07, "logps/chosen": -151.85186767578125, "logps/rejected": -266.0614013671875, "loss": 0.3084, "rewards/chosen": 0.06417274475097656, "rewards/margins": 4.529610633850098, "rewards/rejected": -4.465437889099121, "step": 2907 }, { "epoch": 0.76, "grad_norm": 31.28655242919922, "kl": 0.0, "learning_rate": 1.194713425804763e-07, "logps/chosen": -191.10202026367188, "logps/rejected": -179.45779418945312, "loss": 0.2224, "rewards/chosen": 0.6876000165939331, "rewards/margins": 3.735599994659424, "rewards/rejected": -3.0480000972747803, "step": 2908 }, { "epoch": 0.76, "grad_norm": 35.99723434448242, "kl": 0.0, "learning_rate": 1.193404867835645e-07, "logps/chosen": -178.0907745361328, "logps/rejected": -234.1808319091797, "loss": 0.2611, "rewards/chosen": -0.0443677119910717, "rewards/margins": 1.617915391921997, "rewards/rejected": -1.6622830629348755, "step": 2909 }, { "epoch": 0.76, "grad_norm": 38.04181671142578, "kl": 0.0, "learning_rate": 1.192096309866527e-07, "logps/chosen": -252.9052734375, "logps/rejected": -265.5356140136719, "loss": 0.3118, "rewards/chosen": -0.2618914544582367, "rewards/margins": 2.1319985389709473, "rewards/rejected": -2.393889904022217, "step": 2910 }, { "epoch": 0.76, "grad_norm": 33.879615783691406, "kl": 0.0, "learning_rate": 1.190787751897409e-07, "logps/chosen": -142.2236328125, "logps/rejected": -220.40982055664062, "loss": 0.2546, "rewards/chosen": 2.9108173847198486, "rewards/margins": 5.476387977600098, "rewards/rejected": -2.565570831298828, "step": 2911 }, { "epoch": 0.76, "grad_norm": 31.349456787109375, "kl": 0.0, "learning_rate": 1.1894791939282909e-07, "logps/chosen": -273.1512756347656, "logps/rejected": -231.77330017089844, "loss": 0.2502, "rewards/chosen": 1.1709834337234497, "rewards/margins": 4.816771507263184, "rewards/rejected": -3.6457881927490234, "step": 2912 }, { "epoch": 0.76, "grad_norm": 25.215503692626953, "kl": 0.0, "learning_rate": 1.188170635959173e-07, "logps/chosen": -206.77252197265625, "logps/rejected": -157.4561309814453, "loss": 0.2568, "rewards/chosen": 0.6348246335983276, "rewards/margins": 6.27030086517334, "rewards/rejected": -5.635476112365723, "step": 2913 }, { "epoch": 0.76, "grad_norm": 28.806495666503906, "kl": 0.0, "learning_rate": 1.186862077990055e-07, "logps/chosen": -170.64573669433594, "logps/rejected": -262.3679504394531, "loss": 0.2113, "rewards/chosen": 0.7701768279075623, "rewards/margins": 5.672473907470703, "rewards/rejected": -4.902297019958496, "step": 2914 }, { "epoch": 0.76, "grad_norm": 30.437780380249023, "kl": 0.0, "learning_rate": 1.1855535200209368e-07, "logps/chosen": -245.5428466796875, "logps/rejected": -245.26171875, "loss": 0.2124, "rewards/chosen": 0.24975238740444183, "rewards/margins": 3.772033929824829, "rewards/rejected": -3.5222816467285156, "step": 2915 }, { "epoch": 0.76, "grad_norm": 29.952688217163086, "kl": 0.0, "learning_rate": 1.1842449620518189e-07, "logps/chosen": -194.86053466796875, "logps/rejected": -286.28790283203125, "loss": 0.1902, "rewards/chosen": 2.382014036178589, "rewards/margins": 6.134878158569336, "rewards/rejected": -3.752864122390747, "step": 2916 }, { "epoch": 0.76, "grad_norm": 33.65608596801758, "kl": 0.0, "learning_rate": 1.1829364040827008e-07, "logps/chosen": -221.7244415283203, "logps/rejected": -244.54493713378906, "loss": 0.1968, "rewards/chosen": 1.4137190580368042, "rewards/margins": 5.684474468231201, "rewards/rejected": -4.270755290985107, "step": 2917 }, { "epoch": 0.76, "grad_norm": 25.30941390991211, "kl": 0.0, "learning_rate": 1.1816278461135827e-07, "logps/chosen": -181.33401489257812, "logps/rejected": -193.39193725585938, "loss": 0.2273, "rewards/chosen": 1.3112400770187378, "rewards/margins": 5.295674800872803, "rewards/rejected": -3.9844348430633545, "step": 2918 }, { "epoch": 0.76, "grad_norm": 34.57612991333008, "kl": 0.0, "learning_rate": 1.1803192881444648e-07, "logps/chosen": -223.0674285888672, "logps/rejected": -245.7630615234375, "loss": 0.235, "rewards/chosen": 0.19638967514038086, "rewards/margins": 3.93575382232666, "rewards/rejected": -3.7393641471862793, "step": 2919 }, { "epoch": 0.76, "grad_norm": 24.53961181640625, "kl": 0.0, "learning_rate": 1.1790107301753467e-07, "logps/chosen": -187.73519897460938, "logps/rejected": -231.53778076171875, "loss": 0.2707, "rewards/chosen": 2.8392274379730225, "rewards/margins": 5.255084037780762, "rewards/rejected": -2.41585636138916, "step": 2920 }, { "epoch": 0.76, "grad_norm": 27.83038330078125, "kl": 0.0, "learning_rate": 1.1777021722062287e-07, "logps/chosen": -269.1419677734375, "logps/rejected": -262.9184875488281, "loss": 0.1798, "rewards/chosen": 0.33988016843795776, "rewards/margins": 4.673036575317383, "rewards/rejected": -4.333156585693359, "step": 2921 }, { "epoch": 0.76, "grad_norm": 40.96464920043945, "kl": 0.0, "learning_rate": 1.1763936142371106e-07, "logps/chosen": -258.025146484375, "logps/rejected": -282.2995910644531, "loss": 0.2673, "rewards/chosen": 2.189265012741089, "rewards/margins": 5.157666206359863, "rewards/rejected": -2.9684009552001953, "step": 2922 }, { "epoch": 0.76, "grad_norm": 30.345224380493164, "kl": 0.0, "learning_rate": 1.1750850562679926e-07, "logps/chosen": -242.3159942626953, "logps/rejected": -242.1539306640625, "loss": 0.3145, "rewards/chosen": -1.015539288520813, "rewards/margins": 1.1760417222976685, "rewards/rejected": -2.1915810108184814, "step": 2923 }, { "epoch": 0.77, "grad_norm": 37.97049331665039, "kl": 0.0, "learning_rate": 1.1737764982988746e-07, "logps/chosen": -198.21041870117188, "logps/rejected": -361.84039306640625, "loss": 0.268, "rewards/chosen": 1.1192774772644043, "rewards/margins": 3.962264060974121, "rewards/rejected": -2.842986583709717, "step": 2924 }, { "epoch": 0.77, "grad_norm": 27.184776306152344, "kl": 0.0, "learning_rate": 1.1724679403297567e-07, "logps/chosen": -172.3856201171875, "logps/rejected": -172.68576049804688, "loss": 0.3505, "rewards/chosen": 0.21904900670051575, "rewards/margins": 2.9532828330993652, "rewards/rejected": -2.734233856201172, "step": 2925 }, { "epoch": 0.77, "grad_norm": 31.706565856933594, "kl": 0.0, "learning_rate": 1.1711593823606385e-07, "logps/chosen": -257.7572021484375, "logps/rejected": -253.30271911621094, "loss": 0.2443, "rewards/chosen": 2.2228496074676514, "rewards/margins": 6.374711990356445, "rewards/rejected": -4.151862144470215, "step": 2926 }, { "epoch": 0.77, "grad_norm": 29.20972442626953, "kl": 0.0, "learning_rate": 1.1698508243915204e-07, "logps/chosen": -159.6472625732422, "logps/rejected": -226.85902404785156, "loss": 0.1695, "rewards/chosen": 2.3836936950683594, "rewards/margins": 7.045234203338623, "rewards/rejected": -4.661540508270264, "step": 2927 }, { "epoch": 0.77, "grad_norm": 23.147682189941406, "kl": 0.0, "learning_rate": 1.1685422664224024e-07, "logps/chosen": -157.58294677734375, "logps/rejected": -241.945556640625, "loss": 0.2749, "rewards/chosen": 0.3442791998386383, "rewards/margins": 3.804286003112793, "rewards/rejected": -3.4600067138671875, "step": 2928 }, { "epoch": 0.77, "grad_norm": 32.65460968017578, "kl": 0.0, "learning_rate": 1.1672337084532845e-07, "logps/chosen": -195.1674346923828, "logps/rejected": -227.59368896484375, "loss": 0.2232, "rewards/chosen": 1.0592485666275024, "rewards/margins": 3.595780372619629, "rewards/rejected": -2.536531925201416, "step": 2929 }, { "epoch": 0.77, "grad_norm": 35.047645568847656, "kl": 0.0, "learning_rate": 1.1659251504841663e-07, "logps/chosen": -181.13290405273438, "logps/rejected": -268.6132507324219, "loss": 0.3072, "rewards/chosen": 0.24703586101531982, "rewards/margins": 3.9326653480529785, "rewards/rejected": -3.6856296062469482, "step": 2930 }, { "epoch": 0.77, "grad_norm": 39.733726501464844, "kl": 0.0, "learning_rate": 1.1646165925150483e-07, "logps/chosen": -255.24049377441406, "logps/rejected": -268.782958984375, "loss": 0.2573, "rewards/chosen": 1.9264146089553833, "rewards/margins": 6.034917831420898, "rewards/rejected": -4.108503341674805, "step": 2931 }, { "epoch": 0.77, "grad_norm": 25.3881778717041, "kl": 0.0, "learning_rate": 1.1633080345459304e-07, "logps/chosen": -149.32199096679688, "logps/rejected": -205.93418884277344, "loss": 0.2107, "rewards/chosen": 1.3308162689208984, "rewards/margins": 5.0680389404296875, "rewards/rejected": -3.73722243309021, "step": 2932 }, { "epoch": 0.77, "grad_norm": 28.647340774536133, "kl": 0.0, "learning_rate": 1.1619994765768123e-07, "logps/chosen": -220.11572265625, "logps/rejected": -307.8936462402344, "loss": 0.2313, "rewards/chosen": 0.6090463399887085, "rewards/margins": 3.7676281929016113, "rewards/rejected": -3.1585819721221924, "step": 2933 }, { "epoch": 0.77, "grad_norm": 31.28106117248535, "kl": 0.0, "learning_rate": 1.1606909186076942e-07, "logps/chosen": -245.37083435058594, "logps/rejected": -180.98590087890625, "loss": 0.1955, "rewards/chosen": 0.789535403251648, "rewards/margins": 3.903141975402832, "rewards/rejected": -3.1136064529418945, "step": 2934 }, { "epoch": 0.77, "grad_norm": 32.39136505126953, "kl": 0.0, "learning_rate": 1.1593823606385763e-07, "logps/chosen": -300.31298828125, "logps/rejected": -187.9770050048828, "loss": 0.269, "rewards/chosen": 2.049201488494873, "rewards/margins": 4.9029645919799805, "rewards/rejected": -2.8537633419036865, "step": 2935 }, { "epoch": 0.77, "grad_norm": 37.13324737548828, "kl": 0.0, "learning_rate": 1.1580738026694582e-07, "logps/chosen": -259.8029479980469, "logps/rejected": -213.43118286132812, "loss": 0.2899, "rewards/chosen": 1.5232011079788208, "rewards/margins": 4.917179584503174, "rewards/rejected": -3.3939785957336426, "step": 2936 }, { "epoch": 0.77, "grad_norm": 39.28556442260742, "kl": 0.0, "learning_rate": 1.1567652447003402e-07, "logps/chosen": -246.21910095214844, "logps/rejected": -221.6072235107422, "loss": 0.3189, "rewards/chosen": 1.0758863687515259, "rewards/margins": 5.4353718757629395, "rewards/rejected": -4.359485626220703, "step": 2937 }, { "epoch": 0.77, "grad_norm": 35.176692962646484, "kl": 0.0, "learning_rate": 1.1554566867312221e-07, "logps/chosen": -128.11904907226562, "logps/rejected": -278.5266418457031, "loss": 0.2562, "rewards/chosen": 0.3561898171901703, "rewards/margins": 2.309230089187622, "rewards/rejected": -1.953040361404419, "step": 2938 }, { "epoch": 0.77, "grad_norm": 35.85581970214844, "kl": 0.0, "learning_rate": 1.1541481287621041e-07, "logps/chosen": -228.60194396972656, "logps/rejected": -264.4442138671875, "loss": 0.3275, "rewards/chosen": 0.6116616129875183, "rewards/margins": 4.661001205444336, "rewards/rejected": -4.049339771270752, "step": 2939 }, { "epoch": 0.77, "grad_norm": 32.08247375488281, "kl": 0.0, "learning_rate": 1.1528395707929861e-07, "logps/chosen": -179.61607360839844, "logps/rejected": -192.24085998535156, "loss": 0.3021, "rewards/chosen": 1.5291475057601929, "rewards/margins": 3.2262120246887207, "rewards/rejected": -1.6970643997192383, "step": 2940 }, { "epoch": 0.77, "grad_norm": 35.77885055541992, "kl": 0.0, "learning_rate": 1.1515310128238682e-07, "logps/chosen": -248.62643432617188, "logps/rejected": -270.5456848144531, "loss": 0.16, "rewards/chosen": 2.307846784591675, "rewards/margins": 4.3499555587768555, "rewards/rejected": -2.0421087741851807, "step": 2941 }, { "epoch": 0.77, "grad_norm": 33.389156341552734, "kl": 0.0, "learning_rate": 1.15022245485475e-07, "logps/chosen": -264.442138671875, "logps/rejected": -233.51083374023438, "loss": 0.3071, "rewards/chosen": -0.4067654609680176, "rewards/margins": 1.7154138088226318, "rewards/rejected": -2.1221792697906494, "step": 2942 }, { "epoch": 0.77, "grad_norm": 34.11969757080078, "kl": 0.0, "learning_rate": 1.148913896885632e-07, "logps/chosen": -278.32305908203125, "logps/rejected": -266.03045654296875, "loss": 0.3154, "rewards/chosen": -0.757024347782135, "rewards/margins": 2.179401397705078, "rewards/rejected": -2.9364256858825684, "step": 2943 }, { "epoch": 0.77, "grad_norm": 29.74340057373047, "kl": 0.0, "learning_rate": 1.147605338916514e-07, "logps/chosen": -124.81636047363281, "logps/rejected": -243.23130798339844, "loss": 0.2078, "rewards/chosen": 1.5924134254455566, "rewards/margins": 5.299116611480713, "rewards/rejected": -3.7067031860351562, "step": 2944 }, { "epoch": 0.77, "grad_norm": 32.440956115722656, "kl": 0.0, "learning_rate": 1.146296780947396e-07, "logps/chosen": -161.70448303222656, "logps/rejected": -231.23211669921875, "loss": 0.2956, "rewards/chosen": 1.406182885169983, "rewards/margins": 4.8207926750183105, "rewards/rejected": -3.414609909057617, "step": 2945 }, { "epoch": 0.77, "grad_norm": 37.233802795410156, "kl": 0.0, "learning_rate": 1.1449882229782778e-07, "logps/chosen": -233.98989868164062, "logps/rejected": -275.0398254394531, "loss": 0.2631, "rewards/chosen": 0.17019644379615784, "rewards/margins": 3.401771306991577, "rewards/rejected": -3.231574773788452, "step": 2946 }, { "epoch": 0.77, "grad_norm": 68.66511535644531, "kl": 0.0, "learning_rate": 1.1436796650091598e-07, "logps/chosen": -267.5313720703125, "logps/rejected": -220.8348388671875, "loss": 0.3068, "rewards/chosen": -0.22980618476867676, "rewards/margins": 2.2407171726226807, "rewards/rejected": -2.4705233573913574, "step": 2947 }, { "epoch": 0.77, "grad_norm": 32.196678161621094, "kl": 0.0, "learning_rate": 1.1423711070400419e-07, "logps/chosen": -141.34901428222656, "logps/rejected": -292.7916259765625, "loss": 0.1982, "rewards/chosen": 0.8846850991249084, "rewards/margins": 5.876195430755615, "rewards/rejected": -4.991510391235352, "step": 2948 }, { "epoch": 0.77, "grad_norm": 38.09465408325195, "kl": 0.0, "learning_rate": 1.1410625490709237e-07, "logps/chosen": -175.3059539794922, "logps/rejected": -315.62493896484375, "loss": 0.2861, "rewards/chosen": -0.07770039886236191, "rewards/margins": 3.6531319618225098, "rewards/rejected": -3.73083233833313, "step": 2949 }, { "epoch": 0.77, "grad_norm": 33.7330322265625, "kl": 0.0, "learning_rate": 1.1397539911018057e-07, "logps/chosen": -209.34979248046875, "logps/rejected": -225.85885620117188, "loss": 0.2615, "rewards/chosen": 2.062551498413086, "rewards/margins": 4.068078994750977, "rewards/rejected": -2.0055277347564697, "step": 2950 }, { "epoch": 0.77, "grad_norm": 37.776817321777344, "kl": 0.0, "learning_rate": 1.1384454331326878e-07, "logps/chosen": -213.33070373535156, "logps/rejected": -256.4163818359375, "loss": 0.314, "rewards/chosen": 1.311199426651001, "rewards/margins": 3.4920144081115723, "rewards/rejected": -2.1808149814605713, "step": 2951 }, { "epoch": 0.77, "grad_norm": 34.075008392333984, "kl": 0.0, "learning_rate": 1.1371368751635697e-07, "logps/chosen": -207.1404266357422, "logps/rejected": -249.67095947265625, "loss": 0.1776, "rewards/chosen": 1.8990845680236816, "rewards/margins": 5.696239471435547, "rewards/rejected": -3.7971549034118652, "step": 2952 }, { "epoch": 0.77, "grad_norm": 39.363426208496094, "kl": 0.0, "learning_rate": 1.1358283171944516e-07, "logps/chosen": -206.06753540039062, "logps/rejected": -221.67047119140625, "loss": 0.2966, "rewards/chosen": 0.7883104681968689, "rewards/margins": 4.238680839538574, "rewards/rejected": -3.4503703117370605, "step": 2953 }, { "epoch": 0.77, "grad_norm": 18.948083877563477, "kl": 0.0, "learning_rate": 1.1345197592253337e-07, "logps/chosen": -122.21463775634766, "logps/rejected": -260.5376892089844, "loss": 0.1369, "rewards/chosen": 1.1754182577133179, "rewards/margins": 5.67502498626709, "rewards/rejected": -4.499606609344482, "step": 2954 }, { "epoch": 0.77, "grad_norm": 32.583106994628906, "kl": 0.0, "learning_rate": 1.1332112012562156e-07, "logps/chosen": -174.0437469482422, "logps/rejected": -196.47865295410156, "loss": 0.3576, "rewards/chosen": 0.3868805170059204, "rewards/margins": 3.155986785888672, "rewards/rejected": -2.769106388092041, "step": 2955 }, { "epoch": 0.77, "grad_norm": 34.119384765625, "kl": 0.0, "learning_rate": 1.1319026432870976e-07, "logps/chosen": -209.29136657714844, "logps/rejected": -220.05838012695312, "loss": 0.1649, "rewards/chosen": 1.6317064762115479, "rewards/margins": 5.285268783569336, "rewards/rejected": -3.653562307357788, "step": 2956 }, { "epoch": 0.77, "grad_norm": 32.831085205078125, "kl": 0.0, "learning_rate": 1.1305940853179795e-07, "logps/chosen": -199.46022033691406, "logps/rejected": -262.05328369140625, "loss": 0.2363, "rewards/chosen": 0.814264714717865, "rewards/margins": 4.46965217590332, "rewards/rejected": -3.6553874015808105, "step": 2957 }, { "epoch": 0.77, "grad_norm": 33.77021789550781, "kl": 0.0, "learning_rate": 1.1292855273488615e-07, "logps/chosen": -242.86158752441406, "logps/rejected": -310.06060791015625, "loss": 0.2165, "rewards/chosen": 1.675708532333374, "rewards/margins": 5.887651443481445, "rewards/rejected": -4.211942672729492, "step": 2958 }, { "epoch": 0.77, "grad_norm": 24.862733840942383, "kl": 0.0, "learning_rate": 1.1279769693797435e-07, "logps/chosen": -171.75450134277344, "logps/rejected": -218.28988647460938, "loss": 0.2095, "rewards/chosen": 2.0904273986816406, "rewards/margins": 5.917420387268066, "rewards/rejected": -3.826992988586426, "step": 2959 }, { "epoch": 0.77, "grad_norm": 32.68523406982422, "kl": 0.0, "learning_rate": 1.1266684114106256e-07, "logps/chosen": -218.8000030517578, "logps/rejected": -239.5182342529297, "loss": 0.2813, "rewards/chosen": 1.9658054113388062, "rewards/margins": 3.183340549468994, "rewards/rejected": -1.2175352573394775, "step": 2960 }, { "epoch": 0.77, "grad_norm": 21.125932693481445, "kl": 0.0, "learning_rate": 1.1253598534415074e-07, "logps/chosen": -103.0634536743164, "logps/rejected": -305.65045166015625, "loss": 0.2661, "rewards/chosen": 0.5404638051986694, "rewards/margins": 4.619140148162842, "rewards/rejected": -4.078676223754883, "step": 2961 }, { "epoch": 0.78, "grad_norm": 32.92940902709961, "kl": 0.0, "learning_rate": 1.1240512954723893e-07, "logps/chosen": -186.74815368652344, "logps/rejected": -226.94517517089844, "loss": 0.1668, "rewards/chosen": 1.232695460319519, "rewards/margins": 5.6889777183532715, "rewards/rejected": -4.456282138824463, "step": 2962 }, { "epoch": 0.78, "grad_norm": 30.524091720581055, "kl": 0.0, "learning_rate": 1.1227427375032713e-07, "logps/chosen": -180.26528930664062, "logps/rejected": -228.55606079101562, "loss": 0.3252, "rewards/chosen": 0.7677914500236511, "rewards/margins": 4.915521144866943, "rewards/rejected": -4.147729873657227, "step": 2963 }, { "epoch": 0.78, "grad_norm": 45.41800308227539, "kl": 0.0, "learning_rate": 1.1214341795341534e-07, "logps/chosen": -191.4619598388672, "logps/rejected": -252.8148193359375, "loss": 0.2961, "rewards/chosen": 1.1002200841903687, "rewards/margins": 4.552682876586914, "rewards/rejected": -3.452462673187256, "step": 2964 }, { "epoch": 0.78, "grad_norm": 26.6116886138916, "kl": 0.0, "learning_rate": 1.1201256215650352e-07, "logps/chosen": -240.11390686035156, "logps/rejected": -247.91156005859375, "loss": 0.3104, "rewards/chosen": 2.9228763580322266, "rewards/margins": 6.118831634521484, "rewards/rejected": -3.195955514907837, "step": 2965 }, { "epoch": 0.78, "grad_norm": 32.6483268737793, "kl": 0.0, "learning_rate": 1.1188170635959172e-07, "logps/chosen": -191.32887268066406, "logps/rejected": -302.2726745605469, "loss": 0.1808, "rewards/chosen": 1.0362077951431274, "rewards/margins": 4.585421562194824, "rewards/rejected": -3.5492136478424072, "step": 2966 }, { "epoch": 0.78, "grad_norm": 29.699861526489258, "kl": 0.0, "learning_rate": 1.1175085056267993e-07, "logps/chosen": -234.7059326171875, "logps/rejected": -196.4623260498047, "loss": 0.1724, "rewards/chosen": 0.14948131144046783, "rewards/margins": 3.898578405380249, "rewards/rejected": -3.7490971088409424, "step": 2967 }, { "epoch": 0.78, "grad_norm": 26.997941970825195, "kl": 0.0, "learning_rate": 1.1161999476576812e-07, "logps/chosen": -222.604736328125, "logps/rejected": -244.908935546875, "loss": 0.1768, "rewards/chosen": 1.4389097690582275, "rewards/margins": 5.924962043762207, "rewards/rejected": -4.4860520362854, "step": 2968 }, { "epoch": 0.78, "grad_norm": 34.620880126953125, "kl": 0.0, "learning_rate": 1.1148913896885631e-07, "logps/chosen": -190.08731079101562, "logps/rejected": -178.37429809570312, "loss": 0.2559, "rewards/chosen": 1.443699598312378, "rewards/margins": 4.743268013000488, "rewards/rejected": -3.2995681762695312, "step": 2969 }, { "epoch": 0.78, "grad_norm": 36.19744110107422, "kl": 0.0, "learning_rate": 1.1135828317194452e-07, "logps/chosen": -151.0272674560547, "logps/rejected": -201.56234741210938, "loss": 0.2313, "rewards/chosen": 0.9803556203842163, "rewards/margins": 3.7992358207702637, "rewards/rejected": -2.818880081176758, "step": 2970 }, { "epoch": 0.78, "grad_norm": 33.23929214477539, "kl": 0.0, "learning_rate": 1.1122742737503271e-07, "logps/chosen": -140.99327087402344, "logps/rejected": -244.1494903564453, "loss": 0.2124, "rewards/chosen": 1.0255264043807983, "rewards/margins": 5.782019138336182, "rewards/rejected": -4.756492614746094, "step": 2971 }, { "epoch": 0.78, "grad_norm": 28.24236297607422, "kl": 0.0, "learning_rate": 1.1109657157812091e-07, "logps/chosen": -147.13204956054688, "logps/rejected": -311.97467041015625, "loss": 0.2424, "rewards/chosen": 1.267746925354004, "rewards/margins": 6.412136077880859, "rewards/rejected": -5.1443891525268555, "step": 2972 }, { "epoch": 0.78, "grad_norm": 26.959043502807617, "kl": 0.0, "learning_rate": 1.109657157812091e-07, "logps/chosen": -223.3488311767578, "logps/rejected": -358.9703369140625, "loss": 0.2572, "rewards/chosen": 1.4740769863128662, "rewards/margins": 6.306148529052734, "rewards/rejected": -4.832071304321289, "step": 2973 }, { "epoch": 0.78, "grad_norm": 27.128389358520508, "kl": 0.0, "learning_rate": 1.108348599842973e-07, "logps/chosen": -102.9526596069336, "logps/rejected": -211.68307495117188, "loss": 0.2965, "rewards/chosen": -0.20596718788146973, "rewards/margins": 3.6257193088531494, "rewards/rejected": -3.831686496734619, "step": 2974 }, { "epoch": 0.78, "grad_norm": 33.70097351074219, "kl": 0.0, "learning_rate": 1.107040041873855e-07, "logps/chosen": -219.6284942626953, "logps/rejected": -213.84817504882812, "loss": 0.1994, "rewards/chosen": 3.291337728500366, "rewards/margins": 6.678530693054199, "rewards/rejected": -3.387192726135254, "step": 2975 }, { "epoch": 0.78, "grad_norm": 36.46519088745117, "kl": 0.0, "learning_rate": 1.1057314839047368e-07, "logps/chosen": -214.6738739013672, "logps/rejected": -232.84051513671875, "loss": 0.2328, "rewards/chosen": -0.8670411109924316, "rewards/margins": 2.0714240074157715, "rewards/rejected": -2.938465118408203, "step": 2976 }, { "epoch": 0.78, "grad_norm": 37.8907585144043, "kl": 0.0, "learning_rate": 1.1044229259356189e-07, "logps/chosen": -226.720458984375, "logps/rejected": -200.7624969482422, "loss": 0.2166, "rewards/chosen": 0.6200340986251831, "rewards/margins": 4.319954872131348, "rewards/rejected": -3.699920654296875, "step": 2977 }, { "epoch": 0.78, "grad_norm": 40.9567756652832, "kl": 0.0, "learning_rate": 1.1031143679665009e-07, "logps/chosen": -232.58010864257812, "logps/rejected": -379.1448669433594, "loss": 0.2943, "rewards/chosen": -0.21439041197299957, "rewards/margins": 4.33332633972168, "rewards/rejected": -4.5477166175842285, "step": 2978 }, { "epoch": 0.78, "grad_norm": 40.09104919433594, "kl": 0.0, "learning_rate": 1.1018058099973828e-07, "logps/chosen": -167.21389770507812, "logps/rejected": -303.8032531738281, "loss": 0.307, "rewards/chosen": 1.0971133708953857, "rewards/margins": 3.571532964706421, "rewards/rejected": -2.474419593811035, "step": 2979 }, { "epoch": 0.78, "grad_norm": 25.115385055541992, "kl": 0.0, "learning_rate": 1.1004972520282648e-07, "logps/chosen": -311.6871032714844, "logps/rejected": -211.0572509765625, "loss": 0.1853, "rewards/chosen": 2.4156370162963867, "rewards/margins": 6.687005043029785, "rewards/rejected": -4.271368026733398, "step": 2980 }, { "epoch": 0.78, "grad_norm": 25.411376953125, "kl": 0.0, "learning_rate": 1.0991886940591467e-07, "logps/chosen": -204.73744201660156, "logps/rejected": -330.9629821777344, "loss": 0.2114, "rewards/chosen": 0.5547571778297424, "rewards/margins": 6.988034725189209, "rewards/rejected": -6.433277606964111, "step": 2981 }, { "epoch": 0.78, "grad_norm": 34.39225387573242, "kl": 0.0, "learning_rate": 1.0978801360900287e-07, "logps/chosen": -265.6847229003906, "logps/rejected": -253.92202758789062, "loss": 0.2749, "rewards/chosen": 2.182427167892456, "rewards/margins": 5.48234748840332, "rewards/rejected": -3.299920082092285, "step": 2982 }, { "epoch": 0.78, "grad_norm": 37.2737922668457, "kl": 0.0, "learning_rate": 1.0965715781209108e-07, "logps/chosen": -220.30810546875, "logps/rejected": -286.2281188964844, "loss": 0.3222, "rewards/chosen": 0.6856479644775391, "rewards/margins": 3.8363304138183594, "rewards/rejected": -3.1506824493408203, "step": 2983 }, { "epoch": 0.78, "grad_norm": 34.21880340576172, "kl": 0.0, "learning_rate": 1.0952630201517926e-07, "logps/chosen": -256.91973876953125, "logps/rejected": -266.81805419921875, "loss": 0.2503, "rewards/chosen": 2.428946018218994, "rewards/margins": 6.026227951049805, "rewards/rejected": -3.5972819328308105, "step": 2984 }, { "epoch": 0.78, "grad_norm": 37.171661376953125, "kl": 0.0, "learning_rate": 1.0939544621826746e-07, "logps/chosen": -223.8563232421875, "logps/rejected": -173.02964782714844, "loss": 0.3763, "rewards/chosen": -0.8342050313949585, "rewards/margins": 2.1872963905334473, "rewards/rejected": -3.0215015411376953, "step": 2985 }, { "epoch": 0.78, "grad_norm": 30.91156768798828, "kl": 0.0, "learning_rate": 1.0926459042135567e-07, "logps/chosen": -216.64736938476562, "logps/rejected": -358.8420715332031, "loss": 0.2425, "rewards/chosen": 1.1421846151351929, "rewards/margins": 3.779019355773926, "rewards/rejected": -2.6368348598480225, "step": 2986 }, { "epoch": 0.78, "grad_norm": 32.80834197998047, "kl": 0.0, "learning_rate": 1.0913373462444386e-07, "logps/chosen": -257.1531677246094, "logps/rejected": -254.27359008789062, "loss": 0.2887, "rewards/chosen": 0.9386414885520935, "rewards/margins": 4.0823516845703125, "rewards/rejected": -3.143710136413574, "step": 2987 }, { "epoch": 0.78, "grad_norm": 26.716569900512695, "kl": 0.0, "learning_rate": 1.0900287882753205e-07, "logps/chosen": -203.5930938720703, "logps/rejected": -369.82818603515625, "loss": 0.2132, "rewards/chosen": 5.119094371795654, "rewards/margins": 8.760660171508789, "rewards/rejected": -3.6415653228759766, "step": 2988 }, { "epoch": 0.78, "grad_norm": 32.3159294128418, "kl": 0.0, "learning_rate": 1.0887202303062026e-07, "logps/chosen": -204.94692993164062, "logps/rejected": -323.8594665527344, "loss": 0.2114, "rewards/chosen": 1.6632729768753052, "rewards/margins": 5.176769733428955, "rewards/rejected": -3.5134966373443604, "step": 2989 }, { "epoch": 0.78, "grad_norm": 32.48981475830078, "kl": 0.0, "learning_rate": 1.0874116723370845e-07, "logps/chosen": -217.63912963867188, "logps/rejected": -200.53121948242188, "loss": 0.2789, "rewards/chosen": 0.8979160189628601, "rewards/margins": 4.462857723236084, "rewards/rejected": -3.564941644668579, "step": 2990 }, { "epoch": 0.78, "grad_norm": 24.33729362487793, "kl": 0.0, "learning_rate": 1.0861031143679665e-07, "logps/chosen": -182.7748565673828, "logps/rejected": -279.8739013671875, "loss": 0.1476, "rewards/chosen": 3.3167998790740967, "rewards/margins": 7.395058631896973, "rewards/rejected": -4.078258991241455, "step": 2991 }, { "epoch": 0.78, "grad_norm": 37.343414306640625, "kl": 0.0, "learning_rate": 1.0847945563988483e-07, "logps/chosen": -201.48919677734375, "logps/rejected": -130.3191375732422, "loss": 0.3314, "rewards/chosen": 0.16055479645729065, "rewards/margins": 2.0605573654174805, "rewards/rejected": -1.9000025987625122, "step": 2992 }, { "epoch": 0.78, "grad_norm": 64.43939971923828, "kl": 0.0, "learning_rate": 1.0834859984297304e-07, "logps/chosen": -174.33096313476562, "logps/rejected": -232.03768920898438, "loss": 0.3909, "rewards/chosen": 0.12005829811096191, "rewards/margins": 1.6579828262329102, "rewards/rejected": -1.5379245281219482, "step": 2993 }, { "epoch": 0.78, "grad_norm": 31.608854293823242, "kl": 0.0, "learning_rate": 1.0821774404606124e-07, "logps/chosen": -276.39581298828125, "logps/rejected": -246.9667510986328, "loss": 0.2036, "rewards/chosen": 2.626638412475586, "rewards/margins": 6.046180725097656, "rewards/rejected": -3.4195423126220703, "step": 2994 }, { "epoch": 0.78, "grad_norm": 33.15080642700195, "kl": 0.0, "learning_rate": 1.0808688824914943e-07, "logps/chosen": -141.77513122558594, "logps/rejected": -343.1261901855469, "loss": 0.1914, "rewards/chosen": 1.7029200792312622, "rewards/margins": 6.784095764160156, "rewards/rejected": -5.081175804138184, "step": 2995 }, { "epoch": 0.78, "grad_norm": 32.62825393676758, "kl": 0.0, "learning_rate": 1.0795603245223763e-07, "logps/chosen": -179.11143493652344, "logps/rejected": -298.8634948730469, "loss": 0.2007, "rewards/chosen": 1.274958610534668, "rewards/margins": 5.084338665008545, "rewards/rejected": -3.809380054473877, "step": 2996 }, { "epoch": 0.78, "grad_norm": 48.24810791015625, "kl": 0.0, "learning_rate": 1.0782517665532582e-07, "logps/chosen": -235.58192443847656, "logps/rejected": -326.9761657714844, "loss": 0.17, "rewards/chosen": 1.9319114685058594, "rewards/margins": 6.2909417152404785, "rewards/rejected": -4.359030246734619, "step": 2997 }, { "epoch": 0.78, "grad_norm": 26.545345306396484, "kl": 0.0, "learning_rate": 1.0769432085841402e-07, "logps/chosen": -223.30014038085938, "logps/rejected": -226.85134887695312, "loss": 0.2063, "rewards/chosen": 1.05927312374115, "rewards/margins": 6.737154960632324, "rewards/rejected": -5.677881717681885, "step": 2998 }, { "epoch": 0.78, "grad_norm": 33.72455978393555, "kl": 0.0, "learning_rate": 1.0756346506150223e-07, "logps/chosen": -100.30730438232422, "logps/rejected": -345.95928955078125, "loss": 0.2833, "rewards/chosen": 0.41382449865341187, "rewards/margins": 5.438047885894775, "rewards/rejected": -5.024223327636719, "step": 2999 }, { "epoch": 0.79, "grad_norm": 28.084556579589844, "kl": 0.0, "learning_rate": 1.0743260926459041e-07, "logps/chosen": -166.98902893066406, "logps/rejected": -267.23040771484375, "loss": 0.2217, "rewards/chosen": 0.7845678925514221, "rewards/margins": 5.768924236297607, "rewards/rejected": -4.98435640335083, "step": 3000 }, { "epoch": 0.79, "grad_norm": 36.8436393737793, "kl": 0.0, "learning_rate": 1.0730175346767861e-07, "logps/chosen": -138.873046875, "logps/rejected": -246.13360595703125, "loss": 0.2673, "rewards/chosen": 0.9685492515563965, "rewards/margins": 3.8232574462890625, "rewards/rejected": -2.854708194732666, "step": 3001 }, { "epoch": 0.79, "grad_norm": 28.18307113647461, "kl": 0.0, "learning_rate": 1.0717089767076682e-07, "logps/chosen": -263.03448486328125, "logps/rejected": -228.5595703125, "loss": 0.2002, "rewards/chosen": 1.27859365940094, "rewards/margins": 4.943618297576904, "rewards/rejected": -3.665024757385254, "step": 3002 }, { "epoch": 0.79, "grad_norm": 30.899982452392578, "kl": 0.0, "learning_rate": 1.0704004187385501e-07, "logps/chosen": -220.91151428222656, "logps/rejected": -276.31787109375, "loss": 0.3037, "rewards/chosen": 2.9530606269836426, "rewards/margins": 6.3132548332214355, "rewards/rejected": -3.360194206237793, "step": 3003 }, { "epoch": 0.79, "grad_norm": 32.476722717285156, "kl": 0.0, "learning_rate": 1.069091860769432e-07, "logps/chosen": -192.40228271484375, "logps/rejected": -217.82337951660156, "loss": 0.3322, "rewards/chosen": -0.3618360459804535, "rewards/margins": 2.2618818283081055, "rewards/rejected": -2.623717784881592, "step": 3004 }, { "epoch": 0.79, "grad_norm": 29.309284210205078, "kl": 0.0, "learning_rate": 1.067783302800314e-07, "logps/chosen": -203.76046752929688, "logps/rejected": -190.60350036621094, "loss": 0.3283, "rewards/chosen": 0.03439127281308174, "rewards/margins": 3.6848862171173096, "rewards/rejected": -3.6504950523376465, "step": 3005 }, { "epoch": 0.79, "grad_norm": 31.234689712524414, "kl": 0.0, "learning_rate": 1.066474744831196e-07, "logps/chosen": -197.9490203857422, "logps/rejected": -288.17608642578125, "loss": 0.1237, "rewards/chosen": 1.1934818029403687, "rewards/margins": 4.739086627960205, "rewards/rejected": -3.545604705810547, "step": 3006 }, { "epoch": 0.79, "grad_norm": 28.367952346801758, "kl": 0.0, "learning_rate": 1.0651661868620779e-07, "logps/chosen": -253.59230041503906, "logps/rejected": -246.80653381347656, "loss": 0.3031, "rewards/chosen": -0.3414022922515869, "rewards/margins": 3.4762983322143555, "rewards/rejected": -3.8177006244659424, "step": 3007 }, { "epoch": 0.79, "grad_norm": 31.36299705505371, "kl": 0.0, "learning_rate": 1.06385762889296e-07, "logps/chosen": -146.8282928466797, "logps/rejected": -191.821044921875, "loss": 0.3346, "rewards/chosen": 0.6221804022789001, "rewards/margins": 3.2980802059173584, "rewards/rejected": -2.6758997440338135, "step": 3008 }, { "epoch": 0.79, "grad_norm": 39.00789260864258, "kl": 0.0, "learning_rate": 1.0625490709238419e-07, "logps/chosen": -156.52456665039062, "logps/rejected": -229.74844360351562, "loss": 0.329, "rewards/chosen": -0.1007121205329895, "rewards/margins": 2.1996805667877197, "rewards/rejected": -2.3003926277160645, "step": 3009 }, { "epoch": 0.79, "grad_norm": 29.693687438964844, "kl": 0.0, "learning_rate": 1.0612405129547239e-07, "logps/chosen": -211.4034423828125, "logps/rejected": -303.6903076171875, "loss": 0.3138, "rewards/chosen": 0.26724064350128174, "rewards/margins": 4.005509853363037, "rewards/rejected": -3.738269090652466, "step": 3010 }, { "epoch": 0.79, "grad_norm": 37.14421081542969, "kl": 0.0, "learning_rate": 1.0599319549856057e-07, "logps/chosen": -326.74212646484375, "logps/rejected": -236.1087646484375, "loss": 0.3227, "rewards/chosen": 0.06870659440755844, "rewards/margins": 3.472843647003174, "rewards/rejected": -3.404137134552002, "step": 3011 }, { "epoch": 0.79, "grad_norm": 37.60954284667969, "kl": 0.0, "learning_rate": 1.0586233970164878e-07, "logps/chosen": -205.22088623046875, "logps/rejected": -304.60211181640625, "loss": 0.2637, "rewards/chosen": 2.5140395164489746, "rewards/margins": 6.879363536834717, "rewards/rejected": -4.365324020385742, "step": 3012 }, { "epoch": 0.79, "grad_norm": 40.19622802734375, "kl": 0.0, "learning_rate": 1.0573148390473697e-07, "logps/chosen": -229.6803741455078, "logps/rejected": -237.748291015625, "loss": 0.3501, "rewards/chosen": 0.07104342430830002, "rewards/margins": 2.646733045578003, "rewards/rejected": -2.5756895542144775, "step": 3013 }, { "epoch": 0.79, "grad_norm": 33.84101486206055, "kl": 0.0, "learning_rate": 1.0560062810782517e-07, "logps/chosen": -198.72361755371094, "logps/rejected": -315.6944274902344, "loss": 0.2828, "rewards/chosen": 1.207344889640808, "rewards/margins": 6.238935470581055, "rewards/rejected": -5.031590461730957, "step": 3014 }, { "epoch": 0.79, "grad_norm": 28.22218894958496, "kl": 0.0, "learning_rate": 1.0546977231091337e-07, "logps/chosen": -249.1780242919922, "logps/rejected": -194.42227172851562, "loss": 0.1329, "rewards/chosen": 2.790569543838501, "rewards/margins": 6.9953813552856445, "rewards/rejected": -4.2048115730285645, "step": 3015 }, { "epoch": 0.79, "grad_norm": 39.59652328491211, "kl": 0.0, "learning_rate": 1.0533891651400156e-07, "logps/chosen": -318.8900451660156, "logps/rejected": -175.8162078857422, "loss": 0.1602, "rewards/chosen": 1.4444025754928589, "rewards/margins": 5.619593620300293, "rewards/rejected": -4.1751909255981445, "step": 3016 }, { "epoch": 0.79, "grad_norm": 34.01449203491211, "kl": 0.0, "learning_rate": 1.0520806071708976e-07, "logps/chosen": -234.33010864257812, "logps/rejected": -186.9846954345703, "loss": 0.2612, "rewards/chosen": 0.5609415769577026, "rewards/margins": 2.225095510482788, "rewards/rejected": -1.6641539335250854, "step": 3017 }, { "epoch": 0.79, "grad_norm": 38.461822509765625, "kl": 0.0, "learning_rate": 1.0507720492017797e-07, "logps/chosen": -148.9252471923828, "logps/rejected": -353.3999328613281, "loss": 0.3001, "rewards/chosen": 1.1917644739151, "rewards/margins": 4.060915946960449, "rewards/rejected": -2.8691513538360596, "step": 3018 }, { "epoch": 0.79, "grad_norm": 34.158775329589844, "kl": 0.0, "learning_rate": 1.0494634912326615e-07, "logps/chosen": -278.2842102050781, "logps/rejected": -328.3468322753906, "loss": 0.3007, "rewards/chosen": -1.3085110187530518, "rewards/margins": 4.603449821472168, "rewards/rejected": -5.911961078643799, "step": 3019 }, { "epoch": 0.79, "grad_norm": 33.55435562133789, "kl": 0.0, "learning_rate": 1.0481549332635435e-07, "logps/chosen": -193.94454956054688, "logps/rejected": -240.3863983154297, "loss": 0.2211, "rewards/chosen": 0.9673369526863098, "rewards/margins": 4.179528713226318, "rewards/rejected": -3.2121918201446533, "step": 3020 }, { "epoch": 0.79, "grad_norm": 31.77188491821289, "kl": 0.0, "learning_rate": 1.0468463752944256e-07, "logps/chosen": -275.8807678222656, "logps/rejected": -209.6526336669922, "loss": 0.1991, "rewards/chosen": 2.488818645477295, "rewards/margins": 6.772157669067383, "rewards/rejected": -4.283339023590088, "step": 3021 }, { "epoch": 0.79, "grad_norm": 30.585309982299805, "kl": 0.0, "learning_rate": 1.0455378173253075e-07, "logps/chosen": -213.80178833007812, "logps/rejected": -218.0395050048828, "loss": 0.1788, "rewards/chosen": 1.2606884241104126, "rewards/margins": 5.7575297355651855, "rewards/rejected": -4.4968414306640625, "step": 3022 }, { "epoch": 0.79, "grad_norm": 23.492050170898438, "kl": 0.0, "learning_rate": 1.0442292593561894e-07, "logps/chosen": -174.97213745117188, "logps/rejected": -341.42987060546875, "loss": 0.181, "rewards/chosen": 0.9902437329292297, "rewards/margins": 6.303700923919678, "rewards/rejected": -5.313457012176514, "step": 3023 }, { "epoch": 0.79, "grad_norm": 33.66410827636719, "kl": 0.0, "learning_rate": 1.0429207013870715e-07, "logps/chosen": -193.3638153076172, "logps/rejected": -289.1599426269531, "loss": 0.3278, "rewards/chosen": -0.5050497651100159, "rewards/margins": 3.2536935806274414, "rewards/rejected": -3.7587432861328125, "step": 3024 }, { "epoch": 0.79, "grad_norm": 28.773303985595703, "kl": 0.0, "learning_rate": 1.0416121434179534e-07, "logps/chosen": -164.4617156982422, "logps/rejected": -325.7907409667969, "loss": 0.1888, "rewards/chosen": 1.4648340940475464, "rewards/margins": 8.07848834991455, "rewards/rejected": -6.613654136657715, "step": 3025 }, { "epoch": 0.79, "grad_norm": 26.393327713012695, "kl": 0.0, "learning_rate": 1.0403035854488354e-07, "logps/chosen": -232.78707885742188, "logps/rejected": -268.3695983886719, "loss": 0.2201, "rewards/chosen": 0.41276469826698303, "rewards/margins": 4.607279300689697, "rewards/rejected": -4.194514751434326, "step": 3026 }, { "epoch": 0.79, "grad_norm": 29.967241287231445, "kl": 0.0, "learning_rate": 1.0389950274797172e-07, "logps/chosen": -176.5338134765625, "logps/rejected": -262.23046875, "loss": 0.2127, "rewards/chosen": 1.2708486318588257, "rewards/margins": 4.850653648376465, "rewards/rejected": -3.5798051357269287, "step": 3027 }, { "epoch": 0.79, "grad_norm": 33.31254196166992, "kl": 0.0, "learning_rate": 1.0376864695105993e-07, "logps/chosen": -221.40109252929688, "logps/rejected": -205.8263702392578, "loss": 0.229, "rewards/chosen": 1.289694905281067, "rewards/margins": 4.560299873352051, "rewards/rejected": -3.2706050872802734, "step": 3028 }, { "epoch": 0.79, "grad_norm": 32.369956970214844, "kl": 0.0, "learning_rate": 1.0363779115414813e-07, "logps/chosen": -283.633544921875, "logps/rejected": -215.62583923339844, "loss": 0.2673, "rewards/chosen": 1.1404141187667847, "rewards/margins": 4.866934299468994, "rewards/rejected": -3.726520299911499, "step": 3029 }, { "epoch": 0.79, "grad_norm": 28.29862403869629, "kl": 0.0, "learning_rate": 1.0350693535723632e-07, "logps/chosen": -282.5250549316406, "logps/rejected": -267.06903076171875, "loss": 0.2235, "rewards/chosen": 1.0757641792297363, "rewards/margins": 5.065523147583008, "rewards/rejected": -3.9897589683532715, "step": 3030 }, { "epoch": 0.79, "grad_norm": 29.409870147705078, "kl": 0.0, "learning_rate": 1.0337607956032452e-07, "logps/chosen": -221.21969604492188, "logps/rejected": -300.745849609375, "loss": 0.3934, "rewards/chosen": -0.5117583274841309, "rewards/margins": 2.25506591796875, "rewards/rejected": -2.766824245452881, "step": 3031 }, { "epoch": 0.79, "grad_norm": 30.991165161132812, "kl": 0.0, "learning_rate": 1.0324522376341271e-07, "logps/chosen": -215.63917541503906, "logps/rejected": -270.92657470703125, "loss": 0.2509, "rewards/chosen": 1.370503306388855, "rewards/margins": 4.434596061706543, "rewards/rejected": -3.0640928745269775, "step": 3032 }, { "epoch": 0.79, "grad_norm": 34.13918685913086, "kl": 0.0, "learning_rate": 1.0311436796650091e-07, "logps/chosen": -212.4232635498047, "logps/rejected": -166.4032440185547, "loss": 0.1785, "rewards/chosen": -0.247951939702034, "rewards/margins": 4.787047386169434, "rewards/rejected": -5.034999370574951, "step": 3033 }, { "epoch": 0.79, "grad_norm": 31.52064323425293, "kl": 0.0, "learning_rate": 1.029835121695891e-07, "logps/chosen": -270.621826171875, "logps/rejected": -242.9385223388672, "loss": 0.2086, "rewards/chosen": 0.8697998523712158, "rewards/margins": 4.359752655029297, "rewards/rejected": -3.489952564239502, "step": 3034 }, { "epoch": 0.79, "grad_norm": 41.24197006225586, "kl": 0.0, "learning_rate": 1.028526563726773e-07, "logps/chosen": -266.4942626953125, "logps/rejected": -229.0302734375, "loss": 0.1731, "rewards/chosen": 1.4553301334381104, "rewards/margins": 5.062695503234863, "rewards/rejected": -3.607365369796753, "step": 3035 }, { "epoch": 0.79, "grad_norm": 34.90055465698242, "kl": 0.0, "learning_rate": 1.027218005757655e-07, "logps/chosen": -273.8673400878906, "logps/rejected": -152.23277282714844, "loss": 0.2045, "rewards/chosen": 0.7700100541114807, "rewards/margins": 2.528456449508667, "rewards/rejected": -1.7584463357925415, "step": 3036 }, { "epoch": 0.79, "grad_norm": 32.30648422241211, "kl": 0.0, "learning_rate": 1.0259094477885371e-07, "logps/chosen": -193.05447387695312, "logps/rejected": -302.3185729980469, "loss": 0.1595, "rewards/chosen": 1.5815995931625366, "rewards/margins": 6.047704696655273, "rewards/rejected": -4.466104984283447, "step": 3037 }, { "epoch": 0.8, "grad_norm": 37.760581970214844, "kl": 0.0, "learning_rate": 1.0246008898194189e-07, "logps/chosen": -229.99197387695312, "logps/rejected": -275.3894958496094, "loss": 0.242, "rewards/chosen": 0.6548804044723511, "rewards/margins": 4.6089677810668945, "rewards/rejected": -3.954087257385254, "step": 3038 }, { "epoch": 0.8, "grad_norm": 79.86377716064453, "kl": 0.0, "learning_rate": 1.0232923318503009e-07, "logps/chosen": -220.7820587158203, "logps/rejected": -213.56016540527344, "loss": 0.2907, "rewards/chosen": -0.5391662120819092, "rewards/margins": 3.075031042098999, "rewards/rejected": -3.614197254180908, "step": 3039 }, { "epoch": 0.8, "grad_norm": 32.71912384033203, "kl": 0.0, "learning_rate": 1.021983773881183e-07, "logps/chosen": -200.76583862304688, "logps/rejected": -203.33946228027344, "loss": 0.2753, "rewards/chosen": 0.9402883648872375, "rewards/margins": 5.0086469650268555, "rewards/rejected": -4.068358421325684, "step": 3040 }, { "epoch": 0.8, "grad_norm": 40.06270217895508, "kl": 0.0, "learning_rate": 1.0206752159120649e-07, "logps/chosen": -197.82125854492188, "logps/rejected": -200.48231506347656, "loss": 0.1958, "rewards/chosen": 1.3599319458007812, "rewards/margins": 4.730039596557617, "rewards/rejected": -3.370107650756836, "step": 3041 }, { "epoch": 0.8, "grad_norm": 29.32779884338379, "kl": 0.0, "learning_rate": 1.0193666579429467e-07, "logps/chosen": -238.9820556640625, "logps/rejected": -223.41419982910156, "loss": 0.1964, "rewards/chosen": 2.0803043842315674, "rewards/margins": 6.614651679992676, "rewards/rejected": -4.5343475341796875, "step": 3042 }, { "epoch": 0.8, "grad_norm": 28.266084671020508, "kl": 0.0, "learning_rate": 1.0180580999738287e-07, "logps/chosen": -229.38027954101562, "logps/rejected": -307.1300048828125, "loss": 0.2579, "rewards/chosen": 2.4810798168182373, "rewards/margins": 6.782350540161133, "rewards/rejected": -4.301270484924316, "step": 3043 }, { "epoch": 0.8, "grad_norm": 32.78583908081055, "kl": 0.0, "learning_rate": 1.0167495420047108e-07, "logps/chosen": -164.74696350097656, "logps/rejected": -225.92079162597656, "loss": 0.2339, "rewards/chosen": 1.194899320602417, "rewards/margins": 5.182594299316406, "rewards/rejected": -3.98769474029541, "step": 3044 }, { "epoch": 0.8, "grad_norm": 28.55518913269043, "kl": 0.0, "learning_rate": 1.0154409840355928e-07, "logps/chosen": -168.16722106933594, "logps/rejected": -226.09352111816406, "loss": 0.2739, "rewards/chosen": 0.7534551620483398, "rewards/margins": 3.9352455139160156, "rewards/rejected": -3.181790351867676, "step": 3045 }, { "epoch": 0.8, "grad_norm": 38.69511032104492, "kl": 0.0, "learning_rate": 1.0141324260664746e-07, "logps/chosen": -159.96542358398438, "logps/rejected": -276.47259521484375, "loss": 0.2454, "rewards/chosen": 0.6250556111335754, "rewards/margins": 4.5782928466796875, "rewards/rejected": -3.9532370567321777, "step": 3046 }, { "epoch": 0.8, "grad_norm": 25.491989135742188, "kl": 0.0, "learning_rate": 1.0128238680973567e-07, "logps/chosen": -162.81649780273438, "logps/rejected": -222.93453979492188, "loss": 0.1371, "rewards/chosen": 2.8242900371551514, "rewards/margins": 6.870565414428711, "rewards/rejected": -4.046275615692139, "step": 3047 }, { "epoch": 0.8, "grad_norm": 72.17855834960938, "kl": 0.0, "learning_rate": 1.0115153101282386e-07, "logps/chosen": -173.59512329101562, "logps/rejected": -192.33859252929688, "loss": 0.2457, "rewards/chosen": 2.209728240966797, "rewards/margins": 4.834383010864258, "rewards/rejected": -2.624654769897461, "step": 3048 }, { "epoch": 0.8, "grad_norm": 30.696731567382812, "kl": 0.0, "learning_rate": 1.0102067521591206e-07, "logps/chosen": -226.53488159179688, "logps/rejected": -261.52020263671875, "loss": 0.1878, "rewards/chosen": 1.8455479145050049, "rewards/margins": 6.685124397277832, "rewards/rejected": -4.839576721191406, "step": 3049 }, { "epoch": 0.8, "grad_norm": 33.9281120300293, "kl": 0.0, "learning_rate": 1.0088981941900026e-07, "logps/chosen": -170.5023651123047, "logps/rejected": -269.6197509765625, "loss": 0.2826, "rewards/chosen": -0.06187135726213455, "rewards/margins": 3.0018527507781982, "rewards/rejected": -3.0637240409851074, "step": 3050 }, { "epoch": 0.8, "grad_norm": 30.158000946044922, "kl": 0.0, "learning_rate": 1.0075896362208845e-07, "logps/chosen": -136.28103637695312, "logps/rejected": -260.2313537597656, "loss": 0.1391, "rewards/chosen": 0.9478456377983093, "rewards/margins": 5.134873390197754, "rewards/rejected": -4.187027931213379, "step": 3051 }, { "epoch": 0.8, "grad_norm": 34.88115692138672, "kl": 0.0, "learning_rate": 1.0062810782517665e-07, "logps/chosen": -159.8663330078125, "logps/rejected": -262.3153076171875, "loss": 0.3343, "rewards/chosen": 0.8258453607559204, "rewards/margins": 4.084915637969971, "rewards/rejected": -3.25907039642334, "step": 3052 }, { "epoch": 0.8, "grad_norm": 30.788469314575195, "kl": 0.0, "learning_rate": 1.0049725202826486e-07, "logps/chosen": -225.1278533935547, "logps/rejected": -225.47254943847656, "loss": 0.2801, "rewards/chosen": 1.7009352445602417, "rewards/margins": 4.015796661376953, "rewards/rejected": -2.314861297607422, "step": 3053 }, { "epoch": 0.8, "grad_norm": 37.72449493408203, "kl": 0.0, "learning_rate": 1.0036639623135304e-07, "logps/chosen": -222.17906188964844, "logps/rejected": -288.82110595703125, "loss": 0.1386, "rewards/chosen": 2.187119245529175, "rewards/margins": 6.962368965148926, "rewards/rejected": -4.77524995803833, "step": 3054 }, { "epoch": 0.8, "grad_norm": 37.279109954833984, "kl": 0.0, "learning_rate": 1.0023554043444124e-07, "logps/chosen": -167.89822387695312, "logps/rejected": -264.6224670410156, "loss": 0.2888, "rewards/chosen": 1.1163854598999023, "rewards/margins": 3.8963310718536377, "rewards/rejected": -2.7799456119537354, "step": 3055 }, { "epoch": 0.8, "grad_norm": 34.867244720458984, "kl": 0.0, "learning_rate": 1.0010468463752945e-07, "logps/chosen": -253.61399841308594, "logps/rejected": -304.2154235839844, "loss": 0.2146, "rewards/chosen": 1.490746021270752, "rewards/margins": 6.7905964851379395, "rewards/rejected": -5.2998504638671875, "step": 3056 }, { "epoch": 0.8, "grad_norm": 38.14105987548828, "kl": 0.0, "learning_rate": 9.997382884061764e-08, "logps/chosen": -143.0299530029297, "logps/rejected": -239.80638122558594, "loss": 0.2257, "rewards/chosen": 1.0738029479980469, "rewards/margins": 3.2807700634002686, "rewards/rejected": -2.2069671154022217, "step": 3057 }, { "epoch": 0.8, "grad_norm": 31.968135833740234, "kl": 0.0, "learning_rate": 9.984297304370583e-08, "logps/chosen": -222.02590942382812, "logps/rejected": -259.3979187011719, "loss": 0.18, "rewards/chosen": 1.1077613830566406, "rewards/margins": 6.395566463470459, "rewards/rejected": -5.287805080413818, "step": 3058 }, { "epoch": 0.8, "grad_norm": 30.759674072265625, "kl": 0.0, "learning_rate": 9.971211724679402e-08, "logps/chosen": -201.220458984375, "logps/rejected": -269.8891296386719, "loss": 0.1788, "rewards/chosen": 2.2144436836242676, "rewards/margins": 7.6725993156433105, "rewards/rejected": -5.458155632019043, "step": 3059 }, { "epoch": 0.8, "grad_norm": 34.30893325805664, "kl": 0.0, "learning_rate": 9.958126144988223e-08, "logps/chosen": -235.98963928222656, "logps/rejected": -171.18214416503906, "loss": 0.3627, "rewards/chosen": 0.8601173162460327, "rewards/margins": 3.5811686515808105, "rewards/rejected": -2.7210514545440674, "step": 3060 }, { "epoch": 0.8, "grad_norm": 36.614986419677734, "kl": 0.0, "learning_rate": 9.945040565297043e-08, "logps/chosen": -98.68263244628906, "logps/rejected": -207.23736572265625, "loss": 0.2558, "rewards/chosen": 0.7655553221702576, "rewards/margins": 3.5949623584747314, "rewards/rejected": -2.829406976699829, "step": 3061 }, { "epoch": 0.8, "grad_norm": 32.950923919677734, "kl": 0.0, "learning_rate": 9.931954985605861e-08, "logps/chosen": -197.07855224609375, "logps/rejected": -275.22430419921875, "loss": 0.2385, "rewards/chosen": 0.7969144582748413, "rewards/margins": 4.643563747406006, "rewards/rejected": -3.846649408340454, "step": 3062 }, { "epoch": 0.8, "grad_norm": 36.137454986572266, "kl": 0.0, "learning_rate": 9.918869405914682e-08, "logps/chosen": -171.88201904296875, "logps/rejected": -267.5704040527344, "loss": 0.314, "rewards/chosen": 2.4939815998077393, "rewards/margins": 4.7634735107421875, "rewards/rejected": -2.2694921493530273, "step": 3063 }, { "epoch": 0.8, "grad_norm": 31.99434471130371, "kl": 0.0, "learning_rate": 9.905783826223502e-08, "logps/chosen": -248.55218505859375, "logps/rejected": -202.24285888671875, "loss": 0.3361, "rewards/chosen": -0.03006848692893982, "rewards/margins": 2.202803134918213, "rewards/rejected": -2.2328715324401855, "step": 3064 }, { "epoch": 0.8, "grad_norm": 43.168006896972656, "kl": 0.0, "learning_rate": 9.89269824653232e-08, "logps/chosen": -233.45132446289062, "logps/rejected": -284.4450988769531, "loss": 0.1988, "rewards/chosen": 0.9019581079483032, "rewards/margins": 4.239256858825684, "rewards/rejected": -3.33729887008667, "step": 3065 }, { "epoch": 0.8, "grad_norm": 34.45249938964844, "kl": 0.0, "learning_rate": 9.879612666841141e-08, "logps/chosen": -278.5670471191406, "logps/rejected": -264.644775390625, "loss": 0.2263, "rewards/chosen": 2.1087911128997803, "rewards/margins": 5.218087196350098, "rewards/rejected": -3.1092960834503174, "step": 3066 }, { "epoch": 0.8, "grad_norm": 24.53069305419922, "kl": 0.0, "learning_rate": 9.86652708714996e-08, "logps/chosen": -190.2322540283203, "logps/rejected": -289.7329406738281, "loss": 0.1922, "rewards/chosen": 1.9357092380523682, "rewards/margins": 6.088913917541504, "rewards/rejected": -4.153204441070557, "step": 3067 }, { "epoch": 0.8, "grad_norm": 34.11714553833008, "kl": 0.0, "learning_rate": 9.85344150745878e-08, "logps/chosen": -245.92459106445312, "logps/rejected": -246.92396545410156, "loss": 0.183, "rewards/chosen": 0.07540301233530045, "rewards/margins": 3.0833206176757812, "rewards/rejected": -3.007917642593384, "step": 3068 }, { "epoch": 0.8, "grad_norm": 27.566394805908203, "kl": 0.0, "learning_rate": 9.8403559277676e-08, "logps/chosen": -217.20533752441406, "logps/rejected": -254.9765167236328, "loss": 0.2821, "rewards/chosen": 0.8280597925186157, "rewards/margins": 5.826291561126709, "rewards/rejected": -4.998231887817383, "step": 3069 }, { "epoch": 0.8, "grad_norm": 35.68161392211914, "kl": 0.0, "learning_rate": 9.827270348076419e-08, "logps/chosen": -216.37933349609375, "logps/rejected": -239.38841247558594, "loss": 0.2848, "rewards/chosen": 1.006296157836914, "rewards/margins": 3.833810329437256, "rewards/rejected": -2.827514171600342, "step": 3070 }, { "epoch": 0.8, "grad_norm": 31.20342445373535, "kl": 0.0, "learning_rate": 9.814184768385239e-08, "logps/chosen": -115.98530578613281, "logps/rejected": -222.46414184570312, "loss": 0.2491, "rewards/chosen": 1.8454002141952515, "rewards/margins": 4.2929582595825195, "rewards/rejected": -2.4475579261779785, "step": 3071 }, { "epoch": 0.8, "grad_norm": 33.1669807434082, "kl": 0.0, "learning_rate": 9.80109918869406e-08, "logps/chosen": -305.29730224609375, "logps/rejected": -164.39962768554688, "loss": 0.3077, "rewards/chosen": 1.2443612813949585, "rewards/margins": 3.3249335289001465, "rewards/rejected": -2.0805721282958984, "step": 3072 }, { "epoch": 0.8, "grad_norm": 26.35659408569336, "kl": 0.0, "learning_rate": 9.788013609002878e-08, "logps/chosen": -216.8447265625, "logps/rejected": -254.26968383789062, "loss": 0.2024, "rewards/chosen": 2.339702606201172, "rewards/margins": 7.2026872634887695, "rewards/rejected": -4.862984657287598, "step": 3073 }, { "epoch": 0.8, "grad_norm": 34.57036590576172, "kl": 0.0, "learning_rate": 9.774928029311698e-08, "logps/chosen": -225.0780792236328, "logps/rejected": -183.37353515625, "loss": 0.1963, "rewards/chosen": 1.5948823690414429, "rewards/margins": 3.4738454818725586, "rewards/rejected": -1.8789631128311157, "step": 3074 }, { "epoch": 0.8, "grad_norm": 55.75965881347656, "kl": 0.0, "learning_rate": 9.761842449620519e-08, "logps/chosen": -236.33316040039062, "logps/rejected": -229.29071044921875, "loss": 0.3662, "rewards/chosen": -0.06309998035430908, "rewards/margins": 4.684689521789551, "rewards/rejected": -4.74778938293457, "step": 3075 }, { "epoch": 0.81, "grad_norm": 29.502851486206055, "kl": 0.0, "learning_rate": 9.748756869929338e-08, "logps/chosen": -151.8082733154297, "logps/rejected": -215.3000946044922, "loss": 0.1216, "rewards/chosen": 1.3713454008102417, "rewards/margins": 5.170321464538574, "rewards/rejected": -3.798976182937622, "step": 3076 }, { "epoch": 0.81, "grad_norm": 28.440292358398438, "kl": 0.0, "learning_rate": 9.735671290238156e-08, "logps/chosen": -153.1015167236328, "logps/rejected": -242.10238647460938, "loss": 0.1821, "rewards/chosen": 1.123049259185791, "rewards/margins": 6.207904815673828, "rewards/rejected": -5.084855556488037, "step": 3077 }, { "epoch": 0.81, "grad_norm": 39.055419921875, "kl": 0.0, "learning_rate": 9.722585710546976e-08, "logps/chosen": -214.06842041015625, "logps/rejected": -252.7290802001953, "loss": 0.2904, "rewards/chosen": 0.05434975028038025, "rewards/margins": 3.1375062465667725, "rewards/rejected": -3.0831565856933594, "step": 3078 }, { "epoch": 0.81, "grad_norm": 25.828947067260742, "kl": 0.0, "learning_rate": 9.709500130855797e-08, "logps/chosen": -201.25909423828125, "logps/rejected": -267.3767395019531, "loss": 0.1955, "rewards/chosen": 3.0395989418029785, "rewards/margins": 6.232514381408691, "rewards/rejected": -3.192915439605713, "step": 3079 }, { "epoch": 0.81, "grad_norm": 30.39990997314453, "kl": 0.0, "learning_rate": 9.696414551164617e-08, "logps/chosen": -180.36355590820312, "logps/rejected": -194.8758544921875, "loss": 0.2182, "rewards/chosen": 0.9802899360656738, "rewards/margins": 4.716524124145508, "rewards/rejected": -3.736234188079834, "step": 3080 }, { "epoch": 0.81, "grad_norm": 38.28114700317383, "kl": 0.0, "learning_rate": 9.683328971473435e-08, "logps/chosen": -188.90740966796875, "logps/rejected": -207.32540893554688, "loss": 0.2517, "rewards/chosen": 1.9970073699951172, "rewards/margins": 4.750757694244385, "rewards/rejected": -2.7537503242492676, "step": 3081 }, { "epoch": 0.81, "grad_norm": 29.68705177307129, "kl": 0.0, "learning_rate": 9.670243391782256e-08, "logps/chosen": -161.58554077148438, "logps/rejected": -221.57876586914062, "loss": 0.2469, "rewards/chosen": 0.7848741412162781, "rewards/margins": 5.099874019622803, "rewards/rejected": -4.315000057220459, "step": 3082 }, { "epoch": 0.81, "grad_norm": 27.6368465423584, "kl": 0.0, "learning_rate": 9.657157812091075e-08, "logps/chosen": -253.8367462158203, "logps/rejected": -281.7640380859375, "loss": 0.1273, "rewards/chosen": 2.3699400424957275, "rewards/margins": 6.810979843139648, "rewards/rejected": -4.4410400390625, "step": 3083 }, { "epoch": 0.81, "grad_norm": 21.8061466217041, "kl": 0.0, "learning_rate": 9.644072232399895e-08, "logps/chosen": -178.3246307373047, "logps/rejected": -343.13031005859375, "loss": 0.2539, "rewards/chosen": -0.23348580300807953, "rewards/margins": 4.196106433868408, "rewards/rejected": -4.429592132568359, "step": 3084 }, { "epoch": 0.81, "grad_norm": 33.39321517944336, "kl": 0.0, "learning_rate": 9.630986652708715e-08, "logps/chosen": -177.02105712890625, "logps/rejected": -295.70208740234375, "loss": 0.2628, "rewards/chosen": -0.0624721497297287, "rewards/margins": 5.96860933303833, "rewards/rejected": -6.031081676483154, "step": 3085 }, { "epoch": 0.81, "grad_norm": 39.35239028930664, "kl": 0.0, "learning_rate": 9.617901073017534e-08, "logps/chosen": -204.71322631835938, "logps/rejected": -252.45391845703125, "loss": 0.2637, "rewards/chosen": -0.5764102935791016, "rewards/margins": 2.669811725616455, "rewards/rejected": -3.2462220191955566, "step": 3086 }, { "epoch": 0.81, "grad_norm": 29.97749900817871, "kl": 0.0, "learning_rate": 9.604815493326354e-08, "logps/chosen": -156.9673614501953, "logps/rejected": -241.162109375, "loss": 0.2395, "rewards/chosen": 1.5875059366226196, "rewards/margins": 5.665582656860352, "rewards/rejected": -4.0780768394470215, "step": 3087 }, { "epoch": 0.81, "grad_norm": 34.644081115722656, "kl": 0.0, "learning_rate": 9.591729913635175e-08, "logps/chosen": -209.1474609375, "logps/rejected": -223.16012573242188, "loss": 0.2186, "rewards/chosen": 0.9284906983375549, "rewards/margins": 5.58828067779541, "rewards/rejected": -4.6597900390625, "step": 3088 }, { "epoch": 0.81, "grad_norm": 48.65402603149414, "kl": 0.0, "learning_rate": 9.578644333943993e-08, "logps/chosen": -222.0018768310547, "logps/rejected": -224.7173309326172, "loss": 0.3318, "rewards/chosen": 0.18279579281806946, "rewards/margins": 2.2575557231903076, "rewards/rejected": -2.0747599601745605, "step": 3089 }, { "epoch": 0.81, "grad_norm": 47.96308517456055, "kl": 0.0, "learning_rate": 9.565558754252813e-08, "logps/chosen": -331.3664245605469, "logps/rejected": -217.60836791992188, "loss": 0.2445, "rewards/chosen": 0.6762347221374512, "rewards/margins": 5.298969745635986, "rewards/rejected": -4.622735023498535, "step": 3090 }, { "epoch": 0.81, "grad_norm": 31.31795310974121, "kl": 0.0, "learning_rate": 9.552473174561634e-08, "logps/chosen": -204.9393310546875, "logps/rejected": -170.1268310546875, "loss": 0.3128, "rewards/chosen": 1.282555103302002, "rewards/margins": 4.115183353424072, "rewards/rejected": -2.8326282501220703, "step": 3091 }, { "epoch": 0.81, "grad_norm": 34.949859619140625, "kl": 0.0, "learning_rate": 9.539387594870452e-08, "logps/chosen": -173.3063201904297, "logps/rejected": -293.59075927734375, "loss": 0.2331, "rewards/chosen": 1.1828242540359497, "rewards/margins": 6.227013111114502, "rewards/rejected": -5.044188976287842, "step": 3092 }, { "epoch": 0.81, "grad_norm": 42.05825424194336, "kl": 0.0, "learning_rate": 9.526302015179272e-08, "logps/chosen": -229.2870635986328, "logps/rejected": -262.56317138671875, "loss": 0.2754, "rewards/chosen": 2.785039186477661, "rewards/margins": 5.348743438720703, "rewards/rejected": -2.563704013824463, "step": 3093 }, { "epoch": 0.81, "grad_norm": 31.66534996032715, "kl": 0.0, "learning_rate": 9.513216435488091e-08, "logps/chosen": -170.36338806152344, "logps/rejected": -229.820556640625, "loss": 0.2134, "rewards/chosen": 0.22713103890419006, "rewards/margins": 3.0389466285705566, "rewards/rejected": -2.8118155002593994, "step": 3094 }, { "epoch": 0.81, "grad_norm": 40.741703033447266, "kl": 0.0, "learning_rate": 9.500130855796912e-08, "logps/chosen": -271.0154113769531, "logps/rejected": -269.4881591796875, "loss": 0.3646, "rewards/chosen": 0.5402659177780151, "rewards/margins": 4.598776340484619, "rewards/rejected": -4.0585103034973145, "step": 3095 }, { "epoch": 0.81, "grad_norm": 37.812774658203125, "kl": 0.0, "learning_rate": 9.48704527610573e-08, "logps/chosen": -165.02536010742188, "logps/rejected": -289.88604736328125, "loss": 0.2696, "rewards/chosen": -0.4038521647453308, "rewards/margins": 1.8678739070892334, "rewards/rejected": -2.271726131439209, "step": 3096 }, { "epoch": 0.81, "grad_norm": 35.640708923339844, "kl": 0.0, "learning_rate": 9.47395969641455e-08, "logps/chosen": -293.63604736328125, "logps/rejected": -247.36587524414062, "loss": 0.1799, "rewards/chosen": 3.4284310340881348, "rewards/margins": 6.821096420288086, "rewards/rejected": -3.392665386199951, "step": 3097 }, { "epoch": 0.81, "grad_norm": 27.39713478088379, "kl": 0.0, "learning_rate": 9.460874116723371e-08, "logps/chosen": -226.48678588867188, "logps/rejected": -239.13442993164062, "loss": 0.2349, "rewards/chosen": 2.1605045795440674, "rewards/margins": 7.0987749099731445, "rewards/rejected": -4.938270568847656, "step": 3098 }, { "epoch": 0.81, "grad_norm": 23.795522689819336, "kl": 0.0, "learning_rate": 9.44778853703219e-08, "logps/chosen": -133.2270965576172, "logps/rejected": -317.48931884765625, "loss": 0.1622, "rewards/chosen": 2.4678852558135986, "rewards/margins": 7.570594787597656, "rewards/rejected": -5.102709770202637, "step": 3099 }, { "epoch": 0.81, "grad_norm": 48.198516845703125, "kl": 0.0, "learning_rate": 9.434702957341009e-08, "logps/chosen": -268.0385437011719, "logps/rejected": -235.6272430419922, "loss": 0.2053, "rewards/chosen": -1.1146745681762695, "rewards/margins": 4.273561477661133, "rewards/rejected": -5.388236045837402, "step": 3100 }, { "epoch": 0.81, "grad_norm": 28.210222244262695, "kl": 0.0, "learning_rate": 9.42161737764983e-08, "logps/chosen": -179.96897888183594, "logps/rejected": -268.2703857421875, "loss": 0.174, "rewards/chosen": 1.6665213108062744, "rewards/margins": 7.456060409545898, "rewards/rejected": -5.789538860321045, "step": 3101 }, { "epoch": 0.81, "grad_norm": 43.28695297241211, "kl": 0.0, "learning_rate": 9.408531797958649e-08, "logps/chosen": -325.22027587890625, "logps/rejected": -166.30465698242188, "loss": 0.3118, "rewards/chosen": 0.15811419486999512, "rewards/margins": 3.1421120166778564, "rewards/rejected": -2.9839978218078613, "step": 3102 }, { "epoch": 0.81, "grad_norm": 36.104244232177734, "kl": 0.0, "learning_rate": 9.395446218267469e-08, "logps/chosen": -261.4429016113281, "logps/rejected": -263.02490234375, "loss": 0.2642, "rewards/chosen": 1.4801288843154907, "rewards/margins": 3.8422746658325195, "rewards/rejected": -2.3621459007263184, "step": 3103 }, { "epoch": 0.81, "grad_norm": 27.173694610595703, "kl": 0.0, "learning_rate": 9.382360638576289e-08, "logps/chosen": -144.93190002441406, "logps/rejected": -255.46478271484375, "loss": 0.26, "rewards/chosen": 1.242961049079895, "rewards/margins": 5.085175037384033, "rewards/rejected": -3.8422141075134277, "step": 3104 }, { "epoch": 0.81, "grad_norm": 46.38798904418945, "kl": 0.0, "learning_rate": 9.369275058885108e-08, "logps/chosen": -229.40740966796875, "logps/rejected": -259.7017822265625, "loss": 0.349, "rewards/chosen": -0.6156241297721863, "rewards/margins": 3.493295431137085, "rewards/rejected": -4.108919620513916, "step": 3105 }, { "epoch": 0.81, "grad_norm": 33.67306137084961, "kl": 0.0, "learning_rate": 9.356189479193928e-08, "logps/chosen": -202.85719299316406, "logps/rejected": -230.45826721191406, "loss": 0.2143, "rewards/chosen": 1.6097205877304077, "rewards/margins": 4.408946990966797, "rewards/rejected": -2.7992265224456787, "step": 3106 }, { "epoch": 0.81, "grad_norm": 28.656518936157227, "kl": 0.0, "learning_rate": 9.343103899502749e-08, "logps/chosen": -175.84097290039062, "logps/rejected": -281.7679443359375, "loss": 0.2748, "rewards/chosen": 1.2173833847045898, "rewards/margins": 5.868319034576416, "rewards/rejected": -4.650935649871826, "step": 3107 }, { "epoch": 0.81, "grad_norm": 47.980682373046875, "kl": 0.0, "learning_rate": 9.330018319811567e-08, "logps/chosen": -256.8486633300781, "logps/rejected": -277.4222412109375, "loss": 0.295, "rewards/chosen": 1.240675687789917, "rewards/margins": 6.917579650878906, "rewards/rejected": -5.676904201507568, "step": 3108 }, { "epoch": 0.81, "grad_norm": 41.02346420288086, "kl": 0.0, "learning_rate": 9.316932740120387e-08, "logps/chosen": -216.6624298095703, "logps/rejected": -325.7963562011719, "loss": 0.2024, "rewards/chosen": 0.9395179748535156, "rewards/margins": 3.6543073654174805, "rewards/rejected": -2.714789390563965, "step": 3109 }, { "epoch": 0.81, "grad_norm": 36.90616226196289, "kl": 0.0, "learning_rate": 9.303847160429206e-08, "logps/chosen": -180.5063018798828, "logps/rejected": -304.2553405761719, "loss": 0.2122, "rewards/chosen": 1.2775695323944092, "rewards/margins": 5.875765800476074, "rewards/rejected": -4.598196506500244, "step": 3110 }, { "epoch": 0.81, "grad_norm": 39.39779281616211, "kl": 0.0, "learning_rate": 9.290761580738027e-08, "logps/chosen": -241.53070068359375, "logps/rejected": -229.9088897705078, "loss": 0.3191, "rewards/chosen": 2.081984043121338, "rewards/margins": 6.383558750152588, "rewards/rejected": -4.30157470703125, "step": 3111 }, { "epoch": 0.81, "grad_norm": 32.545494079589844, "kl": 0.0, "learning_rate": 9.277676001046845e-08, "logps/chosen": -128.02120971679688, "logps/rejected": -202.7642059326172, "loss": 0.2003, "rewards/chosen": 2.5098001956939697, "rewards/margins": 5.570107460021973, "rewards/rejected": -3.060307502746582, "step": 3112 }, { "epoch": 0.81, "grad_norm": 26.97892189025879, "kl": 0.0, "learning_rate": 9.264590421355665e-08, "logps/chosen": -230.6981964111328, "logps/rejected": -196.49899291992188, "loss": 0.2041, "rewards/chosen": 2.649827241897583, "rewards/margins": 5.650113105773926, "rewards/rejected": -3.000286102294922, "step": 3113 }, { "epoch": 0.81, "grad_norm": 43.1445198059082, "kl": 0.0, "learning_rate": 9.251504841664486e-08, "logps/chosen": -194.0313720703125, "logps/rejected": -234.02328491210938, "loss": 0.2563, "rewards/chosen": 0.5206071138381958, "rewards/margins": 2.5637826919555664, "rewards/rejected": -2.043175458908081, "step": 3114 }, { "epoch": 0.82, "grad_norm": 34.82023620605469, "kl": 0.0, "learning_rate": 9.238419261973306e-08, "logps/chosen": -172.72352600097656, "logps/rejected": -207.0159912109375, "loss": 0.3734, "rewards/chosen": 0.12107408046722412, "rewards/margins": 3.2228050231933594, "rewards/rejected": -3.1017308235168457, "step": 3115 }, { "epoch": 0.82, "grad_norm": 38.627567291259766, "kl": 0.0, "learning_rate": 9.225333682282124e-08, "logps/chosen": -227.6630859375, "logps/rejected": -323.3218688964844, "loss": 0.2393, "rewards/chosen": 1.4282304048538208, "rewards/margins": 6.921871662139893, "rewards/rejected": -5.493641376495361, "step": 3116 }, { "epoch": 0.82, "grad_norm": 35.75371551513672, "kl": 0.0, "learning_rate": 9.212248102590945e-08, "logps/chosen": -199.0106201171875, "logps/rejected": -221.68994140625, "loss": 0.2503, "rewards/chosen": 1.679624319076538, "rewards/margins": 4.361454010009766, "rewards/rejected": -2.6818299293518066, "step": 3117 }, { "epoch": 0.82, "grad_norm": 24.521865844726562, "kl": 0.0, "learning_rate": 9.199162522899764e-08, "logps/chosen": -169.25497436523438, "logps/rejected": -308.73388671875, "loss": 0.1887, "rewards/chosen": 1.4655717611312866, "rewards/margins": 6.660384178161621, "rewards/rejected": -5.194812297821045, "step": 3118 }, { "epoch": 0.82, "grad_norm": 30.80224609375, "kl": 0.0, "learning_rate": 9.186076943208584e-08, "logps/chosen": -221.1312713623047, "logps/rejected": -227.04100036621094, "loss": 0.2005, "rewards/chosen": 1.6574556827545166, "rewards/margins": 5.227529525756836, "rewards/rejected": -3.5700738430023193, "step": 3119 }, { "epoch": 0.82, "grad_norm": 29.5427188873291, "kl": 0.0, "learning_rate": 9.172991363517404e-08, "logps/chosen": -170.79373168945312, "logps/rejected": -238.59603881835938, "loss": 0.1312, "rewards/chosen": 1.5359221696853638, "rewards/margins": 7.736850738525391, "rewards/rejected": -6.200928688049316, "step": 3120 }, { "epoch": 0.82, "grad_norm": 32.508872985839844, "kl": 0.0, "learning_rate": 9.159905783826223e-08, "logps/chosen": -310.5272521972656, "logps/rejected": -301.6024475097656, "loss": 0.1886, "rewards/chosen": 1.9752588272094727, "rewards/margins": 6.883115291595459, "rewards/rejected": -4.907856464385986, "step": 3121 }, { "epoch": 0.82, "grad_norm": 38.51951217651367, "kl": 0.0, "learning_rate": 9.146820204135043e-08, "logps/chosen": -228.05850219726562, "logps/rejected": -237.06768798828125, "loss": 0.3218, "rewards/chosen": -1.9714559316635132, "rewards/margins": 1.3739887475967407, "rewards/rejected": -3.345444679260254, "step": 3122 }, { "epoch": 0.82, "grad_norm": 31.491409301757812, "kl": 0.0, "learning_rate": 9.133734624443861e-08, "logps/chosen": -211.30628967285156, "logps/rejected": -281.8726501464844, "loss": 0.1737, "rewards/chosen": 1.4899225234985352, "rewards/margins": 5.676398277282715, "rewards/rejected": -4.18647575378418, "step": 3123 }, { "epoch": 0.82, "grad_norm": 26.910266876220703, "kl": 0.0, "learning_rate": 9.120649044752682e-08, "logps/chosen": -184.70957946777344, "logps/rejected": -263.70806884765625, "loss": 0.3086, "rewards/chosen": 0.3036360740661621, "rewards/margins": 4.226426124572754, "rewards/rejected": -3.922790288925171, "step": 3124 }, { "epoch": 0.82, "grad_norm": 39.30774688720703, "kl": 0.0, "learning_rate": 9.107563465061502e-08, "logps/chosen": -245.2247314453125, "logps/rejected": -242.0779266357422, "loss": 0.2454, "rewards/chosen": 0.7935837507247925, "rewards/margins": 4.356839179992676, "rewards/rejected": -3.563255548477173, "step": 3125 }, { "epoch": 0.82, "grad_norm": 36.49482345581055, "kl": 0.0, "learning_rate": 9.094477885370321e-08, "logps/chosen": -235.6526641845703, "logps/rejected": -206.50897216796875, "loss": 0.1504, "rewards/chosen": 2.0932252407073975, "rewards/margins": 6.678851127624512, "rewards/rejected": -4.585626125335693, "step": 3126 }, { "epoch": 0.82, "grad_norm": 35.809295654296875, "kl": 0.0, "learning_rate": 9.081392305679141e-08, "logps/chosen": -160.95736694335938, "logps/rejected": -175.06797790527344, "loss": 0.3326, "rewards/chosen": 0.812410831451416, "rewards/margins": 2.593329906463623, "rewards/rejected": -1.7809189558029175, "step": 3127 }, { "epoch": 0.82, "grad_norm": 38.98195266723633, "kl": 0.0, "learning_rate": 9.06830672598796e-08, "logps/chosen": -209.9936065673828, "logps/rejected": -307.116943359375, "loss": 0.2013, "rewards/chosen": 2.0579047203063965, "rewards/margins": 6.153413772583008, "rewards/rejected": -4.095509052276611, "step": 3128 }, { "epoch": 0.82, "grad_norm": 21.438058853149414, "kl": 0.0, "learning_rate": 9.05522114629678e-08, "logps/chosen": -157.38998413085938, "logps/rejected": -233.13430786132812, "loss": 0.3075, "rewards/chosen": -0.4898746609687805, "rewards/margins": 3.3792076110839844, "rewards/rejected": -3.86908221244812, "step": 3129 }, { "epoch": 0.82, "grad_norm": 29.17632293701172, "kl": 0.0, "learning_rate": 9.042135566605601e-08, "logps/chosen": -199.85702514648438, "logps/rejected": -213.35757446289062, "loss": 0.2223, "rewards/chosen": 2.3872687816619873, "rewards/margins": 5.311424255371094, "rewards/rejected": -2.9241557121276855, "step": 3130 }, { "epoch": 0.82, "grad_norm": 29.56111717224121, "kl": 0.0, "learning_rate": 9.029049986914419e-08, "logps/chosen": -213.77813720703125, "logps/rejected": -226.4279327392578, "loss": 0.2771, "rewards/chosen": 1.5997400283813477, "rewards/margins": 5.683679580688477, "rewards/rejected": -4.083939552307129, "step": 3131 }, { "epoch": 0.82, "grad_norm": 33.34666442871094, "kl": 0.0, "learning_rate": 9.015964407223239e-08, "logps/chosen": -209.387451171875, "logps/rejected": -244.4696044921875, "loss": 0.2269, "rewards/chosen": 0.500552237033844, "rewards/margins": 4.081018447875977, "rewards/rejected": -3.5804662704467773, "step": 3132 }, { "epoch": 0.82, "grad_norm": 30.667217254638672, "kl": 0.0, "learning_rate": 9.00287882753206e-08, "logps/chosen": -215.02833557128906, "logps/rejected": -169.5201873779297, "loss": 0.2921, "rewards/chosen": 0.9950622320175171, "rewards/margins": 3.234675407409668, "rewards/rejected": -2.2396130561828613, "step": 3133 }, { "epoch": 0.82, "grad_norm": 34.83656692504883, "kl": 0.0, "learning_rate": 8.98979324784088e-08, "logps/chosen": -212.655517578125, "logps/rejected": -229.4166259765625, "loss": 0.1796, "rewards/chosen": 1.0849932432174683, "rewards/margins": 5.354401111602783, "rewards/rejected": -4.269407749176025, "step": 3134 }, { "epoch": 0.82, "grad_norm": 25.507356643676758, "kl": 0.0, "learning_rate": 8.976707668149698e-08, "logps/chosen": -231.75711059570312, "logps/rejected": -250.7727813720703, "loss": 0.2336, "rewards/chosen": 1.0825836658477783, "rewards/margins": 4.7187910079956055, "rewards/rejected": -3.636207103729248, "step": 3135 }, { "epoch": 0.82, "grad_norm": 33.45425796508789, "kl": 0.0, "learning_rate": 8.963622088458519e-08, "logps/chosen": -207.46600341796875, "logps/rejected": -223.568359375, "loss": 0.3145, "rewards/chosen": 1.58919095993042, "rewards/margins": 3.960312843322754, "rewards/rejected": -2.371121883392334, "step": 3136 }, { "epoch": 0.82, "grad_norm": 30.995628356933594, "kl": 0.0, "learning_rate": 8.950536508767338e-08, "logps/chosen": -165.9071502685547, "logps/rejected": -265.2596130371094, "loss": 0.183, "rewards/chosen": 1.7723157405853271, "rewards/margins": 5.585575580596924, "rewards/rejected": -3.8132598400115967, "step": 3137 }, { "epoch": 0.82, "grad_norm": 44.489688873291016, "kl": 0.0, "learning_rate": 8.937450929076158e-08, "logps/chosen": -276.55706787109375, "logps/rejected": -195.18429565429688, "loss": 0.2868, "rewards/chosen": 2.160369396209717, "rewards/margins": 3.251467227935791, "rewards/rejected": -1.0910977125167847, "step": 3138 }, { "epoch": 0.82, "grad_norm": 30.777244567871094, "kl": 0.0, "learning_rate": 8.924365349384978e-08, "logps/chosen": -179.30703735351562, "logps/rejected": -178.9294891357422, "loss": 0.2333, "rewards/chosen": 1.8039159774780273, "rewards/margins": 4.568762302398682, "rewards/rejected": -2.7648463249206543, "step": 3139 }, { "epoch": 0.82, "grad_norm": 92.86982727050781, "kl": 0.0, "learning_rate": 8.911279769693797e-08, "logps/chosen": -151.19696044921875, "logps/rejected": -204.7473602294922, "loss": 0.2595, "rewards/chosen": 1.152949571609497, "rewards/margins": 5.322902679443359, "rewards/rejected": -4.169952869415283, "step": 3140 }, { "epoch": 0.82, "grad_norm": 36.63436508178711, "kl": 0.0, "learning_rate": 8.898194190002617e-08, "logps/chosen": -215.37210083007812, "logps/rejected": -314.5213317871094, "loss": 0.2001, "rewards/chosen": 2.3977222442626953, "rewards/margins": 5.928170204162598, "rewards/rejected": -3.5304479598999023, "step": 3141 }, { "epoch": 0.82, "grad_norm": 35.54846954345703, "kl": 0.0, "learning_rate": 8.885108610311436e-08, "logps/chosen": -189.39260864257812, "logps/rejected": -171.74534606933594, "loss": 0.2853, "rewards/chosen": 0.25655680894851685, "rewards/margins": 3.7083852291107178, "rewards/rejected": -3.4518284797668457, "step": 3142 }, { "epoch": 0.82, "grad_norm": 39.20547103881836, "kl": 0.0, "learning_rate": 8.872023030620256e-08, "logps/chosen": -197.0155029296875, "logps/rejected": -288.4905700683594, "loss": 0.2787, "rewards/chosen": 0.012832609005272388, "rewards/margins": 3.237461805343628, "rewards/rejected": -3.2246291637420654, "step": 3143 }, { "epoch": 0.82, "grad_norm": 38.3444938659668, "kl": 0.0, "learning_rate": 8.858937450929076e-08, "logps/chosen": -202.22515869140625, "logps/rejected": -263.79852294921875, "loss": 0.2313, "rewards/chosen": 2.373213529586792, "rewards/margins": 5.190075874328613, "rewards/rejected": -2.8168625831604004, "step": 3144 }, { "epoch": 0.82, "grad_norm": 29.71002769470215, "kl": 0.0, "learning_rate": 8.845851871237895e-08, "logps/chosen": -299.419921875, "logps/rejected": -257.6999206542969, "loss": 0.0992, "rewards/chosen": 2.6316373348236084, "rewards/margins": 6.821929931640625, "rewards/rejected": -4.190292835235596, "step": 3145 }, { "epoch": 0.82, "grad_norm": 26.98137092590332, "kl": 0.0, "learning_rate": 8.832766291546716e-08, "logps/chosen": -159.2989959716797, "logps/rejected": -230.6924591064453, "loss": 0.2329, "rewards/chosen": 0.03763216733932495, "rewards/margins": 4.52135705947876, "rewards/rejected": -4.483725070953369, "step": 3146 }, { "epoch": 0.82, "grad_norm": 33.412445068359375, "kl": 0.0, "learning_rate": 8.819680711855534e-08, "logps/chosen": -233.58352661132812, "logps/rejected": -160.78826904296875, "loss": 0.2634, "rewards/chosen": 0.21745315194129944, "rewards/margins": 3.5812926292419434, "rewards/rejected": -3.3638393878936768, "step": 3147 }, { "epoch": 0.82, "grad_norm": 36.100624084472656, "kl": 0.0, "learning_rate": 8.806595132164354e-08, "logps/chosen": -197.14633178710938, "logps/rejected": -150.0326385498047, "loss": 0.3934, "rewards/chosen": 0.8015373349189758, "rewards/margins": 2.3115899562835693, "rewards/rejected": -1.5100525617599487, "step": 3148 }, { "epoch": 0.82, "grad_norm": 36.14215850830078, "kl": 0.0, "learning_rate": 8.793509552473175e-08, "logps/chosen": -221.76010131835938, "logps/rejected": -222.3566131591797, "loss": 0.1804, "rewards/chosen": 0.9301916360855103, "rewards/margins": 5.47138786315918, "rewards/rejected": -4.541196346282959, "step": 3149 }, { "epoch": 0.82, "grad_norm": 33.10061264038086, "kl": 0.0, "learning_rate": 8.780423972781995e-08, "logps/chosen": -146.35745239257812, "logps/rejected": -314.1618347167969, "loss": 0.2803, "rewards/chosen": 0.9166402220726013, "rewards/margins": 2.8424408435821533, "rewards/rejected": -1.9258005619049072, "step": 3150 }, { "epoch": 0.82, "grad_norm": 25.759798049926758, "kl": 0.0, "learning_rate": 8.767338393090813e-08, "logps/chosen": -185.44004821777344, "logps/rejected": -284.9614562988281, "loss": 0.1916, "rewards/chosen": 1.393906593322754, "rewards/margins": 7.029627799987793, "rewards/rejected": -5.635721206665039, "step": 3151 }, { "epoch": 0.82, "grad_norm": 25.99704933166504, "kl": 0.0, "learning_rate": 8.754252813399634e-08, "logps/chosen": -290.52667236328125, "logps/rejected": -226.89105224609375, "loss": 0.1421, "rewards/chosen": 1.078086018562317, "rewards/margins": 5.805100917816162, "rewards/rejected": -4.727015018463135, "step": 3152 }, { "epoch": 0.83, "grad_norm": 33.87999725341797, "kl": 0.0, "learning_rate": 8.741167233708453e-08, "logps/chosen": -184.73353576660156, "logps/rejected": -241.3988037109375, "loss": 0.2319, "rewards/chosen": 1.0597847700119019, "rewards/margins": 4.234382152557373, "rewards/rejected": -3.1745972633361816, "step": 3153 }, { "epoch": 0.83, "grad_norm": 42.77838897705078, "kl": 0.0, "learning_rate": 8.728081654017272e-08, "logps/chosen": -162.68724060058594, "logps/rejected": -234.38644409179688, "loss": 0.2463, "rewards/chosen": 1.4074424505233765, "rewards/margins": 3.403412342071533, "rewards/rejected": -1.9959698915481567, "step": 3154 }, { "epoch": 0.83, "grad_norm": 29.633657455444336, "kl": 0.0, "learning_rate": 8.714996074326093e-08, "logps/chosen": -206.44908142089844, "logps/rejected": -217.1102752685547, "loss": 0.1632, "rewards/chosen": 1.5870789289474487, "rewards/margins": 4.748828887939453, "rewards/rejected": -3.161750078201294, "step": 3155 }, { "epoch": 0.83, "grad_norm": 33.112728118896484, "kl": 0.0, "learning_rate": 8.701910494634912e-08, "logps/chosen": -136.0078582763672, "logps/rejected": -235.0076904296875, "loss": 0.2412, "rewards/chosen": 1.6538118124008179, "rewards/margins": 4.994295120239258, "rewards/rejected": -3.3404834270477295, "step": 3156 }, { "epoch": 0.83, "grad_norm": 37.93458557128906, "kl": 0.0, "learning_rate": 8.688824914943732e-08, "logps/chosen": -231.58319091796875, "logps/rejected": -197.50265502929688, "loss": 0.2361, "rewards/chosen": 1.7503066062927246, "rewards/margins": 4.487408638000488, "rewards/rejected": -2.7371020317077637, "step": 3157 }, { "epoch": 0.83, "grad_norm": 45.65302276611328, "kl": 0.0, "learning_rate": 8.67573933525255e-08, "logps/chosen": -142.3944854736328, "logps/rejected": -182.17701721191406, "loss": 0.2098, "rewards/chosen": 1.1196633577346802, "rewards/margins": 4.7605438232421875, "rewards/rejected": -3.640880584716797, "step": 3158 }, { "epoch": 0.83, "grad_norm": 38.333126068115234, "kl": 0.0, "learning_rate": 8.662653755561371e-08, "logps/chosen": -209.46014404296875, "logps/rejected": -217.16123962402344, "loss": 0.2337, "rewards/chosen": -0.3171456456184387, "rewards/margins": 2.737488031387329, "rewards/rejected": -3.054633617401123, "step": 3159 }, { "epoch": 0.83, "grad_norm": 28.92974853515625, "kl": 0.0, "learning_rate": 8.64956817587019e-08, "logps/chosen": -213.71473693847656, "logps/rejected": -350.8567199707031, "loss": 0.2021, "rewards/chosen": 1.1655019521713257, "rewards/margins": 4.887118339538574, "rewards/rejected": -3.721616506576538, "step": 3160 }, { "epoch": 0.83, "grad_norm": 34.88011169433594, "kl": 0.0, "learning_rate": 8.63648259617901e-08, "logps/chosen": -208.52581787109375, "logps/rejected": -267.2923889160156, "loss": 0.2536, "rewards/chosen": 1.8001902103424072, "rewards/margins": 5.021884918212891, "rewards/rejected": -3.2216944694519043, "step": 3161 }, { "epoch": 0.83, "grad_norm": 37.72634506225586, "kl": 0.0, "learning_rate": 8.62339701648783e-08, "logps/chosen": -271.94342041015625, "logps/rejected": -235.6877899169922, "loss": 0.2743, "rewards/chosen": 1.3427650928497314, "rewards/margins": 5.064581871032715, "rewards/rejected": -3.7218167781829834, "step": 3162 }, { "epoch": 0.83, "grad_norm": 31.424816131591797, "kl": 0.0, "learning_rate": 8.61031143679665e-08, "logps/chosen": -225.39549255371094, "logps/rejected": -277.0132141113281, "loss": 0.2178, "rewards/chosen": 1.022281527519226, "rewards/margins": 4.791353702545166, "rewards/rejected": -3.7690722942352295, "step": 3163 }, { "epoch": 0.83, "grad_norm": 47.377498626708984, "kl": 0.0, "learning_rate": 8.597225857105469e-08, "logps/chosen": -183.3324737548828, "logps/rejected": -308.13092041015625, "loss": 0.2461, "rewards/chosen": 1.7639703750610352, "rewards/margins": 6.307750225067139, "rewards/rejected": -4.5437798500061035, "step": 3164 }, { "epoch": 0.83, "grad_norm": 26.83221435546875, "kl": 0.0, "learning_rate": 8.58414027741429e-08, "logps/chosen": -236.23025512695312, "logps/rejected": -288.96246337890625, "loss": 0.1597, "rewards/chosen": 1.581776738166809, "rewards/margins": 7.115726470947266, "rewards/rejected": -5.533949851989746, "step": 3165 }, { "epoch": 0.83, "grad_norm": 33.347965240478516, "kl": 0.0, "learning_rate": 8.571054697723108e-08, "logps/chosen": -196.5367889404297, "logps/rejected": -342.7577819824219, "loss": 0.1677, "rewards/chosen": 1.9373396635055542, "rewards/margins": 6.118312358856201, "rewards/rejected": -4.180972576141357, "step": 3166 }, { "epoch": 0.83, "grad_norm": 49.83609390258789, "kl": 0.0, "learning_rate": 8.557969118031928e-08, "logps/chosen": -233.40382385253906, "logps/rejected": -167.3038330078125, "loss": 0.2879, "rewards/chosen": 2.240593433380127, "rewards/margins": 5.706521034240723, "rewards/rejected": -3.4659276008605957, "step": 3167 }, { "epoch": 0.83, "grad_norm": 33.22333526611328, "kl": 0.0, "learning_rate": 8.544883538340749e-08, "logps/chosen": -177.73777770996094, "logps/rejected": -172.5456085205078, "loss": 0.1696, "rewards/chosen": 2.7628121376037598, "rewards/margins": 4.378305912017822, "rewards/rejected": -1.6154937744140625, "step": 3168 }, { "epoch": 0.83, "grad_norm": 35.4307975769043, "kl": 0.0, "learning_rate": 8.531797958649568e-08, "logps/chosen": -221.61648559570312, "logps/rejected": -269.2848815917969, "loss": 0.1578, "rewards/chosen": 1.386556625366211, "rewards/margins": 5.333159923553467, "rewards/rejected": -3.946603298187256, "step": 3169 }, { "epoch": 0.83, "grad_norm": 28.6766357421875, "kl": 0.0, "learning_rate": 8.518712378958387e-08, "logps/chosen": -216.0562286376953, "logps/rejected": -271.5375671386719, "loss": 0.2548, "rewards/chosen": -1.8031291961669922, "rewards/margins": 2.5379562377929688, "rewards/rejected": -4.341085433959961, "step": 3170 }, { "epoch": 0.83, "grad_norm": 28.462066650390625, "kl": 0.0, "learning_rate": 8.505626799267208e-08, "logps/chosen": -202.88999938964844, "logps/rejected": -323.1724853515625, "loss": 0.24, "rewards/chosen": 2.3474764823913574, "rewards/margins": 6.388828277587891, "rewards/rejected": -4.041351795196533, "step": 3171 }, { "epoch": 0.83, "grad_norm": 47.872802734375, "kl": 0.0, "learning_rate": 8.492541219576027e-08, "logps/chosen": -279.97625732421875, "logps/rejected": -255.40150451660156, "loss": 0.1864, "rewards/chosen": 2.105032444000244, "rewards/margins": 5.517726898193359, "rewards/rejected": -3.4126944541931152, "step": 3172 }, { "epoch": 0.83, "grad_norm": 36.41221618652344, "kl": 0.0, "learning_rate": 8.479455639884847e-08, "logps/chosen": -246.18795776367188, "logps/rejected": -206.72479248046875, "loss": 0.3049, "rewards/chosen": 1.8870315551757812, "rewards/margins": 4.641598224639893, "rewards/rejected": -2.7545666694641113, "step": 3173 }, { "epoch": 0.83, "grad_norm": 29.25550651550293, "kl": 0.0, "learning_rate": 8.466370060193665e-08, "logps/chosen": -235.82455444335938, "logps/rejected": -185.5269775390625, "loss": 0.2105, "rewards/chosen": 2.9374399185180664, "rewards/margins": 4.32605504989624, "rewards/rejected": -1.3886151313781738, "step": 3174 }, { "epoch": 0.83, "grad_norm": 32.950496673583984, "kl": 0.0, "learning_rate": 8.453284480502486e-08, "logps/chosen": -181.21908569335938, "logps/rejected": -235.7977752685547, "loss": 0.2509, "rewards/chosen": 1.955785870552063, "rewards/margins": 4.768850803375244, "rewards/rejected": -2.8130650520324707, "step": 3175 }, { "epoch": 0.83, "grad_norm": 38.27532958984375, "kl": 0.0, "learning_rate": 8.440198900811306e-08, "logps/chosen": -173.7947998046875, "logps/rejected": -211.60086059570312, "loss": 0.3025, "rewards/chosen": 0.642396867275238, "rewards/margins": 2.6172077655792236, "rewards/rejected": -1.9748109579086304, "step": 3176 }, { "epoch": 0.83, "grad_norm": 28.526336669921875, "kl": 0.0, "learning_rate": 8.427113321120125e-08, "logps/chosen": -205.12777709960938, "logps/rejected": -226.22775268554688, "loss": 0.2181, "rewards/chosen": 2.4885826110839844, "rewards/margins": 5.492410659790039, "rewards/rejected": -3.0038280487060547, "step": 3177 }, { "epoch": 0.83, "grad_norm": 30.261062622070312, "kl": 0.0, "learning_rate": 8.414027741428945e-08, "logps/chosen": -155.34446716308594, "logps/rejected": -308.4963073730469, "loss": 0.1552, "rewards/chosen": 1.2393455505371094, "rewards/margins": 6.379164218902588, "rewards/rejected": -5.1398186683654785, "step": 3178 }, { "epoch": 0.83, "grad_norm": 51.739192962646484, "kl": 0.0, "learning_rate": 8.400942161737765e-08, "logps/chosen": -218.99700927734375, "logps/rejected": -197.27005004882812, "loss": 0.2523, "rewards/chosen": 2.2029366493225098, "rewards/margins": 4.013391017913818, "rewards/rejected": -1.8104544878005981, "step": 3179 }, { "epoch": 0.83, "grad_norm": 37.45707702636719, "kl": 0.0, "learning_rate": 8.387856582046584e-08, "logps/chosen": -247.09036254882812, "logps/rejected": -280.1000061035156, "loss": 0.2628, "rewards/chosen": -0.34871169924736023, "rewards/margins": 3.9044902324676514, "rewards/rejected": -4.253201961517334, "step": 3180 }, { "epoch": 0.83, "grad_norm": 35.54933547973633, "kl": 0.0, "learning_rate": 8.374771002355404e-08, "logps/chosen": -289.2021179199219, "logps/rejected": -234.57896423339844, "loss": 0.1353, "rewards/chosen": 2.026050329208374, "rewards/margins": 6.521479606628418, "rewards/rejected": -4.495429039001465, "step": 3181 }, { "epoch": 0.83, "grad_norm": 36.762245178222656, "kl": 0.0, "learning_rate": 8.361685422664223e-08, "logps/chosen": -194.93182373046875, "logps/rejected": -270.8766174316406, "loss": 0.2034, "rewards/chosen": -0.8206627368927002, "rewards/margins": 2.1558444499969482, "rewards/rejected": -2.9765071868896484, "step": 3182 }, { "epoch": 0.83, "grad_norm": 26.107078552246094, "kl": 0.0, "learning_rate": 8.348599842973043e-08, "logps/chosen": -207.82778930664062, "logps/rejected": -352.070068359375, "loss": 0.125, "rewards/chosen": 1.0949733257293701, "rewards/margins": 7.711993217468262, "rewards/rejected": -6.6170196533203125, "step": 3183 }, { "epoch": 0.83, "grad_norm": 21.953126907348633, "kl": 0.0, "learning_rate": 8.335514263281864e-08, "logps/chosen": -177.77029418945312, "logps/rejected": -306.3573913574219, "loss": 0.1084, "rewards/chosen": 2.5120620727539062, "rewards/margins": 7.031912803649902, "rewards/rejected": -4.519850730895996, "step": 3184 }, { "epoch": 0.83, "grad_norm": 30.031909942626953, "kl": 0.0, "learning_rate": 8.322428683590682e-08, "logps/chosen": -233.8975372314453, "logps/rejected": -269.39111328125, "loss": 0.1616, "rewards/chosen": 1.1202813386917114, "rewards/margins": 6.912496566772461, "rewards/rejected": -5.792215347290039, "step": 3185 }, { "epoch": 0.83, "grad_norm": 33.76377868652344, "kl": 0.0, "learning_rate": 8.309343103899502e-08, "logps/chosen": -234.36184692382812, "logps/rejected": -317.62469482421875, "loss": 0.0828, "rewards/chosen": 0.8755607604980469, "rewards/margins": 5.862431049346924, "rewards/rejected": -4.986870288848877, "step": 3186 }, { "epoch": 0.83, "grad_norm": 25.309988021850586, "kl": 0.0, "learning_rate": 8.296257524208323e-08, "logps/chosen": -219.3397674560547, "logps/rejected": -196.34658813476562, "loss": 0.1625, "rewards/chosen": 0.9604638814926147, "rewards/margins": 5.255552768707275, "rewards/rejected": -4.295088768005371, "step": 3187 }, { "epoch": 0.83, "grad_norm": 24.442615509033203, "kl": 0.0, "learning_rate": 8.283171944517142e-08, "logps/chosen": -199.65635681152344, "logps/rejected": -239.56736755371094, "loss": 0.2406, "rewards/chosen": 1.292781949043274, "rewards/margins": 4.967615604400635, "rewards/rejected": -3.6748335361480713, "step": 3188 }, { "epoch": 0.83, "grad_norm": 27.33976173400879, "kl": 0.0, "learning_rate": 8.27008636482596e-08, "logps/chosen": -217.6470947265625, "logps/rejected": -220.995849609375, "loss": 0.1736, "rewards/chosen": 1.8959113359451294, "rewards/margins": 5.904285907745361, "rewards/rejected": -4.0083746910095215, "step": 3189 }, { "epoch": 0.83, "grad_norm": 29.29962158203125, "kl": 0.0, "learning_rate": 8.25700078513478e-08, "logps/chosen": -264.2421875, "logps/rejected": -306.50390625, "loss": 0.2428, "rewards/chosen": 1.8079992532730103, "rewards/margins": 7.141074180603027, "rewards/rejected": -5.333075046539307, "step": 3190 }, { "epoch": 0.84, "grad_norm": 39.233707427978516, "kl": 0.0, "learning_rate": 8.243915205443601e-08, "logps/chosen": -252.67047119140625, "logps/rejected": -298.96563720703125, "loss": 0.2574, "rewards/chosen": 2.291105270385742, "rewards/margins": 5.163873195648193, "rewards/rejected": -2.872767925262451, "step": 3191 }, { "epoch": 0.84, "grad_norm": 28.222354888916016, "kl": 0.0, "learning_rate": 8.230829625752421e-08, "logps/chosen": -224.46841430664062, "logps/rejected": -273.3043518066406, "loss": 0.1879, "rewards/chosen": 1.3577877283096313, "rewards/margins": 4.819271087646484, "rewards/rejected": -3.4614834785461426, "step": 3192 }, { "epoch": 0.84, "grad_norm": 35.241722106933594, "kl": 0.0, "learning_rate": 8.217744046061239e-08, "logps/chosen": -227.81884765625, "logps/rejected": -307.7884826660156, "loss": 0.2358, "rewards/chosen": 1.4391993284225464, "rewards/margins": 5.772876739501953, "rewards/rejected": -4.333677291870117, "step": 3193 }, { "epoch": 0.84, "grad_norm": 23.206933975219727, "kl": 0.0, "learning_rate": 8.20465846637006e-08, "logps/chosen": -206.38482666015625, "logps/rejected": -225.6373291015625, "loss": 0.1811, "rewards/chosen": 1.8850288391113281, "rewards/margins": 5.365200519561768, "rewards/rejected": -3.4801716804504395, "step": 3194 }, { "epoch": 0.84, "grad_norm": 25.119455337524414, "kl": 0.0, "learning_rate": 8.19157288667888e-08, "logps/chosen": -173.25401306152344, "logps/rejected": -264.8511657714844, "loss": 0.2923, "rewards/chosen": -0.01973581314086914, "rewards/margins": 3.4208624362945557, "rewards/rejected": -3.440598249435425, "step": 3195 }, { "epoch": 0.84, "grad_norm": 31.192039489746094, "kl": 0.0, "learning_rate": 8.178487306987699e-08, "logps/chosen": -157.18247985839844, "logps/rejected": -212.47877502441406, "loss": 0.3075, "rewards/chosen": 1.0312573909759521, "rewards/margins": 3.534039258956909, "rewards/rejected": -2.502781867980957, "step": 3196 }, { "epoch": 0.84, "grad_norm": 24.054067611694336, "kl": 0.0, "learning_rate": 8.165401727296519e-08, "logps/chosen": -145.00535583496094, "logps/rejected": -198.6532745361328, "loss": 0.2582, "rewards/chosen": -0.1868196278810501, "rewards/margins": 3.4146223068237305, "rewards/rejected": -3.6014418601989746, "step": 3197 }, { "epoch": 0.84, "grad_norm": 28.315120697021484, "kl": 0.0, "learning_rate": 8.152316147605338e-08, "logps/chosen": -310.1528625488281, "logps/rejected": -249.92959594726562, "loss": 0.2273, "rewards/chosen": -0.11365220695734024, "rewards/margins": 4.29476261138916, "rewards/rejected": -4.408414840698242, "step": 3198 }, { "epoch": 0.84, "grad_norm": 30.878515243530273, "kl": 0.0, "learning_rate": 8.139230567914158e-08, "logps/chosen": -269.9025573730469, "logps/rejected": -238.60348510742188, "loss": 0.1857, "rewards/chosen": 1.1058082580566406, "rewards/margins": 5.0078125, "rewards/rejected": -3.9020044803619385, "step": 3199 }, { "epoch": 0.84, "grad_norm": 36.15157699584961, "kl": 0.0, "learning_rate": 8.126144988222979e-08, "logps/chosen": -282.2198791503906, "logps/rejected": -272.2900085449219, "loss": 0.2517, "rewards/chosen": 0.5260896682739258, "rewards/margins": 5.039384841918945, "rewards/rejected": -4.5132951736450195, "step": 3200 }, { "epoch": 0.84, "grad_norm": 27.950164794921875, "kl": 0.0, "learning_rate": 8.113059408531797e-08, "logps/chosen": -236.32823181152344, "logps/rejected": -230.14259338378906, "loss": 0.1538, "rewards/chosen": 3.0094211101531982, "rewards/margins": 7.373923301696777, "rewards/rejected": -4.364501953125, "step": 3201 }, { "epoch": 0.84, "grad_norm": 38.57584762573242, "kl": 0.0, "learning_rate": 8.099973828840617e-08, "logps/chosen": -192.27294921875, "logps/rejected": -292.67828369140625, "loss": 0.1894, "rewards/chosen": 1.4443457126617432, "rewards/margins": 6.036561965942383, "rewards/rejected": -4.5922160148620605, "step": 3202 }, { "epoch": 0.84, "grad_norm": 38.501190185546875, "kl": 0.0, "learning_rate": 8.086888249149438e-08, "logps/chosen": -198.6505126953125, "logps/rejected": -278.82122802734375, "loss": 0.231, "rewards/chosen": 1.5485109090805054, "rewards/margins": 4.2544755935668945, "rewards/rejected": -2.7059645652770996, "step": 3203 }, { "epoch": 0.84, "grad_norm": 31.10076141357422, "kl": 0.0, "learning_rate": 8.073802669458257e-08, "logps/chosen": -126.0052490234375, "logps/rejected": -273.44488525390625, "loss": 0.2894, "rewards/chosen": 0.5634855031967163, "rewards/margins": 4.985409259796143, "rewards/rejected": -4.421923637390137, "step": 3204 }, { "epoch": 0.84, "grad_norm": 25.549251556396484, "kl": 0.0, "learning_rate": 8.060717089767076e-08, "logps/chosen": -222.45462036132812, "logps/rejected": -254.40975952148438, "loss": 0.2332, "rewards/chosen": 0.2601969540119171, "rewards/margins": 3.998560905456543, "rewards/rejected": -3.7383639812469482, "step": 3205 }, { "epoch": 0.84, "grad_norm": 33.01713562011719, "kl": 0.0, "learning_rate": 8.047631510075895e-08, "logps/chosen": -177.3705596923828, "logps/rejected": -257.90863037109375, "loss": 0.2629, "rewards/chosen": 0.653936505317688, "rewards/margins": 4.622261047363281, "rewards/rejected": -3.9683244228363037, "step": 3206 }, { "epoch": 0.84, "grad_norm": 38.29364776611328, "kl": 0.0, "learning_rate": 8.034545930384716e-08, "logps/chosen": -144.2410125732422, "logps/rejected": -287.2203674316406, "loss": 0.2483, "rewards/chosen": 0.690515398979187, "rewards/margins": 6.036245822906494, "rewards/rejected": -5.345730304718018, "step": 3207 }, { "epoch": 0.84, "grad_norm": 35.30019760131836, "kl": 0.0, "learning_rate": 8.021460350693536e-08, "logps/chosen": -192.75279235839844, "logps/rejected": -280.9093017578125, "loss": 0.2415, "rewards/chosen": 1.6136605739593506, "rewards/margins": 4.508999824523926, "rewards/rejected": -2.8953394889831543, "step": 3208 }, { "epoch": 0.84, "grad_norm": 30.80791473388672, "kl": 0.0, "learning_rate": 8.008374771002354e-08, "logps/chosen": -219.2487335205078, "logps/rejected": -264.60992431640625, "loss": 0.3327, "rewards/chosen": 0.5078659057617188, "rewards/margins": 4.614754676818848, "rewards/rejected": -4.106888771057129, "step": 3209 }, { "epoch": 0.84, "grad_norm": 28.3983154296875, "kl": 0.0, "learning_rate": 7.995289191311175e-08, "logps/chosen": -202.18222045898438, "logps/rejected": -248.34375, "loss": 0.153, "rewards/chosen": 2.136230707168579, "rewards/margins": 6.461572647094727, "rewards/rejected": -4.325342178344727, "step": 3210 }, { "epoch": 0.84, "grad_norm": 32.261634826660156, "kl": 0.0, "learning_rate": 7.982203611619995e-08, "logps/chosen": -181.50257873535156, "logps/rejected": -274.13519287109375, "loss": 0.2007, "rewards/chosen": 1.8369464874267578, "rewards/margins": 4.2242255210876465, "rewards/rejected": -2.3872790336608887, "step": 3211 }, { "epoch": 0.84, "grad_norm": 31.96755599975586, "kl": 0.0, "learning_rate": 7.969118031928813e-08, "logps/chosen": -151.80557250976562, "logps/rejected": -224.90036010742188, "loss": 0.4, "rewards/chosen": 0.3806571364402771, "rewards/margins": 2.918895721435547, "rewards/rejected": -2.538238525390625, "step": 3212 }, { "epoch": 0.84, "grad_norm": 27.303483963012695, "kl": 0.0, "learning_rate": 7.956032452237634e-08, "logps/chosen": -276.4804992675781, "logps/rejected": -318.6486511230469, "loss": 0.1806, "rewards/chosen": 0.7546684145927429, "rewards/margins": 6.069582462310791, "rewards/rejected": -5.314914226531982, "step": 3213 }, { "epoch": 0.84, "grad_norm": 51.559303283691406, "kl": 0.0, "learning_rate": 7.942946872546454e-08, "logps/chosen": -243.2832794189453, "logps/rejected": -192.78231811523438, "loss": 0.3287, "rewards/chosen": 1.5706136226654053, "rewards/margins": 4.6659345626831055, "rewards/rejected": -3.0953211784362793, "step": 3214 }, { "epoch": 0.84, "grad_norm": 38.36349868774414, "kl": 0.0, "learning_rate": 7.929861292855273e-08, "logps/chosen": -134.96987915039062, "logps/rejected": -270.34259033203125, "loss": 0.1782, "rewards/chosen": 2.149251937866211, "rewards/margins": 4.776583194732666, "rewards/rejected": -2.627331256866455, "step": 3215 }, { "epoch": 0.84, "grad_norm": 29.82472038269043, "kl": 0.0, "learning_rate": 7.916775713164093e-08, "logps/chosen": -255.9649200439453, "logps/rejected": -271.6165771484375, "loss": 0.2529, "rewards/chosen": 1.3246444463729858, "rewards/margins": 6.542586803436279, "rewards/rejected": -5.217942237854004, "step": 3216 }, { "epoch": 0.84, "grad_norm": 37.64690399169922, "kl": 0.0, "learning_rate": 7.903690133472912e-08, "logps/chosen": -192.82557678222656, "logps/rejected": -246.595947265625, "loss": 0.3331, "rewards/chosen": 1.6345527172088623, "rewards/margins": 5.916933059692383, "rewards/rejected": -4.282380104064941, "step": 3217 }, { "epoch": 0.84, "grad_norm": 28.811220169067383, "kl": 0.0, "learning_rate": 7.890604553781732e-08, "logps/chosen": -160.36233520507812, "logps/rejected": -200.77005004882812, "loss": 0.235, "rewards/chosen": 1.1709307432174683, "rewards/margins": 4.160086631774902, "rewards/rejected": -2.9891560077667236, "step": 3218 }, { "epoch": 0.84, "grad_norm": 44.311737060546875, "kl": 0.0, "learning_rate": 7.877518974090553e-08, "logps/chosen": -139.6275634765625, "logps/rejected": -238.56863403320312, "loss": 0.3518, "rewards/chosen": 0.10801827907562256, "rewards/margins": 2.9018564224243164, "rewards/rejected": -2.7938380241394043, "step": 3219 }, { "epoch": 0.84, "grad_norm": 31.64383316040039, "kl": 0.0, "learning_rate": 7.864433394399371e-08, "logps/chosen": -140.96168518066406, "logps/rejected": -304.3995666503906, "loss": 0.2882, "rewards/chosen": 0.13699030876159668, "rewards/margins": 8.668340682983398, "rewards/rejected": -8.531350135803223, "step": 3220 }, { "epoch": 0.84, "grad_norm": 35.97218322753906, "kl": 0.0, "learning_rate": 7.851347814708191e-08, "logps/chosen": -193.40135192871094, "logps/rejected": -165.84324645996094, "loss": 0.292, "rewards/chosen": 2.2008094787597656, "rewards/margins": 4.323521614074707, "rewards/rejected": -2.1227123737335205, "step": 3221 }, { "epoch": 0.84, "grad_norm": 34.96503829956055, "kl": 0.0, "learning_rate": 7.838262235017012e-08, "logps/chosen": -293.71673583984375, "logps/rejected": -264.406982421875, "loss": 0.2792, "rewards/chosen": 1.4762459993362427, "rewards/margins": 5.221151828765869, "rewards/rejected": -3.744905948638916, "step": 3222 }, { "epoch": 0.84, "grad_norm": 32.15884017944336, "kl": 0.0, "learning_rate": 7.825176655325831e-08, "logps/chosen": -148.90626525878906, "logps/rejected": -291.37860107421875, "loss": 0.2933, "rewards/chosen": -0.11567753553390503, "rewards/margins": 3.3869001865386963, "rewards/rejected": -3.502577781677246, "step": 3223 }, { "epoch": 0.84, "grad_norm": 27.66660499572754, "kl": 0.0, "learning_rate": 7.81209107563465e-08, "logps/chosen": -199.25677490234375, "logps/rejected": -200.94052124023438, "loss": 0.2217, "rewards/chosen": 1.435728907585144, "rewards/margins": 5.693033218383789, "rewards/rejected": -4.2573041915893555, "step": 3224 }, { "epoch": 0.84, "grad_norm": 27.243793487548828, "kl": 0.0, "learning_rate": 7.799005495943469e-08, "logps/chosen": -270.1687927246094, "logps/rejected": -238.72988891601562, "loss": 0.1522, "rewards/chosen": 2.963019609451294, "rewards/margins": 6.563299655914307, "rewards/rejected": -3.6002800464630127, "step": 3225 }, { "epoch": 0.84, "grad_norm": 24.84229850769043, "kl": 0.0, "learning_rate": 7.78591991625229e-08, "logps/chosen": -196.7878875732422, "logps/rejected": -330.93072509765625, "loss": 0.233, "rewards/chosen": 2.8932948112487793, "rewards/margins": 6.86652946472168, "rewards/rejected": -3.9732346534729004, "step": 3226 }, { "epoch": 0.84, "grad_norm": 31.127796173095703, "kl": 0.0, "learning_rate": 7.77283433656111e-08, "logps/chosen": -198.16998291015625, "logps/rejected": -208.06605529785156, "loss": 0.2182, "rewards/chosen": -1.5927191972732544, "rewards/margins": 2.7119503021240234, "rewards/rejected": -4.304669380187988, "step": 3227 }, { "epoch": 0.84, "grad_norm": 36.46774673461914, "kl": 0.0, "learning_rate": 7.759748756869928e-08, "logps/chosen": -239.41961669921875, "logps/rejected": -234.93087768554688, "loss": 0.223, "rewards/chosen": 2.2043843269348145, "rewards/margins": 4.640742301940918, "rewards/rejected": -2.4363577365875244, "step": 3228 }, { "epoch": 0.85, "grad_norm": 35.70278549194336, "kl": 0.0, "learning_rate": 7.746663177178749e-08, "logps/chosen": -233.97518920898438, "logps/rejected": -235.1247100830078, "loss": 0.1729, "rewards/chosen": 2.1642262935638428, "rewards/margins": 4.9302520751953125, "rewards/rejected": -2.7660255432128906, "step": 3229 }, { "epoch": 0.85, "grad_norm": 33.79775619506836, "kl": 0.0, "learning_rate": 7.733577597487569e-08, "logps/chosen": -186.15023803710938, "logps/rejected": -248.98333740234375, "loss": 0.2755, "rewards/chosen": 0.7999600172042847, "rewards/margins": 5.275784969329834, "rewards/rejected": -4.47582483291626, "step": 3230 }, { "epoch": 0.85, "grad_norm": 30.210243225097656, "kl": 0.0, "learning_rate": 7.720492017796388e-08, "logps/chosen": -284.4582824707031, "logps/rejected": -281.6883544921875, "loss": 0.2504, "rewards/chosen": 1.0357283353805542, "rewards/margins": 5.805393218994141, "rewards/rejected": -4.769664764404297, "step": 3231 }, { "epoch": 0.85, "grad_norm": 27.326990127563477, "kl": 0.0, "learning_rate": 7.707406438105208e-08, "logps/chosen": -143.6712188720703, "logps/rejected": -282.7164306640625, "loss": 0.317, "rewards/chosen": 0.5529003739356995, "rewards/margins": 4.33873176574707, "rewards/rejected": -3.7858314514160156, "step": 3232 }, { "epoch": 0.85, "grad_norm": 35.79087448120117, "kl": 0.0, "learning_rate": 7.694320858414027e-08, "logps/chosen": -196.32203674316406, "logps/rejected": -246.3926544189453, "loss": 0.2145, "rewards/chosen": 1.676134467124939, "rewards/margins": 6.369762897491455, "rewards/rejected": -4.693628311157227, "step": 3233 }, { "epoch": 0.85, "grad_norm": 31.284025192260742, "kl": 0.0, "learning_rate": 7.681235278722847e-08, "logps/chosen": -155.36402893066406, "logps/rejected": -301.85418701171875, "loss": 0.3093, "rewards/chosen": 0.6385982036590576, "rewards/margins": 3.672419309616089, "rewards/rejected": -3.0338211059570312, "step": 3234 }, { "epoch": 0.85, "grad_norm": 24.24800682067871, "kl": 0.0, "learning_rate": 7.668149699031668e-08, "logps/chosen": -172.85191345214844, "logps/rejected": -227.68849182128906, "loss": 0.1207, "rewards/chosen": 3.319450616836548, "rewards/margins": 6.2267866134643555, "rewards/rejected": -2.9073362350463867, "step": 3235 }, { "epoch": 0.85, "grad_norm": 34.73143005371094, "kl": 0.0, "learning_rate": 7.655064119340486e-08, "logps/chosen": -130.93258666992188, "logps/rejected": -256.797119140625, "loss": 0.3582, "rewards/chosen": -0.7385777235031128, "rewards/margins": 4.073555946350098, "rewards/rejected": -4.8121337890625, "step": 3236 }, { "epoch": 0.85, "grad_norm": 38.271507263183594, "kl": 0.0, "learning_rate": 7.641978539649306e-08, "logps/chosen": -225.26898193359375, "logps/rejected": -152.4866180419922, "loss": 0.3053, "rewards/chosen": 2.025780439376831, "rewards/margins": 4.564108371734619, "rewards/rejected": -2.538327932357788, "step": 3237 }, { "epoch": 0.85, "grad_norm": 32.1751708984375, "kl": 0.0, "learning_rate": 7.628892959958127e-08, "logps/chosen": -260.02423095703125, "logps/rejected": -164.86642456054688, "loss": 0.2417, "rewards/chosen": -0.08353383094072342, "rewards/margins": 2.7393460273742676, "rewards/rejected": -2.8228797912597656, "step": 3238 }, { "epoch": 0.85, "grad_norm": 29.464027404785156, "kl": 0.0, "learning_rate": 7.615807380266945e-08, "logps/chosen": -215.1149139404297, "logps/rejected": -328.502197265625, "loss": 0.3162, "rewards/chosen": 1.711369514465332, "rewards/margins": 7.5639872550964355, "rewards/rejected": -5.8526177406311035, "step": 3239 }, { "epoch": 0.85, "grad_norm": 29.200777053833008, "kl": 0.0, "learning_rate": 7.602721800575765e-08, "logps/chosen": -279.73236083984375, "logps/rejected": -186.07730102539062, "loss": 0.2846, "rewards/chosen": -0.08167314529418945, "rewards/margins": 3.102032423019409, "rewards/rejected": -3.1837055683135986, "step": 3240 }, { "epoch": 0.85, "grad_norm": 20.829153060913086, "kl": 0.0, "learning_rate": 7.589636220884584e-08, "logps/chosen": -168.58822631835938, "logps/rejected": -271.2360534667969, "loss": 0.1802, "rewards/chosen": 2.281903028488159, "rewards/margins": 4.783234596252441, "rewards/rejected": -2.501331329345703, "step": 3241 }, { "epoch": 0.85, "grad_norm": 28.79102325439453, "kl": 0.0, "learning_rate": 7.576550641193405e-08, "logps/chosen": -145.00428771972656, "logps/rejected": -232.7147979736328, "loss": 0.1955, "rewards/chosen": 1.831176996231079, "rewards/margins": 5.964174270629883, "rewards/rejected": -4.132997035980225, "step": 3242 }, { "epoch": 0.85, "grad_norm": 45.27934265136719, "kl": 0.0, "learning_rate": 7.563465061502223e-08, "logps/chosen": -227.49392700195312, "logps/rejected": -241.43299865722656, "loss": 0.3037, "rewards/chosen": 0.29154136776924133, "rewards/margins": 2.1040217876434326, "rewards/rejected": -1.8124803304672241, "step": 3243 }, { "epoch": 0.85, "grad_norm": 38.81378936767578, "kl": 0.0, "learning_rate": 7.550379481811043e-08, "logps/chosen": -193.5105438232422, "logps/rejected": -267.61676025390625, "loss": 0.2176, "rewards/chosen": 1.1929868459701538, "rewards/margins": 4.8053669929504395, "rewards/rejected": -3.612380266189575, "step": 3244 }, { "epoch": 0.85, "grad_norm": 31.02093505859375, "kl": 0.0, "learning_rate": 7.537293902119864e-08, "logps/chosen": -164.437255859375, "logps/rejected": -237.98223876953125, "loss": 0.2863, "rewards/chosen": 0.6644800901412964, "rewards/margins": 3.926863670349121, "rewards/rejected": -3.262383460998535, "step": 3245 }, { "epoch": 0.85, "grad_norm": 31.176836013793945, "kl": 0.0, "learning_rate": 7.524208322428684e-08, "logps/chosen": -250.9615936279297, "logps/rejected": -214.7868194580078, "loss": 0.1822, "rewards/chosen": 1.7779932022094727, "rewards/margins": 6.728030681610107, "rewards/rejected": -4.950037479400635, "step": 3246 }, { "epoch": 0.85, "grad_norm": 26.141632080078125, "kl": 0.0, "learning_rate": 7.511122742737502e-08, "logps/chosen": -206.78175354003906, "logps/rejected": -313.44171142578125, "loss": 0.2266, "rewards/chosen": 1.8484094142913818, "rewards/margins": 7.116808891296387, "rewards/rejected": -5.268399238586426, "step": 3247 }, { "epoch": 0.85, "grad_norm": 40.501155853271484, "kl": 0.0, "learning_rate": 7.498037163046323e-08, "logps/chosen": -258.9842529296875, "logps/rejected": -315.1947021484375, "loss": 0.2318, "rewards/chosen": 1.151440978050232, "rewards/margins": 7.068854331970215, "rewards/rejected": -5.917413234710693, "step": 3248 }, { "epoch": 0.85, "grad_norm": 37.80188751220703, "kl": 0.0, "learning_rate": 7.484951583355142e-08, "logps/chosen": -222.5956268310547, "logps/rejected": -309.52630615234375, "loss": 0.2064, "rewards/chosen": 1.6434623003005981, "rewards/margins": 4.907730579376221, "rewards/rejected": -3.264268398284912, "step": 3249 }, { "epoch": 0.85, "grad_norm": 43.29213333129883, "kl": 0.0, "learning_rate": 7.471866003663962e-08, "logps/chosen": -209.0088348388672, "logps/rejected": -259.30157470703125, "loss": 0.2872, "rewards/chosen": 1.5399799346923828, "rewards/margins": 4.152887344360352, "rewards/rejected": -2.6129074096679688, "step": 3250 }, { "epoch": 0.85, "grad_norm": 35.9322624206543, "kl": 0.0, "learning_rate": 7.458780423972782e-08, "logps/chosen": -174.74900817871094, "logps/rejected": -288.7847900390625, "loss": 0.2254, "rewards/chosen": 1.1389139890670776, "rewards/margins": 5.006952285766602, "rewards/rejected": -3.8680381774902344, "step": 3251 }, { "epoch": 0.85, "grad_norm": 35.05708694458008, "kl": 0.0, "learning_rate": 7.445694844281601e-08, "logps/chosen": -212.56893920898438, "logps/rejected": -207.61000061035156, "loss": 0.2539, "rewards/chosen": 1.387547492980957, "rewards/margins": 4.24867057800293, "rewards/rejected": -2.8611228466033936, "step": 3252 }, { "epoch": 0.85, "grad_norm": 28.2816104888916, "kl": 0.0, "learning_rate": 7.432609264590421e-08, "logps/chosen": -194.02349853515625, "logps/rejected": -223.91067504882812, "loss": 0.2686, "rewards/chosen": 1.438894510269165, "rewards/margins": 4.563453197479248, "rewards/rejected": -3.124558687210083, "step": 3253 }, { "epoch": 0.85, "grad_norm": 38.05796432495117, "kl": 0.0, "learning_rate": 7.419523684899242e-08, "logps/chosen": -141.80540466308594, "logps/rejected": -262.2084045410156, "loss": 0.2656, "rewards/chosen": 0.646595299243927, "rewards/margins": 3.703841209411621, "rewards/rejected": -3.057245969772339, "step": 3254 }, { "epoch": 0.85, "grad_norm": 29.759531021118164, "kl": 0.0, "learning_rate": 7.40643810520806e-08, "logps/chosen": -231.3519744873047, "logps/rejected": -239.27000427246094, "loss": 0.1509, "rewards/chosen": 1.9064635038375854, "rewards/margins": 5.785111427307129, "rewards/rejected": -3.878648042678833, "step": 3255 }, { "epoch": 0.85, "grad_norm": 31.95449447631836, "kl": 0.0, "learning_rate": 7.39335252551688e-08, "logps/chosen": -179.57452392578125, "logps/rejected": -195.614501953125, "loss": 0.2422, "rewards/chosen": 0.5285756587982178, "rewards/margins": 3.189023733139038, "rewards/rejected": -2.6604480743408203, "step": 3256 }, { "epoch": 0.85, "grad_norm": 33.531883239746094, "kl": 0.0, "learning_rate": 7.3802669458257e-08, "logps/chosen": -145.7747039794922, "logps/rejected": -197.13243103027344, "loss": 0.1107, "rewards/chosen": 2.2670633792877197, "rewards/margins": 5.13998556137085, "rewards/rejected": -2.87292218208313, "step": 3257 }, { "epoch": 0.85, "grad_norm": 28.455995559692383, "kl": 0.0, "learning_rate": 7.36718136613452e-08, "logps/chosen": -203.78224182128906, "logps/rejected": -243.5115203857422, "loss": 0.2562, "rewards/chosen": 1.287809133529663, "rewards/margins": 4.950334548950195, "rewards/rejected": -3.6625256538391113, "step": 3258 }, { "epoch": 0.85, "grad_norm": 37.13747787475586, "kl": 0.0, "learning_rate": 7.354095786443339e-08, "logps/chosen": -271.0595397949219, "logps/rejected": -266.4194030761719, "loss": 0.2399, "rewards/chosen": 0.8473371863365173, "rewards/margins": 4.8914594650268555, "rewards/rejected": -4.044122219085693, "step": 3259 }, { "epoch": 0.85, "grad_norm": 40.32930374145508, "kl": 0.0, "learning_rate": 7.341010206752158e-08, "logps/chosen": -184.6184844970703, "logps/rejected": -306.64208984375, "loss": 0.204, "rewards/chosen": 2.8387949466705322, "rewards/margins": 6.3892598152160645, "rewards/rejected": -3.5504648685455322, "step": 3260 }, { "epoch": 0.85, "grad_norm": 33.87854766845703, "kl": 0.0, "learning_rate": 7.327924627060979e-08, "logps/chosen": -162.88327026367188, "logps/rejected": -258.2253723144531, "loss": 0.2213, "rewards/chosen": -1.2969216108322144, "rewards/margins": 1.792932391166687, "rewards/rejected": -3.0898540019989014, "step": 3261 }, { "epoch": 0.85, "grad_norm": 34.421268463134766, "kl": 0.0, "learning_rate": 7.314839047369799e-08, "logps/chosen": -165.06387329101562, "logps/rejected": -290.7898254394531, "loss": 0.26, "rewards/chosen": 0.41494685411453247, "rewards/margins": 3.6931369304656982, "rewards/rejected": -3.2781901359558105, "step": 3262 }, { "epoch": 0.85, "grad_norm": 26.321603775024414, "kl": 0.0, "learning_rate": 7.301753467678617e-08, "logps/chosen": -157.8191680908203, "logps/rejected": -317.366943359375, "loss": 0.2229, "rewards/chosen": 1.980485439300537, "rewards/margins": 7.327960968017578, "rewards/rejected": -5.347475528717041, "step": 3263 }, { "epoch": 0.85, "grad_norm": 34.1356086730957, "kl": 0.0, "learning_rate": 7.288667887987438e-08, "logps/chosen": -121.9466552734375, "logps/rejected": -367.5460510253906, "loss": 0.195, "rewards/chosen": 0.6263151168823242, "rewards/margins": 9.286596298217773, "rewards/rejected": -8.66028118133545, "step": 3264 }, { "epoch": 0.85, "grad_norm": 29.905941009521484, "kl": 0.0, "learning_rate": 7.275582308296258e-08, "logps/chosen": -166.04644775390625, "logps/rejected": -160.56764221191406, "loss": 0.1754, "rewards/chosen": 1.7765103578567505, "rewards/margins": 5.152238368988037, "rewards/rejected": -3.375727891921997, "step": 3265 }, { "epoch": 0.85, "grad_norm": 31.1649169921875, "kl": 0.0, "learning_rate": 7.262496728605077e-08, "logps/chosen": -232.14187622070312, "logps/rejected": -367.91448974609375, "loss": 0.2476, "rewards/chosen": 0.11659705638885498, "rewards/margins": 4.91973876953125, "rewards/rejected": -4.8031415939331055, "step": 3266 }, { "epoch": 0.86, "grad_norm": 30.761837005615234, "kl": 0.0, "learning_rate": 7.249411148913897e-08, "logps/chosen": -301.6856994628906, "logps/rejected": -214.73681640625, "loss": 0.1979, "rewards/chosen": 1.8630826473236084, "rewards/margins": 4.209121227264404, "rewards/rejected": -2.346038579940796, "step": 3267 }, { "epoch": 0.86, "grad_norm": 33.79634094238281, "kl": 0.0, "learning_rate": 7.236325569222716e-08, "logps/chosen": -221.21762084960938, "logps/rejected": -207.97463989257812, "loss": 0.2969, "rewards/chosen": 0.7912971377372742, "rewards/margins": 6.043848514556885, "rewards/rejected": -5.252551555633545, "step": 3268 }, { "epoch": 0.86, "grad_norm": 26.53761863708496, "kl": 0.0, "learning_rate": 7.223239989531536e-08, "logps/chosen": -295.77362060546875, "logps/rejected": -300.87371826171875, "loss": 0.1873, "rewards/chosen": 4.822590351104736, "rewards/margins": 8.884696960449219, "rewards/rejected": -4.062106609344482, "step": 3269 }, { "epoch": 0.86, "grad_norm": 30.040515899658203, "kl": 0.0, "learning_rate": 7.210154409840356e-08, "logps/chosen": -157.37391662597656, "logps/rejected": -169.13473510742188, "loss": 0.2455, "rewards/chosen": 0.8005948066711426, "rewards/margins": 3.58113694190979, "rewards/rejected": -2.7805421352386475, "step": 3270 }, { "epoch": 0.86, "grad_norm": 39.841590881347656, "kl": 0.0, "learning_rate": 7.197068830149175e-08, "logps/chosen": -207.9867706298828, "logps/rejected": -267.37017822265625, "loss": 0.3974, "rewards/chosen": 0.5954592227935791, "rewards/margins": 2.9090476036071777, "rewards/rejected": -2.3135883808135986, "step": 3271 }, { "epoch": 0.86, "grad_norm": 26.90143585205078, "kl": 0.0, "learning_rate": 7.183983250457995e-08, "logps/chosen": -143.56216430664062, "logps/rejected": -245.6182403564453, "loss": 0.2192, "rewards/chosen": 0.7064897418022156, "rewards/margins": 5.181339740753174, "rewards/rejected": -4.474850177764893, "step": 3272 }, { "epoch": 0.86, "grad_norm": 39.29250717163086, "kl": 0.0, "learning_rate": 7.170897670766814e-08, "logps/chosen": -231.05426025390625, "logps/rejected": -326.5909118652344, "loss": 0.353, "rewards/chosen": -0.5797771215438843, "rewards/margins": 3.835221767425537, "rewards/rejected": -4.414999008178711, "step": 3273 }, { "epoch": 0.86, "grad_norm": 26.03618621826172, "kl": 0.0, "learning_rate": 7.157812091075634e-08, "logps/chosen": -222.4735870361328, "logps/rejected": -200.89833068847656, "loss": 0.3105, "rewards/chosen": -0.4272328019142151, "rewards/margins": 2.4853272438049316, "rewards/rejected": -2.912559986114502, "step": 3274 }, { "epoch": 0.86, "grad_norm": 32.362457275390625, "kl": 0.0, "learning_rate": 7.144726511384454e-08, "logps/chosen": -306.9471130371094, "logps/rejected": -284.0693359375, "loss": 0.23, "rewards/chosen": 0.10507598519325256, "rewards/margins": 3.6749560832977295, "rewards/rejected": -3.5698800086975098, "step": 3275 }, { "epoch": 0.86, "grad_norm": 22.9937744140625, "kl": 0.0, "learning_rate": 7.131640931693273e-08, "logps/chosen": -204.49484252929688, "logps/rejected": -254.06138610839844, "loss": 0.2437, "rewards/chosen": 0.7712520360946655, "rewards/margins": 5.286528587341309, "rewards/rejected": -4.5152764320373535, "step": 3276 }, { "epoch": 0.86, "grad_norm": 39.88124465942383, "kl": 0.0, "learning_rate": 7.118555352002094e-08, "logps/chosen": -245.06753540039062, "logps/rejected": -268.346923828125, "loss": 0.2285, "rewards/chosen": 2.043609142303467, "rewards/margins": 6.61504602432251, "rewards/rejected": -4.571436882019043, "step": 3277 }, { "epoch": 0.86, "grad_norm": 36.58185577392578, "kl": 0.0, "learning_rate": 7.105469772310912e-08, "logps/chosen": -263.1695251464844, "logps/rejected": -236.58145141601562, "loss": 0.1717, "rewards/chosen": 0.9724127650260925, "rewards/margins": 6.123079776763916, "rewards/rejected": -5.150667190551758, "step": 3278 }, { "epoch": 0.86, "grad_norm": 37.39385986328125, "kl": 0.0, "learning_rate": 7.092384192619732e-08, "logps/chosen": -216.4535369873047, "logps/rejected": -157.31016540527344, "loss": 0.1617, "rewards/chosen": 2.5863406658172607, "rewards/margins": 5.75681209564209, "rewards/rejected": -3.17047119140625, "step": 3279 }, { "epoch": 0.86, "grad_norm": 33.63765335083008, "kl": 0.0, "learning_rate": 7.079298612928553e-08, "logps/chosen": -185.89892578125, "logps/rejected": -231.67864990234375, "loss": 0.3398, "rewards/chosen": 0.6431490778923035, "rewards/margins": 4.488551616668701, "rewards/rejected": -3.845402717590332, "step": 3280 }, { "epoch": 0.86, "grad_norm": 37.0649528503418, "kl": 0.0, "learning_rate": 7.066213033237373e-08, "logps/chosen": -154.55319213867188, "logps/rejected": -238.2505340576172, "loss": 0.3235, "rewards/chosen": 0.45910343527793884, "rewards/margins": 4.324784278869629, "rewards/rejected": -3.8656809329986572, "step": 3281 }, { "epoch": 0.86, "grad_norm": 35.96867370605469, "kl": 0.0, "learning_rate": 7.053127453546191e-08, "logps/chosen": -263.711181640625, "logps/rejected": -323.4737243652344, "loss": 0.1867, "rewards/chosen": 1.104454755783081, "rewards/margins": 6.848848342895508, "rewards/rejected": -5.744393825531006, "step": 3282 }, { "epoch": 0.86, "grad_norm": 39.129539489746094, "kl": 0.0, "learning_rate": 7.040041873855012e-08, "logps/chosen": -151.68955993652344, "logps/rejected": -177.42762756347656, "loss": 0.2207, "rewards/chosen": 2.1430304050445557, "rewards/margins": 5.147785663604736, "rewards/rejected": -3.0047552585601807, "step": 3283 }, { "epoch": 0.86, "grad_norm": 28.86467742919922, "kl": 0.0, "learning_rate": 7.026956294163831e-08, "logps/chosen": -250.60304260253906, "logps/rejected": -249.148681640625, "loss": 0.1399, "rewards/chosen": 2.5337722301483154, "rewards/margins": 6.411539077758789, "rewards/rejected": -3.8777670860290527, "step": 3284 }, { "epoch": 0.86, "grad_norm": 32.93992233276367, "kl": 0.0, "learning_rate": 7.013870714472651e-08, "logps/chosen": -203.44247436523438, "logps/rejected": -197.58848571777344, "loss": 0.201, "rewards/chosen": 0.29153192043304443, "rewards/margins": 5.219209671020508, "rewards/rejected": -4.927677631378174, "step": 3285 }, { "epoch": 0.86, "grad_norm": 25.979219436645508, "kl": 0.0, "learning_rate": 7.000785134781471e-08, "logps/chosen": -167.48854064941406, "logps/rejected": -310.9127197265625, "loss": 0.1946, "rewards/chosen": 1.70915949344635, "rewards/margins": 6.76434850692749, "rewards/rejected": -5.05518913269043, "step": 3286 }, { "epoch": 0.86, "grad_norm": 25.11945343017578, "kl": 0.0, "learning_rate": 6.98769955509029e-08, "logps/chosen": -223.0096893310547, "logps/rejected": -205.82083129882812, "loss": 0.2614, "rewards/chosen": 1.862809419631958, "rewards/margins": 6.054323196411133, "rewards/rejected": -4.191514015197754, "step": 3287 }, { "epoch": 0.86, "grad_norm": 42.99819564819336, "kl": 0.0, "learning_rate": 6.97461397539911e-08, "logps/chosen": -249.3183135986328, "logps/rejected": -287.7580261230469, "loss": 0.2844, "rewards/chosen": -0.017606837674975395, "rewards/margins": 3.4193594455718994, "rewards/rejected": -3.4369661808013916, "step": 3288 }, { "epoch": 0.86, "grad_norm": 28.284883499145508, "kl": 0.0, "learning_rate": 6.961528395707931e-08, "logps/chosen": -209.82473754882812, "logps/rejected": -249.59402465820312, "loss": 0.1452, "rewards/chosen": 1.9604130983352661, "rewards/margins": 7.294702053070068, "rewards/rejected": -5.334289073944092, "step": 3289 }, { "epoch": 0.86, "grad_norm": 30.614185333251953, "kl": 0.0, "learning_rate": 6.948442816016749e-08, "logps/chosen": -171.9506072998047, "logps/rejected": -178.01907348632812, "loss": 0.2271, "rewards/chosen": 1.5359193086624146, "rewards/margins": 4.613659381866455, "rewards/rejected": -3.07774019241333, "step": 3290 }, { "epoch": 0.86, "grad_norm": 41.53820037841797, "kl": 0.0, "learning_rate": 6.935357236325569e-08, "logps/chosen": -209.81532287597656, "logps/rejected": -180.5158233642578, "loss": 0.244, "rewards/chosen": 1.3790901899337769, "rewards/margins": 4.9010844230651855, "rewards/rejected": -3.5219943523406982, "step": 3291 }, { "epoch": 0.86, "grad_norm": 40.008262634277344, "kl": 0.0, "learning_rate": 6.922271656634388e-08, "logps/chosen": -97.14312744140625, "logps/rejected": -326.3070373535156, "loss": 0.2114, "rewards/chosen": 1.234908938407898, "rewards/margins": 5.508755207061768, "rewards/rejected": -4.27384614944458, "step": 3292 }, { "epoch": 0.86, "grad_norm": 35.759037017822266, "kl": 0.0, "learning_rate": 6.909186076943209e-08, "logps/chosen": -243.43714904785156, "logps/rejected": -278.0218200683594, "loss": 0.1198, "rewards/chosen": 2.5602760314941406, "rewards/margins": 5.5889482498168945, "rewards/rejected": -3.028672456741333, "step": 3293 }, { "epoch": 0.86, "grad_norm": 37.18498229980469, "kl": 0.0, "learning_rate": 6.896100497252028e-08, "logps/chosen": -257.5518493652344, "logps/rejected": -204.18756103515625, "loss": 0.2803, "rewards/chosen": 0.5463675260543823, "rewards/margins": 4.813793182373047, "rewards/rejected": -4.267425537109375, "step": 3294 }, { "epoch": 0.86, "grad_norm": 25.23471450805664, "kl": 0.0, "learning_rate": 6.883014917560847e-08, "logps/chosen": -147.47364807128906, "logps/rejected": -310.4903259277344, "loss": 0.1438, "rewards/chosen": 1.7297723293304443, "rewards/margins": 6.705340385437012, "rewards/rejected": -4.9755682945251465, "step": 3295 }, { "epoch": 0.86, "grad_norm": 31.479604721069336, "kl": 0.0, "learning_rate": 6.869929337869668e-08, "logps/chosen": -287.01898193359375, "logps/rejected": -258.6340026855469, "loss": 0.184, "rewards/chosen": 1.9719114303588867, "rewards/margins": 6.917535781860352, "rewards/rejected": -4.945624351501465, "step": 3296 }, { "epoch": 0.86, "grad_norm": 39.46041488647461, "kl": 0.0, "learning_rate": 6.856843758178486e-08, "logps/chosen": -157.99221801757812, "logps/rejected": -350.6339416503906, "loss": 0.2945, "rewards/chosen": -0.05334752798080444, "rewards/margins": 2.8886020183563232, "rewards/rejected": -2.9419496059417725, "step": 3297 }, { "epoch": 0.86, "grad_norm": 27.979738235473633, "kl": 0.0, "learning_rate": 6.843758178487306e-08, "logps/chosen": -275.239990234375, "logps/rejected": -249.71923828125, "loss": 0.2181, "rewards/chosen": 1.6366877555847168, "rewards/margins": 5.247501850128174, "rewards/rejected": -3.610814094543457, "step": 3298 }, { "epoch": 0.86, "grad_norm": 29.67488670349121, "kl": 0.0, "learning_rate": 6.830672598796127e-08, "logps/chosen": -181.911865234375, "logps/rejected": -250.30233764648438, "loss": 0.2572, "rewards/chosen": 2.5218210220336914, "rewards/margins": 5.756906032562256, "rewards/rejected": -3.2350850105285645, "step": 3299 }, { "epoch": 0.86, "grad_norm": 42.35227966308594, "kl": 0.0, "learning_rate": 6.817587019104947e-08, "logps/chosen": -224.5271453857422, "logps/rejected": -224.19619750976562, "loss": 0.3821, "rewards/chosen": 0.25506705045700073, "rewards/margins": 3.4965734481811523, "rewards/rejected": -3.241506338119507, "step": 3300 }, { "epoch": 0.86, "grad_norm": 38.902870178222656, "kl": 0.0, "learning_rate": 6.804501439413765e-08, "logps/chosen": -217.0912322998047, "logps/rejected": -236.94139099121094, "loss": 0.3333, "rewards/chosen": 0.8560259938240051, "rewards/margins": 3.4678268432617188, "rewards/rejected": -2.6118009090423584, "step": 3301 }, { "epoch": 0.86, "grad_norm": 29.101722717285156, "kl": 0.0, "learning_rate": 6.791415859722586e-08, "logps/chosen": -188.09799194335938, "logps/rejected": -235.2723846435547, "loss": 0.1716, "rewards/chosen": 2.8118784427642822, "rewards/margins": 6.987157821655273, "rewards/rejected": -4.175279140472412, "step": 3302 }, { "epoch": 0.86, "grad_norm": 53.44160079956055, "kl": 0.0, "learning_rate": 6.778330280031405e-08, "logps/chosen": -146.15121459960938, "logps/rejected": -212.4782257080078, "loss": 0.2992, "rewards/chosen": 0.34400415420532227, "rewards/margins": 2.6334946155548096, "rewards/rejected": -2.2894904613494873, "step": 3303 }, { "epoch": 0.86, "grad_norm": 33.913021087646484, "kl": 0.0, "learning_rate": 6.765244700340225e-08, "logps/chosen": -233.7433624267578, "logps/rejected": -230.5489959716797, "loss": 0.1883, "rewards/chosen": 1.3600634336471558, "rewards/margins": 5.133440971374512, "rewards/rejected": -3.7733774185180664, "step": 3304 }, { "epoch": 0.86, "grad_norm": 37.881309509277344, "kl": 0.0, "learning_rate": 6.752159120649043e-08, "logps/chosen": -187.03089904785156, "logps/rejected": -321.6445617675781, "loss": 0.3124, "rewards/chosen": 0.576844334602356, "rewards/margins": 4.5928730964660645, "rewards/rejected": -4.016028881072998, "step": 3305 }, { "epoch": 0.87, "grad_norm": 28.14802360534668, "kl": 0.0, "learning_rate": 6.739073540957864e-08, "logps/chosen": -179.93328857421875, "logps/rejected": -330.87994384765625, "loss": 0.2017, "rewards/chosen": 1.1859331130981445, "rewards/margins": 6.022317886352539, "rewards/rejected": -4.8363847732543945, "step": 3306 }, { "epoch": 0.87, "grad_norm": 30.64641761779785, "kl": 0.0, "learning_rate": 6.725987961266684e-08, "logps/chosen": -189.5594024658203, "logps/rejected": -235.72720336914062, "loss": 0.1842, "rewards/chosen": -0.34744271636009216, "rewards/margins": 3.385176420211792, "rewards/rejected": -3.732619047164917, "step": 3307 }, { "epoch": 0.87, "grad_norm": 35.5704460144043, "kl": 0.0, "learning_rate": 6.712902381575503e-08, "logps/chosen": -184.53591918945312, "logps/rejected": -285.756591796875, "loss": 0.235, "rewards/chosen": 0.026567867025732994, "rewards/margins": 5.050618648529053, "rewards/rejected": -5.024050712585449, "step": 3308 }, { "epoch": 0.87, "grad_norm": 25.643348693847656, "kl": 0.0, "learning_rate": 6.699816801884323e-08, "logps/chosen": -160.59278869628906, "logps/rejected": -255.24618530273438, "loss": 0.1446, "rewards/chosen": 1.3220984935760498, "rewards/margins": 6.726505279541016, "rewards/rejected": -5.404406547546387, "step": 3309 }, { "epoch": 0.87, "grad_norm": 25.281702041625977, "kl": 0.0, "learning_rate": 6.686731222193143e-08, "logps/chosen": -208.46261596679688, "logps/rejected": -263.46209716796875, "loss": 0.2685, "rewards/chosen": 0.5003941059112549, "rewards/margins": 3.899710178375244, "rewards/rejected": -3.3993160724639893, "step": 3310 }, { "epoch": 0.87, "grad_norm": 37.242706298828125, "kl": 0.0, "learning_rate": 6.673645642501962e-08, "logps/chosen": -240.91317749023438, "logps/rejected": -254.6995849609375, "loss": 0.2309, "rewards/chosen": 2.887584924697876, "rewards/margins": 5.6153669357299805, "rewards/rejected": -2.7277822494506836, "step": 3311 }, { "epoch": 0.87, "grad_norm": 58.118492126464844, "kl": 0.0, "learning_rate": 6.660560062810783e-08, "logps/chosen": -233.1113739013672, "logps/rejected": -321.74786376953125, "loss": 0.2546, "rewards/chosen": 1.2985846996307373, "rewards/margins": 5.554841995239258, "rewards/rejected": -4.256257057189941, "step": 3312 }, { "epoch": 0.87, "grad_norm": 41.69536590576172, "kl": 0.0, "learning_rate": 6.647474483119601e-08, "logps/chosen": -250.2501678466797, "logps/rejected": -202.795654296875, "loss": 0.306, "rewards/chosen": 0.21787656843662262, "rewards/margins": 3.9046685695648193, "rewards/rejected": -3.6867918968200684, "step": 3313 }, { "epoch": 0.87, "grad_norm": 29.167028427124023, "kl": 0.0, "learning_rate": 6.634388903428421e-08, "logps/chosen": -247.29339599609375, "logps/rejected": -244.97125244140625, "loss": 0.2146, "rewards/chosen": 1.6185340881347656, "rewards/margins": 4.635621070861816, "rewards/rejected": -3.017086982727051, "step": 3314 }, { "epoch": 0.87, "grad_norm": 31.391908645629883, "kl": 0.0, "learning_rate": 6.621303323737242e-08, "logps/chosen": -205.63795471191406, "logps/rejected": -143.65188598632812, "loss": 0.1912, "rewards/chosen": 3.0524964332580566, "rewards/margins": 6.475236892700195, "rewards/rejected": -3.4227406978607178, "step": 3315 }, { "epoch": 0.87, "grad_norm": 32.993507385253906, "kl": 0.0, "learning_rate": 6.608217744046062e-08, "logps/chosen": -227.3526611328125, "logps/rejected": -240.81625366210938, "loss": 0.1422, "rewards/chosen": 2.2260305881500244, "rewards/margins": 5.8847575187683105, "rewards/rejected": -3.658726930618286, "step": 3316 }, { "epoch": 0.87, "grad_norm": 49.290889739990234, "kl": 0.0, "learning_rate": 6.59513216435488e-08, "logps/chosen": -175.3079376220703, "logps/rejected": -264.86151123046875, "loss": 0.2859, "rewards/chosen": 0.9824588894844055, "rewards/margins": 4.101996898651123, "rewards/rejected": -3.1195380687713623, "step": 3317 }, { "epoch": 0.87, "grad_norm": 29.216691970825195, "kl": 0.0, "learning_rate": 6.582046584663701e-08, "logps/chosen": -156.804443359375, "logps/rejected": -283.55572509765625, "loss": 0.3219, "rewards/chosen": 1.2649195194244385, "rewards/margins": 2.263371467590332, "rewards/rejected": -0.9984518885612488, "step": 3318 }, { "epoch": 0.87, "grad_norm": 29.11150550842285, "kl": 0.0, "learning_rate": 6.56896100497252e-08, "logps/chosen": -234.84364318847656, "logps/rejected": -303.1112060546875, "loss": 0.1699, "rewards/chosen": 1.790556788444519, "rewards/margins": 6.315253257751465, "rewards/rejected": -4.524696350097656, "step": 3319 }, { "epoch": 0.87, "grad_norm": 33.973350524902344, "kl": 0.0, "learning_rate": 6.55587542528134e-08, "logps/chosen": -187.39263916015625, "logps/rejected": -275.7961120605469, "loss": 0.3065, "rewards/chosen": 1.2405842542648315, "rewards/margins": 6.566118240356445, "rewards/rejected": -5.325533866882324, "step": 3320 }, { "epoch": 0.87, "grad_norm": 42.52359390258789, "kl": 0.0, "learning_rate": 6.542789845590158e-08, "logps/chosen": -132.62879943847656, "logps/rejected": -302.57452392578125, "loss": 0.2687, "rewards/chosen": 0.5341070890426636, "rewards/margins": 5.116555690765381, "rewards/rejected": -4.582448482513428, "step": 3321 }, { "epoch": 0.87, "grad_norm": 30.272220611572266, "kl": 0.0, "learning_rate": 6.529704265898979e-08, "logps/chosen": -213.6365966796875, "logps/rejected": -249.19174194335938, "loss": 0.2952, "rewards/chosen": 0.25924426317214966, "rewards/margins": 3.283485174179077, "rewards/rejected": -3.0242409706115723, "step": 3322 }, { "epoch": 0.87, "grad_norm": 24.300634384155273, "kl": 0.0, "learning_rate": 6.516618686207799e-08, "logps/chosen": -166.66082763671875, "logps/rejected": -324.5031433105469, "loss": 0.2112, "rewards/chosen": 1.5318686962127686, "rewards/margins": 5.843268394470215, "rewards/rejected": -4.311399936676025, "step": 3323 }, { "epoch": 0.87, "grad_norm": 33.69001770019531, "kl": 0.0, "learning_rate": 6.503533106516618e-08, "logps/chosen": -290.857421875, "logps/rejected": -209.4654083251953, "loss": 0.2298, "rewards/chosen": 0.6444933414459229, "rewards/margins": 4.194701194763184, "rewards/rejected": -3.5502076148986816, "step": 3324 }, { "epoch": 0.87, "grad_norm": 39.34164810180664, "kl": 0.0, "learning_rate": 6.490447526825438e-08, "logps/chosen": -179.45948791503906, "logps/rejected": -268.68505859375, "loss": 0.335, "rewards/chosen": -0.48030078411102295, "rewards/margins": 1.7185760736465454, "rewards/rejected": -2.1988768577575684, "step": 3325 }, { "epoch": 0.87, "grad_norm": 37.93251037597656, "kl": 0.0, "learning_rate": 6.477361947134258e-08, "logps/chosen": -226.54959106445312, "logps/rejected": -286.8560485839844, "loss": 0.2178, "rewards/chosen": 1.1966824531555176, "rewards/margins": 5.566013813018799, "rewards/rejected": -4.369331359863281, "step": 3326 }, { "epoch": 0.87, "grad_norm": 31.69426155090332, "kl": 0.0, "learning_rate": 6.464276367443077e-08, "logps/chosen": -254.9532470703125, "logps/rejected": -292.90997314453125, "loss": 0.3137, "rewards/chosen": 0.43578511476516724, "rewards/margins": 3.768958330154419, "rewards/rejected": -3.3331732749938965, "step": 3327 }, { "epoch": 0.87, "grad_norm": 38.56787109375, "kl": 0.0, "learning_rate": 6.451190787751897e-08, "logps/chosen": -229.85284423828125, "logps/rejected": -233.19021606445312, "loss": 0.2724, "rewards/chosen": 0.6514629125595093, "rewards/margins": 4.2708420753479, "rewards/rejected": -3.6193790435791016, "step": 3328 }, { "epoch": 0.87, "grad_norm": 24.048837661743164, "kl": 0.0, "learning_rate": 6.438105208060717e-08, "logps/chosen": -177.91517639160156, "logps/rejected": -227.61233520507812, "loss": 0.195, "rewards/chosen": 1.9322575330734253, "rewards/margins": 5.3044915199279785, "rewards/rejected": -3.3722338676452637, "step": 3329 }, { "epoch": 0.87, "grad_norm": 24.384130477905273, "kl": 0.0, "learning_rate": 6.425019628369536e-08, "logps/chosen": -123.97366333007812, "logps/rejected": -229.6993865966797, "loss": 0.2558, "rewards/chosen": 0.7002733945846558, "rewards/margins": 3.6762170791625977, "rewards/rejected": -2.9759435653686523, "step": 3330 }, { "epoch": 0.87, "grad_norm": 35.15760803222656, "kl": 0.0, "learning_rate": 6.411934048678357e-08, "logps/chosen": -119.36579132080078, "logps/rejected": -305.31610107421875, "loss": 0.1591, "rewards/chosen": 2.24487566947937, "rewards/margins": 6.488380432128906, "rewards/rejected": -4.243504524230957, "step": 3331 }, { "epoch": 0.87, "grad_norm": 31.893068313598633, "kl": 0.0, "learning_rate": 6.398848468987175e-08, "logps/chosen": -143.27931213378906, "logps/rejected": -316.27020263671875, "loss": 0.1997, "rewards/chosen": 0.03290824219584465, "rewards/margins": 4.4567461013793945, "rewards/rejected": -4.423837661743164, "step": 3332 }, { "epoch": 0.87, "grad_norm": 29.552108764648438, "kl": 0.0, "learning_rate": 6.385762889295995e-08, "logps/chosen": -142.15072631835938, "logps/rejected": -240.77963256835938, "loss": 0.243, "rewards/chosen": 1.4211618900299072, "rewards/margins": 4.207136154174805, "rewards/rejected": -2.7859745025634766, "step": 3333 }, { "epoch": 0.87, "grad_norm": 48.456520080566406, "kl": 0.0, "learning_rate": 6.372677309604816e-08, "logps/chosen": -230.8395233154297, "logps/rejected": -218.60824584960938, "loss": 0.2448, "rewards/chosen": 1.877655029296875, "rewards/margins": 4.602280139923096, "rewards/rejected": -2.7246251106262207, "step": 3334 }, { "epoch": 0.87, "grad_norm": 47.03510665893555, "kl": 0.0, "learning_rate": 6.359591729913635e-08, "logps/chosen": -210.4639129638672, "logps/rejected": -389.0422668457031, "loss": 0.1982, "rewards/chosen": 1.8622188568115234, "rewards/margins": 19.06804656982422, "rewards/rejected": -17.205827713012695, "step": 3335 }, { "epoch": 0.87, "grad_norm": 49.48558807373047, "kl": 0.0, "learning_rate": 6.346506150222454e-08, "logps/chosen": -173.88287353515625, "logps/rejected": -371.12628173828125, "loss": 0.326, "rewards/chosen": 0.8577790260314941, "rewards/margins": 6.012696743011475, "rewards/rejected": -5.1549177169799805, "step": 3336 }, { "epoch": 0.87, "grad_norm": 39.3420295715332, "kl": 0.0, "learning_rate": 6.333420570531273e-08, "logps/chosen": -222.27012634277344, "logps/rejected": -150.44436645507812, "loss": 0.2694, "rewards/chosen": 0.5377728939056396, "rewards/margins": 3.824592113494873, "rewards/rejected": -3.2868192195892334, "step": 3337 }, { "epoch": 0.87, "grad_norm": 28.24842643737793, "kl": 0.0, "learning_rate": 6.320334990840094e-08, "logps/chosen": -137.2236785888672, "logps/rejected": -277.0232238769531, "loss": 0.1481, "rewards/chosen": 1.2536040544509888, "rewards/margins": 6.276458263397217, "rewards/rejected": -5.022854328155518, "step": 3338 }, { "epoch": 0.87, "grad_norm": 36.254573822021484, "kl": 0.0, "learning_rate": 6.307249411148914e-08, "logps/chosen": -181.7095489501953, "logps/rejected": -241.9065399169922, "loss": 0.2699, "rewards/chosen": 1.2226327657699585, "rewards/margins": 4.4673848152160645, "rewards/rejected": -3.2447519302368164, "step": 3339 }, { "epoch": 0.87, "grad_norm": 25.44312858581543, "kl": 0.0, "learning_rate": 6.294163831457732e-08, "logps/chosen": -195.16006469726562, "logps/rejected": -225.5717315673828, "loss": 0.1913, "rewards/chosen": 2.599803924560547, "rewards/margins": 5.383212089538574, "rewards/rejected": -2.7834081649780273, "step": 3340 }, { "epoch": 0.87, "grad_norm": 44.52004623413086, "kl": 0.0, "learning_rate": 6.281078251766553e-08, "logps/chosen": -195.54818725585938, "logps/rejected": -170.70751953125, "loss": 0.2647, "rewards/chosen": 1.62178635597229, "rewards/margins": 3.412325859069824, "rewards/rejected": -1.7905396223068237, "step": 3341 }, { "epoch": 0.87, "grad_norm": 42.561214447021484, "kl": 0.0, "learning_rate": 6.267992672075373e-08, "logps/chosen": -242.3065948486328, "logps/rejected": -249.35528564453125, "loss": 0.2645, "rewards/chosen": 1.8589394092559814, "rewards/margins": 5.069093704223633, "rewards/rejected": -3.2101545333862305, "step": 3342 }, { "epoch": 0.87, "grad_norm": 26.688722610473633, "kl": 0.0, "learning_rate": 6.254907092384192e-08, "logps/chosen": -158.76341247558594, "logps/rejected": -157.71682739257812, "loss": 0.3068, "rewards/chosen": 0.2880159914493561, "rewards/margins": 3.0845084190368652, "rewards/rejected": -2.796492338180542, "step": 3343 }, { "epoch": 0.88, "grad_norm": 26.89533805847168, "kl": 0.0, "learning_rate": 6.241821512693012e-08, "logps/chosen": -279.21527099609375, "logps/rejected": -219.7892303466797, "loss": 0.1806, "rewards/chosen": 1.976793885231018, "rewards/margins": 6.151236534118652, "rewards/rejected": -4.174442768096924, "step": 3344 }, { "epoch": 0.88, "grad_norm": 39.72605895996094, "kl": 0.0, "learning_rate": 6.228735933001832e-08, "logps/chosen": -227.8719940185547, "logps/rejected": -207.28826904296875, "loss": 0.3296, "rewards/chosen": -0.17927008867263794, "rewards/margins": 3.6305737495422363, "rewards/rejected": -3.8098437786102295, "step": 3345 }, { "epoch": 0.88, "grad_norm": 37.94425582885742, "kl": 0.0, "learning_rate": 6.215650353310651e-08, "logps/chosen": -232.4866943359375, "logps/rejected": -313.95916748046875, "loss": 0.2654, "rewards/chosen": 0.6113022565841675, "rewards/margins": 4.760047912597656, "rewards/rejected": -4.148745536804199, "step": 3346 }, { "epoch": 0.88, "grad_norm": 29.92729949951172, "kl": 0.0, "learning_rate": 6.202564773619471e-08, "logps/chosen": -227.82040405273438, "logps/rejected": -236.7136688232422, "loss": 0.2515, "rewards/chosen": 2.12203049659729, "rewards/margins": 4.986681938171387, "rewards/rejected": -2.8646512031555176, "step": 3347 }, { "epoch": 0.88, "grad_norm": 34.22357940673828, "kl": 0.0, "learning_rate": 6.18947919392829e-08, "logps/chosen": -212.64195251464844, "logps/rejected": -177.64585876464844, "loss": 0.2566, "rewards/chosen": 0.41153281927108765, "rewards/margins": 3.8872270584106445, "rewards/rejected": -3.475694179534912, "step": 3348 }, { "epoch": 0.88, "grad_norm": 31.709651947021484, "kl": 0.0, "learning_rate": 6.17639361423711e-08, "logps/chosen": -227.3643035888672, "logps/rejected": -234.74111938476562, "loss": 0.2212, "rewards/chosen": 1.6882359981536865, "rewards/margins": 5.849997520446777, "rewards/rejected": -4.16176176071167, "step": 3349 }, { "epoch": 0.88, "grad_norm": 32.12677764892578, "kl": 0.0, "learning_rate": 6.163308034545931e-08, "logps/chosen": -222.95790100097656, "logps/rejected": -250.52259826660156, "loss": 0.21, "rewards/chosen": 0.7367315292358398, "rewards/margins": 4.7167205810546875, "rewards/rejected": -3.9799892902374268, "step": 3350 }, { "epoch": 0.88, "grad_norm": 32.62001419067383, "kl": 0.0, "learning_rate": 6.150222454854749e-08, "logps/chosen": -214.60733032226562, "logps/rejected": -329.4471130371094, "loss": 0.1048, "rewards/chosen": 2.412802219390869, "rewards/margins": 10.279458999633789, "rewards/rejected": -7.866656303405762, "step": 3351 }, { "epoch": 0.88, "grad_norm": 45.16449737548828, "kl": 0.0, "learning_rate": 6.13713687516357e-08, "logps/chosen": -269.85308837890625, "logps/rejected": -269.24371337890625, "loss": 0.3168, "rewards/chosen": 0.4667782783508301, "rewards/margins": 4.049206256866455, "rewards/rejected": -3.582427978515625, "step": 3352 }, { "epoch": 0.88, "grad_norm": 32.718441009521484, "kl": 0.0, "learning_rate": 6.12405129547239e-08, "logps/chosen": -177.99945068359375, "logps/rejected": -303.58941650390625, "loss": 0.3193, "rewards/chosen": 0.3383503258228302, "rewards/margins": 5.382714748382568, "rewards/rejected": -5.0443644523620605, "step": 3353 }, { "epoch": 0.88, "grad_norm": 41.32862854003906, "kl": 0.0, "learning_rate": 6.11096571578121e-08, "logps/chosen": -215.1548309326172, "logps/rejected": -329.5027770996094, "loss": 0.1712, "rewards/chosen": 1.8988428115844727, "rewards/margins": 6.0316267013549805, "rewards/rejected": -4.132783889770508, "step": 3354 }, { "epoch": 0.88, "grad_norm": 52.48097229003906, "kl": 0.0, "learning_rate": 6.097880136090029e-08, "logps/chosen": -199.16217041015625, "logps/rejected": -284.30108642578125, "loss": 0.2402, "rewards/chosen": -0.06404250860214233, "rewards/margins": 4.585357189178467, "rewards/rejected": -4.649399757385254, "step": 3355 }, { "epoch": 0.88, "grad_norm": 28.11062240600586, "kl": 0.0, "learning_rate": 6.084794556398847e-08, "logps/chosen": -156.20716857910156, "logps/rejected": -228.56581115722656, "loss": 0.2018, "rewards/chosen": 1.7657132148742676, "rewards/margins": 5.493755340576172, "rewards/rejected": -3.728041887283325, "step": 3356 }, { "epoch": 0.88, "grad_norm": 36.19900131225586, "kl": 0.0, "learning_rate": 6.071708976707668e-08, "logps/chosen": -171.58216857910156, "logps/rejected": -188.8887481689453, "loss": 0.2677, "rewards/chosen": 1.182680606842041, "rewards/margins": 2.961630344390869, "rewards/rejected": -1.7789497375488281, "step": 3357 }, { "epoch": 0.88, "grad_norm": 32.472721099853516, "kl": 0.0, "learning_rate": 6.058623397016488e-08, "logps/chosen": -341.27197265625, "logps/rejected": -182.00791931152344, "loss": 0.1525, "rewards/chosen": 3.77966046333313, "rewards/margins": 6.490664482116699, "rewards/rejected": -2.7110040187835693, "step": 3358 }, { "epoch": 0.88, "grad_norm": 27.687116622924805, "kl": 0.0, "learning_rate": 6.045537817325307e-08, "logps/chosen": -152.61212158203125, "logps/rejected": -318.43719482421875, "loss": 0.2795, "rewards/chosen": 0.9022289514541626, "rewards/margins": 4.633611679077148, "rewards/rejected": -3.7313828468322754, "step": 3359 }, { "epoch": 0.88, "grad_norm": 29.510347366333008, "kl": 0.0, "learning_rate": 6.032452237634127e-08, "logps/chosen": -184.28567504882812, "logps/rejected": -226.7508544921875, "loss": 0.2104, "rewards/chosen": 1.5323501825332642, "rewards/margins": 5.288782119750977, "rewards/rejected": -3.756431818008423, "step": 3360 }, { "epoch": 0.88, "grad_norm": 33.84703063964844, "kl": 0.0, "learning_rate": 6.019366657942947e-08, "logps/chosen": -202.72412109375, "logps/rejected": -185.4829559326172, "loss": 0.1574, "rewards/chosen": 0.6997545957565308, "rewards/margins": 3.3935866355895996, "rewards/rejected": -2.6938319206237793, "step": 3361 }, { "epoch": 0.88, "grad_norm": 36.224327087402344, "kl": 0.0, "learning_rate": 6.006281078251766e-08, "logps/chosen": -192.3488006591797, "logps/rejected": -253.87930297851562, "loss": 0.3257, "rewards/chosen": 0.5731613636016846, "rewards/margins": 2.5195703506469727, "rewards/rejected": -1.946408987045288, "step": 3362 }, { "epoch": 0.88, "grad_norm": 31.756702423095703, "kl": 0.0, "learning_rate": 5.993195498560586e-08, "logps/chosen": -220.3318634033203, "logps/rejected": -325.5659484863281, "loss": 0.2633, "rewards/chosen": 1.128380537033081, "rewards/margins": 5.160576820373535, "rewards/rejected": -4.032196044921875, "step": 3363 }, { "epoch": 0.88, "grad_norm": 29.23659896850586, "kl": 0.0, "learning_rate": 5.980109918869405e-08, "logps/chosen": -245.1772003173828, "logps/rejected": -314.14703369140625, "loss": 0.2172, "rewards/chosen": 1.6287401914596558, "rewards/margins": 5.567266464233398, "rewards/rejected": -3.9385263919830322, "step": 3364 }, { "epoch": 0.88, "grad_norm": 37.184505462646484, "kl": 0.0, "learning_rate": 5.967024339178225e-08, "logps/chosen": -262.8486022949219, "logps/rejected": -166.6172637939453, "loss": 0.2953, "rewards/chosen": 1.73638916015625, "rewards/margins": 3.695864200592041, "rewards/rejected": -1.9594751596450806, "step": 3365 }, { "epoch": 0.88, "grad_norm": 40.81242370605469, "kl": 0.0, "learning_rate": 5.953938759487045e-08, "logps/chosen": -137.64544677734375, "logps/rejected": -172.7340087890625, "loss": 0.3206, "rewards/chosen": 0.6962337493896484, "rewards/margins": 4.178898334503174, "rewards/rejected": -3.4826645851135254, "step": 3366 }, { "epoch": 0.88, "grad_norm": 47.62089920043945, "kl": 0.0, "learning_rate": 5.940853179795865e-08, "logps/chosen": -161.90464782714844, "logps/rejected": -241.15768432617188, "loss": 0.1864, "rewards/chosen": 1.6212987899780273, "rewards/margins": 4.349338054656982, "rewards/rejected": -2.728039264678955, "step": 3367 }, { "epoch": 0.88, "grad_norm": 28.67351531982422, "kl": 0.0, "learning_rate": 5.927767600104684e-08, "logps/chosen": -229.27951049804688, "logps/rejected": -155.9366455078125, "loss": 0.2764, "rewards/chosen": 1.6246337890625, "rewards/margins": 5.1172051429748535, "rewards/rejected": -3.4925713539123535, "step": 3368 }, { "epoch": 0.88, "grad_norm": 21.98739242553711, "kl": 0.0, "learning_rate": 5.914682020413504e-08, "logps/chosen": -253.05848693847656, "logps/rejected": -254.97067260742188, "loss": 0.3061, "rewards/chosen": 0.8975669145584106, "rewards/margins": 3.9961724281311035, "rewards/rejected": -3.0986056327819824, "step": 3369 }, { "epoch": 0.88, "grad_norm": 33.838050842285156, "kl": 0.0, "learning_rate": 5.901596440722324e-08, "logps/chosen": -179.86862182617188, "logps/rejected": -205.198486328125, "loss": 0.2797, "rewards/chosen": 0.18361665308475494, "rewards/margins": 3.2206592559814453, "rewards/rejected": -3.0370426177978516, "step": 3370 }, { "epoch": 0.88, "grad_norm": 40.067752838134766, "kl": 0.0, "learning_rate": 5.8885108610311434e-08, "logps/chosen": -206.2080841064453, "logps/rejected": -243.10455322265625, "loss": 0.2762, "rewards/chosen": 0.2042618691921234, "rewards/margins": 3.7257778644561768, "rewards/rejected": -3.5215160846710205, "step": 3371 }, { "epoch": 0.88, "grad_norm": 21.66316032409668, "kl": 0.0, "learning_rate": 5.875425281339963e-08, "logps/chosen": -236.3871612548828, "logps/rejected": -286.03564453125, "loss": 0.3134, "rewards/chosen": 1.657511591911316, "rewards/margins": 5.718562602996826, "rewards/rejected": -4.061050891876221, "step": 3372 }, { "epoch": 0.88, "grad_norm": 27.895227432250977, "kl": 0.0, "learning_rate": 5.862339701648783e-08, "logps/chosen": -246.6658477783203, "logps/rejected": -306.644775390625, "loss": 0.2309, "rewards/chosen": 0.20239301025867462, "rewards/margins": 6.392871856689453, "rewards/rejected": -6.190478801727295, "step": 3373 }, { "epoch": 0.88, "grad_norm": 25.335817337036133, "kl": 0.0, "learning_rate": 5.849254121957602e-08, "logps/chosen": -218.54864501953125, "logps/rejected": -249.16184997558594, "loss": 0.2853, "rewards/chosen": 1.310225248336792, "rewards/margins": 4.546543121337891, "rewards/rejected": -3.2363181114196777, "step": 3374 }, { "epoch": 0.88, "grad_norm": 40.887332916259766, "kl": 0.0, "learning_rate": 5.8361685422664225e-08, "logps/chosen": -132.6246795654297, "logps/rejected": -322.7361755371094, "loss": 0.2353, "rewards/chosen": 1.1172531843185425, "rewards/margins": 5.526000499725342, "rewards/rejected": -4.40874719619751, "step": 3375 }, { "epoch": 0.88, "grad_norm": 33.22807312011719, "kl": 0.0, "learning_rate": 5.8230829625752415e-08, "logps/chosen": -134.68377685546875, "logps/rejected": -236.12823486328125, "loss": 0.2749, "rewards/chosen": 0.8383082151412964, "rewards/margins": 4.732278823852539, "rewards/rejected": -3.8939707279205322, "step": 3376 }, { "epoch": 0.88, "grad_norm": 21.896162033081055, "kl": 0.0, "learning_rate": 5.809997382884062e-08, "logps/chosen": -179.1840057373047, "logps/rejected": -266.8948974609375, "loss": 0.141, "rewards/chosen": 1.1683275699615479, "rewards/margins": 5.748424530029297, "rewards/rejected": -4.580097198486328, "step": 3377 }, { "epoch": 0.88, "grad_norm": 46.306766510009766, "kl": 0.0, "learning_rate": 5.7969118031928813e-08, "logps/chosen": -234.8936767578125, "logps/rejected": -172.52682495117188, "loss": 0.321, "rewards/chosen": -0.017574980854988098, "rewards/margins": 3.2141342163085938, "rewards/rejected": -3.2317092418670654, "step": 3378 }, { "epoch": 0.88, "grad_norm": 34.68102264404297, "kl": 0.0, "learning_rate": 5.783826223501701e-08, "logps/chosen": -224.55445861816406, "logps/rejected": -277.49237060546875, "loss": 0.2509, "rewards/chosen": 1.5944043397903442, "rewards/margins": 5.50456428527832, "rewards/rejected": -3.9101598262786865, "step": 3379 }, { "epoch": 0.88, "grad_norm": 20.631187438964844, "kl": 0.0, "learning_rate": 5.7707406438105206e-08, "logps/chosen": -171.95504760742188, "logps/rejected": -330.640380859375, "loss": 0.091, "rewards/chosen": 2.3494856357574463, "rewards/margins": 7.262475967407227, "rewards/rejected": -4.912990570068359, "step": 3380 }, { "epoch": 0.88, "grad_norm": 24.329391479492188, "kl": 0.0, "learning_rate": 5.757655064119341e-08, "logps/chosen": -177.57826232910156, "logps/rejected": -240.90550231933594, "loss": 0.2097, "rewards/chosen": 1.9974151849746704, "rewards/margins": 6.527149677276611, "rewards/rejected": -4.5297346115112305, "step": 3381 }, { "epoch": 0.89, "grad_norm": 24.810420989990234, "kl": 0.0, "learning_rate": 5.74456948442816e-08, "logps/chosen": -177.7817840576172, "logps/rejected": -236.9996795654297, "loss": 0.2142, "rewards/chosen": -0.16691847145557404, "rewards/margins": 4.240876197814941, "rewards/rejected": -4.40779447555542, "step": 3382 }, { "epoch": 0.89, "grad_norm": 34.094512939453125, "kl": 0.0, "learning_rate": 5.73148390473698e-08, "logps/chosen": -189.9762725830078, "logps/rejected": -209.1165313720703, "loss": 0.2814, "rewards/chosen": 1.5954288244247437, "rewards/margins": 4.066312789916992, "rewards/rejected": -2.470883846282959, "step": 3383 }, { "epoch": 0.89, "grad_norm": 27.294633865356445, "kl": 0.0, "learning_rate": 5.718398325045799e-08, "logps/chosen": -159.4602813720703, "logps/rejected": -260.0506896972656, "loss": 0.2759, "rewards/chosen": 1.1249538660049438, "rewards/margins": 3.517695903778076, "rewards/rejected": -2.392742156982422, "step": 3384 }, { "epoch": 0.89, "grad_norm": 27.89652442932129, "kl": 0.0, "learning_rate": 5.7053127453546186e-08, "logps/chosen": -237.68202209472656, "logps/rejected": -262.3923645019531, "loss": 0.1512, "rewards/chosen": 2.2936761379241943, "rewards/margins": 6.410256385803223, "rewards/rejected": -4.116580009460449, "step": 3385 }, { "epoch": 0.89, "grad_norm": 35.205177307128906, "kl": 0.0, "learning_rate": 5.692227165663439e-08, "logps/chosen": -226.78341674804688, "logps/rejected": -203.07223510742188, "loss": 0.2688, "rewards/chosen": 0.8353214859962463, "rewards/margins": 5.021864414215088, "rewards/rejected": -4.186542987823486, "step": 3386 }, { "epoch": 0.89, "grad_norm": 29.306995391845703, "kl": 0.0, "learning_rate": 5.679141585972258e-08, "logps/chosen": -136.1898193359375, "logps/rejected": -186.20309448242188, "loss": 0.2487, "rewards/chosen": 0.21132591366767883, "rewards/margins": 4.531725883483887, "rewards/rejected": -4.320399761199951, "step": 3387 }, { "epoch": 0.89, "grad_norm": 33.45652389526367, "kl": 0.0, "learning_rate": 5.666056006281078e-08, "logps/chosen": -280.9603576660156, "logps/rejected": -256.38214111328125, "loss": 0.141, "rewards/chosen": 2.556107997894287, "rewards/margins": 6.635317325592041, "rewards/rejected": -4.079209327697754, "step": 3388 }, { "epoch": 0.89, "grad_norm": 31.559064865112305, "kl": 0.0, "learning_rate": 5.652970426589898e-08, "logps/chosen": -217.60220336914062, "logps/rejected": -288.64434814453125, "loss": 0.1909, "rewards/chosen": 3.050231456756592, "rewards/margins": 6.9026713371276855, "rewards/rejected": -3.8524398803710938, "step": 3389 }, { "epoch": 0.89, "grad_norm": 35.80206298828125, "kl": 0.0, "learning_rate": 5.639884846898717e-08, "logps/chosen": -221.58529663085938, "logps/rejected": -273.1722717285156, "loss": 0.2664, "rewards/chosen": 0.20219017565250397, "rewards/margins": 3.6827523708343506, "rewards/rejected": -3.480562210083008, "step": 3390 }, { "epoch": 0.89, "grad_norm": 36.897422790527344, "kl": 0.0, "learning_rate": 5.626799267207537e-08, "logps/chosen": -169.41078186035156, "logps/rejected": -203.75230407714844, "loss": 0.3144, "rewards/chosen": 0.9681345224380493, "rewards/margins": 2.774806022644043, "rewards/rejected": -1.806671380996704, "step": 3391 }, { "epoch": 0.89, "grad_norm": 34.7119026184082, "kl": 0.0, "learning_rate": 5.6137136875163565e-08, "logps/chosen": -191.32791137695312, "logps/rejected": -206.8913116455078, "loss": 0.2233, "rewards/chosen": 2.613614797592163, "rewards/margins": 6.831250190734863, "rewards/rejected": -4.217635154724121, "step": 3392 }, { "epoch": 0.89, "grad_norm": 29.810110092163086, "kl": 0.0, "learning_rate": 5.600628107825176e-08, "logps/chosen": -232.5635528564453, "logps/rejected": -232.62684631347656, "loss": 0.1641, "rewards/chosen": 1.6024681329727173, "rewards/margins": 5.8191447257995605, "rewards/rejected": -4.216676712036133, "step": 3393 }, { "epoch": 0.89, "grad_norm": 31.482484817504883, "kl": 0.0, "learning_rate": 5.5875425281339964e-08, "logps/chosen": -226.40219116210938, "logps/rejected": -293.3888854980469, "loss": 0.3131, "rewards/chosen": 0.6189436912536621, "rewards/margins": 4.934136867523193, "rewards/rejected": -4.315193176269531, "step": 3394 }, { "epoch": 0.89, "grad_norm": 40.35493850708008, "kl": 0.0, "learning_rate": 5.5744569484428154e-08, "logps/chosen": -156.18832397460938, "logps/rejected": -336.05908203125, "loss": 0.2243, "rewards/chosen": 1.6128734350204468, "rewards/margins": 4.650113105773926, "rewards/rejected": -3.0372395515441895, "step": 3395 }, { "epoch": 0.89, "grad_norm": 36.46339416503906, "kl": 0.0, "learning_rate": 5.5613713687516356e-08, "logps/chosen": -178.1088104248047, "logps/rejected": -283.3271484375, "loss": 0.2208, "rewards/chosen": 1.1255799531936646, "rewards/margins": 5.292938232421875, "rewards/rejected": -4.1673583984375, "step": 3396 }, { "epoch": 0.89, "grad_norm": 41.65345764160156, "kl": 0.0, "learning_rate": 5.548285789060455e-08, "logps/chosen": -280.9771423339844, "logps/rejected": -273.1884460449219, "loss": 0.3031, "rewards/chosen": 0.27189967036247253, "rewards/margins": 5.235050678253174, "rewards/rejected": -4.963150978088379, "step": 3397 }, { "epoch": 0.89, "grad_norm": 37.78427505493164, "kl": 0.0, "learning_rate": 5.535200209369275e-08, "logps/chosen": -246.3995361328125, "logps/rejected": -295.0464172363281, "loss": 0.222, "rewards/chosen": 1.722731351852417, "rewards/margins": 5.892146110534668, "rewards/rejected": -4.16941499710083, "step": 3398 }, { "epoch": 0.89, "grad_norm": 28.25185775756836, "kl": 0.0, "learning_rate": 5.5221146296780945e-08, "logps/chosen": -170.24465942382812, "logps/rejected": -228.2877655029297, "loss": 0.1923, "rewards/chosen": 1.01484215259552, "rewards/margins": 3.821547508239746, "rewards/rejected": -2.8067054748535156, "step": 3399 }, { "epoch": 0.89, "grad_norm": 33.23460388183594, "kl": 0.0, "learning_rate": 5.509029049986914e-08, "logps/chosen": -149.94390869140625, "logps/rejected": -215.99136352539062, "loss": 0.3428, "rewards/chosen": 1.3301762342453003, "rewards/margins": 4.284788131713867, "rewards/rejected": -2.9546117782592773, "step": 3400 }, { "epoch": 0.89, "grad_norm": 30.18558692932129, "kl": 0.0, "learning_rate": 5.495943470295734e-08, "logps/chosen": -178.04258728027344, "logps/rejected": -250.28799438476562, "loss": 0.1482, "rewards/chosen": 1.288857340812683, "rewards/margins": 5.212002754211426, "rewards/rejected": -3.9231455326080322, "step": 3401 }, { "epoch": 0.89, "grad_norm": 32.04692840576172, "kl": 0.0, "learning_rate": 5.482857890604554e-08, "logps/chosen": -170.0665740966797, "logps/rejected": -245.35592651367188, "loss": 0.2146, "rewards/chosen": 1.415221929550171, "rewards/margins": 7.141057968139648, "rewards/rejected": -5.725835800170898, "step": 3402 }, { "epoch": 0.89, "grad_norm": 34.108558654785156, "kl": 0.0, "learning_rate": 5.469772310913373e-08, "logps/chosen": -185.75926208496094, "logps/rejected": -248.67950439453125, "loss": 0.2838, "rewards/chosen": 0.09984666109085083, "rewards/margins": 3.3367090225219727, "rewards/rejected": -3.2368624210357666, "step": 3403 }, { "epoch": 0.89, "grad_norm": 33.87335968017578, "kl": 0.0, "learning_rate": 5.456686731222193e-08, "logps/chosen": -267.390625, "logps/rejected": -268.80877685546875, "loss": 0.2325, "rewards/chosen": 2.265165328979492, "rewards/margins": 5.600419044494629, "rewards/rejected": -3.3352534770965576, "step": 3404 }, { "epoch": 0.89, "grad_norm": 24.61138916015625, "kl": 0.0, "learning_rate": 5.443601151531013e-08, "logps/chosen": -142.63829040527344, "logps/rejected": -187.2084197998047, "loss": 0.1972, "rewards/chosen": 2.221682071685791, "rewards/margins": 6.340028762817383, "rewards/rejected": -4.118346691131592, "step": 3405 }, { "epoch": 0.89, "grad_norm": 32.826377868652344, "kl": 0.0, "learning_rate": 5.4305155718398324e-08, "logps/chosen": -232.07369995117188, "logps/rejected": -256.4215087890625, "loss": 0.2689, "rewards/chosen": 0.41094404458999634, "rewards/margins": 4.343574047088623, "rewards/rejected": -3.9326298236846924, "step": 3406 }, { "epoch": 0.89, "grad_norm": 43.49300765991211, "kl": 0.0, "learning_rate": 5.417429992148652e-08, "logps/chosen": -308.1954040527344, "logps/rejected": -250.01596069335938, "loss": 0.336, "rewards/chosen": 0.7856874465942383, "rewards/margins": 3.084690570831299, "rewards/rejected": -2.2990031242370605, "step": 3407 }, { "epoch": 0.89, "grad_norm": 31.34882354736328, "kl": 0.0, "learning_rate": 5.4043444124574716e-08, "logps/chosen": -118.74658203125, "logps/rejected": -236.03623962402344, "loss": 0.1779, "rewards/chosen": 2.0077106952667236, "rewards/margins": 5.9031219482421875, "rewards/rejected": -3.895411491394043, "step": 3408 }, { "epoch": 0.89, "grad_norm": 36.540462493896484, "kl": 0.0, "learning_rate": 5.391258832766291e-08, "logps/chosen": -235.9467010498047, "logps/rejected": -239.76263427734375, "loss": 0.3456, "rewards/chosen": 0.06817644834518433, "rewards/margins": 3.937302350997925, "rewards/rejected": -3.8691258430480957, "step": 3409 }, { "epoch": 0.89, "grad_norm": 29.51076889038086, "kl": 0.0, "learning_rate": 5.3781732530751115e-08, "logps/chosen": -258.8054504394531, "logps/rejected": -237.52342224121094, "loss": 0.2351, "rewards/chosen": 1.873181700706482, "rewards/margins": 6.267869472503662, "rewards/rejected": -4.394687652587891, "step": 3410 }, { "epoch": 0.89, "grad_norm": 36.962745666503906, "kl": 0.0, "learning_rate": 5.3650876733839304e-08, "logps/chosen": -270.0163879394531, "logps/rejected": -256.7657470703125, "loss": 0.2353, "rewards/chosen": 1.1174057722091675, "rewards/margins": 3.3676934242248535, "rewards/rejected": -2.2502877712249756, "step": 3411 }, { "epoch": 0.89, "grad_norm": 31.154197692871094, "kl": 0.0, "learning_rate": 5.352002093692751e-08, "logps/chosen": -211.06796264648438, "logps/rejected": -339.4208984375, "loss": 0.2567, "rewards/chosen": 1.7467280626296997, "rewards/margins": 4.826638698577881, "rewards/rejected": -3.0799107551574707, "step": 3412 }, { "epoch": 0.89, "grad_norm": 41.743247985839844, "kl": 0.0, "learning_rate": 5.33891651400157e-08, "logps/chosen": -251.5352020263672, "logps/rejected": -174.1204376220703, "loss": 0.2898, "rewards/chosen": 0.8819773197174072, "rewards/margins": 5.040376663208008, "rewards/rejected": -4.1583991050720215, "step": 3413 }, { "epoch": 0.89, "grad_norm": 20.825239181518555, "kl": 0.0, "learning_rate": 5.325830934310389e-08, "logps/chosen": -258.7743225097656, "logps/rejected": -294.87408447265625, "loss": 0.1343, "rewards/chosen": 2.4211437702178955, "rewards/margins": 6.7697038650512695, "rewards/rejected": -4.348559856414795, "step": 3414 }, { "epoch": 0.89, "grad_norm": 38.2694091796875, "kl": 0.0, "learning_rate": 5.3127453546192095e-08, "logps/chosen": -180.42234802246094, "logps/rejected": -224.37460327148438, "loss": 0.2061, "rewards/chosen": 2.412104606628418, "rewards/margins": 5.695418357849121, "rewards/rejected": -3.283313751220703, "step": 3415 }, { "epoch": 0.89, "grad_norm": 41.9177131652832, "kl": 0.0, "learning_rate": 5.2996597749280285e-08, "logps/chosen": -199.71421813964844, "logps/rejected": -248.69764709472656, "loss": 0.2886, "rewards/chosen": 0.07714885473251343, "rewards/margins": 3.2543482780456543, "rewards/rejected": -3.177199363708496, "step": 3416 }, { "epoch": 0.89, "grad_norm": 48.770668029785156, "kl": 0.0, "learning_rate": 5.286574195236849e-08, "logps/chosen": -191.14852905273438, "logps/rejected": -266.9022216796875, "loss": 0.2607, "rewards/chosen": 0.6749199628829956, "rewards/margins": 3.958329677581787, "rewards/rejected": -3.283409595489502, "step": 3417 }, { "epoch": 0.89, "grad_norm": 27.754985809326172, "kl": 0.0, "learning_rate": 5.2734886155456684e-08, "logps/chosen": -191.7516326904297, "logps/rejected": -189.91012573242188, "loss": 0.1772, "rewards/chosen": 0.6567466259002686, "rewards/margins": 3.608036756515503, "rewards/rejected": -2.9512901306152344, "step": 3418 }, { "epoch": 0.89, "grad_norm": 36.7862548828125, "kl": 0.0, "learning_rate": 5.260403035854488e-08, "logps/chosen": -152.87648010253906, "logps/rejected": -252.29257202148438, "loss": 0.2769, "rewards/chosen": 1.3332586288452148, "rewards/margins": 3.6783080101013184, "rewards/rejected": -2.3450493812561035, "step": 3419 }, { "epoch": 0.9, "grad_norm": 36.703426361083984, "kl": 0.0, "learning_rate": 5.2473174561633076e-08, "logps/chosen": -257.3609619140625, "logps/rejected": -244.8799591064453, "loss": 0.2853, "rewards/chosen": 0.4611107110977173, "rewards/margins": 5.041869640350342, "rewards/rejected": -4.580759048461914, "step": 3420 }, { "epoch": 0.9, "grad_norm": 34.300514221191406, "kl": 0.0, "learning_rate": 5.234231876472128e-08, "logps/chosen": -157.11154174804688, "logps/rejected": -214.3528289794922, "loss": 0.2159, "rewards/chosen": 1.7635271549224854, "rewards/margins": 5.321822166442871, "rewards/rejected": -3.558295249938965, "step": 3421 }, { "epoch": 0.9, "grad_norm": 37.2406120300293, "kl": 0.0, "learning_rate": 5.221146296780947e-08, "logps/chosen": -217.28884887695312, "logps/rejected": -221.66026306152344, "loss": 0.1318, "rewards/chosen": 3.0308799743652344, "rewards/margins": 7.286211967468262, "rewards/rejected": -4.255331993103027, "step": 3422 }, { "epoch": 0.9, "grad_norm": 30.882293701171875, "kl": 0.0, "learning_rate": 5.208060717089767e-08, "logps/chosen": -182.9688262939453, "logps/rejected": -254.4478302001953, "loss": 0.1496, "rewards/chosen": 1.0490161180496216, "rewards/margins": 4.701062202453613, "rewards/rejected": -3.6520462036132812, "step": 3423 }, { "epoch": 0.9, "grad_norm": 36.64192581176758, "kl": 0.0, "learning_rate": 5.194975137398586e-08, "logps/chosen": -188.86862182617188, "logps/rejected": -242.87950134277344, "loss": 0.2503, "rewards/chosen": -1.1615270376205444, "rewards/margins": 2.616291046142578, "rewards/rejected": -3.777817964553833, "step": 3424 }, { "epoch": 0.9, "grad_norm": 32.814735412597656, "kl": 0.0, "learning_rate": 5.181889557707406e-08, "logps/chosen": -206.17967224121094, "logps/rejected": -267.380615234375, "loss": 0.1992, "rewards/chosen": 3.037926197052002, "rewards/margins": 5.702183246612549, "rewards/rejected": -2.664257049560547, "step": 3425 }, { "epoch": 0.9, "grad_norm": 37.48391342163086, "kl": 0.0, "learning_rate": 5.168803978016226e-08, "logps/chosen": -156.80770874023438, "logps/rejected": -240.1800079345703, "loss": 0.2122, "rewards/chosen": 1.6631457805633545, "rewards/margins": 3.9029107093811035, "rewards/rejected": -2.239764928817749, "step": 3426 }, { "epoch": 0.9, "grad_norm": 28.892189025878906, "kl": 0.0, "learning_rate": 5.1557183983250455e-08, "logps/chosen": -166.68853759765625, "logps/rejected": -338.1997375488281, "loss": 0.3027, "rewards/chosen": 0.10017256438732147, "rewards/margins": 2.704484701156616, "rewards/rejected": -2.6043121814727783, "step": 3427 }, { "epoch": 0.9, "grad_norm": 25.72585105895996, "kl": 0.0, "learning_rate": 5.142632818633865e-08, "logps/chosen": -125.46946716308594, "logps/rejected": -273.8974609375, "loss": 0.1832, "rewards/chosen": 1.6431037187576294, "rewards/margins": 4.477502822875977, "rewards/rejected": -2.8343989849090576, "step": 3428 }, { "epoch": 0.9, "grad_norm": 32.71539306640625, "kl": 0.0, "learning_rate": 5.1295472389426854e-08, "logps/chosen": -202.12356567382812, "logps/rejected": -202.73655700683594, "loss": 0.2059, "rewards/chosen": -0.09025682508945465, "rewards/margins": 3.4574134349823, "rewards/rejected": -3.547670364379883, "step": 3429 }, { "epoch": 0.9, "grad_norm": 33.782875061035156, "kl": 0.0, "learning_rate": 5.116461659251504e-08, "logps/chosen": -207.37619018554688, "logps/rejected": -195.9562225341797, "loss": 0.2405, "rewards/chosen": 1.3065872192382812, "rewards/margins": 3.7297723293304443, "rewards/rejected": -2.423185110092163, "step": 3430 }, { "epoch": 0.9, "grad_norm": 41.365657806396484, "kl": 0.0, "learning_rate": 5.1033760795603246e-08, "logps/chosen": -129.8498077392578, "logps/rejected": -163.44862365722656, "loss": 0.2667, "rewards/chosen": 0.3854326903820038, "rewards/margins": 2.549304962158203, "rewards/rejected": -2.163872241973877, "step": 3431 }, { "epoch": 0.9, "grad_norm": 44.39329147338867, "kl": 0.0, "learning_rate": 5.0902904998691435e-08, "logps/chosen": -182.01693725585938, "logps/rejected": -238.8572998046875, "loss": 0.1059, "rewards/chosen": 0.8487826585769653, "rewards/margins": 4.960758686065674, "rewards/rejected": -4.111976146697998, "step": 3432 }, { "epoch": 0.9, "grad_norm": 25.658466339111328, "kl": 0.0, "learning_rate": 5.077204920177964e-08, "logps/chosen": -158.74652099609375, "logps/rejected": -301.00299072265625, "loss": 0.1193, "rewards/chosen": 0.1914927214384079, "rewards/margins": 5.276811599731445, "rewards/rejected": -5.0853190422058105, "step": 3433 }, { "epoch": 0.9, "grad_norm": 28.961124420166016, "kl": 0.0, "learning_rate": 5.0641193404867834e-08, "logps/chosen": -187.64295959472656, "logps/rejected": -201.07196044921875, "loss": 0.1929, "rewards/chosen": 1.7851966619491577, "rewards/margins": 5.109597682952881, "rewards/rejected": -3.3244009017944336, "step": 3434 }, { "epoch": 0.9, "grad_norm": 34.09691619873047, "kl": 0.0, "learning_rate": 5.051033760795603e-08, "logps/chosen": -243.12417602539062, "logps/rejected": -236.99737548828125, "loss": 0.2327, "rewards/chosen": 1.3128129243850708, "rewards/margins": 5.337040424346924, "rewards/rejected": -4.024227619171143, "step": 3435 }, { "epoch": 0.9, "grad_norm": 30.38089942932129, "kl": 0.0, "learning_rate": 5.0379481811044226e-08, "logps/chosen": -234.9790802001953, "logps/rejected": -238.9598388671875, "loss": 0.2465, "rewards/chosen": 1.983193039894104, "rewards/margins": 7.679162502288818, "rewards/rejected": -5.695969581604004, "step": 3436 }, { "epoch": 0.9, "grad_norm": 36.361236572265625, "kl": 0.0, "learning_rate": 5.024862601413243e-08, "logps/chosen": -166.4700469970703, "logps/rejected": -247.67010498046875, "loss": 0.2001, "rewards/chosen": 1.0058622360229492, "rewards/margins": 3.8643991947174072, "rewards/rejected": -2.858536958694458, "step": 3437 }, { "epoch": 0.9, "grad_norm": 27.40168571472168, "kl": 0.0, "learning_rate": 5.011777021722062e-08, "logps/chosen": -183.8133087158203, "logps/rejected": -230.89952087402344, "loss": 0.2022, "rewards/chosen": 1.5351980924606323, "rewards/margins": 4.581424236297607, "rewards/rejected": -3.0462260246276855, "step": 3438 }, { "epoch": 0.9, "grad_norm": 37.18417739868164, "kl": 0.0, "learning_rate": 4.998691442030882e-08, "logps/chosen": -236.43589782714844, "logps/rejected": -251.51925659179688, "loss": 0.2451, "rewards/chosen": 1.116813063621521, "rewards/margins": 4.605891704559326, "rewards/rejected": -3.4890785217285156, "step": 3439 }, { "epoch": 0.9, "grad_norm": 37.91105270385742, "kl": 0.0, "learning_rate": 4.985605862339701e-08, "logps/chosen": -191.96932983398438, "logps/rejected": -254.31973266601562, "loss": 0.2672, "rewards/chosen": 1.3671517372131348, "rewards/margins": 3.7806217670440674, "rewards/rejected": -2.4134700298309326, "step": 3440 }, { "epoch": 0.9, "grad_norm": 24.377700805664062, "kl": 0.0, "learning_rate": 4.9725202826485214e-08, "logps/chosen": -262.665283203125, "logps/rejected": -226.87684631347656, "loss": 0.2126, "rewards/chosen": 2.6146717071533203, "rewards/margins": 5.3994903564453125, "rewards/rejected": -2.784818410873413, "step": 3441 }, { "epoch": 0.9, "grad_norm": 33.40414810180664, "kl": 0.0, "learning_rate": 4.959434702957341e-08, "logps/chosen": -248.6764678955078, "logps/rejected": -247.5901641845703, "loss": 0.2668, "rewards/chosen": 1.8479644060134888, "rewards/margins": 5.598476886749268, "rewards/rejected": -3.7505125999450684, "step": 3442 }, { "epoch": 0.9, "grad_norm": 28.74833869934082, "kl": 0.0, "learning_rate": 4.94634912326616e-08, "logps/chosen": -209.05368041992188, "logps/rejected": -178.4486083984375, "loss": 0.287, "rewards/chosen": 0.11143312603235245, "rewards/margins": 3.713197708129883, "rewards/rejected": -3.601764678955078, "step": 3443 }, { "epoch": 0.9, "grad_norm": 34.230064392089844, "kl": 0.0, "learning_rate": 4.93326354357498e-08, "logps/chosen": -129.30055236816406, "logps/rejected": -276.4996643066406, "loss": 0.2192, "rewards/chosen": 2.003117561340332, "rewards/margins": 5.9718122482299805, "rewards/rejected": -3.9686944484710693, "step": 3444 }, { "epoch": 0.9, "grad_norm": 27.552433013916016, "kl": 0.0, "learning_rate": 4.9201779638838e-08, "logps/chosen": -198.81936645507812, "logps/rejected": -295.6492004394531, "loss": 0.2223, "rewards/chosen": 0.5861003994941711, "rewards/margins": 5.239863395690918, "rewards/rejected": -4.6537628173828125, "step": 3445 }, { "epoch": 0.9, "grad_norm": 22.962255477905273, "kl": 0.0, "learning_rate": 4.9070923841926194e-08, "logps/chosen": -241.2476348876953, "logps/rejected": -184.06736755371094, "loss": 0.1843, "rewards/chosen": 0.3511374294757843, "rewards/margins": 3.9717328548431396, "rewards/rejected": -3.6205954551696777, "step": 3446 }, { "epoch": 0.9, "grad_norm": 35.427024841308594, "kl": 0.0, "learning_rate": 4.894006804501439e-08, "logps/chosen": -175.5431671142578, "logps/rejected": -299.06829833984375, "loss": 0.1853, "rewards/chosen": 1.4518764019012451, "rewards/margins": 4.794173240661621, "rewards/rejected": -3.342296838760376, "step": 3447 }, { "epoch": 0.9, "grad_norm": 32.377662658691406, "kl": 0.0, "learning_rate": 4.880921224810259e-08, "logps/chosen": -258.3872985839844, "logps/rejected": -223.69058227539062, "loss": 0.2336, "rewards/chosen": 1.3112188577651978, "rewards/margins": 5.9051337242126465, "rewards/rejected": -4.593914985656738, "step": 3448 }, { "epoch": 0.9, "grad_norm": 31.69407844543457, "kl": 0.0, "learning_rate": 4.867835645119078e-08, "logps/chosen": -195.5200958251953, "logps/rejected": -281.95965576171875, "loss": 0.2164, "rewards/chosen": -0.44065719842910767, "rewards/margins": 2.2437925338745117, "rewards/rejected": -2.6844496726989746, "step": 3449 }, { "epoch": 0.9, "grad_norm": 36.76367950439453, "kl": 0.0, "learning_rate": 4.8547500654278985e-08, "logps/chosen": -146.5417022705078, "logps/rejected": -271.20166015625, "loss": 0.2169, "rewards/chosen": 1.526592493057251, "rewards/margins": 4.448322296142578, "rewards/rejected": -2.9217300415039062, "step": 3450 }, { "epoch": 0.9, "grad_norm": 67.0025863647461, "kl": 0.0, "learning_rate": 4.8416644857367174e-08, "logps/chosen": -212.27645874023438, "logps/rejected": -219.5655059814453, "loss": 0.2246, "rewards/chosen": 0.9446682333946228, "rewards/margins": 4.536525726318359, "rewards/rejected": -3.591857433319092, "step": 3451 }, { "epoch": 0.9, "grad_norm": 37.800968170166016, "kl": 0.0, "learning_rate": 4.828578906045538e-08, "logps/chosen": -203.21331787109375, "logps/rejected": -186.6897735595703, "loss": 0.2268, "rewards/chosen": 2.3236589431762695, "rewards/margins": 4.436846733093262, "rewards/rejected": -2.113187551498413, "step": 3452 }, { "epoch": 0.9, "grad_norm": 29.42912483215332, "kl": 0.0, "learning_rate": 4.815493326354357e-08, "logps/chosen": -216.91082763671875, "logps/rejected": -302.6053161621094, "loss": 0.1218, "rewards/chosen": 3.6041765213012695, "rewards/margins": 7.994859218597412, "rewards/rejected": -4.390682697296143, "step": 3453 }, { "epoch": 0.9, "grad_norm": 43.55042266845703, "kl": 0.0, "learning_rate": 4.802407746663177e-08, "logps/chosen": -172.40695190429688, "logps/rejected": -174.08834838867188, "loss": 0.3223, "rewards/chosen": 0.5470663905143738, "rewards/margins": 2.5851056575775146, "rewards/rejected": -2.038039207458496, "step": 3454 }, { "epoch": 0.9, "grad_norm": 25.590280532836914, "kl": 0.0, "learning_rate": 4.7893221669719965e-08, "logps/chosen": -184.07814025878906, "logps/rejected": -215.82809448242188, "loss": 0.1457, "rewards/chosen": 2.0870168209075928, "rewards/margins": 8.137537956237793, "rewards/rejected": -6.050520896911621, "step": 3455 }, { "epoch": 0.9, "grad_norm": 30.191423416137695, "kl": 0.0, "learning_rate": 4.776236587280817e-08, "logps/chosen": -205.03875732421875, "logps/rejected": -246.31654357910156, "loss": 0.2024, "rewards/chosen": 1.6703300476074219, "rewards/margins": 4.7895050048828125, "rewards/rejected": -3.1191751956939697, "step": 3456 }, { "epoch": 0.9, "grad_norm": 35.2052001953125, "kl": 0.0, "learning_rate": 4.763151007589636e-08, "logps/chosen": -239.1469268798828, "logps/rejected": -204.1580047607422, "loss": 0.2853, "rewards/chosen": 1.1633193492889404, "rewards/margins": 3.4346001148223877, "rewards/rejected": -2.2712807655334473, "step": 3457 }, { "epoch": 0.9, "grad_norm": 40.309967041015625, "kl": 0.0, "learning_rate": 4.750065427898456e-08, "logps/chosen": -141.7136688232422, "logps/rejected": -249.14077758789062, "loss": 0.2115, "rewards/chosen": 1.0800200700759888, "rewards/margins": 4.454376697540283, "rewards/rejected": -3.374356746673584, "step": 3458 }, { "epoch": 0.91, "grad_norm": 25.846149444580078, "kl": 0.0, "learning_rate": 4.736979848207275e-08, "logps/chosen": -199.12954711914062, "logps/rejected": -300.17291259765625, "loss": 0.1687, "rewards/chosen": 0.7269772887229919, "rewards/margins": 5.699728012084961, "rewards/rejected": -4.972750663757324, "step": 3459 }, { "epoch": 0.91, "grad_norm": 30.018512725830078, "kl": 0.0, "learning_rate": 4.723894268516095e-08, "logps/chosen": -225.64785766601562, "logps/rejected": -230.19851684570312, "loss": 0.2012, "rewards/chosen": 2.279531478881836, "rewards/margins": 6.719575881958008, "rewards/rejected": -4.440044403076172, "step": 3460 }, { "epoch": 0.91, "grad_norm": 33.65125274658203, "kl": 0.0, "learning_rate": 4.710808688824915e-08, "logps/chosen": -246.58177185058594, "logps/rejected": -316.8072509765625, "loss": 0.2009, "rewards/chosen": 2.155870199203491, "rewards/margins": 6.938811302185059, "rewards/rejected": -4.782940864562988, "step": 3461 }, { "epoch": 0.91, "grad_norm": 31.399215698242188, "kl": 0.0, "learning_rate": 4.6977231091337345e-08, "logps/chosen": -198.928955078125, "logps/rejected": -194.60580444335938, "loss": 0.2615, "rewards/chosen": 0.9209333658218384, "rewards/margins": 4.618176460266113, "rewards/rejected": -3.6972432136535645, "step": 3462 }, { "epoch": 0.91, "grad_norm": 27.609634399414062, "kl": 0.0, "learning_rate": 4.684637529442554e-08, "logps/chosen": -269.3661193847656, "logps/rejected": -219.11439514160156, "loss": 0.2958, "rewards/chosen": -0.05110529065132141, "rewards/margins": 3.0176358222961426, "rewards/rejected": -3.0687410831451416, "step": 3463 }, { "epoch": 0.91, "grad_norm": 33.409671783447266, "kl": 0.0, "learning_rate": 4.6715519497513743e-08, "logps/chosen": -162.5331268310547, "logps/rejected": -324.6145324707031, "loss": 0.2649, "rewards/chosen": 0.1856052577495575, "rewards/margins": 4.66796875, "rewards/rejected": -4.482363700866699, "step": 3464 }, { "epoch": 0.91, "grad_norm": 30.648984909057617, "kl": 0.0, "learning_rate": 4.658466370060193e-08, "logps/chosen": -200.22683715820312, "logps/rejected": -353.6416931152344, "loss": 0.2502, "rewards/chosen": 2.1871938705444336, "rewards/margins": 7.84035587310791, "rewards/rejected": -5.653162002563477, "step": 3465 }, { "epoch": 0.91, "grad_norm": 37.56059265136719, "kl": 0.0, "learning_rate": 4.6453807903690136e-08, "logps/chosen": -225.29098510742188, "logps/rejected": -352.63604736328125, "loss": 0.1575, "rewards/chosen": 0.413031667470932, "rewards/margins": 7.242162227630615, "rewards/rejected": -6.82913064956665, "step": 3466 }, { "epoch": 0.91, "grad_norm": 30.30290412902832, "kl": 0.0, "learning_rate": 4.6322952106778325e-08, "logps/chosen": -267.28814697265625, "logps/rejected": -205.54666137695312, "loss": 0.1733, "rewards/chosen": 1.5149755477905273, "rewards/margins": 3.9995875358581543, "rewards/rejected": -2.484611988067627, "step": 3467 }, { "epoch": 0.91, "grad_norm": 51.21653747558594, "kl": 0.0, "learning_rate": 4.619209630986653e-08, "logps/chosen": -284.357177734375, "logps/rejected": -295.5521240234375, "loss": 0.3575, "rewards/chosen": 1.1439176797866821, "rewards/margins": 3.0836286544799805, "rewards/rejected": -1.9397108554840088, "step": 3468 }, { "epoch": 0.91, "grad_norm": 40.627769470214844, "kl": 0.0, "learning_rate": 4.6061240512954724e-08, "logps/chosen": -257.2642822265625, "logps/rejected": -317.79730224609375, "loss": 0.2883, "rewards/chosen": 0.018114745616912842, "rewards/margins": 4.1845903396606445, "rewards/rejected": -4.166475772857666, "step": 3469 }, { "epoch": 0.91, "grad_norm": 32.87179183959961, "kl": 0.0, "learning_rate": 4.593038471604292e-08, "logps/chosen": -190.40333557128906, "logps/rejected": -224.46026611328125, "loss": 0.355, "rewards/chosen": 0.24565494060516357, "rewards/margins": 4.230736255645752, "rewards/rejected": -3.985081434249878, "step": 3470 }, { "epoch": 0.91, "grad_norm": 35.715606689453125, "kl": 0.0, "learning_rate": 4.5799528919131116e-08, "logps/chosen": -275.18035888671875, "logps/rejected": -206.77891540527344, "loss": 0.208, "rewards/chosen": 3.2653937339782715, "rewards/margins": 5.532741546630859, "rewards/rejected": -2.267347812652588, "step": 3471 }, { "epoch": 0.91, "grad_norm": 41.31447219848633, "kl": 0.0, "learning_rate": 4.5668673122219306e-08, "logps/chosen": -237.96063232421875, "logps/rejected": -322.6827392578125, "loss": 0.2782, "rewards/chosen": 1.0975438356399536, "rewards/margins": 4.81644868850708, "rewards/rejected": -3.718904972076416, "step": 3472 }, { "epoch": 0.91, "grad_norm": 39.48509979248047, "kl": 0.0, "learning_rate": 4.553781732530751e-08, "logps/chosen": -222.59228515625, "logps/rejected": -259.65008544921875, "loss": 0.342, "rewards/chosen": 1.1126902103424072, "rewards/margins": 5.725687026977539, "rewards/rejected": -4.612997055053711, "step": 3473 }, { "epoch": 0.91, "grad_norm": 28.43281364440918, "kl": 0.0, "learning_rate": 4.5406961528395704e-08, "logps/chosen": -238.9607391357422, "logps/rejected": -218.1140594482422, "loss": 0.1194, "rewards/chosen": 3.4507126808166504, "rewards/margins": 6.578547477722168, "rewards/rejected": -3.1278350353240967, "step": 3474 }, { "epoch": 0.91, "grad_norm": 27.3100528717041, "kl": 0.0, "learning_rate": 4.52761057314839e-08, "logps/chosen": -157.7848358154297, "logps/rejected": -222.90573120117188, "loss": 0.2354, "rewards/chosen": 0.6552536487579346, "rewards/margins": 3.9927327632904053, "rewards/rejected": -3.3374791145324707, "step": 3475 }, { "epoch": 0.91, "grad_norm": 35.11143112182617, "kl": 0.0, "learning_rate": 4.5145249934572097e-08, "logps/chosen": -230.7719268798828, "logps/rejected": -308.68560791015625, "loss": 0.2438, "rewards/chosen": 0.8573986291885376, "rewards/margins": 4.772648334503174, "rewards/rejected": -3.915249824523926, "step": 3476 }, { "epoch": 0.91, "grad_norm": 28.025171279907227, "kl": 0.0, "learning_rate": 4.50143941376603e-08, "logps/chosen": -155.75784301757812, "logps/rejected": -286.5352478027344, "loss": 0.1556, "rewards/chosen": 2.2150192260742188, "rewards/margins": 6.869057655334473, "rewards/rejected": -4.654038429260254, "step": 3477 }, { "epoch": 0.91, "grad_norm": 36.9254264831543, "kl": 0.0, "learning_rate": 4.488353834074849e-08, "logps/chosen": -175.12673950195312, "logps/rejected": -235.85813903808594, "loss": 0.2853, "rewards/chosen": -0.033377885818481445, "rewards/margins": 3.532153606414795, "rewards/rejected": -3.5655314922332764, "step": 3478 }, { "epoch": 0.91, "grad_norm": 34.92323684692383, "kl": 0.0, "learning_rate": 4.475268254383669e-08, "logps/chosen": -174.413818359375, "logps/rejected": -258.3853454589844, "loss": 0.2737, "rewards/chosen": 0.14707216620445251, "rewards/margins": 4.230539321899414, "rewards/rejected": -4.08346700668335, "step": 3479 }, { "epoch": 0.91, "grad_norm": 31.240678787231445, "kl": 0.0, "learning_rate": 4.462182674692489e-08, "logps/chosen": -156.48321533203125, "logps/rejected": -216.8871612548828, "loss": 0.2726, "rewards/chosen": 0.8429794311523438, "rewards/margins": 4.134203910827637, "rewards/rejected": -3.291224479675293, "step": 3480 }, { "epoch": 0.91, "grad_norm": 35.4827880859375, "kl": 0.0, "learning_rate": 4.4490970950013084e-08, "logps/chosen": -247.5883026123047, "logps/rejected": -203.5853271484375, "loss": 0.209, "rewards/chosen": 1.6337577104568481, "rewards/margins": 4.678426742553711, "rewards/rejected": -3.0446689128875732, "step": 3481 }, { "epoch": 0.91, "grad_norm": 30.35540199279785, "kl": 0.0, "learning_rate": 4.436011515310128e-08, "logps/chosen": -260.1644592285156, "logps/rejected": -248.724609375, "loss": 0.1345, "rewards/chosen": 3.037034273147583, "rewards/margins": 6.655192852020264, "rewards/rejected": -3.6181585788726807, "step": 3482 }, { "epoch": 0.91, "grad_norm": 42.778465270996094, "kl": 0.0, "learning_rate": 4.4229259356189476e-08, "logps/chosen": -260.2196044921875, "logps/rejected": -259.03521728515625, "loss": 0.2166, "rewards/chosen": 3.436540126800537, "rewards/margins": 4.734576225280762, "rewards/rejected": -1.2980358600616455, "step": 3483 }, { "epoch": 0.91, "grad_norm": 34.71338653564453, "kl": 0.0, "learning_rate": 4.409840355927767e-08, "logps/chosen": -247.28744506835938, "logps/rejected": -287.5233459472656, "loss": 0.2157, "rewards/chosen": 0.4272666573524475, "rewards/margins": 5.451603412628174, "rewards/rejected": -5.024336814880371, "step": 3484 }, { "epoch": 0.91, "grad_norm": 37.13381576538086, "kl": 0.0, "learning_rate": 4.3967547762365875e-08, "logps/chosen": -227.41146850585938, "logps/rejected": -302.9892578125, "loss": 0.2826, "rewards/chosen": 1.3567723035812378, "rewards/margins": 6.252913951873779, "rewards/rejected": -4.896141529083252, "step": 3485 }, { "epoch": 0.91, "grad_norm": 35.91691589355469, "kl": 0.0, "learning_rate": 4.3836691965454064e-08, "logps/chosen": -266.44287109375, "logps/rejected": -257.7109069824219, "loss": 0.2031, "rewards/chosen": 0.6967478394508362, "rewards/margins": 5.014280319213867, "rewards/rejected": -4.317532539367676, "step": 3486 }, { "epoch": 0.91, "grad_norm": 39.01267623901367, "kl": 0.0, "learning_rate": 4.370583616854227e-08, "logps/chosen": -219.94180297851562, "logps/rejected": -304.16461181640625, "loss": 0.2245, "rewards/chosen": 1.0676984786987305, "rewards/margins": 4.392019271850586, "rewards/rejected": -3.3243210315704346, "step": 3487 }, { "epoch": 0.91, "grad_norm": 36.86568832397461, "kl": 0.0, "learning_rate": 4.357498037163046e-08, "logps/chosen": -193.48902893066406, "logps/rejected": -194.73497009277344, "loss": 0.1368, "rewards/chosen": 2.4176418781280518, "rewards/margins": 6.041609287261963, "rewards/rejected": -3.623967409133911, "step": 3488 }, { "epoch": 0.91, "grad_norm": 32.746612548828125, "kl": 0.0, "learning_rate": 4.344412457471866e-08, "logps/chosen": -237.88595581054688, "logps/rejected": -221.19143676757812, "loss": 0.2488, "rewards/chosen": 2.1624321937561035, "rewards/margins": 6.177945137023926, "rewards/rejected": -4.015512943267822, "step": 3489 }, { "epoch": 0.91, "grad_norm": 33.666107177734375, "kl": 0.0, "learning_rate": 4.3313268777806855e-08, "logps/chosen": -215.32931518554688, "logps/rejected": -319.3660888671875, "loss": 0.2697, "rewards/chosen": 0.6944324970245361, "rewards/margins": 6.149470329284668, "rewards/rejected": -5.455037593841553, "step": 3490 }, { "epoch": 0.91, "grad_norm": 32.387325286865234, "kl": 0.0, "learning_rate": 4.318241298089505e-08, "logps/chosen": -168.0982666015625, "logps/rejected": -288.7040100097656, "loss": 0.2772, "rewards/chosen": 0.8535113334655762, "rewards/margins": 4.308161735534668, "rewards/rejected": -3.4546501636505127, "step": 3491 }, { "epoch": 0.91, "grad_norm": 37.205135345458984, "kl": 0.0, "learning_rate": 4.305155718398325e-08, "logps/chosen": -199.73460388183594, "logps/rejected": -291.91644287109375, "loss": 0.2238, "rewards/chosen": 1.210793137550354, "rewards/margins": 3.8120923042297363, "rewards/rejected": -2.6012990474700928, "step": 3492 }, { "epoch": 0.91, "grad_norm": 27.158855438232422, "kl": 0.0, "learning_rate": 4.292070138707145e-08, "logps/chosen": -92.06619262695312, "logps/rejected": -278.1312561035156, "loss": 0.2458, "rewards/chosen": 0.738717794418335, "rewards/margins": 4.438655853271484, "rewards/rejected": -3.6999380588531494, "step": 3493 }, { "epoch": 0.91, "grad_norm": 37.6988410949707, "kl": 0.0, "learning_rate": 4.278984559015964e-08, "logps/chosen": -102.7883529663086, "logps/rejected": -254.30300903320312, "loss": 0.2389, "rewards/chosen": -0.21390125155448914, "rewards/margins": 3.267529249191284, "rewards/rejected": -3.4814305305480957, "step": 3494 }, { "epoch": 0.91, "grad_norm": 44.969459533691406, "kl": 0.0, "learning_rate": 4.265898979324784e-08, "logps/chosen": -165.9117431640625, "logps/rejected": -234.60121154785156, "loss": 0.2802, "rewards/chosen": 0.9268661141395569, "rewards/margins": 3.578329086303711, "rewards/rejected": -2.651463031768799, "step": 3495 }, { "epoch": 0.91, "grad_norm": 29.521686553955078, "kl": 0.0, "learning_rate": 4.252813399633604e-08, "logps/chosen": -256.52166748046875, "logps/rejected": -304.9439697265625, "loss": 0.176, "rewards/chosen": 2.3784642219543457, "rewards/margins": 7.135860443115234, "rewards/rejected": -4.757396221160889, "step": 3496 }, { "epoch": 0.92, "grad_norm": 36.39180374145508, "kl": 0.0, "learning_rate": 4.2397278199424234e-08, "logps/chosen": -263.2144775390625, "logps/rejected": -237.9430389404297, "loss": 0.1611, "rewards/chosen": 1.8299251794815063, "rewards/margins": 4.7226996421813965, "rewards/rejected": -2.8927743434906006, "step": 3497 }, { "epoch": 0.92, "grad_norm": 28.184282302856445, "kl": 0.0, "learning_rate": 4.226642240251243e-08, "logps/chosen": -142.09413146972656, "logps/rejected": -358.89691162109375, "loss": 0.2323, "rewards/chosen": 0.06838560104370117, "rewards/margins": 6.167913913726807, "rewards/rejected": -6.0995283126831055, "step": 3498 }, { "epoch": 0.92, "grad_norm": 32.088775634765625, "kl": 0.0, "learning_rate": 4.2135566605600627e-08, "logps/chosen": -238.5006561279297, "logps/rejected": -242.0651397705078, "loss": 0.2115, "rewards/chosen": 1.0346657037734985, "rewards/margins": 6.6157379150390625, "rewards/rejected": -5.5810723304748535, "step": 3499 }, { "epoch": 0.92, "grad_norm": 33.76140594482422, "kl": 0.0, "learning_rate": 4.200471080868882e-08, "logps/chosen": -218.50735473632812, "logps/rejected": -223.3050994873047, "loss": 0.2968, "rewards/chosen": 0.24274462461471558, "rewards/margins": 2.7823526859283447, "rewards/rejected": -2.5396080017089844, "step": 3500 } ], "logging_steps": 1.0, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }