{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 7.598883452709097, "learning_rate": 1.5873015873015874e-07, "logits/chosen": -1.7671998739242554, "logits/rejected": -2.2639822959899902, "logps/chosen": -46.430763244628906, "logps/rejected": -102.85381317138672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 9.021711112045313, "learning_rate": 1.5873015873015873e-06, "logits/chosen": -1.6467827558517456, "logits/rejected": -2.05173921585083, "logps/chosen": -50.05971145629883, "logps/rejected": -110.39069366455078, "loss": 0.6403, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.0030777277424931526, "rewards/margins": 0.11931005865335464, "rewards/rejected": -0.11623234301805496, "step": 10 }, { "epoch": 0.16, "grad_norm": 1.8190332435996819, "learning_rate": 3.1746031746031746e-06, "logits/chosen": -1.3254443407058716, "logits/rejected": -1.779170036315918, "logps/chosen": -37.68037033081055, "logps/rejected": -313.1996154785156, "loss": 0.2472, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.09366801381111145, "rewards/margins": 2.184483051300049, "rewards/rejected": -2.0908150672912598, "step": 20 }, { "epoch": 0.24, "grad_norm": 8.393638030400462, "learning_rate": 4.761904761904762e-06, "logits/chosen": -0.7945032715797424, "logits/rejected": -1.1047414541244507, "logps/chosen": -55.5176887512207, "logps/rejected": -406.99609375, "loss": 0.1799, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.04824959114193916, "rewards/margins": 3.0284643173217773, "rewards/rejected": -3.076713800430298, "step": 30 }, { "epoch": 0.32, "grad_norm": 3.6170805565428488, "learning_rate": 6.349206349206349e-06, "logits/chosen": -0.3710061311721802, "logits/rejected": -0.7456581592559814, "logps/chosen": -50.16427230834961, "logps/rejected": -541.6275634765625, "loss": 0.1374, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01431644894182682, "rewards/margins": 4.384586334228516, "rewards/rejected": -4.398902416229248, "step": 40 }, { "epoch": 0.4, "grad_norm": 3.795764554921293, "learning_rate": 7.936507936507936e-06, "logits/chosen": -0.581534206867218, "logits/rejected": -0.8648965954780579, "logps/chosen": -80.1789779663086, "logps/rejected": -672.813720703125, "loss": 0.0553, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3146038353443146, "rewards/margins": 5.392758846282959, "rewards/rejected": -5.707362174987793, "step": 50 }, { "epoch": 0.48, "grad_norm": 1.397363269705544, "learning_rate": 9.523809523809525e-06, "logits/chosen": -0.7689735293388367, "logits/rejected": -0.9561458826065063, "logps/chosen": -145.53469848632812, "logps/rejected": -831.8748779296875, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -0.9438420534133911, "rewards/margins": 6.384782314300537, "rewards/rejected": -7.328624725341797, "step": 60 }, { "epoch": 0.56, "grad_norm": 0.13618082387486424, "learning_rate": 9.996172565322375e-06, "logits/chosen": -1.2459557056427002, "logits/rejected": -1.4872663021087646, "logps/chosen": -132.92080688476562, "logps/rejected": -911.9078979492188, "loss": 0.0409, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8237069249153137, "rewards/margins": 7.28142786026001, "rewards/rejected": -8.105135917663574, "step": 70 }, { "epoch": 0.64, "grad_norm": 1.3606359582288166, "learning_rate": 9.97744005136599e-06, "logits/chosen": -1.503824234008789, "logits/rejected": -1.8992702960968018, "logps/chosen": -115.5097885131836, "logps/rejected": -916.9987182617188, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6468590497970581, "rewards/margins": 7.513754367828369, "rewards/rejected": -8.160614013671875, "step": 80 }, { "epoch": 0.72, "grad_norm": 8.135027509581384, "learning_rate": 9.943157907471825e-06, "logits/chosen": -1.5570601224899292, "logits/rejected": -1.9588285684585571, "logps/chosen": -122.7470703125, "logps/rejected": -1007.5989379882812, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.71613609790802, "rewards/margins": 8.348057746887207, "rewards/rejected": -9.064192771911621, "step": 90 }, { "epoch": 0.8, "grad_norm": 0.07850356571109195, "learning_rate": 9.893433231795864e-06, "logits/chosen": -2.478653907775879, "logits/rejected": -2.867173671722412, "logps/chosen": -142.598388671875, "logps/rejected": -989.5442504882812, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -0.9251037836074829, "rewards/margins": 7.951480865478516, "rewards/rejected": -8.876585006713867, "step": 100 }, { "epoch": 0.88, "grad_norm": 0.010143597010992798, "learning_rate": 9.828421365296023e-06, "logits/chosen": -1.9893125295639038, "logits/rejected": -2.527003526687622, "logps/chosen": -127.56657409667969, "logps/rejected": -1111.281005859375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.7765494585037231, "rewards/margins": 9.32172679901123, "rewards/rejected": -10.098276138305664, "step": 110 }, { "epoch": 0.96, "grad_norm": 0.005924828079353313, "learning_rate": 9.748325406443647e-06, "logits/chosen": -1.330843448638916, "logits/rejected": -2.0016732215881348, "logps/chosen": -80.08639526367188, "logps/rejected": -1047.385009765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.33333706855773926, "rewards/margins": 9.099153518676758, "rewards/rejected": -9.43249225616455, "step": 120 }, { "epoch": 1.04, "grad_norm": 0.006907045924266673, "learning_rate": 9.653395576739504e-06, "logits/chosen": -1.3165078163146973, "logits/rejected": -1.999441385269165, "logps/chosen": -142.83404541015625, "logps/rejected": -1252.3265380859375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.9162995219230652, "rewards/margins": 10.60672378540039, "rewards/rejected": -11.52302360534668, "step": 130 }, { "epoch": 1.12, "grad_norm": 0.11065788109459221, "learning_rate": 9.543928439016445e-06, "logits/chosen": -2.337254047393799, "logits/rejected": -2.8029887676239014, "logps/chosen": -141.67652893066406, "logps/rejected": -1141.8990478515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.9172345399856567, "rewards/margins": 9.48987102508545, "rewards/rejected": -10.407105445861816, "step": 140 }, { "epoch": 1.2, "grad_norm": 0.08367928566939553, "learning_rate": 9.42026597097071e-06, "logits/chosen": -2.5381298065185547, "logits/rejected": -3.0234386920928955, "logps/chosen": -162.30075073242188, "logps/rejected": -1239.561767578125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.1074072122573853, "rewards/margins": 10.28199291229248, "rewards/rejected": -11.389399528503418, "step": 150 }, { "epoch": 1.28, "grad_norm": 0.0036761092850006146, "learning_rate": 9.282794496816244e-06, "logits/chosen": -2.3002023696899414, "logits/rejected": -2.8048605918884277, "logps/chosen": -136.76681518554688, "logps/rejected": -1181.3677978515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.8667460680007935, "rewards/margins": 9.932957649230957, "rewards/rejected": -10.799702644348145, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 0.0021196612287394884, "learning_rate": 9.131943480399531e-06, "logits/chosen": -2.4460995197296143, "logits/rejected": -2.9539546966552734, "logps/chosen": -154.1061553955078, "logps/rejected": -1166.7919921875, "loss": 0.0017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0605894327163696, "rewards/margins": 9.586730003356934, "rewards/rejected": -10.647318840026855, "step": 170 }, { "epoch": 1.44, "grad_norm": 0.004351477647807626, "learning_rate": 8.968184183545285e-06, "logits/chosen": -1.3524295091629028, "logits/rejected": -2.148332357406616, "logps/chosen": -146.11656188964844, "logps/rejected": -1293.1431884765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.959304928779602, "rewards/margins": 10.965489387512207, "rewards/rejected": -11.92479419708252, "step": 180 }, { "epoch": 1.52, "grad_norm": 0.004066031914054124, "learning_rate": 8.792028193824364e-06, "logits/chosen": -0.9521867036819458, "logits/rejected": -1.7849302291870117, "logps/chosen": -146.82675170898438, "logps/rejected": -1224.1842041015625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.9709192514419556, "rewards/margins": 10.264577865600586, "rewards/rejected": -11.235495567321777, "step": 190 }, { "epoch": 1.6, "grad_norm": 2.8254638303864446, "learning_rate": 8.604025826343167e-06, "logits/chosen": -1.4731212854385376, "logits/rejected": -2.2158398628234863, "logps/chosen": -166.7774200439453, "logps/rejected": -1286.593505859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.1668568849563599, "rewards/margins": 10.681219100952148, "rewards/rejected": -11.848076820373535, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 0.027929253424062123, "learning_rate": 8.404764404547404e-06, "logits/chosen": -1.5285543203353882, "logits/rejected": -2.207693099975586, "logps/chosen": -145.4683837890625, "logps/rejected": -1240.819091796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.9586501121520996, "rewards/margins": 10.435168266296387, "rewards/rejected": -11.393818855285645, "step": 210 }, { "epoch": 1.76, "grad_norm": 0.00891305422535769, "learning_rate": 8.194866425410984e-06, "logits/chosen": -1.2906019687652588, "logits/rejected": -1.9918200969696045, "logps/chosen": -109.07038879394531, "logps/rejected": -1155.131591796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6084994077682495, "rewards/margins": 9.912607192993164, "rewards/rejected": -10.521106719970703, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 0.0018316293380985671, "learning_rate": 7.974987614742066e-06, "logits/chosen": -0.9763646125793457, "logits/rejected": -1.7541002035140991, "logps/chosen": -96.57362365722656, "logps/rejected": -1161.369384765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.4819282591342926, "rewards/margins": 10.10207462310791, "rewards/rejected": -10.584001541137695, "step": 230 }, { "epoch": 1.92, "grad_norm": 0.003080239253269156, "learning_rate": 7.745814878681516e-06, "logits/chosen": -1.5342729091644287, "logits/rejected": -2.221407175064087, "logps/chosen": -136.1558074951172, "logps/rejected": -1264.690673828125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.8548258543014526, "rewards/margins": 10.780150413513184, "rewards/rejected": -11.634977340698242, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.029482148279890704, "learning_rate": 7.50806415779332e-06, "logits/chosen": -2.258669853210449, "logits/rejected": -2.8217015266418457, "logps/chosen": -140.51853942871094, "logps/rejected": -1197.581787109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.8967872858047485, "rewards/margins": 10.075855255126953, "rewards/rejected": -10.97264289855957, "step": 250 }, { "epoch": 2.08, "grad_norm": 0.020782306025551353, "learning_rate": 7.262478190450834e-06, "logits/chosen": -2.3182389736175537, "logits/rejected": -2.92055082321167, "logps/chosen": -120.69600677490234, "logps/rejected": -1217.408447265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.7067109942436218, "rewards/margins": 10.442468643188477, "rewards/rejected": -11.14918041229248, "step": 260 }, { "epoch": 2.16, "grad_norm": 0.014378242296594932, "learning_rate": 7.0098241925061215e-06, "logits/chosen": -2.2274346351623535, "logits/rejected": -2.8495638370513916, "logps/chosen": -143.10069274902344, "logps/rejected": -1256.431640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.919533371925354, "rewards/margins": 10.646241188049316, "rewards/rejected": -11.565774917602539, "step": 270 }, { "epoch": 2.24, "grad_norm": 0.0011703826093326713, "learning_rate": 6.750891460491093e-06, "logits/chosen": -1.953466773033142, "logits/rejected": -2.550438642501831, "logps/chosen": -119.64122009277344, "logps/rejected": -1253.8546142578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6891164779663086, "rewards/margins": 10.844141960144043, "rewards/rejected": -11.533258438110352, "step": 280 }, { "epoch": 2.32, "grad_norm": 0.016444776166175196, "learning_rate": 6.486488905838143e-06, "logits/chosen": -1.8186432123184204, "logits/rejected": -2.431183338165283, "logps/chosen": -114.60746002197266, "logps/rejected": -1220.325927734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6494568586349487, "rewards/margins": 10.547185897827148, "rewards/rejected": -11.196642875671387, "step": 290 }, { "epoch": 2.4, "grad_norm": 0.0005496505009132721, "learning_rate": 6.2174425278234115e-06, "logits/chosen": -1.7246840000152588, "logits/rejected": -2.2873055934906006, "logps/chosen": -108.48016357421875, "logps/rejected": -1233.37744140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5782238245010376, "rewards/margins": 10.742240905761719, "rewards/rejected": -11.320464134216309, "step": 300 }, { "epoch": 2.48, "grad_norm": 0.0006580056763243573, "learning_rate": 5.944592833127253e-06, "logits/chosen": -1.8141095638275146, "logits/rejected": -2.429810047149658, "logps/chosen": -119.1697006225586, "logps/rejected": -1326.865966796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6833556890487671, "rewards/margins": 11.57683277130127, "rewards/rejected": -12.260188102722168, "step": 310 }, { "epoch": 2.56, "grad_norm": 0.0011989998803146951, "learning_rate": 5.668792210073255e-06, "logits/chosen": -1.7204673290252686, "logits/rejected": -2.3327724933624268, "logps/chosen": -103.1290512084961, "logps/rejected": -1265.806640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5410266518592834, "rewards/margins": 11.101877212524414, "rewards/rejected": -11.642904281616211, "step": 320 }, { "epoch": 2.64, "grad_norm": 0.01089396369261857, "learning_rate": 5.39090226574877e-06, "logits/chosen": -1.52898371219635, "logits/rejected": -2.140092134475708, "logps/chosen": -91.94349670410156, "logps/rejected": -1207.033447265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.43181926012039185, "rewards/margins": 10.611051559448242, "rewards/rejected": -11.042871475219727, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 0.0015016734818494304, "learning_rate": 5.111791134325793e-06, "logits/chosen": -1.6063101291656494, "logits/rejected": -2.3003313541412354, "logps/chosen": -126.36214447021484, "logps/rejected": -1331.2056884765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.7603046298027039, "rewards/margins": 11.541264533996582, "rewards/rejected": -12.301568984985352, "step": 340 }, { "epoch": 2.8, "grad_norm": 0.044167012422386985, "learning_rate": 4.832330764991131e-06, "logits/chosen": -1.6678760051727295, "logits/rejected": -2.332516670227051, "logps/chosen": -119.7556381225586, "logps/rejected": -1301.3248291015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.6866098642349243, "rewards/margins": 11.313838958740234, "rewards/rejected": -12.000448226928711, "step": 350 }, { "epoch": 2.88, "grad_norm": 0.0005385533629396381, "learning_rate": 4.553394197958339e-06, "logits/chosen": -1.9416593313217163, "logits/rejected": -2.6904163360595703, "logps/chosen": -121.2122802734375, "logps/rejected": -1295.510498046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.7095136642456055, "rewards/margins": 11.224905014038086, "rewards/rejected": -11.934419631958008, "step": 360 }, { "epoch": 2.96, "grad_norm": 0.0006203949257092619, "learning_rate": 4.275852837071309e-06, "logits/chosen": -1.8644075393676758, "logits/rejected": -2.7058677673339844, "logps/chosen": -114.23481750488281, "logps/rejected": -1273.7774658203125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.6469079256057739, "rewards/margins": 11.084717750549316, "rewards/rejected": -11.731626510620117, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.0018701198703410247, "learning_rate": 4.000573727519868e-06, "logits/chosen": -1.6911453008651733, "logits/rejected": -2.5057501792907715, "logps/chosen": -115.5822982788086, "logps/rejected": -1267.122314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6596534848213196, "rewards/margins": 11.005937576293945, "rewards/rejected": -11.665590286254883, "step": 380 }, { "epoch": 3.12, "grad_norm": 0.004226077469779532, "learning_rate": 3.7284168471719527e-06, "logits/chosen": -1.646296501159668, "logits/rejected": -2.4523534774780273, "logps/chosen": -110.4627914428711, "logps/rejected": -1280.544921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.6036595106124878, "rewards/margins": 11.18139934539795, "rewards/rejected": -11.785058975219727, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.0006904055330191614, "learning_rate": 3.4602324199842026e-06, "logits/chosen": -1.7251287698745728, "logits/rejected": -2.5812900066375732, "logps/chosen": -138.866943359375, "logps/rejected": -1301.4326171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.8702784776687622, "rewards/margins": 11.145977973937988, "rewards/rejected": -12.016257286071777, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.0026124260205581465, "learning_rate": 3.1968582598840234e-06, "logits/chosen": -1.5730335712432861, "logits/rejected": -2.383654832839966, "logps/chosen": -109.4728012084961, "logps/rejected": -1262.404541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6025822758674622, "rewards/margins": 11.004800796508789, "rewards/rejected": -11.607383728027344, "step": 410 }, { "epoch": 3.36, "grad_norm": 0.016129813594014157, "learning_rate": 2.9391171534208185e-06, "logits/chosen": -1.7119560241699219, "logits/rejected": -2.51015043258667, "logps/chosen": -122.90666198730469, "logps/rejected": -1313.09228515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.7348455190658569, "rewards/margins": 11.394600868225098, "rewards/rejected": -12.129446983337402, "step": 420 }, { "epoch": 3.44, "grad_norm": 0.0024401290515470016, "learning_rate": 2.6878142893630904e-06, "logits/chosen": -1.6607134342193604, "logits/rejected": -2.4547019004821777, "logps/chosen": -137.77719116210938, "logps/rejected": -1349.2073974609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.8648022413253784, "rewards/margins": 11.622838020324707, "rewards/rejected": -12.487640380859375, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.001375330125575793, "learning_rate": 2.4437347432713838e-06, "logits/chosen": -1.8045127391815186, "logits/rejected": -2.586901903152466, "logps/chosen": -156.62245178222656, "logps/rejected": -1351.8687744140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.0343620777130127, "rewards/margins": 11.499258995056152, "rewards/rejected": -12.533620834350586, "step": 440 }, { "epoch": 3.6, "grad_norm": 0.0016289824782741704, "learning_rate": 2.207641024905322e-06, "logits/chosen": -1.6454054117202759, "logits/rejected": -2.4403579235076904, "logps/chosen": -108.79969787597656, "logps/rejected": -1255.9583740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5948039293289185, "rewards/margins": 10.957615852355957, "rewards/rejected": -11.552419662475586, "step": 450 }, { "epoch": 3.68, "grad_norm": 0.0003909667399708806, "learning_rate": 1.9802706961266936e-06, "logits/chosen": -1.5957154035568237, "logits/rejected": -2.4980740547180176, "logps/chosen": -124.23823547363281, "logps/rejected": -1321.4207763671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7479596138000488, "rewards/margins": 11.454780578613281, "rewards/rejected": -12.202739715576172, "step": 460 }, { "epoch": 3.76, "grad_norm": 0.0006870240069107201, "learning_rate": 1.7623340667403089e-06, "logits/chosen": -1.5271575450897217, "logits/rejected": -2.3313746452331543, "logps/chosen": -115.49822998046875, "logps/rejected": -1287.483642578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6483306288719177, "rewards/margins": 11.213019371032715, "rewards/rejected": -11.861350059509277, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.03279067348481457, "learning_rate": 1.5545119754708682e-06, "logits/chosen": -1.719366431236267, "logits/rejected": -2.5721096992492676, "logps/chosen": -141.73440551757812, "logps/rejected": -1358.951416015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.9100968241691589, "rewards/margins": 11.68384838104248, "rewards/rejected": -12.59394645690918, "step": 480 }, { "epoch": 3.92, "grad_norm": 0.004304506433089471, "learning_rate": 1.3574536630081208e-06, "logits/chosen": -1.5175102949142456, "logits/rejected": -2.292325496673584, "logps/chosen": -100.6703872680664, "logps/rejected": -1273.519775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5087094902992249, "rewards/margins": 11.203500747680664, "rewards/rejected": -11.71220874786377, "step": 490 }, { "epoch": 4.0, "grad_norm": 0.0004662700999661582, "learning_rate": 1.1717747437649657e-06, "logits/chosen": -1.5407252311706543, "logits/rejected": -2.3491063117980957, "logps/chosen": -122.54182434082031, "logps/rejected": -1335.7579345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7224875092506409, "rewards/margins": 11.629507064819336, "rewards/rejected": -12.351993560791016, "step": 500 }, { "epoch": 4.08, "grad_norm": 0.0017439323111767312, "learning_rate": 9.980552826847635e-07, "logits/chosen": -1.6905838251113892, "logits/rejected": -2.487473487854004, "logps/chosen": -166.93624877929688, "logps/rejected": -1409.1981201171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.1514984369277954, "rewards/margins": 11.959856033325195, "rewards/rejected": -13.111355781555176, "step": 510 }, { "epoch": 4.16, "grad_norm": 0.0024902235420863005, "learning_rate": 8.368379831059592e-07, "logits/chosen": -1.5832383632659912, "logits/rejected": -2.3622446060180664, "logps/chosen": -130.75253295898438, "logps/rejected": -1372.863037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.806990921497345, "rewards/margins": 11.918740272521973, "rewards/rejected": -12.725730895996094, "step": 520 }, { "epoch": 4.24, "grad_norm": 0.0004030809278175782, "learning_rate": 6.886264913451635e-07, "logits/chosen": -1.5754905939102173, "logits/rejected": -2.314927101135254, "logps/chosen": -109.8852310180664, "logps/rejected": -1284.387451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6023492813110352, "rewards/margins": 11.232891082763672, "rewards/rejected": -11.835241317749023, "step": 530 }, { "epoch": 4.32, "grad_norm": 0.014990693875413815, "learning_rate": 5.538838232952104e-07, "logits/chosen": -1.6247650384902954, "logits/rejected": -2.395631790161133, "logps/chosen": -112.5528335571289, "logps/rejected": -1308.465087890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6267929673194885, "rewards/margins": 11.434622764587402, "rewards/rejected": -12.06141471862793, "step": 540 }, { "epoch": 4.4, "grad_norm": 0.0008931683311901964, "learning_rate": 4.3303091795353024e-07, "logits/chosen": -1.7327635288238525, "logits/rejected": -2.521054744720459, "logps/chosen": -193.12649536132812, "logps/rejected": -1453.8939208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3928953409194946, "rewards/margins": 12.179082870483398, "rewards/rejected": -13.571980476379395, "step": 550 }, { "epoch": 4.48, "grad_norm": 0.0006090223103858115, "learning_rate": 3.2644532239966444e-07, "logits/chosen": -1.540307879447937, "logits/rejected": -2.274130344390869, "logps/chosen": -116.36529541015625, "logps/rejected": -1331.5732421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6516658663749695, "rewards/margins": 11.651314735412598, "rewards/rejected": -12.302980422973633, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 0.0012600919102292293, "learning_rate": 2.3446001233004333e-07, "logits/chosen": -1.545082926750183, "logits/rejected": -2.3070833683013916, "logps/chosen": -118.43846130371094, "logps/rejected": -1310.844482421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6805299520492554, "rewards/margins": 11.414255142211914, "rewards/rejected": -12.094785690307617, "step": 570 }, { "epoch": 4.64, "grad_norm": 0.0033895825817679913, "learning_rate": 1.573623518347517e-07, "logits/chosen": -1.3955062627792358, "logits/rejected": -2.1139488220214844, "logps/chosen": -95.18065643310547, "logps/rejected": -1255.7984619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.46435871720314026, "rewards/margins": 11.06248664855957, "rewards/rejected": -11.52684497833252, "step": 580 }, { "epoch": 4.72, "grad_norm": 0.0005277953893460651, "learning_rate": 9.539319566590766e-08, "logits/chosen": -1.4713037014007568, "logits/rejected": -2.1673684120178223, "logps/chosen": -103.91761779785156, "logps/rejected": -1269.9681396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5508452653884888, "rewards/margins": 11.134601593017578, "rewards/rejected": -11.685447692871094, "step": 590 }, { "epoch": 4.8, "grad_norm": 0.0038828625096458437, "learning_rate": 4.8746136802240716e-08, "logits/chosen": -1.60639226436615, "logits/rejected": -2.2908573150634766, "logps/chosen": -111.3029556274414, "logps/rejected": -1284.128173828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6181883811950684, "rewards/margins": 11.210031509399414, "rewards/rejected": -11.82822036743164, "step": 600 }, { "epoch": 4.88, "grad_norm": 0.0016152355239910333, "learning_rate": 1.75669016604485e-08, "logits/chosen": -1.5895562171936035, "logits/rejected": -2.336843490600586, "logps/chosen": -118.6875991821289, "logps/rejected": -1293.158935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6965440511703491, "rewards/margins": 11.21881103515625, "rewards/rejected": -11.915353775024414, "step": 610 }, { "epoch": 4.96, "grad_norm": 0.0004233911082118195, "learning_rate": 1.952894842735531e-09, "logits/chosen": -1.5527576208114624, "logits/rejected": -2.2691588401794434, "logps/chosen": -98.73721313476562, "logps/rejected": -1254.2822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.4978357255458832, "rewards/margins": 11.023382186889648, "rewards/rejected": -11.52121639251709, "step": 620 }, { "epoch": 5.0, "step": 625, "total_flos": 0.0, "train_loss": 0.022511796173290348, "train_runtime": 65461.2558, "train_samples_per_second": 0.611, "train_steps_per_second": 0.01 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }