diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6516 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9940933254577673, + "eval_steps": 43, + "global_step": 422, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004725339633786178, + "grad_norm": 1.1176782705315427, + "learning_rate": 3.846153846153846e-08, + "logits/chosen": -1.5937305688858032, + "logits/rejected": -1.7021960020065308, + "logps/chosen": -247.54559326171875, + "logps/rejected": -179.0218048095703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.009450679267572357, + "grad_norm": 1.2535773078918948, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -1.7181015014648438, + "logits/rejected": -1.644026756286621, + "logps/chosen": -259.1505432128906, + "logps/rejected": -241.68020629882812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.014176018901358535, + "grad_norm": 0.995484839907434, + "learning_rate": 1.1538461538461539e-07, + "logits/chosen": -0.8613071441650391, + "logits/rejected": -0.8891040682792664, + "logps/chosen": -230.91070556640625, + "logps/rejected": -219.62979125976562, + "loss": 0.6934, + "rewards/accuracies": 0.390625, + "rewards/chosen": 6.782727723475546e-05, + "rewards/margins": -0.0004302160523366183, + "rewards/rejected": 0.0004980433732271194, + "step": 3 + }, + { + "epoch": 0.018901358535144713, + "grad_norm": 1.1568688062326662, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -1.1649622917175293, + "logits/rejected": -1.131172776222229, + "logps/chosen": -184.93499755859375, + "logps/rejected": -184.127197265625, + "loss": 0.693, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.0002185619086958468, + "rewards/margins": -0.0006310059688985348, + "rewards/rejected": 0.0004124442348256707, + "step": 4 + }, + { + "epoch": 0.02362669816893089, + "grad_norm": 1.0155652752384032, + "learning_rate": 1.9230769230769231e-07, + "logits/chosen": -1.8650751113891602, + "logits/rejected": -1.9386688470840454, + "logps/chosen": -193.21636962890625, + "logps/rejected": -175.6696014404297, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00019388733198866248, + "rewards/margins": -0.00023763455101288855, + "rewards/rejected": 0.0004315219703130424, + "step": 5 + }, + { + "epoch": 0.02835203780271707, + "grad_norm": 1.2022303509332404, + "learning_rate": 2.3076923076923078e-07, + "logits/chosen": -1.8849067687988281, + "logits/rejected": -1.8837637901306152, + "logps/chosen": -234.3896484375, + "logps/rejected": -218.09625244140625, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0004923291853629053, + "rewards/margins": 0.0006284262635745108, + "rewards/rejected": -0.00013609707821160555, + "step": 6 + }, + { + "epoch": 0.03307737743650325, + "grad_norm": 1.136952564893261, + "learning_rate": 2.692307692307692e-07, + "logits/chosen": -1.816144585609436, + "logits/rejected": -1.934072494506836, + "logps/chosen": -246.14027404785156, + "logps/rejected": -177.02993774414062, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0834413589909673e-05, + "rewards/margins": 0.001102915033698082, + "rewards/rejected": -0.0011237493017688394, + "step": 7 + }, + { + "epoch": 0.03780271707028943, + "grad_norm": 1.1596582232433024, + "learning_rate": 3.076923076923077e-07, + "logits/chosen": -1.8363107442855835, + "logits/rejected": -1.8167006969451904, + "logps/chosen": -239.65370178222656, + "logps/rejected": -221.20333862304688, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00017343120998702943, + "rewards/margins": -0.0007941695512272418, + "rewards/rejected": 0.0006207384867593646, + "step": 8 + }, + { + "epoch": 0.042528056704075605, + "grad_norm": 1.1810520949859216, + "learning_rate": 3.461538461538461e-07, + "logits/chosen": -1.6903538703918457, + "logits/rejected": -1.7734307050704956, + "logps/chosen": -239.26535034179688, + "logps/rejected": -193.875244140625, + "loss": 0.6929, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0003690614248625934, + "rewards/margins": 0.00022412401449400932, + "rewards/rejected": 0.00014493743947241455, + "step": 9 + }, + { + "epoch": 0.04725339633786178, + "grad_norm": 1.253373850013781, + "learning_rate": 3.8461538461538463e-07, + "logits/chosen": -1.5372889041900635, + "logits/rejected": -1.5536653995513916, + "logps/chosen": -222.93399047851562, + "logps/rejected": -223.1899871826172, + "loss": 0.6929, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0005048069870099425, + "rewards/margins": 0.000533790560439229, + "rewards/rejected": -0.0010385976638644934, + "step": 10 + }, + { + "epoch": 0.05197873597164796, + "grad_norm": 1.1609876885112298, + "learning_rate": 4.2307692307692304e-07, + "logits/chosen": -1.20901620388031, + "logits/rejected": -1.2452011108398438, + "logps/chosen": -274.1497497558594, + "logps/rejected": -227.36790466308594, + "loss": 0.6929, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0004496507463045418, + "rewards/margins": -0.0006296815699897707, + "rewards/rejected": 0.0001800309109967202, + "step": 11 + }, + { + "epoch": 0.05670407560543414, + "grad_norm": 1.1166995258772006, + "learning_rate": 4.6153846153846156e-07, + "logits/chosen": -1.8701603412628174, + "logits/rejected": -1.8107975721359253, + "logps/chosen": -219.3397674560547, + "logps/rejected": -232.28269958496094, + "loss": 0.6927, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0005328843253664672, + "rewards/margins": 0.000161867166752927, + "rewards/rejected": -0.0006947515066713095, + "step": 12 + }, + { + "epoch": 0.06142941523922032, + "grad_norm": 1.1010862122779934, + "learning_rate": 5e-07, + "logits/chosen": -1.3035281896591187, + "logits/rejected": -1.3319075107574463, + "logps/chosen": -200.9400634765625, + "logps/rejected": -177.5894317626953, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00024598470190539956, + "rewards/margins": 0.001570576336234808, + "rewards/rejected": -0.0013245916925370693, + "step": 13 + }, + { + "epoch": 0.0661547548730065, + "grad_norm": 1.109450926366569, + "learning_rate": 4.999926250172797e-07, + "logits/chosen": -1.2467422485351562, + "logits/rejected": -1.2619496583938599, + "logps/chosen": -247.1371612548828, + "logps/rejected": -232.03895568847656, + "loss": 0.6925, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.00045037176460027695, + "rewards/margins": 0.0029086670838296413, + "rewards/rejected": -0.0024582953192293644, + "step": 14 + }, + { + "epoch": 0.07088009450679268, + "grad_norm": 1.0686336079566285, + "learning_rate": 4.999705005042417e-07, + "logits/chosen": -0.9053488969802856, + "logits/rejected": -0.9105295538902283, + "logps/chosen": -200.528076171875, + "logps/rejected": -191.2373046875, + "loss": 0.6924, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0003694885817822069, + "rewards/margins": 0.001977597363293171, + "rewards/rejected": -0.00234708609059453, + "step": 15 + }, + { + "epoch": 0.07560543414057885, + "grad_norm": 1.1198403704053106, + "learning_rate": 4.999336277662292e-07, + "logits/chosen": -1.382132887840271, + "logits/rejected": -1.3727940320968628, + "logps/chosen": -237.94508361816406, + "logps/rejected": -251.71519470214844, + "loss": 0.6924, + "rewards/accuracies": 0.640625, + "rewards/chosen": -3.6539247957989573e-05, + "rewards/margins": 0.002266494557261467, + "rewards/rejected": -0.002303033834323287, + "step": 16 + }, + { + "epoch": 0.08033077377436504, + "grad_norm": 1.022581880493408, + "learning_rate": 4.998820089787287e-07, + "logits/chosen": -1.0172172784805298, + "logits/rejected": -1.0724008083343506, + "logps/chosen": -232.03070068359375, + "logps/rejected": -214.65155029296875, + "loss": 0.6919, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0009355681831948459, + "rewards/margins": 0.002296539256349206, + "rewards/rejected": -0.003232107497751713, + "step": 17 + }, + { + "epoch": 0.08505611340815121, + "grad_norm": 1.127368600628108, + "learning_rate": 4.998156471872415e-07, + "logits/chosen": -1.6294444799423218, + "logits/rejected": -1.6482771635055542, + "logps/chosen": -226.63442993164062, + "logps/rejected": -209.50799560546875, + "loss": 0.6913, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.0007066840189509094, + "rewards/margins": 0.005803423933684826, + "rewards/rejected": -0.005096739623695612, + "step": 18 + }, + { + "epoch": 0.0897814530419374, + "grad_norm": 1.1009230447057081, + "learning_rate": 4.997345463071041e-07, + "logits/chosen": -1.9955631494522095, + "logits/rejected": -1.9177535772323608, + "logps/chosen": -219.0171661376953, + "logps/rejected": -204.72091674804688, + "loss": 0.6906, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.001279333489947021, + "rewards/margins": 0.003984270617365837, + "rewards/rejected": -0.005263603758066893, + "step": 19 + }, + { + "epoch": 0.09450679267572357, + "grad_norm": 1.1219432468145276, + "learning_rate": 4.996387111232572e-07, + "logits/chosen": -0.9923038482666016, + "logits/rejected": -1.0317736864089966, + "logps/chosen": -207.70364379882812, + "logps/rejected": -210.37612915039062, + "loss": 0.6903, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0004186414007563144, + "rewards/margins": 0.007054868154227734, + "rewards/rejected": -0.007473509293049574, + "step": 20 + }, + { + "epoch": 0.09923213230950975, + "grad_norm": 1.1456255677406275, + "learning_rate": 4.995281472899636e-07, + "logits/chosen": -1.2455811500549316, + "logits/rejected": -1.2986594438552856, + "logps/chosen": -260.4434814453125, + "logps/rejected": -239.2939453125, + "loss": 0.6899, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.001413986086845398, + "rewards/margins": 0.006085187196731567, + "rewards/rejected": -0.007499172817915678, + "step": 21 + }, + { + "epoch": 0.10395747194329592, + "grad_norm": 1.0796373972108642, + "learning_rate": 4.99402861330474e-07, + "logits/chosen": -1.6776717901229858, + "logits/rejected": -1.6839845180511475, + "logps/chosen": -243.64987182617188, + "logps/rejected": -224.65371704101562, + "loss": 0.6898, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.0021368232555687428, + "rewards/margins": 0.007371032610535622, + "rewards/rejected": -0.009507855400443077, + "step": 22 + }, + { + "epoch": 0.10868281157708211, + "grad_norm": 1.067721897185461, + "learning_rate": 4.992628606366425e-07, + "logits/chosen": -1.6994775533676147, + "logits/rejected": -1.6823248863220215, + "logps/chosen": -184.07522583007812, + "logps/rejected": -201.24017333984375, + "loss": 0.6892, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0037174485623836517, + "rewards/margins": 0.005765101406723261, + "rewards/rejected": -0.009482549503445625, + "step": 23 + }, + { + "epoch": 0.11340815121086828, + "grad_norm": 1.1222480052149575, + "learning_rate": 4.991081534684911e-07, + "logits/chosen": -1.3170721530914307, + "logits/rejected": -1.3209936618804932, + "logps/chosen": -173.28440856933594, + "logps/rejected": -180.39111328125, + "loss": 0.6882, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.0013278307160362601, + "rewards/margins": 0.01227161381393671, + "rewards/rejected": -0.013599444180727005, + "step": 24 + }, + { + "epoch": 0.11813349084465447, + "grad_norm": 1.1336846530783633, + "learning_rate": 4.98938748953721e-07, + "logits/chosen": -1.2200762033462524, + "logits/rejected": -1.2801724672317505, + "logps/chosen": -228.53701782226562, + "logps/rejected": -209.99066162109375, + "loss": 0.6876, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0031590620055794716, + "rewards/margins": 0.014617552980780602, + "rewards/rejected": -0.017776615917682648, + "step": 25 + }, + { + "epoch": 0.12285883047844064, + "grad_norm": 1.1070134845717687, + "learning_rate": 4.987546570871754e-07, + "logits/chosen": -1.7048778533935547, + "logits/rejected": -1.6716248989105225, + "logps/chosen": -237.16851806640625, + "logps/rejected": -234.78970336914062, + "loss": 0.6868, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.006663296837359667, + "rewards/margins": 0.013624398037791252, + "rewards/rejected": -0.02028769627213478, + "step": 26 + }, + { + "epoch": 0.1275841701122268, + "grad_norm": 1.1180888707264458, + "learning_rate": 4.985558887302488e-07, + "logits/chosen": -1.692581057548523, + "logits/rejected": -1.7662020921707153, + "logps/chosen": -197.77049255371094, + "logps/rejected": -182.52011108398438, + "loss": 0.686, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003295771311968565, + "rewards/margins": 0.009790323674678802, + "rewards/rejected": -0.01308609452098608, + "step": 27 + }, + { + "epoch": 0.132309509746013, + "grad_norm": 1.1912060262420856, + "learning_rate": 4.983424556102468e-07, + "logits/chosen": -1.8331196308135986, + "logits/rejected": -1.8659313917160034, + "logps/chosen": -200.51959228515625, + "logps/rejected": -181.59939575195312, + "loss": 0.6841, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.004111767280846834, + "rewards/margins": 0.017347747460007668, + "rewards/rejected": -0.021459516137838364, + "step": 28 + }, + { + "epoch": 0.13703484937979918, + "grad_norm": 1.1795634739349266, + "learning_rate": 4.981143703196941e-07, + "logits/chosen": -2.1984832286834717, + "logits/rejected": -2.172135591506958, + "logps/chosen": -193.10231018066406, + "logps/rejected": -180.98953247070312, + "loss": 0.6844, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.010889173485338688, + "rewards/margins": 0.014179128222167492, + "rewards/rejected": -0.02506830170750618, + "step": 29 + }, + { + "epoch": 0.14176018901358536, + "grad_norm": 1.066586203576245, + "learning_rate": 4.978716463155912e-07, + "logits/chosen": -2.06459379196167, + "logits/rejected": -2.0417637825012207, + "logps/chosen": -159.03701782226562, + "logps/rejected": -195.22523498535156, + "loss": 0.6843, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.0062042661011219025, + "rewards/margins": 0.016392884775996208, + "rewards/rejected": -0.02259715273976326, + "step": 30 + }, + { + "epoch": 0.14648552864737152, + "grad_norm": 1.2110413924457768, + "learning_rate": 4.976142979186209e-07, + "logits/chosen": -1.9228863716125488, + "logits/rejected": -1.9126472473144531, + "logps/chosen": -202.9162139892578, + "logps/rejected": -175.83433532714844, + "loss": 0.6821, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.007678491994738579, + "rewards/margins": 0.02073112316429615, + "rewards/rejected": -0.02840961515903473, + "step": 31 + }, + { + "epoch": 0.1512108682811577, + "grad_norm": 1.2866090020461936, + "learning_rate": 4.973423403123028e-07, + "logits/chosen": -1.701865792274475, + "logits/rejected": -1.7730942964553833, + "logps/chosen": -235.43563842773438, + "logps/rejected": -229.68411254882812, + "loss": 0.6798, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.0038016163744032383, + "rewards/margins": 0.025650067254900932, + "rewards/rejected": -0.029451683163642883, + "step": 32 + }, + { + "epoch": 0.1559362079149439, + "grad_norm": 1.1184970866462265, + "learning_rate": 4.970557895420983e-07, + "logits/chosen": -1.7996641397476196, + "logits/rejected": -1.7775640487670898, + "logps/chosen": -172.83163452148438, + "logps/rejected": -201.3394317626953, + "loss": 0.6818, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014111585915088654, + "rewards/margins": 0.02376263216137886, + "rewards/rejected": -0.03787422180175781, + "step": 33 + }, + { + "epoch": 0.16066154754873008, + "grad_norm": 1.1028286808678158, + "learning_rate": 4.967546625144633e-07, + "logits/chosen": -1.1831226348876953, + "logits/rejected": -1.1660494804382324, + "logps/chosen": -177.04531860351562, + "logps/rejected": -188.2403564453125, + "loss": 0.6822, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.011519413441419601, + "rewards/margins": 0.02097604051232338, + "rewards/rejected": -0.03249545022845268, + "step": 34 + }, + { + "epoch": 0.16538688718251623, + "grad_norm": 1.1486945876483394, + "learning_rate": 4.964389769958506e-07, + "logits/chosen": -1.382279634475708, + "logits/rejected": -1.419837474822998, + "logps/chosen": -156.91551208496094, + "logps/rejected": -157.46437072753906, + "loss": 0.6784, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.012027422897517681, + "rewards/margins": 0.030697565525770187, + "rewards/rejected": -0.04272499307990074, + "step": 35 + }, + { + "epoch": 0.17011222681630242, + "grad_norm": 1.1027256033620638, + "learning_rate": 4.961087516116621e-07, + "logits/chosen": -1.1302804946899414, + "logits/rejected": -1.1647142171859741, + "logps/chosen": -263.8468322753906, + "logps/rejected": -243.76544189453125, + "loss": 0.6793, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.020269813016057014, + "rewards/margins": 0.02116047963500023, + "rewards/rejected": -0.041430290788412094, + "step": 36 + }, + { + "epoch": 0.1748375664500886, + "grad_norm": 1.0823022111801957, + "learning_rate": 4.957640058451501e-07, + "logits/chosen": -1.5351812839508057, + "logits/rejected": -1.5872442722320557, + "logps/chosen": -204.82644653320312, + "logps/rejected": -176.9189453125, + "loss": 0.6799, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02254084125161171, + "rewards/margins": 0.017112018540501595, + "rewards/rejected": -0.039652857929468155, + "step": 37 + }, + { + "epoch": 0.1795629060838748, + "grad_norm": 1.0721194904364362, + "learning_rate": 4.954047600362669e-07, + "logits/chosen": -1.8736214637756348, + "logits/rejected": -1.819676160812378, + "logps/chosen": -178.29270935058594, + "logps/rejected": -191.61605834960938, + "loss": 0.6803, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02358250319957733, + "rewards/margins": 0.031186437234282494, + "rewards/rejected": -0.054768942296504974, + "step": 38 + }, + { + "epoch": 0.18428824571766095, + "grad_norm": 1.0629161817956538, + "learning_rate": 4.950310353804659e-07, + "logits/chosen": -1.7329224348068237, + "logits/rejected": -1.780181646347046, + "logps/chosen": -188.88072204589844, + "logps/rejected": -177.74427795410156, + "loss": 0.6784, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.024656984955072403, + "rewards/margins": 0.03351202234625816, + "rewards/rejected": -0.058169007301330566, + "step": 39 + }, + { + "epoch": 0.18901358535144713, + "grad_norm": 1.0963449505388767, + "learning_rate": 4.946428539274497e-07, + "logits/chosen": -1.920142650604248, + "logits/rejected": -1.9512125253677368, + "logps/chosen": -237.4847412109375, + "logps/rejected": -210.24838256835938, + "loss": 0.6759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030252018943428993, + "rewards/margins": 0.040137603878974915, + "rewards/rejected": -0.07038962841033936, + "step": 40 + }, + { + "epoch": 0.19373892498523332, + "grad_norm": 1.039447855381113, + "learning_rate": 4.942402385798706e-07, + "logits/chosen": -1.246740698814392, + "logits/rejected": -1.281036138534546, + "logps/chosen": -240.5216064453125, + "logps/rejected": -188.30795288085938, + "loss": 0.6762, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.032288651913404465, + "rewards/margins": 0.03159747272729874, + "rewards/rejected": -0.0638861209154129, + "step": 41 + }, + { + "epoch": 0.1984642646190195, + "grad_norm": 1.0133081629791365, + "learning_rate": 4.938232130919785e-07, + "logits/chosen": -2.049900531768799, + "logits/rejected": -2.0438833236694336, + "logps/chosen": -241.96177673339844, + "logps/rejected": -223.10511779785156, + "loss": 0.6775, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.05470336228609085, + "rewards/margins": 0.040740497410297394, + "rewards/rejected": -0.09544385224580765, + "step": 42 + }, + { + "epoch": 0.20318960425280566, + "grad_norm": 1.046962172171356, + "learning_rate": 4.933918020682195e-07, + "logits/chosen": -2.0764129161834717, + "logits/rejected": -1.9940263032913208, + "logps/chosen": -204.7505340576172, + "logps/rejected": -206.99880981445312, + "loss": 0.6727, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.03313834220170975, + "rewards/margins": 0.06050185114145279, + "rewards/rejected": -0.09364018589258194, + "step": 43 + }, + { + "epoch": 0.20318960425280566, + "eval_logits/chosen": -2.2005555629730225, + "eval_logits/rejected": -2.2166874408721924, + "eval_logps/chosen": -216.52699279785156, + "eval_logps/rejected": -209.94314575195312, + "eval_loss": 0.6714360117912292, + "eval_rewards/accuracies": 0.5871211886405945, + "eval_rewards/chosen": -0.05296258255839348, + "eval_rewards/margins": 0.046969976276159286, + "eval_rewards/rejected": -0.09993256628513336, + "eval_runtime": 225.63, + "eval_samples_per_second": 16.204, + "eval_steps_per_second": 0.293, + "step": 43 + }, + { + "epoch": 0.20791494388659185, + "grad_norm": 1.1285157956991194, + "learning_rate": 4.929460309617843e-07, + "logits/chosen": -2.0923304557800293, + "logits/rejected": -2.151911973953247, + "logps/chosen": -252.43092346191406, + "logps/rejected": -221.0844268798828, + "loss": 0.6707, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.04450148344039917, + "rewards/margins": 0.058394819498062134, + "rewards/rejected": -0.10289628803730011, + "step": 44 + }, + { + "epoch": 0.21264028352037803, + "grad_norm": 1.0651629243936434, + "learning_rate": 4.924859260731066e-07, + "logits/chosen": -2.0476608276367188, + "logits/rejected": -2.174062490463257, + "logps/chosen": -219.19517517089844, + "logps/rejected": -189.25193786621094, + "loss": 0.6718, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.06256880611181259, + "rewards/margins": 0.04902214929461479, + "rewards/rejected": -0.11159095913171768, + "step": 45 + }, + { + "epoch": 0.21736562315416422, + "grad_norm": 1.0516408835233841, + "learning_rate": 4.920115145483112e-07, + "logits/chosen": -1.602857232093811, + "logits/rejected": -1.6103214025497437, + "logps/chosen": -252.79664611816406, + "logps/rejected": -229.30770874023438, + "loss": 0.6711, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.08149686455726624, + "rewards/margins": 0.037921737879514694, + "rewards/rejected": -0.11941860616207123, + "step": 46 + }, + { + "epoch": 0.22209096278795037, + "grad_norm": 1.1789732456475222, + "learning_rate": 4.915228243776124e-07, + "logits/chosen": -1.1500588655471802, + "logits/rejected": -1.1534898281097412, + "logps/chosen": -253.38931274414062, + "logps/rejected": -220.110595703125, + "loss": 0.6634, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.06619597226381302, + "rewards/margins": 0.07172124832868576, + "rewards/rejected": -0.13791722059249878, + "step": 47 + }, + { + "epoch": 0.22681630242173656, + "grad_norm": 1.056306792669834, + "learning_rate": 4.91019884393663e-07, + "logits/chosen": -0.8076485991477966, + "logits/rejected": -0.8253241181373596, + "logps/chosen": -211.2886962890625, + "logps/rejected": -174.4979705810547, + "loss": 0.6668, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.05709861218929291, + "rewards/margins": 0.05306413769721985, + "rewards/rejected": -0.11016274988651276, + "step": 48 + }, + { + "epoch": 0.23154164205552275, + "grad_norm": 1.1483517794441522, + "learning_rate": 4.905027242698521e-07, + "logits/chosen": -1.5992224216461182, + "logits/rejected": -1.7107007503509521, + "logps/chosen": -258.3623962402344, + "logps/rejected": -208.75918579101562, + "loss": 0.6608, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.07419726252555847, + "rewards/margins": 0.07502313703298569, + "rewards/rejected": -0.14922040700912476, + "step": 49 + }, + { + "epoch": 0.23626698168930893, + "grad_norm": 1.0512679200538415, + "learning_rate": 4.89971374518556e-07, + "logits/chosen": -2.0465784072875977, + "logits/rejected": -2.033297300338745, + "logps/chosen": -185.704345703125, + "logps/rejected": -203.28013610839844, + "loss": 0.6639, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.08691324293613434, + "rewards/margins": 0.06635289639234543, + "rewards/rejected": -0.15326614677906036, + "step": 50 + }, + { + "epoch": 0.2409923213230951, + "grad_norm": 1.112120640811307, + "learning_rate": 4.894258664893363e-07, + "logits/chosen": -1.7012823820114136, + "logits/rejected": -1.7330955266952515, + "logps/chosen": -208.68597412109375, + "logps/rejected": -218.7354278564453, + "loss": 0.6594, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.07796823233366013, + "rewards/margins": 0.038594380021095276, + "rewards/rejected": -0.1165626123547554, + "step": 51 + }, + { + "epoch": 0.24571766095688127, + "grad_norm": 1.1579361743422294, + "learning_rate": 4.888662323670913e-07, + "logits/chosen": -1.6541762351989746, + "logits/rejected": -1.7235084772109985, + "logps/chosen": -269.59210205078125, + "logps/rejected": -243.711669921875, + "loss": 0.6629, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.06561748683452606, + "rewards/margins": 0.06448770314455032, + "rewards/rejected": -0.1301051825284958, + "step": 52 + }, + { + "epoch": 0.25044300059066743, + "grad_norm": 1.1521413307343888, + "learning_rate": 4.882925051701568e-07, + "logits/chosen": -1.853175163269043, + "logits/rejected": -1.8978935480117798, + "logps/chosen": -225.21856689453125, + "logps/rejected": -228.90325927734375, + "loss": 0.6578, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09534727036952972, + "rewards/margins": 0.077830970287323, + "rewards/rejected": -0.17317824065685272, + "step": 53 + }, + { + "epoch": 0.2551683402244536, + "grad_norm": 1.0619319741777762, + "learning_rate": 4.877047187483582e-07, + "logits/chosen": -1.6998298168182373, + "logits/rejected": -1.7790082693099976, + "logps/chosen": -212.0916748046875, + "logps/rejected": -191.58206176757812, + "loss": 0.6631, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.10848715156316757, + "rewards/margins": 0.048885174095630646, + "rewards/rejected": -0.1573723405599594, + "step": 54 + }, + { + "epoch": 0.2598936798582398, + "grad_norm": 1.1160678416368461, + "learning_rate": 4.871029077810132e-07, + "logits/chosen": -1.675370216369629, + "logits/rejected": -1.7553473711013794, + "logps/chosen": -225.4173126220703, + "logps/rejected": -198.70562744140625, + "loss": 0.6558, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.10019448399543762, + "rewards/margins": 0.08370313048362732, + "rewards/rejected": -0.18389761447906494, + "step": 55 + }, + { + "epoch": 0.264619019492026, + "grad_norm": 1.180700159151371, + "learning_rate": 4.864871077748857e-07, + "logits/chosen": -2.015566110610962, + "logits/rejected": -2.080444812774658, + "logps/chosen": -229.28094482421875, + "logps/rejected": -215.07102966308594, + "loss": 0.6587, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1043662577867508, + "rewards/margins": 0.07935845851898193, + "rewards/rejected": -0.18372471630573273, + "step": 56 + }, + { + "epoch": 0.26934435912581217, + "grad_norm": 1.2229942654627657, + "learning_rate": 4.858573550620908e-07, + "logits/chosen": -2.024144411087036, + "logits/rejected": -2.0165395736694336, + "logps/chosen": -266.3519287109375, + "logps/rejected": -233.6390380859375, + "loss": 0.6454, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.12464563548564911, + "rewards/margins": 0.12967334687709808, + "rewards/rejected": -0.2543190121650696, + "step": 57 + }, + { + "epoch": 0.27406969875959836, + "grad_norm": 1.0980503837148203, + "learning_rate": 4.852136867979515e-07, + "logits/chosen": -2.3049449920654297, + "logits/rejected": -2.286456346511841, + "logps/chosen": -187.29776000976562, + "logps/rejected": -195.98187255859375, + "loss": 0.6559, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.13345083594322205, + "rewards/margins": 0.05029616504907608, + "rewards/rejected": -0.18374700844287872, + "step": 58 + }, + { + "epoch": 0.27879503839338454, + "grad_norm": 1.222277907011155, + "learning_rate": 4.845561409588065e-07, + "logits/chosen": -2.3418726921081543, + "logits/rejected": -2.2416603565216064, + "logps/chosen": -184.75540161132812, + "logps/rejected": -186.7329864501953, + "loss": 0.6424, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10166566073894501, + "rewards/margins": 0.12095416337251663, + "rewards/rejected": -0.22261981666088104, + "step": 59 + }, + { + "epoch": 0.2835203780271707, + "grad_norm": 1.0214927771783529, + "learning_rate": 4.838847563397693e-07, + "logits/chosen": -1.5123400688171387, + "logits/rejected": -1.6280531883239746, + "logps/chosen": -237.2841796875, + "logps/rejected": -212.3114013671875, + "loss": 0.6583, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1481347680091858, + "rewards/margins": 0.09539347887039185, + "rewards/rejected": -0.24352826178073883, + "step": 60 + }, + { + "epoch": 0.28824571766095686, + "grad_norm": 1.1567243525055237, + "learning_rate": 4.831995725524398e-07, + "logits/chosen": -2.612375497817993, + "logits/rejected": -2.534623861312866, + "logps/chosen": -163.80526733398438, + "logps/rejected": -201.24952697753906, + "loss": 0.6481, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.14992724359035492, + "rewards/margins": 0.10543593764305115, + "rewards/rejected": -0.2553631663322449, + "step": 61 + }, + { + "epoch": 0.29297105729474304, + "grad_norm": 1.2299920626415959, + "learning_rate": 4.825006300225665e-07, + "logits/chosen": -2.0585803985595703, + "logits/rejected": -2.1235456466674805, + "logps/chosen": -218.53924560546875, + "logps/rejected": -219.55938720703125, + "loss": 0.6439, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.15479950606822968, + "rewards/margins": 0.11182530224323273, + "rewards/rejected": -0.2666248381137848, + "step": 62 + }, + { + "epoch": 0.2976963969285292, + "grad_norm": 1.203647663122135, + "learning_rate": 4.817879699876622e-07, + "logits/chosen": -1.9584053754806519, + "logits/rejected": -2.047368288040161, + "logps/chosen": -180.12673950195312, + "logps/rejected": -169.58712768554688, + "loss": 0.6365, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.15420033037662506, + "rewards/margins": 0.07548267394304276, + "rewards/rejected": -0.2296830117702484, + "step": 63 + }, + { + "epoch": 0.3024217365623154, + "grad_norm": 1.1158018473166338, + "learning_rate": 4.810616344945705e-07, + "logits/chosen": -1.889503002166748, + "logits/rejected": -1.8674815893173218, + "logps/chosen": -205.70799255371094, + "logps/rejected": -200.00503540039062, + "loss": 0.6564, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.20557425916194916, + "rewards/margins": 0.07413503527641296, + "rewards/rejected": -0.2797092795372009, + "step": 64 + }, + { + "epoch": 0.3071470761961016, + "grad_norm": 1.1145309518931334, + "learning_rate": 4.803216663969849e-07, + "logits/chosen": -2.5508382320404053, + "logits/rejected": -2.58968186378479, + "logps/chosen": -222.9154052734375, + "logps/rejected": -205.91091918945312, + "loss": 0.646, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.16130368411540985, + "rewards/margins": 0.11249940097332001, + "rewards/rejected": -0.27380311489105225, + "step": 65 + }, + { + "epoch": 0.3118724158298878, + "grad_norm": 1.1907287315768882, + "learning_rate": 4.795681093529209e-07, + "logits/chosen": -1.9771151542663574, + "logits/rejected": -1.9131364822387695, + "logps/chosen": -181.78024291992188, + "logps/rejected": -203.05455017089844, + "loss": 0.6367, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17563070356845856, + "rewards/margins": 0.1282222419977188, + "rewards/rejected": -0.30385297536849976, + "step": 66 + }, + { + "epoch": 0.31659775546367397, + "grad_norm": 1.0642807389889697, + "learning_rate": 4.7880100782214e-07, + "logits/chosen": -2.139569044113159, + "logits/rejected": -2.139991283416748, + "logps/chosen": -200.4115447998047, + "logps/rejected": -209.0381317138672, + "loss": 0.6515, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25353577733039856, + "rewards/margins": 0.025102369487285614, + "rewards/rejected": -0.2786381244659424, + "step": 67 + }, + { + "epoch": 0.32132309509746015, + "grad_norm": 1.2361659779944367, + "learning_rate": 4.780204070635266e-07, + "logits/chosen": -2.3622794151306152, + "logits/rejected": -2.3103740215301514, + "logps/chosen": -241.8636474609375, + "logps/rejected": -253.21530151367188, + "loss": 0.6309, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2274862378835678, + "rewards/margins": 0.18495142459869385, + "rewards/rejected": -0.41243770718574524, + "step": 68 + }, + { + "epoch": 0.3260484347312463, + "grad_norm": 1.2216304493799137, + "learning_rate": 4.772263531324172e-07, + "logits/chosen": -2.275869369506836, + "logits/rejected": -2.296611785888672, + "logps/chosen": -255.72305297851562, + "logps/rejected": -237.79794311523438, + "loss": 0.631, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.24048469960689545, + "rewards/margins": 0.11058272421360016, + "rewards/rejected": -0.351067453622818, + "step": 69 + }, + { + "epoch": 0.33077377436503247, + "grad_norm": 1.2564703689206955, + "learning_rate": 4.764188928778843e-07, + "logits/chosen": -1.9974974393844604, + "logits/rejected": -2.011615514755249, + "logps/chosen": -195.90382385253906, + "logps/rejected": -214.2757110595703, + "loss": 0.6336, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2238045632839203, + "rewards/margins": 0.1463470458984375, + "rewards/rejected": -0.3701516091823578, + "step": 70 + }, + { + "epoch": 0.33549911399881865, + "grad_norm": 1.2749948068499388, + "learning_rate": 4.755980739399711e-07, + "logits/chosen": -2.3057668209075928, + "logits/rejected": -2.2394864559173584, + "logps/chosen": -193.60552978515625, + "logps/rejected": -239.27389526367188, + "loss": 0.6525, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2364463210105896, + "rewards/margins": 0.1507556438446045, + "rewards/rejected": -0.3872019648551941, + "step": 71 + }, + { + "epoch": 0.34022445363260484, + "grad_norm": 1.3097724846280558, + "learning_rate": 4.747639447468816e-07, + "logits/chosen": -2.2665905952453613, + "logits/rejected": -2.279989719390869, + "logps/chosen": -285.88140869140625, + "logps/rejected": -304.027587890625, + "loss": 0.6232, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.23830385506153107, + "rewards/margins": 0.22441713511943817, + "rewards/rejected": -0.4627210199832916, + "step": 72 + }, + { + "epoch": 0.344949793266391, + "grad_norm": 1.267134897055487, + "learning_rate": 4.739165545121228e-07, + "logits/chosen": -1.9879568815231323, + "logits/rejected": -1.9682663679122925, + "logps/chosen": -246.3922882080078, + "logps/rejected": -247.4912109375, + "loss": 0.6229, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.3345012366771698, + "rewards/margins": 0.17793008685112, + "rewards/rejected": -0.5124313235282898, + "step": 73 + }, + { + "epoch": 0.3496751329001772, + "grad_norm": 1.2766840060333855, + "learning_rate": 4.730559532316014e-07, + "logits/chosen": -2.3778185844421387, + "logits/rejected": -2.4101202487945557, + "logps/chosen": -221.90524291992188, + "logps/rejected": -241.57962036132812, + "loss": 0.6395, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.32600241899490356, + "rewards/margins": 0.14937394857406616, + "rewards/rejected": -0.4753763973712921, + "step": 74 + }, + { + "epoch": 0.3544004725339634, + "grad_norm": 1.3566530419485627, + "learning_rate": 4.721821916806741e-07, + "logits/chosen": -2.4602699279785156, + "logits/rejected": -2.4688634872436523, + "logps/chosen": -268.510498046875, + "logps/rejected": -286.26165771484375, + "loss": 0.6237, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.31091028451919556, + "rewards/margins": 0.29035893082618713, + "rewards/rejected": -0.6012692451477051, + "step": 75 + }, + { + "epoch": 0.3591258121677496, + "grad_norm": 1.4205593895904036, + "learning_rate": 4.7129532141115145e-07, + "logits/chosen": -2.5466036796569824, + "logits/rejected": -2.5945773124694824, + "logps/chosen": -318.98370361328125, + "logps/rejected": -309.7763366699219, + "loss": 0.6112, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.3561764359474182, + "rewards/margins": 0.18557003140449524, + "rewards/rejected": -0.5417464375495911, + "step": 76 + }, + { + "epoch": 0.3638511518015357, + "grad_norm": 1.528828829984918, + "learning_rate": 4.7039539474825683e-07, + "logits/chosen": -2.2787909507751465, + "logits/rejected": -2.3680973052978516, + "logps/chosen": -305.41925048828125, + "logps/rejected": -288.8333435058594, + "loss": 0.6006, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.39139410853385925, + "rewards/margins": 0.2935311794281006, + "rewards/rejected": -0.6849253177642822, + "step": 77 + }, + { + "epoch": 0.3685764914353219, + "grad_norm": 1.6985476984319592, + "learning_rate": 4.6948246478753903e-07, + "logits/chosen": -2.261338472366333, + "logits/rejected": -2.323387622833252, + "logps/chosen": -247.09033203125, + "logps/rejected": -257.6181640625, + "loss": 0.6314, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5037412047386169, + "rewards/margins": 0.1979576051235199, + "rewards/rejected": -0.7016987800598145, + "step": 78 + }, + { + "epoch": 0.3733018310691081, + "grad_norm": 1.729313425365047, + "learning_rate": 4.6855658539173946e-07, + "logits/chosen": -2.698389768600464, + "logits/rejected": -2.646217107772827, + "logps/chosen": -314.0598449707031, + "logps/rejected": -283.12530517578125, + "loss": 0.6206, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.5247927904129028, + "rewards/margins": 0.22078382968902588, + "rewards/rejected": -0.7455766201019287, + "step": 79 + }, + { + "epoch": 0.37802717070289427, + "grad_norm": 1.66621636166947, + "learning_rate": 4.6761781118761446e-07, + "logits/chosen": -2.343153238296509, + "logits/rejected": -2.3706839084625244, + "logps/chosen": -296.689453125, + "logps/rejected": -309.9665222167969, + "loss": 0.6257, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.44052672386169434, + "rewards/margins": 0.23874229192733765, + "rewards/rejected": -0.6792689561843872, + "step": 80 + }, + { + "epoch": 0.38275251033668045, + "grad_norm": 1.5082652270101988, + "learning_rate": 4.666661975627123e-07, + "logits/chosen": -2.4629459381103516, + "logits/rejected": -2.486161470413208, + "logps/chosen": -233.51983642578125, + "logps/rejected": -262.89447021484375, + "loss": 0.6154, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.4172506630420685, + "rewards/margins": 0.3244992196559906, + "rewards/rejected": -0.7417498826980591, + "step": 81 + }, + { + "epoch": 0.38747784997046664, + "grad_norm": 1.5314338973522612, + "learning_rate": 4.657018006621053e-07, + "logits/chosen": -2.1910758018493652, + "logits/rejected": -2.273641586303711, + "logps/chosen": -225.42889404296875, + "logps/rejected": -239.92913818359375, + "loss": 0.6058, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4517180323600769, + "rewards/margins": 0.22106432914733887, + "rewards/rejected": -0.6727824211120605, + "step": 82 + }, + { + "epoch": 0.3922031896042528, + "grad_norm": 1.6363344671283797, + "learning_rate": 4.6472467738507724e-07, + "logits/chosen": -2.4740734100341797, + "logits/rejected": -2.593059539794922, + "logps/chosen": -325.74298095703125, + "logps/rejected": -297.570068359375, + "loss": 0.6088, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6095322370529175, + "rewards/margins": 0.1409136950969696, + "rewards/rejected": -0.7504459619522095, + "step": 83 + }, + { + "epoch": 0.396928529238039, + "grad_norm": 1.7032682560793109, + "learning_rate": 4.6373488538176656e-07, + "logits/chosen": -2.627995014190674, + "logits/rejected": -2.585923433303833, + "logps/chosen": -278.2349853515625, + "logps/rejected": -319.6769104003906, + "loss": 0.6177, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6383107900619507, + "rewards/margins": 0.17859135568141937, + "rewards/rejected": -0.8169021010398865, + "step": 84 + }, + { + "epoch": 0.40165386887182514, + "grad_norm": 1.6927306950035537, + "learning_rate": 4.627324830497645e-07, + "logits/chosen": -2.3960776329040527, + "logits/rejected": -2.4317002296447754, + "logps/chosen": -217.64886474609375, + "logps/rejected": -255.28555297851562, + "loss": 0.6034, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.581438422203064, + "rewards/margins": 0.2544565200805664, + "rewards/rejected": -0.8358950018882751, + "step": 85 + }, + { + "epoch": 0.4063792085056113, + "grad_norm": 1.517574514685826, + "learning_rate": 4.617175295306701e-07, + "logits/chosen": -2.342132329940796, + "logits/rejected": -2.317411422729492, + "logps/chosen": -249.52786254882812, + "logps/rejected": -272.2576904296875, + "loss": 0.6056, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4862441420555115, + "rewards/margins": 0.2543086111545563, + "rewards/rejected": -0.7405527830123901, + "step": 86 + }, + { + "epoch": 0.4063792085056113, + "eval_logits/chosen": -3.0177347660064697, + "eval_logits/rejected": -3.027693748474121, + "eval_logps/chosen": -269.99395751953125, + "eval_logps/rejected": -288.7347106933594, + "eval_loss": 0.6040579080581665, + "eval_rewards/accuracies": 0.6022727489471436, + "eval_rewards/chosen": -0.5876324772834778, + "eval_rewards/margins": 0.3002159297466278, + "eval_rewards/rejected": -0.8878483772277832, + "eval_runtime": 225.4053, + "eval_samples_per_second": 16.22, + "eval_steps_per_second": 0.293, + "step": 86 + }, + { + "epoch": 0.4111045481393975, + "grad_norm": 1.828247945135766, + "learning_rate": 4.6069008470660057e-07, + "logits/chosen": -2.776036500930786, + "logits/rejected": -2.8447940349578857, + "logps/chosen": -286.0122985839844, + "logps/rejected": -318.0850830078125, + "loss": 0.6202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6256659626960754, + "rewards/margins": 0.2720867395401001, + "rewards/rejected": -0.8977527022361755, + "step": 87 + }, + { + "epoch": 0.4158298877731837, + "grad_norm": 1.7339677719729851, + "learning_rate": 4.596502091966587e-07, + "logits/chosen": -2.6904196739196777, + "logits/rejected": -2.8044652938842773, + "logps/chosen": -318.77557373046875, + "logps/rejected": -326.3057556152344, + "loss": 0.6133, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.6397165060043335, + "rewards/margins": 0.23243850469589233, + "rewards/rejected": -0.8721550107002258, + "step": 88 + }, + { + "epoch": 0.4205552274069699, + "grad_norm": 1.5787512385428173, + "learning_rate": 4.5859796435335575e-07, + "logits/chosen": -2.5359115600585938, + "logits/rejected": -2.5303304195404053, + "logps/chosen": -256.5873718261719, + "logps/rejected": -295.2117004394531, + "loss": 0.6062, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.5649542212486267, + "rewards/margins": 0.2900598347187042, + "rewards/rejected": -0.8550140857696533, + "step": 89 + }, + { + "epoch": 0.42528056704075606, + "grad_norm": 1.6858517491129223, + "learning_rate": 4.5753341225899195e-07, + "logits/chosen": -2.4564006328582764, + "logits/rejected": -2.470282793045044, + "logps/chosen": -327.185791015625, + "logps/rejected": -319.4944152832031, + "loss": 0.6017, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.6457672715187073, + "rewards/margins": 0.2363281399011612, + "rewards/rejected": -0.882095456123352, + "step": 90 + }, + { + "epoch": 0.43000590667454225, + "grad_norm": 1.6776313748940133, + "learning_rate": 4.564566157219938e-07, + "logits/chosen": -2.467501163482666, + "logits/rejected": -2.570307970046997, + "logps/chosen": -318.20867919921875, + "logps/rejected": -302.66949462890625, + "loss": 0.6015, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.5671995878219604, + "rewards/margins": 0.2143457680940628, + "rewards/rejected": -0.7815454006195068, + "step": 91 + }, + { + "epoch": 0.43473124630832843, + "grad_norm": 2.0614895261976747, + "learning_rate": 4.5536763827320803e-07, + "logits/chosen": -2.3631057739257812, + "logits/rejected": -2.54693341255188, + "logps/chosen": -273.6968078613281, + "logps/rejected": -246.98641967773438, + "loss": 0.5856, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4555537700653076, + "rewards/margins": 0.22295869886875153, + "rewards/rejected": -0.6785125136375427, + "step": 92 + }, + { + "epoch": 0.43945658594211456, + "grad_norm": 1.8016509044142324, + "learning_rate": 4.5426654416215367e-07, + "logits/chosen": -2.71864652633667, + "logits/rejected": -2.7026920318603516, + "logps/chosen": -288.259033203125, + "logps/rejected": -330.29132080078125, + "loss": 0.6038, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.5275314450263977, + "rewards/margins": 0.5003238916397095, + "rewards/rejected": -1.027855396270752, + "step": 93 + }, + { + "epoch": 0.44418192557590075, + "grad_norm": 2.243457029027687, + "learning_rate": 4.5315339835323095e-07, + "logits/chosen": -2.884897232055664, + "logits/rejected": -2.8569068908691406, + "logps/chosen": -269.7425537109375, + "logps/rejected": -297.4085693359375, + "loss": 0.5748, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.5586040019989014, + "rewards/margins": 0.3271249234676361, + "rewards/rejected": -0.8857288956642151, + "step": 94 + }, + { + "epoch": 0.44890726520968693, + "grad_norm": 2.296742463720749, + "learning_rate": 4.520282665218889e-07, + "logits/chosen": -2.790522336959839, + "logits/rejected": -2.84653377532959, + "logps/chosen": -265.41339111328125, + "logps/rejected": -315.9186096191406, + "loss": 0.5625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5728757977485657, + "rewards/margins": 0.515011191368103, + "rewards/rejected": -1.0878870487213135, + "step": 95 + }, + { + "epoch": 0.4536326048434731, + "grad_norm": 1.922804376035695, + "learning_rate": 4.5089121505074987e-07, + "logits/chosen": -2.6556386947631836, + "logits/rejected": -2.806910276412964, + "logps/chosen": -244.31153869628906, + "logps/rejected": -247.36322021484375, + "loss": 0.5787, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5964667797088623, + "rewards/margins": 0.29180973768234253, + "rewards/rejected": -0.8882765173912048, + "step": 96 + }, + { + "epoch": 0.4583579444772593, + "grad_norm": 1.6125061422080407, + "learning_rate": 4.4974231102569355e-07, + "logits/chosen": -2.7232208251953125, + "logits/rejected": -2.8808493614196777, + "logps/chosen": -272.30206298828125, + "logps/rejected": -273.55487060546875, + "loss": 0.5764, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.5672851800918579, + "rewards/margins": 0.39647072553634644, + "rewards/rejected": -0.9637559056282043, + "step": 97 + }, + { + "epoch": 0.4630832841110455, + "grad_norm": 2.0273197043813598, + "learning_rate": 4.4858162223189853e-07, + "logits/chosen": -2.691676616668701, + "logits/rejected": -2.7318079471588135, + "logps/chosen": -331.65582275390625, + "logps/rejected": -321.1370544433594, + "loss": 0.6022, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7305989861488342, + "rewards/margins": 0.20282992720603943, + "rewards/rejected": -0.933428943157196, + "step": 98 + }, + { + "epoch": 0.4678086237448317, + "grad_norm": 1.772177669462408, + "learning_rate": 4.474092171498434e-07, + "logits/chosen": -2.5423169136047363, + "logits/rejected": -2.5960371494293213, + "logps/chosen": -260.9735107421875, + "logps/rejected": -277.1812744140625, + "loss": 0.5888, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.6579495668411255, + "rewards/margins": 0.32558560371398926, + "rewards/rejected": -0.9835351705551147, + "step": 99 + }, + { + "epoch": 0.47253396337861786, + "grad_norm": 2.031475270950697, + "learning_rate": 4.462251649512656e-07, + "logits/chosen": -2.805039167404175, + "logits/rejected": -2.7704780101776123, + "logps/chosen": -235.95053100585938, + "logps/rejected": -292.6195373535156, + "loss": 0.5634, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.612291157245636, + "rewards/margins": 0.49476855993270874, + "rewards/rejected": -1.1070597171783447, + "step": 100 + }, + { + "epoch": 0.477259303012404, + "grad_norm": 2.8786657503544033, + "learning_rate": 4.4502953549508135e-07, + "logits/chosen": -2.829331398010254, + "logits/rejected": -2.8344359397888184, + "logps/chosen": -309.11285400390625, + "logps/rejected": -331.1981201171875, + "loss": 0.596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7622178792953491, + "rewards/margins": 0.48595699667930603, + "rewards/rejected": -1.2481749057769775, + "step": 101 + }, + { + "epoch": 0.4819846426461902, + "grad_norm": 2.6747243514730163, + "learning_rate": 4.438223993232634e-07, + "logits/chosen": -2.71714186668396, + "logits/rejected": -2.7164933681488037, + "logps/chosen": -319.4408264160156, + "logps/rejected": -335.07025146484375, + "loss": 0.5873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7349227666854858, + "rewards/margins": 0.5957677960395813, + "rewards/rejected": -1.3306907415390015, + "step": 102 + }, + { + "epoch": 0.48670998227997636, + "grad_norm": 2.184532115681571, + "learning_rate": 4.426038276566787e-07, + "logits/chosen": -2.595947027206421, + "logits/rejected": -2.6409239768981934, + "logps/chosen": -302.1788330078125, + "logps/rejected": -300.60736083984375, + "loss": 0.5929, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6437785625457764, + "rewards/margins": 0.4648054838180542, + "rewards/rejected": -1.1085840463638306, + "step": 103 + }, + { + "epoch": 0.49143532191376255, + "grad_norm": 2.351448463153027, + "learning_rate": 4.413738923908874e-07, + "logits/chosen": -2.820120334625244, + "logits/rejected": -2.881047248840332, + "logps/chosen": -321.73577880859375, + "logps/rejected": -336.362548828125, + "loss": 0.6113, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6863315105438232, + "rewards/margins": 0.3078695237636566, + "rewards/rejected": -0.9942010641098022, + "step": 104 + }, + { + "epoch": 0.49616066154754873, + "grad_norm": 2.0712718540984976, + "learning_rate": 4.4013266609190016e-07, + "logits/chosen": -2.726858139038086, + "logits/rejected": -2.8224053382873535, + "logps/chosen": -318.89129638671875, + "logps/rejected": -321.7113037109375, + "loss": 0.5567, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.7669743299484253, + "rewards/margins": 0.5382488369941711, + "rewards/rejected": -1.3052233457565308, + "step": 105 + }, + { + "epoch": 0.5008860011813349, + "grad_norm": 2.7161021836994412, + "learning_rate": 4.3888022199189684e-07, + "logits/chosen": -2.5977838039398193, + "logits/rejected": -2.568969488143921, + "logps/chosen": -270.40380859375, + "logps/rejected": -321.2980041503906, + "loss": 0.6029, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.6543689966201782, + "rewards/margins": 0.5025177001953125, + "rewards/rejected": -1.1568866968154907, + "step": 106 + }, + { + "epoch": 0.505611340815121, + "grad_norm": 2.133467086083558, + "learning_rate": 4.3761663398490634e-07, + "logits/chosen": -2.5719194412231445, + "logits/rejected": -2.569828510284424, + "logps/chosen": -278.3515930175781, + "logps/rejected": -290.2523193359375, + "loss": 0.5601, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6445180773735046, + "rewards/margins": 0.44993308186531067, + "rewards/rejected": -1.0944510698318481, + "step": 107 + }, + { + "epoch": 0.5103366804489072, + "grad_norm": 2.581357357210363, + "learning_rate": 4.363419766224464e-07, + "logits/chosen": -2.5332443714141846, + "logits/rejected": -2.5566651821136475, + "logps/chosen": -255.7954559326172, + "logps/rejected": -285.38604736328125, + "loss": 0.5701, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7009232044219971, + "rewards/margins": 0.5014970302581787, + "rewards/rejected": -1.2024202346801758, + "step": 108 + }, + { + "epoch": 0.5150620200826934, + "grad_norm": 2.0545799713212913, + "learning_rate": 4.3505632510912515e-07, + "logits/chosen": -2.5492563247680664, + "logits/rejected": -2.6753411293029785, + "logps/chosen": -252.5239715576172, + "logps/rejected": -250.57691955566406, + "loss": 0.5745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.580327033996582, + "rewards/margins": 0.28667935729026794, + "rewards/rejected": -0.8670063614845276, + "step": 109 + }, + { + "epoch": 0.5197873597164796, + "grad_norm": 2.2034797994763458, + "learning_rate": 4.3375975529820414e-07, + "logits/chosen": -2.492084503173828, + "logits/rejected": -2.432577610015869, + "logps/chosen": -310.48046875, + "logps/rejected": -362.7835388183594, + "loss": 0.5392, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6999268531799316, + "rewards/margins": 0.7504494190216064, + "rewards/rejected": -1.450376272201538, + "step": 110 + }, + { + "epoch": 0.5245126993502658, + "grad_norm": 1.9829084137694295, + "learning_rate": 4.3245234368712304e-07, + "logits/chosen": -2.7556090354919434, + "logits/rejected": -2.754206418991089, + "logps/chosen": -270.7232666015625, + "logps/rejected": -324.1688232421875, + "loss": 0.5701, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.6123411655426025, + "rewards/margins": 0.4949289858341217, + "rewards/rejected": -1.1072702407836914, + "step": 111 + }, + { + "epoch": 0.529238038984052, + "grad_norm": 2.0360711261149596, + "learning_rate": 4.3113416741298616e-07, + "logits/chosen": -2.659914016723633, + "logits/rejected": -2.673081874847412, + "logps/chosen": -292.6357727050781, + "logps/rejected": -286.8974914550781, + "loss": 0.5785, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.645796537399292, + "rewards/margins": 0.4479163587093353, + "rewards/rejected": -1.0937130451202393, + "step": 112 + }, + { + "epoch": 0.5339633786178382, + "grad_norm": 2.3215552370547945, + "learning_rate": 4.298053042480114e-07, + "logits/chosen": -2.6102089881896973, + "logits/rejected": -2.666215419769287, + "logps/chosen": -284.019775390625, + "logps/rejected": -313.7834777832031, + "loss": 0.5434, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6691496968269348, + "rewards/margins": 0.5124155879020691, + "rewards/rejected": -1.181565284729004, + "step": 113 + }, + { + "epoch": 0.5386887182516243, + "grad_norm": 2.1290543043289434, + "learning_rate": 4.2846583259494185e-07, + "logits/chosen": -2.795818328857422, + "logits/rejected": -2.9482498168945312, + "logps/chosen": -291.4162292480469, + "logps/rejected": -279.486083984375, + "loss": 0.5576, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.791326105594635, + "rewards/margins": 0.29795482754707336, + "rewards/rejected": -1.0892809629440308, + "step": 114 + }, + { + "epoch": 0.5434140578854105, + "grad_norm": 2.123022825050298, + "learning_rate": 4.271158314824199e-07, + "logits/chosen": -2.5966644287109375, + "logits/rejected": -2.67663836479187, + "logps/chosen": -286.2615661621094, + "logps/rejected": -300.17669677734375, + "loss": 0.5549, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.6490954160690308, + "rewards/margins": 0.37902897596359253, + "rewards/rejected": -1.028124451637268, + "step": 115 + }, + { + "epoch": 0.5481393975191967, + "grad_norm": 2.3410590679680854, + "learning_rate": 4.2575538056032446e-07, + "logits/chosen": -2.3392884731292725, + "logits/rejected": -2.3976926803588867, + "logps/chosen": -309.89501953125, + "logps/rejected": -347.65264892578125, + "loss": 0.5519, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.8080885410308838, + "rewards/margins": 0.5476577877998352, + "rewards/rejected": -1.3557462692260742, + "step": 116 + }, + { + "epoch": 0.5528647371529829, + "grad_norm": 2.1818324792915305, + "learning_rate": 4.2438456009507195e-07, + "logits/chosen": -2.751250743865967, + "logits/rejected": -2.683605909347534, + "logps/chosen": -304.8617248535156, + "logps/rejected": -357.66583251953125, + "loss": 0.5637, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7783306837081909, + "rewards/margins": 0.6774348020553589, + "rewards/rejected": -1.4557652473449707, + "step": 117 + }, + { + "epoch": 0.5575900767867691, + "grad_norm": 2.2239562359532994, + "learning_rate": 4.230034509648803e-07, + "logits/chosen": -2.653618335723877, + "logits/rejected": -2.6241607666015625, + "logps/chosen": -311.1373596191406, + "logps/rejected": -379.66473388671875, + "loss": 0.5332, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.6985858678817749, + "rewards/margins": 0.754062294960022, + "rewards/rejected": -1.4526481628417969, + "step": 118 + }, + { + "epoch": 0.5623154164205553, + "grad_norm": 2.2500972070282024, + "learning_rate": 4.216121346549973e-07, + "logits/chosen": -3.0888874530792236, + "logits/rejected": -2.974677562713623, + "logps/chosen": -266.66705322265625, + "logps/rejected": -335.3798828125, + "loss": 0.5606, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.6862295866012573, + "rewards/margins": 0.5550753474235535, + "rewards/rejected": -1.2413049936294556, + "step": 119 + }, + { + "epoch": 0.5670407560543415, + "grad_norm": 2.18698230287775, + "learning_rate": 4.202106932528928e-07, + "logits/chosen": -2.6840288639068604, + "logits/rejected": -2.7219927310943604, + "logps/chosen": -349.6735534667969, + "logps/rejected": -351.0025634765625, + "loss": 0.5593, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9063121676445007, + "rewards/margins": 0.5141624808311462, + "rewards/rejected": -1.420474648475647, + "step": 120 + }, + { + "epoch": 0.5717660956881275, + "grad_norm": 2.3004644364082734, + "learning_rate": 4.1879920944341593e-07, + "logits/chosen": -2.914316177368164, + "logits/rejected": -2.9891955852508545, + "logps/chosen": -279.8473205566406, + "logps/rejected": -313.1243591308594, + "loss": 0.544, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.754368245601654, + "rewards/margins": 0.5700761079788208, + "rewards/rejected": -1.3244441747665405, + "step": 121 + }, + { + "epoch": 0.5764914353219137, + "grad_norm": 2.3085447219221504, + "learning_rate": 4.1737776650391625e-07, + "logits/chosen": -2.5704903602600098, + "logits/rejected": -2.6333065032958984, + "logps/chosen": -274.203369140625, + "logps/rejected": -333.9256896972656, + "loss": 0.5438, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7395837903022766, + "rewards/margins": 0.5535318851470947, + "rewards/rejected": -1.2931156158447266, + "step": 122 + }, + { + "epoch": 0.5812167749556999, + "grad_norm": 2.1555107869656234, + "learning_rate": 4.1594644829933074e-07, + "logits/chosen": -3.0889954566955566, + "logits/rejected": -3.00903582572937, + "logps/chosen": -287.2713623046875, + "logps/rejected": -352.50244140625, + "loss": 0.5432, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7744563221931458, + "rewards/margins": 0.6194370985031128, + "rewards/rejected": -1.3938933610916138, + "step": 123 + }, + { + "epoch": 0.5859421145894861, + "grad_norm": 2.3124214479042147, + "learning_rate": 4.1450533927723563e-07, + "logits/chosen": -2.7649660110473633, + "logits/rejected": -2.7645654678344727, + "logps/chosen": -323.98663330078125, + "logps/rejected": -377.0577697753906, + "loss": 0.5562, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.9616256952285767, + "rewards/margins": 0.6629120707511902, + "rewards/rejected": -1.624537706375122, + "step": 124 + }, + { + "epoch": 0.5906674542232723, + "grad_norm": 2.201123781604531, + "learning_rate": 4.130545244628638e-07, + "logits/chosen": -2.8170711994171143, + "logits/rejected": -2.801412582397461, + "logps/chosen": -284.6482849121094, + "logps/rejected": -339.86328125, + "loss": 0.5695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9369969964027405, + "rewards/margins": 0.6950640678405762, + "rewards/rejected": -1.6320611238479614, + "step": 125 + }, + { + "epoch": 0.5953927938570585, + "grad_norm": 2.4679791573291907, + "learning_rate": 4.11594089454089e-07, + "logits/chosen": -2.7288601398468018, + "logits/rejected": -2.732513427734375, + "logps/chosen": -331.1778259277344, + "logps/rejected": -422.12139892578125, + "loss": 0.566, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8497134447097778, + "rewards/margins": 0.6027945280075073, + "rewards/rejected": -1.4525080919265747, + "step": 126 + }, + { + "epoch": 0.6001181334908446, + "grad_norm": 2.5170488926988073, + "learning_rate": 4.101241204163748e-07, + "logits/chosen": -2.66646671295166, + "logits/rejected": -2.621904134750366, + "logps/chosen": -301.6587829589844, + "logps/rejected": -365.4261474609375, + "loss": 0.5407, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9232099056243896, + "rewards/margins": 0.4544805884361267, + "rewards/rejected": -1.377690315246582, + "step": 127 + }, + { + "epoch": 0.6048434731246308, + "grad_norm": 2.3673200269773718, + "learning_rate": 4.086447040776911e-07, + "logits/chosen": -3.108903169631958, + "logits/rejected": -3.056070327758789, + "logps/chosen": -276.7916259765625, + "logps/rejected": -342.23876953125, + "loss": 0.5449, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8404449820518494, + "rewards/margins": 0.6569064855575562, + "rewards/rejected": -1.4973516464233398, + "step": 128 + }, + { + "epoch": 0.609568812758417, + "grad_norm": 3.2943893516301768, + "learning_rate": 4.071559277233975e-07, + "logits/chosen": -3.1619277000427246, + "logits/rejected": -3.1500680446624756, + "logps/chosen": -283.8644104003906, + "logps/rejected": -360.0433044433594, + "loss": 0.573, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.969115674495697, + "rewards/margins": 0.5388011932373047, + "rewards/rejected": -1.507916808128357, + "step": 129 + }, + { + "epoch": 0.609568812758417, + "eval_logits/chosen": -2.9238109588623047, + "eval_logits/rejected": -2.93009352684021, + "eval_logps/chosen": -304.09130859375, + "eval_logps/rejected": -360.09600830078125, + "eval_loss": 0.5451335906982422, + "eval_rewards/accuracies": 0.6174242496490479, + "eval_rewards/chosen": -0.9286060333251953, + "eval_rewards/margins": 0.6728550791740417, + "eval_rewards/rejected": -1.6014612913131714, + "eval_runtime": 226.5871, + "eval_samples_per_second": 16.135, + "eval_steps_per_second": 0.291, + "step": 129 + }, + { + "epoch": 0.6142941523922032, + "grad_norm": 2.3717766371975086, + "learning_rate": 4.05657879191093e-07, + "logits/chosen": -2.86085844039917, + "logits/rejected": -3.0779123306274414, + "logps/chosen": -353.9771423339844, + "logps/rejected": -372.76788330078125, + "loss": 0.5611, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9203246235847473, + "rewards/margins": 0.6556491851806641, + "rewards/rejected": -1.575973629951477, + "step": 130 + }, + { + "epoch": 0.6190194920259894, + "grad_norm": 2.9558527636533025, + "learning_rate": 4.04150646865434e-07, + "logits/chosen": -2.6664750576019287, + "logits/rejected": -2.6207780838012695, + "logps/chosen": -283.3083801269531, + "logps/rejected": -348.6871032714844, + "loss": 0.5525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8656440377235413, + "rewards/margins": 0.7297619581222534, + "rewards/rejected": -1.5954060554504395, + "step": 131 + }, + { + "epoch": 0.6237448316597756, + "grad_norm": 2.7269741124418165, + "learning_rate": 4.0263431967291934e-07, + "logits/chosen": -2.8708412647247314, + "logits/rejected": -2.836123466491699, + "logps/chosen": -251.37966918945312, + "logps/rejected": -302.320068359375, + "loss": 0.5554, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8969355821609497, + "rewards/margins": 0.4451577365398407, + "rewards/rejected": -1.3420933485031128, + "step": 132 + }, + { + "epoch": 0.6284701712935618, + "grad_norm": 2.3559260676043547, + "learning_rate": 4.011089870766437e-07, + "logits/chosen": -2.9185516834259033, + "logits/rejected": -3.0862460136413574, + "logps/chosen": -340.1504211425781, + "logps/rejected": -376.01629638671875, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8409585952758789, + "rewards/margins": 0.9160802364349365, + "rewards/rejected": -1.7570387125015259, + "step": 133 + }, + { + "epoch": 0.6331955109273479, + "grad_norm": 2.551925501157886, + "learning_rate": 3.995747390710196e-07, + "logits/chosen": -2.895480155944824, + "logits/rejected": -2.939868211746216, + "logps/chosen": -323.12164306640625, + "logps/rejected": -397.951171875, + "loss": 0.5385, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.1145464181900024, + "rewards/margins": 1.0304715633392334, + "rewards/rejected": -2.1450178623199463, + "step": 134 + }, + { + "epoch": 0.6379208505611341, + "grad_norm": 3.0639374327823625, + "learning_rate": 3.98031666176467e-07, + "logits/chosen": -3.221116781234741, + "logits/rejected": -3.125380516052246, + "logps/chosen": -277.20684814453125, + "logps/rejected": -368.4245300292969, + "loss": 0.5453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9265705943107605, + "rewards/margins": 0.8099436163902283, + "rewards/rejected": -1.7365142107009888, + "step": 135 + }, + { + "epoch": 0.6426461901949203, + "grad_norm": 3.0229894732050537, + "learning_rate": 3.9647985943407345e-07, + "logits/chosen": -2.7229156494140625, + "logits/rejected": -2.624408006668091, + "logps/chosen": -296.76507568359375, + "logps/rejected": -366.4635925292969, + "loss": 0.5446, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.911621630191803, + "rewards/margins": 0.3694719672203064, + "rewards/rejected": -1.2810935974121094, + "step": 136 + }, + { + "epoch": 0.6473715298287065, + "grad_norm": 2.777240365107515, + "learning_rate": 3.949194104002224e-07, + "logits/chosen": -3.008553981781006, + "logits/rejected": -3.0245308876037598, + "logps/chosen": -278.2191162109375, + "logps/rejected": -372.57672119140625, + "loss": 0.5333, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.9170963168144226, + "rewards/margins": 0.91133052110672, + "rewards/rejected": -1.8284270763397217, + "step": 137 + }, + { + "epoch": 0.6520968694624926, + "grad_norm": 2.5983010643513222, + "learning_rate": 3.93350411141191e-07, + "logits/chosen": -2.984111785888672, + "logits/rejected": -2.9940693378448486, + "logps/chosen": -256.7328796386719, + "logps/rejected": -304.14361572265625, + "loss": 0.5481, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8156505227088928, + "rewards/margins": 0.6031564474105835, + "rewards/rejected": -1.418807029724121, + "step": 138 + }, + { + "epoch": 0.6568222090962788, + "grad_norm": 2.758553605218817, + "learning_rate": 3.917729542277187e-07, + "logits/chosen": -2.739635944366455, + "logits/rejected": -2.8373708724975586, + "logps/chosen": -355.6564025878906, + "logps/rejected": -444.02471923828125, + "loss": 0.5188, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9943738579750061, + "rewards/margins": 0.8635731339454651, + "rewards/rejected": -1.8579471111297607, + "step": 139 + }, + { + "epoch": 0.6615475487300649, + "grad_norm": 2.5137621293297605, + "learning_rate": 3.901871327295453e-07, + "logits/chosen": -2.6592538356781006, + "logits/rejected": -2.8847031593322754, + "logps/chosen": -330.49609375, + "logps/rejected": -349.5611572265625, + "loss": 0.5406, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.8463073968887329, + "rewards/margins": 0.6520651578903198, + "rewards/rejected": -1.4983725547790527, + "step": 140 + }, + { + "epoch": 0.6662728883638511, + "grad_norm": 2.285013562717476, + "learning_rate": 3.885930402099199e-07, + "logits/chosen": -2.5882949829101562, + "logits/rejected": -2.6650753021240234, + "logps/chosen": -335.5899658203125, + "logps/rejected": -360.06451416015625, + "loss": 0.5074, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9886775016784668, + "rewards/margins": 0.6825906038284302, + "rewards/rejected": -1.6712682247161865, + "step": 141 + }, + { + "epoch": 0.6709982279976373, + "grad_norm": 2.554720405760107, + "learning_rate": 3.8699077072008085e-07, + "logits/chosen": -2.8670525550842285, + "logits/rejected": -2.866511344909668, + "logps/chosen": -271.18243408203125, + "logps/rejected": -343.53582763671875, + "loss": 0.5163, + "rewards/accuracies": 0.578125, + "rewards/chosen": -1.0047590732574463, + "rewards/margins": 0.5558298826217651, + "rewards/rejected": -1.5605889558792114, + "step": 142 + }, + { + "epoch": 0.6757235676314235, + "grad_norm": 2.830180877879511, + "learning_rate": 3.8538041879370657e-07, + "logits/chosen": -3.037707567214966, + "logits/rejected": -3.063495397567749, + "logps/chosen": -347.0140380859375, + "logps/rejected": -392.6544494628906, + "loss": 0.5505, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9989163875579834, + "rewards/margins": 0.8129914402961731, + "rewards/rejected": -1.8119077682495117, + "step": 143 + }, + { + "epoch": 0.6804489072652097, + "grad_norm": 2.6000164530484224, + "learning_rate": 3.8376207944133817e-07, + "logits/chosen": -3.087387800216675, + "logits/rejected": -3.079148530960083, + "logps/chosen": -316.996337890625, + "logps/rejected": -361.9954528808594, + "loss": 0.5365, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.111214518547058, + "rewards/margins": 0.5254876017570496, + "rewards/rejected": -1.636702060699463, + "step": 144 + }, + { + "epoch": 0.6851742468989959, + "grad_norm": 2.6115954536025408, + "learning_rate": 3.8213584814477363e-07, + "logits/chosen": -3.111316442489624, + "logits/rejected": -3.184953451156616, + "logps/chosen": -311.6522521972656, + "logps/rejected": -325.0541076660156, + "loss": 0.4982, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.9516675472259521, + "rewards/margins": 0.6691429018974304, + "rewards/rejected": -1.6208105087280273, + "step": 145 + }, + { + "epoch": 0.689899586532782, + "grad_norm": 2.976191491134426, + "learning_rate": 3.8050182085143464e-07, + "logits/chosen": -2.9731078147888184, + "logits/rejected": -3.072920083999634, + "logps/chosen": -309.2756042480469, + "logps/rejected": -352.2410888671875, + "loss": 0.5572, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.080316185951233, + "rewards/margins": 0.8104506134986877, + "rewards/rejected": -1.8907668590545654, + "step": 146 + }, + { + "epoch": 0.6946249261665682, + "grad_norm": 2.837971176112728, + "learning_rate": 3.7886009396870564e-07, + "logits/chosen": -2.8793129920959473, + "logits/rejected": -2.8820691108703613, + "logps/chosen": -288.26116943359375, + "logps/rejected": -363.0924072265625, + "loss": 0.5536, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.1091738939285278, + "rewards/margins": 0.493495374917984, + "rewards/rejected": -1.602669358253479, + "step": 147 + }, + { + "epoch": 0.6993502658003544, + "grad_norm": 2.928245357032695, + "learning_rate": 3.7721076435824585e-07, + "logits/chosen": -2.7040960788726807, + "logits/rejected": -2.8614137172698975, + "logps/chosen": -415.0562438964844, + "logps/rejected": -420.36297607421875, + "loss": 0.5321, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.0266618728637695, + "rewards/margins": 0.8255325555801392, + "rewards/rejected": -1.8521945476531982, + "step": 148 + }, + { + "epoch": 0.7040756054341406, + "grad_norm": 2.983452098073201, + "learning_rate": 3.755539293302742e-07, + "logits/chosen": -2.614259958267212, + "logits/rejected": -2.6951889991760254, + "logps/chosen": -352.3388366699219, + "logps/rejected": -375.2248840332031, + "loss": 0.5457, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1011898517608643, + "rewards/margins": 0.613980233669281, + "rewards/rejected": -1.715169906616211, + "step": 149 + }, + { + "epoch": 0.7088009450679268, + "grad_norm": 2.6652814589714997, + "learning_rate": 3.738896866378282e-07, + "logits/chosen": -2.7506563663482666, + "logits/rejected": -2.690138339996338, + "logps/chosen": -322.7042236328125, + "logps/rejected": -373.8511047363281, + "loss": 0.516, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9171172380447388, + "rewards/margins": 0.8513570427894592, + "rewards/rejected": -1.7684742212295532, + "step": 150 + }, + { + "epoch": 0.713526284701713, + "grad_norm": 2.5934419246508096, + "learning_rate": 3.722181344709969e-07, + "logits/chosen": -2.7295525074005127, + "logits/rejected": -2.855721950531006, + "logps/chosen": -332.5438232421875, + "logps/rejected": -359.502197265625, + "loss": 0.5121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9578840136528015, + "rewards/margins": 0.7636557221412659, + "rewards/rejected": -1.7215397357940674, + "step": 151 + }, + { + "epoch": 0.7182516243354992, + "grad_norm": 3.1206430500674975, + "learning_rate": 3.705393714511268e-07, + "logits/chosen": -2.845468282699585, + "logits/rejected": -2.7837162017822266, + "logps/chosen": -314.1913757324219, + "logps/rejected": -419.7186584472656, + "loss": 0.5321, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.9525049924850464, + "rewards/margins": 0.6129012703895569, + "rewards/rejected": -1.5654062032699585, + "step": 152 + }, + { + "epoch": 0.7229769639692853, + "grad_norm": 3.3791075672510336, + "learning_rate": 3.688534966250042e-07, + "logits/chosen": -3.007288932800293, + "logits/rejected": -2.9382283687591553, + "logps/chosen": -297.81622314453125, + "logps/rejected": -355.0559387207031, + "loss": 0.54, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8055727481842041, + "rewards/margins": 0.7948654890060425, + "rewards/rejected": -1.6004382371902466, + "step": 153 + }, + { + "epoch": 0.7277023036030714, + "grad_norm": 2.7374735457469384, + "learning_rate": 3.671606094590108e-07, + "logits/chosen": -2.7088348865509033, + "logits/rejected": -2.7453291416168213, + "logps/chosen": -337.4541931152344, + "logps/rejected": -411.91485595703125, + "loss": 0.5429, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0184537172317505, + "rewards/margins": 0.8295416235923767, + "rewards/rejected": -1.8479952812194824, + "step": 154 + }, + { + "epoch": 0.7324276432368576, + "grad_norm": 2.977388710936637, + "learning_rate": 3.6546080983325523e-07, + "logits/chosen": -2.824364185333252, + "logits/rejected": -2.911698579788208, + "logps/chosen": -308.72161865234375, + "logps/rejected": -312.67431640625, + "loss": 0.5497, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.054483413696289, + "rewards/margins": 0.345467209815979, + "rewards/rejected": -1.399950623512268, + "step": 155 + }, + { + "epoch": 0.7371529828706438, + "grad_norm": 3.297987493267062, + "learning_rate": 3.6375419803568046e-07, + "logits/chosen": -2.938750743865967, + "logits/rejected": -3.12616229057312, + "logps/chosen": -380.7288513183594, + "logps/rejected": -393.94683837890625, + "loss": 0.5366, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0641953945159912, + "rewards/margins": 0.899326741695404, + "rewards/rejected": -1.96352219581604, + "step": 156 + }, + { + "epoch": 0.74187832250443, + "grad_norm": 2.5345279316698983, + "learning_rate": 3.6204087475614676e-07, + "logits/chosen": -2.923267364501953, + "logits/rejected": -2.8579440116882324, + "logps/chosen": -302.9458923339844, + "logps/rejected": -387.598388671875, + "loss": 0.5246, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0088130235671997, + "rewards/margins": 0.8763782382011414, + "rewards/rejected": -1.8851913213729858, + "step": 157 + }, + { + "epoch": 0.7466036621382162, + "grad_norm": 3.13993186485945, + "learning_rate": 3.603209410804906e-07, + "logits/chosen": -2.7970800399780273, + "logits/rejected": -2.77022123336792, + "logps/chosen": -260.28485107421875, + "logps/rejected": -377.06585693359375, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9031432867050171, + "rewards/margins": 0.9541431665420532, + "rewards/rejected": -1.8572864532470703, + "step": 158 + }, + { + "epoch": 0.7513290017720023, + "grad_norm": 2.777966751571199, + "learning_rate": 3.5859449848456123e-07, + "logits/chosen": -2.83420991897583, + "logits/rejected": -2.9197449684143066, + "logps/chosen": -270.993896484375, + "logps/rejected": -329.7164001464844, + "loss": 0.5438, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.9526958465576172, + "rewards/margins": 0.5992559194564819, + "rewards/rejected": -1.5519516468048096, + "step": 159 + }, + { + "epoch": 0.7560543414057885, + "grad_norm": 2.816810414145774, + "learning_rate": 3.5686164882823313e-07, + "logits/chosen": -2.4739251136779785, + "logits/rejected": -2.5660862922668457, + "logps/chosen": -325.5205383300781, + "logps/rejected": -357.9018859863281, + "loss": 0.5207, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.059773564338684, + "rewards/margins": 0.855665922164917, + "rewards/rejected": -1.9154393672943115, + "step": 160 + }, + { + "epoch": 0.7607796810395747, + "grad_norm": 2.7264976697102217, + "learning_rate": 3.5512249434939634e-07, + "logits/chosen": -3.020364284515381, + "logits/rejected": -3.1138038635253906, + "logps/chosen": -292.44451904296875, + "logps/rejected": -370.27581787109375, + "loss": 0.5324, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0052980184555054, + "rewards/margins": 0.916731059551239, + "rewards/rejected": -1.9220290184020996, + "step": 161 + }, + { + "epoch": 0.7655050206733609, + "grad_norm": 4.143376503088087, + "learning_rate": 3.533771376579249e-07, + "logits/chosen": -2.8414347171783447, + "logits/rejected": -2.8189237117767334, + "logps/chosen": -298.5760192871094, + "logps/rejected": -407.37457275390625, + "loss": 0.5111, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.9188227653503418, + "rewards/margins": 1.0365185737609863, + "rewards/rejected": -1.9553413391113281, + "step": 162 + }, + { + "epoch": 0.7702303603071471, + "grad_norm": 2.851627509613439, + "learning_rate": 3.5162568172962215e-07, + "logits/chosen": -2.737412214279175, + "logits/rejected": -2.8929431438446045, + "logps/chosen": -325.90631103515625, + "logps/rejected": -372.39410400390625, + "loss": 0.5193, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.134871244430542, + "rewards/margins": 0.6576811075210571, + "rewards/rejected": -1.7925523519515991, + "step": 163 + }, + { + "epoch": 0.7749556999409333, + "grad_norm": 3.4081328480353985, + "learning_rate": 3.498682299001459e-07, + "logits/chosen": -2.622042179107666, + "logits/rejected": -2.759326696395874, + "logps/chosen": -354.53448486328125, + "logps/rejected": -397.6759338378906, + "loss": 0.5084, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.1062355041503906, + "rewards/margins": 0.7745749354362488, + "rewards/rejected": -1.8808104991912842, + "step": 164 + }, + { + "epoch": 0.7796810395747195, + "grad_norm": 5.113522770893731, + "learning_rate": 3.4810488585891103e-07, + "logits/chosen": -2.7235350608825684, + "logits/rejected": -2.8394298553466797, + "logps/chosen": -339.3172912597656, + "logps/rejected": -403.5739440917969, + "loss": 0.5256, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0079370737075806, + "rewards/margins": 0.9361110925674438, + "rewards/rejected": -1.9440481662750244, + "step": 165 + }, + { + "epoch": 0.7844063792085056, + "grad_norm": 2.845383053447366, + "learning_rate": 3.4633575364297224e-07, + "logits/chosen": -3.0904507637023926, + "logits/rejected": -3.1447291374206543, + "logps/chosen": -308.4082336425781, + "logps/rejected": -403.6357421875, + "loss": 0.5258, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1904851198196411, + "rewards/margins": 1.1804500818252563, + "rewards/rejected": -2.3709352016448975, + "step": 166 + }, + { + "epoch": 0.7891317188422918, + "grad_norm": 3.6331311614486483, + "learning_rate": 3.445609376308857e-07, + "logits/chosen": -2.7289986610412598, + "logits/rejected": -2.64235520362854, + "logps/chosen": -351.40728759765625, + "logps/rejected": -435.42010498046875, + "loss": 0.4979, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.273624300956726, + "rewards/margins": 1.0576740503311157, + "rewards/rejected": -2.331298351287842, + "step": 167 + }, + { + "epoch": 0.793857058476078, + "grad_norm": 3.6152830110317655, + "learning_rate": 3.4278054253655086e-07, + "logits/chosen": -2.8024775981903076, + "logits/rejected": -2.853891611099243, + "logps/chosen": -322.92181396484375, + "logps/rejected": -396.2481689453125, + "loss": 0.5094, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.0655136108398438, + "rewards/margins": 0.8890769481658936, + "rewards/rejected": -1.9545905590057373, + "step": 168 + }, + { + "epoch": 0.7985823981098642, + "grad_norm": 3.119357172634371, + "learning_rate": 3.4099467340303214e-07, + "logits/chosen": -3.0272624492645264, + "logits/rejected": -3.140334129333496, + "logps/chosen": -307.60479736328125, + "logps/rejected": -376.73590087890625, + "loss": 0.5064, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1390501260757446, + "rewards/margins": 0.9899504780769348, + "rewards/rejected": -2.129000425338745, + "step": 169 + }, + { + "epoch": 0.8033077377436503, + "grad_norm": 3.457360676304788, + "learning_rate": 3.392034355963614e-07, + "logits/chosen": -2.8242180347442627, + "logits/rejected": -2.8203928470611572, + "logps/chosen": -338.4170837402344, + "logps/rejected": -375.84893798828125, + "loss": 0.5257, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.182822346687317, + "rewards/margins": 0.7907478213310242, + "rewards/rejected": -1.9735702276229858, + "step": 170 + }, + { + "epoch": 0.8080330773774365, + "grad_norm": 5.011470232701305, + "learning_rate": 3.374069347993218e-07, + "logits/chosen": -2.6921019554138184, + "logits/rejected": -2.7788760662078857, + "logps/chosen": -329.470703125, + "logps/rejected": -467.9057922363281, + "loss": 0.5396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1266531944274902, + "rewards/margins": 0.8941323757171631, + "rewards/rejected": -2.0207855701446533, + "step": 171 + }, + { + "epoch": 0.8127584170112226, + "grad_norm": 3.7253097168704294, + "learning_rate": 3.356052770052119e-07, + "logits/chosen": -2.716782331466675, + "logits/rejected": -2.6556971073150635, + "logps/chosen": -309.70672607421875, + "logps/rejected": -426.3639221191406, + "loss": 0.5239, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2285258769989014, + "rewards/margins": 0.8051817417144775, + "rewards/rejected": -2.033707618713379, + "step": 172 + }, + { + "epoch": 0.8127584170112226, + "eval_logits/chosen": -2.9802942276000977, + "eval_logits/rejected": -2.9883527755737305, + "eval_logps/chosen": -339.8587646484375, + "eval_logps/rejected": -423.5323791503906, + "eval_loss": 0.51226407289505, + "eval_rewards/accuracies": 0.6287878751754761, + "eval_rewards/chosen": -1.2862800359725952, + "eval_rewards/margins": 0.9495444893836975, + "eval_rewards/rejected": -2.2358245849609375, + "eval_runtime": 225.4382, + "eval_samples_per_second": 16.217, + "eval_steps_per_second": 0.293, + "step": 172 + }, + { + "epoch": 0.8174837566450088, + "grad_norm": 4.039346342004204, + "learning_rate": 3.337985685115926e-07, + "logits/chosen": -2.9982471466064453, + "logits/rejected": -2.934654712677002, + "logps/chosen": -345.1435546875, + "logps/rejected": -418.8468017578125, + "loss": 0.5227, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.2590099573135376, + "rewards/margins": 0.7078821659088135, + "rewards/rejected": -1.9668920040130615, + "step": 173 + }, + { + "epoch": 0.822209096278795, + "grad_norm": 3.355839857834348, + "learning_rate": 3.319869159140152e-07, + "logits/chosen": -2.412257432937622, + "logits/rejected": -2.5528626441955566, + "logps/chosen": -313.64422607421875, + "logps/rejected": -387.2737121582031, + "loss": 0.5154, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2666703462600708, + "rewards/margins": 0.9918266534805298, + "rewards/rejected": -2.2584969997406006, + "step": 174 + }, + { + "epoch": 0.8269344359125812, + "grad_norm": 3.8883187343016528, + "learning_rate": 3.301704260997325e-07, + "logits/chosen": -2.835768222808838, + "logits/rejected": -2.861588716506958, + "logps/chosen": -310.4635314941406, + "logps/rejected": -418.094970703125, + "loss": 0.5263, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.149375557899475, + "rewards/margins": 1.2631374597549438, + "rewards/rejected": -2.41251277923584, + "step": 175 + }, + { + "epoch": 0.8316597755463674, + "grad_norm": 3.943606898054145, + "learning_rate": 3.283492062413925e-07, + "logits/chosen": -2.8773105144500732, + "logits/rejected": -2.919630527496338, + "logps/chosen": -315.5, + "logps/rejected": -414.37908935546875, + "loss": 0.5139, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2905051708221436, + "rewards/margins": 1.0659555196762085, + "rewards/rejected": -2.3564605712890625, + "step": 176 + }, + { + "epoch": 0.8363851151801536, + "grad_norm": 4.394795298842495, + "learning_rate": 3.2652336379071506e-07, + "logits/chosen": -2.7635695934295654, + "logits/rejected": -2.8052563667297363, + "logps/chosen": -371.2409973144531, + "logps/rejected": -429.0415954589844, + "loss": 0.4828, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2244281768798828, + "rewards/margins": 1.045049786567688, + "rewards/rejected": -2.2694778442382812, + "step": 177 + }, + { + "epoch": 0.8411104548139398, + "grad_norm": 3.6672724994810593, + "learning_rate": 3.246930064721523e-07, + "logits/chosen": -2.7490479946136475, + "logits/rejected": -2.6954996585845947, + "logps/chosen": -265.0568542480469, + "logps/rejected": -373.4683532714844, + "loss": 0.5134, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.1422216892242432, + "rewards/margins": 1.010473370552063, + "rewards/rejected": -2.1526949405670166, + "step": 178 + }, + { + "epoch": 0.8458357944477259, + "grad_norm": 3.438810171868027, + "learning_rate": 3.228582422765331e-07, + "logits/chosen": -2.661006212234497, + "logits/rejected": -2.756809711456299, + "logps/chosen": -363.7588806152344, + "logps/rejected": -392.8025207519531, + "loss": 0.5044, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.341508388519287, + "rewards/margins": 0.7005224227905273, + "rewards/rejected": -2.0420308113098145, + "step": 179 + }, + { + "epoch": 0.8505611340815121, + "grad_norm": 4.113182426182398, + "learning_rate": 3.2101917945469135e-07, + "logits/chosen": -2.700942277908325, + "logits/rejected": -2.82185435295105, + "logps/chosen": -364.240234375, + "logps/rejected": -429.81292724609375, + "loss": 0.4816, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1254488229751587, + "rewards/margins": 0.9122301340103149, + "rewards/rejected": -2.0376789569854736, + "step": 180 + }, + { + "epoch": 0.8552864737152983, + "grad_norm": 4.022329679746118, + "learning_rate": 3.1917592651107927e-07, + "logits/chosen": -2.8973255157470703, + "logits/rejected": -2.794524908065796, + "logps/chosen": -352.9599914550781, + "logps/rejected": -422.71197509765625, + "loss": 0.5153, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.2179690599441528, + "rewards/margins": 0.7948740124702454, + "rewards/rejected": -2.012843132019043, + "step": 181 + }, + { + "epoch": 0.8600118133490845, + "grad_norm": 3.7009474178127353, + "learning_rate": 3.173285921973657e-07, + "logits/chosen": -2.793835401535034, + "logits/rejected": -2.7703464031219482, + "logps/chosen": -320.6808776855469, + "logps/rejected": -454.6962890625, + "loss": 0.512, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.1430678367614746, + "rewards/margins": 1.6093158721923828, + "rewards/rejected": -2.7523837089538574, + "step": 182 + }, + { + "epoch": 0.8647371529828707, + "grad_norm": 3.2328530891675182, + "learning_rate": 3.1547728550601983e-07, + "logits/chosen": -2.6808881759643555, + "logits/rejected": -2.676565170288086, + "logps/chosen": -301.3103332519531, + "logps/rejected": -402.32281494140625, + "loss": 0.5161, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1162338256835938, + "rewards/margins": 1.0167958736419678, + "rewards/rejected": -2.1330299377441406, + "step": 183 + }, + { + "epoch": 0.8694624926166569, + "grad_norm": 3.899851002215307, + "learning_rate": 3.1362211566388057e-07, + "logits/chosen": -2.8450677394866943, + "logits/rejected": -2.836167097091675, + "logps/chosen": -341.335693359375, + "logps/rejected": -407.4953918457031, + "loss": 0.5422, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.2810035943984985, + "rewards/margins": 0.6536043882369995, + "rewards/rejected": -1.934607982635498, + "step": 184 + }, + { + "epoch": 0.874187832250443, + "grad_norm": 3.2712338921521518, + "learning_rate": 3.1176319212571204e-07, + "logits/chosen": -2.405541181564331, + "logits/rejected": -2.4331328868865967, + "logps/chosen": -288.65325927734375, + "logps/rejected": -364.7303466796875, + "loss": 0.5114, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2161575555801392, + "rewards/margins": 0.9309273362159729, + "rewards/rejected": -2.147084951400757, + "step": 185 + }, + { + "epoch": 0.8789131718842291, + "grad_norm": 3.8818256203788852, + "learning_rate": 3.099006245677461e-07, + "logits/chosen": -2.4217336177825928, + "logits/rejected": -2.360973358154297, + "logps/chosen": -381.958740234375, + "logps/rejected": -487.387451171875, + "loss": 0.5393, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2569398880004883, + "rewards/margins": 0.7359199523925781, + "rewards/rejected": -1.9928598403930664, + "step": 186 + }, + { + "epoch": 0.8836385115180153, + "grad_norm": 3.241850575925625, + "learning_rate": 3.0803452288121113e-07, + "logits/chosen": -2.6186816692352295, + "logits/rejected": -2.5035552978515625, + "logps/chosen": -356.3555603027344, + "logps/rejected": -538.8609619140625, + "loss": 0.4985, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2594354152679443, + "rewards/margins": 1.667135238647461, + "rewards/rejected": -2.9265708923339844, + "step": 187 + }, + { + "epoch": 0.8883638511518015, + "grad_norm": 2.994070703824197, + "learning_rate": 3.0616499716584874e-07, + "logits/chosen": -2.936795473098755, + "logits/rejected": -2.855689525604248, + "logps/chosen": -339.5811767578125, + "logps/rejected": -430.01483154296875, + "loss": 0.5116, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1258800029754639, + "rewards/margins": 0.6621576547622681, + "rewards/rejected": -1.7880375385284424, + "step": 188 + }, + { + "epoch": 0.8930891907855877, + "grad_norm": 3.4626212317934146, + "learning_rate": 3.042921577234177e-07, + "logits/chosen": -2.6947526931762695, + "logits/rejected": -2.8166098594665527, + "logps/chosen": -328.1815185546875, + "logps/rejected": -340.3358154296875, + "loss": 0.5048, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0728776454925537, + "rewards/margins": 0.6414874792098999, + "rewards/rejected": -1.7143651247024536, + "step": 189 + }, + { + "epoch": 0.8978145304193739, + "grad_norm": 3.1647602539221182, + "learning_rate": 3.024161150511861e-07, + "logits/chosen": -2.9085636138916016, + "logits/rejected": -3.0228068828582764, + "logps/chosen": -323.0364074707031, + "logps/rejected": -358.6117248535156, + "loss": 0.5248, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.1753543615341187, + "rewards/margins": 0.9408324956893921, + "rewards/rejected": -2.1161868572235107, + "step": 190 + }, + { + "epoch": 0.90253987005316, + "grad_norm": 3.025256770279786, + "learning_rate": 3.0053697983541247e-07, + "logits/chosen": -2.545339345932007, + "logits/rejected": -2.5873234272003174, + "logps/chosen": -364.4934387207031, + "logps/rejected": -404.4205322265625, + "loss": 0.5114, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3082777261734009, + "rewards/margins": 0.9447382688522339, + "rewards/rejected": -2.2530159950256348, + "step": 191 + }, + { + "epoch": 0.9072652096869462, + "grad_norm": 3.2107060614205682, + "learning_rate": 2.986548629448146e-07, + "logits/chosen": -2.5320749282836914, + "logits/rejected": -2.6858551502227783, + "logps/chosen": -357.6763916015625, + "logps/rejected": -418.71173095703125, + "loss": 0.5194, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0752955675125122, + "rewards/margins": 1.176898717880249, + "rewards/rejected": -2.252194404602051, + "step": 192 + }, + { + "epoch": 0.9119905493207324, + "grad_norm": 4.290813644018959, + "learning_rate": 2.967698754240289e-07, + "logits/chosen": -2.6695892810821533, + "logits/rejected": -2.660761833190918, + "logps/chosen": -347.8155517578125, + "logps/rejected": -403.8181457519531, + "loss": 0.5543, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.207077145576477, + "rewards/margins": 0.7051151394844055, + "rewards/rejected": -1.9121922254562378, + "step": 193 + }, + { + "epoch": 0.9167158889545186, + "grad_norm": 3.656365041410898, + "learning_rate": 2.948821284870585e-07, + "logits/chosen": -3.009221315383911, + "logits/rejected": -2.9382755756378174, + "logps/chosen": -338.1603698730469, + "logps/rejected": -425.35931396484375, + "loss": 0.5007, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3645328283309937, + "rewards/margins": 0.9410390257835388, + "rewards/rejected": -2.305572032928467, + "step": 194 + }, + { + "epoch": 0.9214412285883048, + "grad_norm": 3.166267065915439, + "learning_rate": 2.9299173351071176e-07, + "logits/chosen": -2.544590950012207, + "logits/rejected": -2.5580813884735107, + "logps/chosen": -375.34112548828125, + "logps/rejected": -437.5019836425781, + "loss": 0.5057, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1270973682403564, + "rewards/margins": 0.8198693990707397, + "rewards/rejected": -1.9469666481018066, + "step": 195 + }, + { + "epoch": 0.926166568222091, + "grad_norm": 2.9416082082853805, + "learning_rate": 2.9109880202803097e-07, + "logits/chosen": -2.5898144245147705, + "logits/rejected": -2.590017795562744, + "logps/chosen": -325.3600769042969, + "logps/rejected": -417.5689392089844, + "loss": 0.5251, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1922552585601807, + "rewards/margins": 1.2383495569229126, + "rewards/rejected": -2.4306044578552246, + "step": 196 + }, + { + "epoch": 0.9308919078558772, + "grad_norm": 4.344454107439186, + "learning_rate": 2.892034457217119e-07, + "logits/chosen": -2.793138027191162, + "logits/rejected": -2.6765213012695312, + "logps/chosen": -393.07080078125, + "logps/rejected": -508.9129638671875, + "loss": 0.5302, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.202684998512268, + "rewards/margins": 1.2748346328735352, + "rewards/rejected": -2.4775197505950928, + "step": 197 + }, + { + "epoch": 0.9356172474896634, + "grad_norm": 4.341995602773921, + "learning_rate": 2.8730577641751474e-07, + "logits/chosen": -2.5986645221710205, + "logits/rejected": -2.7119102478027344, + "logps/chosen": -301.242431640625, + "logps/rejected": -423.28619384765625, + "loss": 0.5145, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1418884992599487, + "rewards/margins": 1.2161611318588257, + "rewards/rejected": -2.3580498695373535, + "step": 198 + }, + { + "epoch": 0.9403425871234495, + "grad_norm": 3.5092494052777536, + "learning_rate": 2.854059060776659e-07, + "logits/chosen": -2.4758505821228027, + "logits/rejected": -2.5398991107940674, + "logps/chosen": -311.9541015625, + "logps/rejected": -413.2162780761719, + "loss": 0.4867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.084080457687378, + "rewards/margins": 1.512494683265686, + "rewards/rejected": -2.5965750217437744, + "step": 199 + }, + { + "epoch": 0.9450679267572357, + "grad_norm": 3.5187855585130294, + "learning_rate": 2.835039467942529e-07, + "logits/chosen": -2.5920052528381348, + "logits/rejected": -2.7105183601379395, + "logps/chosen": -340.66351318359375, + "logps/rejected": -416.5780944824219, + "loss": 0.5222, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0971770286560059, + "rewards/margins": 0.9351829290390015, + "rewards/rejected": -2.032360315322876, + "step": 200 + }, + { + "epoch": 0.9497932663910219, + "grad_norm": 3.793717608121411, + "learning_rate": 2.8160001078261055e-07, + "logits/chosen": -2.498663902282715, + "logits/rejected": -2.4728269577026367, + "logps/chosen": -301.9998474121094, + "logps/rejected": -426.3241271972656, + "loss": 0.5251, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.031512975692749, + "rewards/margins": 1.083221197128296, + "rewards/rejected": -2.114734172821045, + "step": 201 + }, + { + "epoch": 0.954518606024808, + "grad_norm": 2.9737105768220244, + "learning_rate": 2.7969421037470033e-07, + "logits/chosen": -2.7715539932250977, + "logits/rejected": -2.7849650382995605, + "logps/chosen": -345.68963623046875, + "logps/rejected": -444.9283752441406, + "loss": 0.5044, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.262139081954956, + "rewards/margins": 0.9420502185821533, + "rewards/rejected": -2.2041893005371094, + "step": 202 + }, + { + "epoch": 0.9592439456585942, + "grad_norm": 3.4484596414693893, + "learning_rate": 2.777866580124829e-07, + "logits/chosen": -2.491079330444336, + "logits/rejected": -2.7043981552124023, + "logps/chosen": -395.48211669921875, + "logps/rejected": -463.1803894042969, + "loss": 0.5037, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2879524230957031, + "rewards/margins": 0.926118791103363, + "rewards/rejected": -2.214071273803711, + "step": 203 + }, + { + "epoch": 0.9639692852923804, + "grad_norm": 3.553922161828863, + "learning_rate": 2.758774662412838e-07, + "logits/chosen": -2.5517563819885254, + "logits/rejected": -2.4807627201080322, + "logps/chosen": -305.34637451171875, + "logps/rejected": -486.2059631347656, + "loss": 0.5027, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.346076250076294, + "rewards/margins": 1.699803352355957, + "rewards/rejected": -3.04587984085083, + "step": 204 + }, + { + "epoch": 0.9686946249261665, + "grad_norm": 3.191519565656703, + "learning_rate": 2.739667477031538e-07, + "logits/chosen": -2.7191619873046875, + "logits/rejected": -2.7111451625823975, + "logps/chosen": -344.4493408203125, + "logps/rejected": -480.70135498046875, + "loss": 0.4982, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2324879169464111, + "rewards/margins": 1.2888904809951782, + "rewards/rejected": -2.521378517150879, + "step": 205 + }, + { + "epoch": 0.9734199645599527, + "grad_norm": 4.672920315021906, + "learning_rate": 2.7205461513022233e-07, + "logits/chosen": -2.0949220657348633, + "logits/rejected": -2.1367533206939697, + "logps/chosen": -390.1219482421875, + "logps/rejected": -415.38037109375, + "loss": 0.4815, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.0024380683898926, + "rewards/margins": 0.9189928770065308, + "rewards/rejected": -1.9214308261871338, + "step": 206 + }, + { + "epoch": 0.9781453041937389, + "grad_norm": 3.5306370227740262, + "learning_rate": 2.70141181338047e-07, + "logits/chosen": -2.406205177307129, + "logits/rejected": -2.38879132270813, + "logps/chosen": -356.2658386230469, + "logps/rejected": -456.2667236328125, + "loss": 0.4946, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.220351219177246, + "rewards/margins": 1.0728812217712402, + "rewards/rejected": -2.2932324409484863, + "step": 207 + }, + { + "epoch": 0.9828706438275251, + "grad_norm": 3.479933152007338, + "learning_rate": 2.6822655921895693e-07, + "logits/chosen": -2.446387767791748, + "logits/rejected": -2.346810817718506, + "logps/chosen": -312.9122619628906, + "logps/rejected": -459.10040283203125, + "loss": 0.5015, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.142736792564392, + "rewards/margins": 1.2611982822418213, + "rewards/rejected": -2.403934955596924, + "step": 208 + }, + { + "epoch": 0.9875959834613113, + "grad_norm": 3.8539520740205018, + "learning_rate": 2.663108617353926e-07, + "logits/chosen": -2.7008585929870605, + "logits/rejected": -2.7585320472717285, + "logps/chosen": -410.00634765625, + "logps/rejected": -455.63812255859375, + "loss": 0.5103, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.2332756519317627, + "rewards/margins": 0.693227231502533, + "rewards/rejected": -1.9265029430389404, + "step": 209 + }, + { + "epoch": 0.9923213230950975, + "grad_norm": 3.845322423212567, + "learning_rate": 2.6439420191324064e-07, + "logits/chosen": -2.462545871734619, + "logits/rejected": -2.5116636753082275, + "logps/chosen": -305.0135803222656, + "logps/rejected": -405.6048583984375, + "loss": 0.4911, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.231694221496582, + "rewards/margins": 0.9429072141647339, + "rewards/rejected": -2.1746013164520264, + "step": 210 + }, + { + "epoch": 0.9970466627288836, + "grad_norm": 5.6022942181300905, + "learning_rate": 2.6247669283516556e-07, + "logits/chosen": -2.544132947921753, + "logits/rejected": -2.5758156776428223, + "logps/chosen": -366.85009765625, + "logps/rejected": -442.0340881347656, + "loss": 0.5449, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.4171723127365112, + "rewards/margins": 1.0115364789962769, + "rewards/rejected": -2.428708791732788, + "step": 211 + }, + { + "epoch": 1.0017720023626697, + "grad_norm": 3.186023679965871, + "learning_rate": 2.60558447633938e-07, + "logits/chosen": -2.244544506072998, + "logits/rejected": -2.3220551013946533, + "logps/chosen": -366.2366638183594, + "logps/rejected": -452.18707275390625, + "loss": 0.5058, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4039456844329834, + "rewards/margins": 1.031445860862732, + "rewards/rejected": -2.435391426086426, + "step": 212 + }, + { + "epoch": 1.006497341996456, + "grad_norm": 3.6450488482030523, + "learning_rate": 2.5863957948575963e-07, + "logits/chosen": -2.3453848361968994, + "logits/rejected": -2.468796968460083, + "logps/chosen": -326.9305114746094, + "logps/rejected": -368.9020690917969, + "loss": 0.4954, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1048694849014282, + "rewards/margins": 0.7470109462738037, + "rewards/rejected": -1.8518803119659424, + "step": 213 + }, + { + "epoch": 1.011222681630242, + "grad_norm": 4.137227043648897, + "learning_rate": 2.567202016035859e-07, + "logits/chosen": -2.5369303226470947, + "logits/rejected": -2.535897731781006, + "logps/chosen": -310.4713439941406, + "logps/rejected": -399.39398193359375, + "loss": 0.4987, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2701810598373413, + "rewards/margins": 0.7798756957054138, + "rewards/rejected": -2.0500564575195312, + "step": 214 + }, + { + "epoch": 1.0159480212640284, + "grad_norm": 4.02450997349156, + "learning_rate": 2.5480042723044653e-07, + "logits/chosen": -2.567810535430908, + "logits/rejected": -2.575981378555298, + "logps/chosen": -350.37957763671875, + "logps/rejected": -428.45745849609375, + "loss": 0.4668, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.4323270320892334, + "rewards/margins": 1.0882251262664795, + "rewards/rejected": -2.520552396774292, + "step": 215 + }, + { + "epoch": 1.0159480212640284, + "eval_logits/chosen": -2.584289789199829, + "eval_logits/rejected": -2.5910158157348633, + "eval_logps/chosen": -361.1751708984375, + "eval_logps/rejected": -463.7195129394531, + "eval_loss": 0.49447911977767944, + "eval_rewards/accuracies": 0.6439393758773804, + "eval_rewards/chosen": -1.499444603919983, + "eval_rewards/margins": 1.138251781463623, + "eval_rewards/rejected": -2.6376962661743164, + "eval_runtime": 224.7997, + "eval_samples_per_second": 16.263, + "eval_steps_per_second": 0.294, + "step": 215 + }, + { + "epoch": 1.0206733608978145, + "grad_norm": 4.359766282431931, + "learning_rate": 2.5288036963276414e-07, + "logits/chosen": -2.4916322231292725, + "logits/rejected": -2.482870101928711, + "logps/chosen": -297.8340759277344, + "logps/rejected": -387.14703369140625, + "loss": 0.4968, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1933519840240479, + "rewards/margins": 1.0262653827667236, + "rewards/rejected": -2.2196173667907715, + "step": 216 + }, + { + "epoch": 1.0253987005316008, + "grad_norm": 3.593941578167617, + "learning_rate": 2.509601420936717e-07, + "logits/chosen": -2.5306057929992676, + "logits/rejected": -2.440415382385254, + "logps/chosen": -341.2530212402344, + "logps/rejected": -464.18487548828125, + "loss": 0.488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4438025951385498, + "rewards/margins": 1.340946078300476, + "rewards/rejected": -2.7847485542297363, + "step": 217 + }, + { + "epoch": 1.0301240401653868, + "grad_norm": 3.664745354346814, + "learning_rate": 2.490398579063283e-07, + "logits/chosen": -2.6102001667022705, + "logits/rejected": -2.529940128326416, + "logps/chosen": -345.69677734375, + "logps/rejected": -484.93035888671875, + "loss": 0.5001, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5673726797103882, + "rewards/margins": 1.314273476600647, + "rewards/rejected": -2.8816463947296143, + "step": 218 + }, + { + "epoch": 1.0348493797991731, + "grad_norm": 5.084728530421535, + "learning_rate": 2.4711963036723583e-07, + "logits/chosen": -2.3622119426727295, + "logits/rejected": -2.365530490875244, + "logps/chosen": -367.52984619140625, + "logps/rejected": -406.59375, + "loss": 0.4928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4226937294006348, + "rewards/margins": 0.49759745597839355, + "rewards/rejected": -1.9202911853790283, + "step": 219 + }, + { + "epoch": 1.0395747194329592, + "grad_norm": 3.6814125755312093, + "learning_rate": 2.451995727695535e-07, + "logits/chosen": -2.4771206378936768, + "logits/rejected": -2.5038909912109375, + "logps/chosen": -365.91815185546875, + "logps/rejected": -477.00347900390625, + "loss": 0.4837, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5666453838348389, + "rewards/margins": 1.2746491432189941, + "rewards/rejected": -2.841294527053833, + "step": 220 + }, + { + "epoch": 1.0443000590667455, + "grad_norm": 4.348311431123137, + "learning_rate": 2.432797983964141e-07, + "logits/chosen": -2.514589786529541, + "logits/rejected": -2.460106134414673, + "logps/chosen": -358.013427734375, + "logps/rejected": -444.3507080078125, + "loss": 0.4922, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.4816346168518066, + "rewards/margins": 0.9881500601768494, + "rewards/rejected": -2.469784736633301, + "step": 221 + }, + { + "epoch": 1.0490253987005316, + "grad_norm": 4.474335447495181, + "learning_rate": 2.413604205142404e-07, + "logits/chosen": -2.2786381244659424, + "logits/rejected": -2.371419668197632, + "logps/chosen": -377.5779113769531, + "logps/rejected": -431.80316162109375, + "loss": 0.5267, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.8127973079681396, + "rewards/margins": 0.9888743162155151, + "rewards/rejected": -2.8016717433929443, + "step": 222 + }, + { + "epoch": 1.0537507383343179, + "grad_norm": 3.9171412475826557, + "learning_rate": 2.3944155236606196e-07, + "logits/chosen": -2.4430832862854004, + "logits/rejected": -2.409916639328003, + "logps/chosen": -344.40399169921875, + "logps/rejected": -461.02349853515625, + "loss": 0.4846, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4063448905944824, + "rewards/margins": 1.1203367710113525, + "rewards/rejected": -2.526681661605835, + "step": 223 + }, + { + "epoch": 1.058476077968104, + "grad_norm": 5.279702970403167, + "learning_rate": 2.3752330716483444e-07, + "logits/chosen": -2.7835280895233154, + "logits/rejected": -2.7787797451019287, + "logps/chosen": -353.5722351074219, + "logps/rejected": -415.2908935546875, + "loss": 0.5057, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6296898126602173, + "rewards/margins": 0.9214704632759094, + "rewards/rejected": -2.5511598587036133, + "step": 224 + }, + { + "epoch": 1.0632014176018902, + "grad_norm": 3.8586230520693157, + "learning_rate": 2.356057980867594e-07, + "logits/chosen": -2.547018527984619, + "logits/rejected": -2.5479323863983154, + "logps/chosen": -411.9010009765625, + "logps/rejected": -543.1708984375, + "loss": 0.4834, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.6013555526733398, + "rewards/margins": 1.6427891254425049, + "rewards/rejected": -3.244144916534424, + "step": 225 + }, + { + "epoch": 1.0679267572356763, + "grad_norm": 3.836310963297125, + "learning_rate": 2.3368913826460742e-07, + "logits/chosen": -2.622213840484619, + "logits/rejected": -2.699857473373413, + "logps/chosen": -383.7267150878906, + "logps/rejected": -406.25152587890625, + "loss": 0.5085, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.4025287628173828, + "rewards/margins": 0.56195068359375, + "rewards/rejected": -1.9644795656204224, + "step": 226 + }, + { + "epoch": 1.0726520968694624, + "grad_norm": 4.999193459222086, + "learning_rate": 2.3177344078104305e-07, + "logits/chosen": -2.6717772483825684, + "logits/rejected": -2.672889232635498, + "logps/chosen": -346.08544921875, + "logps/rejected": -417.90545654296875, + "loss": 0.4906, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.6554023027420044, + "rewards/margins": 0.8562977313995361, + "rewards/rejected": -2.51170015335083, + "step": 227 + }, + { + "epoch": 1.0773774365032487, + "grad_norm": 4.6410271396992595, + "learning_rate": 2.2985881866195304e-07, + "logits/chosen": -2.646639347076416, + "logits/rejected": -2.7165169715881348, + "logps/chosen": -355.2235107421875, + "logps/rejected": -395.57550048828125, + "loss": 0.5089, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.3660098314285278, + "rewards/margins": 0.7286862134933472, + "rewards/rejected": -2.094696044921875, + "step": 228 + }, + { + "epoch": 1.0821027761370348, + "grad_norm": 3.9004711748243412, + "learning_rate": 2.2794538486977765e-07, + "logits/chosen": -2.573826313018799, + "logits/rejected": -2.6808300018310547, + "logps/chosen": -355.0180969238281, + "logps/rejected": -473.90252685546875, + "loss": 0.4922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4073047637939453, + "rewards/margins": 1.1750006675720215, + "rewards/rejected": -2.582305431365967, + "step": 229 + }, + { + "epoch": 1.086828115770821, + "grad_norm": 3.6316010874552704, + "learning_rate": 2.2603325229684628e-07, + "logits/chosen": -2.4409735202789307, + "logits/rejected": -2.4068901538848877, + "logps/chosen": -306.1089172363281, + "logps/rejected": -447.4771423339844, + "loss": 0.4552, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.4156140089035034, + "rewards/margins": 1.0425467491149902, + "rewards/rejected": -2.458160877227783, + "step": 230 + }, + { + "epoch": 1.0915534554046071, + "grad_norm": 4.420363258356965, + "learning_rate": 2.2412253375871618e-07, + "logits/chosen": -2.681562662124634, + "logits/rejected": -2.5477468967437744, + "logps/chosen": -338.50775146484375, + "logps/rejected": -500.01568603515625, + "loss": 0.4909, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4533276557922363, + "rewards/margins": 1.5830377340316772, + "rewards/rejected": -3.036365270614624, + "step": 231 + }, + { + "epoch": 1.0962787950383934, + "grad_norm": 3.3398932376367116, + "learning_rate": 2.2221334198751717e-07, + "logits/chosen": -2.566534996032715, + "logits/rejected": -2.663203716278076, + "logps/chosen": -423.2144470214844, + "logps/rejected": -525.0465087890625, + "loss": 0.4902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.552110195159912, + "rewards/margins": 1.6297154426574707, + "rewards/rejected": -3.181825876235962, + "step": 232 + }, + { + "epoch": 1.1010041346721795, + "grad_norm": 3.258958216613353, + "learning_rate": 2.2030578962529964e-07, + "logits/chosen": -2.533161163330078, + "logits/rejected": -2.598869800567627, + "logps/chosen": -384.85443115234375, + "logps/rejected": -436.78302001953125, + "loss": 0.4943, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.1795471906661987, + "rewards/margins": 0.7562814354896545, + "rewards/rejected": -1.935828685760498, + "step": 233 + }, + { + "epoch": 1.1057294743059658, + "grad_norm": 4.3977570998277695, + "learning_rate": 2.1839998921738948e-07, + "logits/chosen": -2.682744026184082, + "logits/rejected": -2.8130013942718506, + "logps/chosen": -375.19012451171875, + "logps/rejected": -446.29486083984375, + "loss": 0.4962, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.338588833808899, + "rewards/margins": 0.9564355611801147, + "rewards/rejected": -2.2950241565704346, + "step": 234 + }, + { + "epoch": 1.1104548139397519, + "grad_norm": 3.711007822076339, + "learning_rate": 2.1649605320574715e-07, + "logits/chosen": -2.770697832107544, + "logits/rejected": -2.7958450317382812, + "logps/chosen": -263.7291259765625, + "logps/rejected": -410.407470703125, + "loss": 0.4984, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2530757188796997, + "rewards/margins": 1.3193039894104004, + "rewards/rejected": -2.5723795890808105, + "step": 235 + }, + { + "epoch": 1.1151801535735382, + "grad_norm": 5.205159446883962, + "learning_rate": 2.1459409392233414e-07, + "logits/chosen": -2.509124279022217, + "logits/rejected": -2.4804513454437256, + "logps/chosen": -431.5054931640625, + "logps/rejected": -569.149169921875, + "loss": 0.4849, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6995575428009033, + "rewards/margins": 1.6153841018676758, + "rewards/rejected": -3.3149421215057373, + "step": 236 + }, + { + "epoch": 1.1199054932073242, + "grad_norm": 3.6720237761578276, + "learning_rate": 2.1269422358248534e-07, + "logits/chosen": -2.2790334224700928, + "logits/rejected": -2.5470128059387207, + "logps/chosen": -368.87750244140625, + "logps/rejected": -383.09478759765625, + "loss": 0.4829, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3113298416137695, + "rewards/margins": 0.9517258405685425, + "rewards/rejected": -2.2630558013916016, + "step": 237 + }, + { + "epoch": 1.1246308328411105, + "grad_norm": 4.38203569485804, + "learning_rate": 2.1079655427828804e-07, + "logits/chosen": -2.484546184539795, + "logits/rejected": -2.513195753097534, + "logps/chosen": -329.1628723144531, + "logps/rejected": -448.17327880859375, + "loss": 0.4783, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3690838813781738, + "rewards/margins": 1.233577847480774, + "rewards/rejected": -2.602661609649658, + "step": 238 + }, + { + "epoch": 1.1293561724748966, + "grad_norm": 4.1915678394143665, + "learning_rate": 2.0890119797196904e-07, + "logits/chosen": -2.489327907562256, + "logits/rejected": -2.619502544403076, + "logps/chosen": -367.636474609375, + "logps/rejected": -442.066650390625, + "loss": 0.4619, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.260221004486084, + "rewards/margins": 1.034293293952942, + "rewards/rejected": -2.2945144176483154, + "step": 239 + }, + { + "epoch": 1.1340815121086827, + "grad_norm": 4.912500043289106, + "learning_rate": 2.0700826648928827e-07, + "logits/chosen": -2.445549964904785, + "logits/rejected": -2.4925429821014404, + "logps/chosen": -403.63580322265625, + "logps/rejected": -492.6475830078125, + "loss": 0.4837, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4245916604995728, + "rewards/margins": 1.291830062866211, + "rewards/rejected": -2.7164220809936523, + "step": 240 + }, + { + "epoch": 1.138806851742469, + "grad_norm": 3.973403985509937, + "learning_rate": 2.0511787151294153e-07, + "logits/chosen": -2.6269099712371826, + "logits/rejected": -2.6633949279785156, + "logps/chosen": -362.7601318359375, + "logps/rejected": -465.685791015625, + "loss": 0.4772, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.3287506103515625, + "rewards/margins": 1.2157796621322632, + "rewards/rejected": -2.544530153274536, + "step": 241 + }, + { + "epoch": 1.1435321913762553, + "grad_norm": 3.706128513697955, + "learning_rate": 2.0323012457597113e-07, + "logits/chosen": -2.6716468334198, + "logits/rejected": -2.5537989139556885, + "logps/chosen": -313.87188720703125, + "logps/rejected": -456.5760498046875, + "loss": 0.4663, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.3276413679122925, + "rewards/margins": 1.286893367767334, + "rewards/rejected": -2.614534854888916, + "step": 242 + }, + { + "epoch": 1.1482575310100414, + "grad_norm": 4.436897843537744, + "learning_rate": 2.0134513705518544e-07, + "logits/chosen": -2.442168951034546, + "logits/rejected": -2.475062608718872, + "logps/chosen": -349.3672180175781, + "logps/rejected": -437.71075439453125, + "loss": 0.4738, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4281671047210693, + "rewards/margins": 1.169339656829834, + "rewards/rejected": -2.5975069999694824, + "step": 243 + }, + { + "epoch": 1.1529828706438274, + "grad_norm": 3.739187250501708, + "learning_rate": 1.9946302016458754e-07, + "logits/chosen": -2.5069191455841064, + "logits/rejected": -2.4427642822265625, + "logps/chosen": -378.0785827636719, + "logps/rejected": -515.2130126953125, + "loss": 0.4537, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5397615432739258, + "rewards/margins": 1.4216468334197998, + "rewards/rejected": -2.9614078998565674, + "step": 244 + }, + { + "epoch": 1.1577082102776137, + "grad_norm": 4.618667609463329, + "learning_rate": 1.975838849488139e-07, + "logits/chosen": -2.6840784549713135, + "logits/rejected": -2.608031988143921, + "logps/chosen": -353.92388916015625, + "logps/rejected": -443.1690368652344, + "loss": 0.4789, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5656286478042603, + "rewards/margins": 0.7766789793968201, + "rewards/rejected": -2.3423075675964355, + "step": 245 + }, + { + "epoch": 1.1624335499113998, + "grad_norm": 3.85370799421685, + "learning_rate": 1.957078422765823e-07, + "logits/chosen": -2.512361764907837, + "logits/rejected": -2.664332389831543, + "logps/chosen": -423.3899230957031, + "logps/rejected": -493.901123046875, + "loss": 0.508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5689921379089355, + "rewards/margins": 1.2216757535934448, + "rewards/rejected": -2.790667772293091, + "step": 246 + }, + { + "epoch": 1.167158889545186, + "grad_norm": 4.071196958136995, + "learning_rate": 1.9383500283415127e-07, + "logits/chosen": -2.7708868980407715, + "logits/rejected": -2.928670644760132, + "logps/chosen": -413.68780517578125, + "logps/rejected": -419.13287353515625, + "loss": 0.4674, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6208066940307617, + "rewards/margins": 0.8534724116325378, + "rewards/rejected": -2.4742789268493652, + "step": 247 + }, + { + "epoch": 1.1718842291789722, + "grad_norm": 4.6057615724394445, + "learning_rate": 1.9196547711878882e-07, + "logits/chosen": -2.7272331714630127, + "logits/rejected": -2.7651190757751465, + "logps/chosen": -391.68218994140625, + "logps/rejected": -536.7830810546875, + "loss": 0.4898, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3025808334350586, + "rewards/margins": 1.699794888496399, + "rewards/rejected": -3.002375602722168, + "step": 248 + }, + { + "epoch": 1.1766095688127585, + "grad_norm": 4.356687184481579, + "learning_rate": 1.9009937543225393e-07, + "logits/chosen": -2.6314167976379395, + "logits/rejected": -2.5595688819885254, + "logps/chosen": -311.2470703125, + "logps/rejected": -458.4511413574219, + "loss": 0.4912, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.5766427516937256, + "rewards/margins": 1.1243988275527954, + "rewards/rejected": -2.7010414600372314, + "step": 249 + }, + { + "epoch": 1.1813349084465445, + "grad_norm": 3.8646816254377168, + "learning_rate": 1.8823680787428804e-07, + "logits/chosen": -2.628770589828491, + "logits/rejected": -2.6632299423217773, + "logps/chosen": -362.2929382324219, + "logps/rejected": -451.97686767578125, + "loss": 0.5168, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.444336175918579, + "rewards/margins": 1.026198387145996, + "rewards/rejected": -2.470534324645996, + "step": 250 + }, + { + "epoch": 1.1860602480803308, + "grad_norm": 4.972160856261013, + "learning_rate": 1.8637788433611946e-07, + "logits/chosen": -2.6531898975372314, + "logits/rejected": -2.6115176677703857, + "logps/chosen": -374.62554931640625, + "logps/rejected": -556.6736450195312, + "loss": 0.4812, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.495612382888794, + "rewards/margins": 1.505082368850708, + "rewards/rejected": -3.000694751739502, + "step": 251 + }, + { + "epoch": 1.190785587714117, + "grad_norm": 5.493325018181506, + "learning_rate": 1.8452271449398015e-07, + "logits/chosen": -2.6560559272766113, + "logits/rejected": -2.7018203735351562, + "logps/chosen": -352.3895568847656, + "logps/rejected": -441.25994873046875, + "loss": 0.4814, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6333725452423096, + "rewards/margins": 1.1176269054412842, + "rewards/rejected": -2.7509994506835938, + "step": 252 + }, + { + "epoch": 1.1955109273479032, + "grad_norm": 4.859694388307905, + "learning_rate": 1.8267140780263424e-07, + "logits/chosen": -2.634824514389038, + "logits/rejected": -2.638603448867798, + "logps/chosen": -336.9228515625, + "logps/rejected": -454.23883056640625, + "loss": 0.4996, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.460708498954773, + "rewards/margins": 1.1815128326416016, + "rewards/rejected": -2.642221450805664, + "step": 253 + }, + { + "epoch": 1.2002362669816893, + "grad_norm": 4.23344077907717, + "learning_rate": 1.8082407348892076e-07, + "logits/chosen": -2.581425666809082, + "logits/rejected": -2.4411489963531494, + "logps/chosen": -360.4097595214844, + "logps/rejected": -511.65484619140625, + "loss": 0.4754, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.5460442304611206, + "rewards/margins": 1.2290416955947876, + "rewards/rejected": -2.775085687637329, + "step": 254 + }, + { + "epoch": 1.2049616066154756, + "grad_norm": 5.021641074779002, + "learning_rate": 1.7898082054530868e-07, + "logits/chosen": -2.5814576148986816, + "logits/rejected": -2.562331199645996, + "logps/chosen": -391.0801696777344, + "logps/rejected": -486.2853698730469, + "loss": 0.4886, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5095934867858887, + "rewards/margins": 0.9785588979721069, + "rewards/rejected": -2.488152503967285, + "step": 255 + }, + { + "epoch": 1.2096869462492617, + "grad_norm": 4.032966746831722, + "learning_rate": 1.7714175772346683e-07, + "logits/chosen": -2.73325514793396, + "logits/rejected": -2.8232052326202393, + "logps/chosen": -361.27423095703125, + "logps/rejected": -461.78125, + "loss": 0.5027, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.4946091175079346, + "rewards/margins": 1.1646876335144043, + "rewards/rejected": -2.659296751022339, + "step": 256 + }, + { + "epoch": 1.2144122858830477, + "grad_norm": 4.348859426367674, + "learning_rate": 1.753069935278477e-07, + "logits/chosen": -2.5508053302764893, + "logits/rejected": -2.6324799060821533, + "logps/chosen": -349.6580810546875, + "logps/rejected": -436.2727966308594, + "loss": 0.4904, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.5449261665344238, + "rewards/margins": 1.1617025136947632, + "rewards/rejected": -2.7066287994384766, + "step": 257 + }, + { + "epoch": 1.219137625516834, + "grad_norm": 3.7399337348111246, + "learning_rate": 1.7347663620928494e-07, + "logits/chosen": -2.6396665573120117, + "logits/rejected": -2.686690330505371, + "logps/chosen": -373.62762451171875, + "logps/rejected": -456.91876220703125, + "loss": 0.4607, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5388782024383545, + "rewards/margins": 1.0700188875198364, + "rewards/rejected": -2.6088972091674805, + "step": 258 + }, + { + "epoch": 1.219137625516834, + "eval_logits/chosen": -2.7951161861419678, + "eval_logits/rejected": -2.8025708198547363, + "eval_logps/chosen": -369.3280029296875, + "eval_logps/rejected": -488.8177490234375, + "eval_loss": 0.48161956667900085, + "eval_rewards/accuracies": 0.6401515007019043, + "eval_rewards/chosen": -1.5809730291366577, + "eval_rewards/margins": 1.307705044746399, + "eval_rewards/rejected": -2.8886778354644775, + "eval_runtime": 225.1752, + "eval_samples_per_second": 16.236, + "eval_steps_per_second": 0.293, + "step": 258 + }, + { + "epoch": 1.2238629651506203, + "grad_norm": 4.565788447191084, + "learning_rate": 1.7165079375860752e-07, + "logits/chosen": -2.7769393920898438, + "logits/rejected": -2.7542011737823486, + "logps/chosen": -327.9093322753906, + "logps/rejected": -441.0569763183594, + "loss": 0.5122, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4275953769683838, + "rewards/margins": 1.174314022064209, + "rewards/rejected": -2.6019093990325928, + "step": 259 + }, + { + "epoch": 1.2285883047844064, + "grad_norm": 3.8161499063717206, + "learning_rate": 1.6982957390026748e-07, + "logits/chosen": -2.5881879329681396, + "logits/rejected": -2.5274972915649414, + "logps/chosen": -380.05670166015625, + "logps/rejected": -548.7481079101562, + "loss": 0.4618, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.7176233530044556, + "rewards/margins": 1.5351811647415161, + "rewards/rejected": -3.2528045177459717, + "step": 260 + }, + { + "epoch": 1.2333136444181925, + "grad_norm": 4.44964808900954, + "learning_rate": 1.680130840859848e-07, + "logits/chosen": -2.6286840438842773, + "logits/rejected": -2.606605291366577, + "logps/chosen": -325.35833740234375, + "logps/rejected": -435.7322082519531, + "loss": 0.4912, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.2944529056549072, + "rewards/margins": 1.2223491668701172, + "rewards/rejected": -2.5168020725250244, + "step": 261 + }, + { + "epoch": 1.2380389840519788, + "grad_norm": 4.264258297752704, + "learning_rate": 1.662014314884074e-07, + "logits/chosen": -2.6938886642456055, + "logits/rejected": -2.6410956382751465, + "logps/chosen": -341.89141845703125, + "logps/rejected": -533.8803100585938, + "loss": 0.4583, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.6077433824539185, + "rewards/margins": 1.743915319442749, + "rewards/rejected": -3.351658821105957, + "step": 262 + }, + { + "epoch": 1.2427643236857648, + "grad_norm": 3.8551209282619374, + "learning_rate": 1.64394722994788e-07, + "logits/chosen": -2.644559383392334, + "logits/rejected": -2.672788381576538, + "logps/chosen": -332.5156555175781, + "logps/rejected": -419.4230041503906, + "loss": 0.4707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4292283058166504, + "rewards/margins": 0.7848268151283264, + "rewards/rejected": -2.214055299758911, + "step": 263 + }, + { + "epoch": 1.2474896633195511, + "grad_norm": 5.862600550730873, + "learning_rate": 1.625930652006782e-07, + "logits/chosen": -2.806763172149658, + "logits/rejected": -2.818608283996582, + "logps/chosen": -332.0267639160156, + "logps/rejected": -421.3615417480469, + "loss": 0.5, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.3579407930374146, + "rewards/margins": 1.1036723852157593, + "rewards/rejected": -2.461613178253174, + "step": 264 + }, + { + "epoch": 1.2522150029533372, + "grad_norm": 4.952960168127134, + "learning_rate": 1.607965644036386e-07, + "logits/chosen": -2.397037982940674, + "logits/rejected": -2.370586633682251, + "logps/chosen": -380.63433837890625, + "logps/rejected": -567.65478515625, + "loss": 0.4619, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.3215548992156982, + "rewards/margins": 1.9972357749938965, + "rewards/rejected": -3.318790912628174, + "step": 265 + }, + { + "epoch": 1.2569403425871235, + "grad_norm": 4.451231917032084, + "learning_rate": 1.5900532659696786e-07, + "logits/chosen": -2.5172245502471924, + "logits/rejected": -2.549943208694458, + "logps/chosen": -310.1299743652344, + "logps/rejected": -405.5203857421875, + "loss": 0.493, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.271753191947937, + "rewards/margins": 0.9632126688957214, + "rewards/rejected": -2.2349658012390137, + "step": 266 + }, + { + "epoch": 1.2616656822209096, + "grad_norm": 3.9305163776837313, + "learning_rate": 1.5721945746344914e-07, + "logits/chosen": -2.5553438663482666, + "logits/rejected": -2.6015634536743164, + "logps/chosen": -336.09326171875, + "logps/rejected": -444.8936462402344, + "loss": 0.4871, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.417650580406189, + "rewards/margins": 1.3679817914962769, + "rewards/rejected": -2.785632371902466, + "step": 267 + }, + { + "epoch": 1.2663910218546959, + "grad_norm": 3.688073971085107, + "learning_rate": 1.5543906236911423e-07, + "logits/chosen": -2.798358917236328, + "logits/rejected": -2.803248167037964, + "logps/chosen": -307.88250732421875, + "logps/rejected": -455.02081298828125, + "loss": 0.4952, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4341715574264526, + "rewards/margins": 1.0945826768875122, + "rewards/rejected": -2.528754234313965, + "step": 268 + }, + { + "epoch": 1.271116361488482, + "grad_norm": 4.072630703078056, + "learning_rate": 1.5366424635702773e-07, + "logits/chosen": -2.4071907997131348, + "logits/rejected": -2.469078540802002, + "logps/chosen": -344.1185302734375, + "logps/rejected": -481.708740234375, + "loss": 0.4607, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.4194681644439697, + "rewards/margins": 1.5102123022079468, + "rewards/rejected": -2.929680109024048, + "step": 269 + }, + { + "epoch": 1.2758417011222682, + "grad_norm": 4.509575176085059, + "learning_rate": 1.5189511414108902e-07, + "logits/chosen": -2.6352696418762207, + "logits/rejected": -2.5769996643066406, + "logps/chosen": -299.04486083984375, + "logps/rejected": -428.2113952636719, + "loss": 0.4548, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.261251449584961, + "rewards/margins": 1.3529876470565796, + "rewards/rejected": -2.61423921585083, + "step": 270 + }, + { + "epoch": 1.2805670407560543, + "grad_norm": 4.337503774611465, + "learning_rate": 1.5013177009985412e-07, + "logits/chosen": -2.492708683013916, + "logits/rejected": -2.5158631801605225, + "logps/chosen": -369.5946960449219, + "logps/rejected": -539.3471069335938, + "loss": 0.4839, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4575221538543701, + "rewards/margins": 1.7974143028259277, + "rewards/rejected": -3.2549362182617188, + "step": 271 + }, + { + "epoch": 1.2852923803898406, + "grad_norm": 3.7985527417770797, + "learning_rate": 1.4837431827037786e-07, + "logits/chosen": -2.588874101638794, + "logits/rejected": -2.6791810989379883, + "logps/chosen": -399.30633544921875, + "logps/rejected": -391.2076416015625, + "loss": 0.4848, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.1733232736587524, + "rewards/margins": 0.8876461982727051, + "rewards/rejected": -2.060969591140747, + "step": 272 + }, + { + "epoch": 1.2900177200236267, + "grad_norm": 4.177927201443006, + "learning_rate": 1.466228623420751e-07, + "logits/chosen": -2.498131036758423, + "logits/rejected": -2.5749406814575195, + "logps/chosen": -323.9309997558594, + "logps/rejected": -450.7756042480469, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2552162408828735, + "rewards/margins": 1.4370646476745605, + "rewards/rejected": -2.6922807693481445, + "step": 273 + }, + { + "epoch": 1.2947430596574128, + "grad_norm": 3.6814005701875523, + "learning_rate": 1.448775056506036e-07, + "logits/chosen": -2.469701051712036, + "logits/rejected": -2.5528130531311035, + "logps/chosen": -382.1181335449219, + "logps/rejected": -487.5738525390625, + "loss": 0.4747, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.516263723373413, + "rewards/margins": 1.4078060388565063, + "rewards/rejected": -2.92406964302063, + "step": 274 + }, + { + "epoch": 1.299468399291199, + "grad_norm": 3.7200492832656766, + "learning_rate": 1.4313835117176692e-07, + "logits/chosen": -2.9147932529449463, + "logits/rejected": -2.988351821899414, + "logps/chosen": -376.6600341796875, + "logps/rejected": -456.061279296875, + "loss": 0.4718, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.3018884658813477, + "rewards/margins": 1.2087831497192383, + "rewards/rejected": -2.510671615600586, + "step": 275 + }, + { + "epoch": 1.3041937389249854, + "grad_norm": 4.704523030577493, + "learning_rate": 1.4140550151543872e-07, + "logits/chosen": -2.5561208724975586, + "logits/rejected": -2.669656276702881, + "logps/chosen": -391.99273681640625, + "logps/rejected": -470.0566711425781, + "loss": 0.4779, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.384698510169983, + "rewards/margins": 1.2832698822021484, + "rewards/rejected": -2.667968511581421, + "step": 276 + }, + { + "epoch": 1.3089190785587714, + "grad_norm": 3.890211089107488, + "learning_rate": 1.3967905891950936e-07, + "logits/chosen": -2.525979995727539, + "logits/rejected": -2.5187900066375732, + "logps/chosen": -329.0636291503906, + "logps/rejected": -518.5650024414062, + "loss": 0.4978, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.3041456937789917, + "rewards/margins": 1.7594897747039795, + "rewards/rejected": -3.0636353492736816, + "step": 277 + }, + { + "epoch": 1.3136444181925575, + "grad_norm": 3.900954884610721, + "learning_rate": 1.3795912524385322e-07, + "logits/chosen": -2.6802122592926025, + "logits/rejected": -2.7386527061462402, + "logps/chosen": -394.46905517578125, + "logps/rejected": -530.927978515625, + "loss": 0.4819, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5945687294006348, + "rewards/margins": 1.4686897993087769, + "rewards/rejected": -3.063258647918701, + "step": 278 + }, + { + "epoch": 1.3183697578263438, + "grad_norm": 5.012829563655198, + "learning_rate": 1.3624580196431952e-07, + "logits/chosen": -2.735568046569824, + "logits/rejected": -2.7510178089141846, + "logps/chosen": -375.1429748535156, + "logps/rejected": -484.94390869140625, + "loss": 0.4769, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.396562099456787, + "rewards/margins": 1.301504135131836, + "rewards/rejected": -2.698065996170044, + "step": 279 + }, + { + "epoch": 1.3230950974601299, + "grad_norm": 6.486088065803327, + "learning_rate": 1.3453919016674483e-07, + "logits/chosen": -2.5972790718078613, + "logits/rejected": -2.6828713417053223, + "logps/chosen": -317.81280517578125, + "logps/rejected": -378.3294982910156, + "loss": 0.4867, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.1421467065811157, + "rewards/margins": 1.074657917022705, + "rewards/rejected": -2.2168045043945312, + "step": 280 + }, + { + "epoch": 1.3278204370939162, + "grad_norm": 4.08450025143174, + "learning_rate": 1.328393905409892e-07, + "logits/chosen": -2.4976108074188232, + "logits/rejected": -2.5105791091918945, + "logps/chosen": -394.51141357421875, + "logps/rejected": -482.7172546386719, + "loss": 0.4878, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.521199107170105, + "rewards/margins": 1.0865689516067505, + "rewards/rejected": -2.6077680587768555, + "step": 281 + }, + { + "epoch": 1.3325457767277022, + "grad_norm": 4.246089354170224, + "learning_rate": 1.3114650337499578e-07, + "logits/chosen": -2.629361152648926, + "logits/rejected": -2.595665693283081, + "logps/chosen": -334.8639831542969, + "logps/rejected": -423.8031921386719, + "loss": 0.4994, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5956919193267822, + "rewards/margins": 0.8941479921340942, + "rewards/rejected": -2.489840030670166, + "step": 282 + }, + { + "epoch": 1.3372711163614885, + "grad_norm": 3.6260185771038103, + "learning_rate": 1.2946062854887314e-07, + "logits/chosen": -2.430432081222534, + "logits/rejected": -2.3944997787475586, + "logps/chosen": -371.2943115234375, + "logps/rejected": -522.4259033203125, + "loss": 0.4836, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5247899293899536, + "rewards/margins": 1.4179781675338745, + "rewards/rejected": -2.942767858505249, + "step": 283 + }, + { + "epoch": 1.3419964559952746, + "grad_norm": 4.374381091295728, + "learning_rate": 1.2778186552900316e-07, + "logits/chosen": -2.7117838859558105, + "logits/rejected": -2.761711597442627, + "logps/chosen": -400.6455078125, + "logps/rejected": -503.24371337890625, + "loss": 0.484, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.717864990234375, + "rewards/margins": 1.5052025318145752, + "rewards/rejected": -3.2230677604675293, + "step": 284 + }, + { + "epoch": 1.346721795629061, + "grad_norm": 3.8541867456367847, + "learning_rate": 1.261103133621718e-07, + "logits/chosen": -2.594248056411743, + "logits/rejected": -2.603362798690796, + "logps/chosen": -360.4008483886719, + "logps/rejected": -533.9642944335938, + "loss": 0.4763, + "rewards/accuracies": 0.546875, + "rewards/chosen": -1.626147747039795, + "rewards/margins": 1.3955036401748657, + "rewards/rejected": -3.021651268005371, + "step": 285 + }, + { + "epoch": 1.351447135262847, + "grad_norm": 3.8033607627593815, + "learning_rate": 1.2444607066972583e-07, + "logits/chosen": -2.385476589202881, + "logits/rejected": -2.4700021743774414, + "logps/chosen": -379.0484924316406, + "logps/rejected": -446.36932373046875, + "loss": 0.4707, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.5443971157073975, + "rewards/margins": 1.0206142663955688, + "rewards/rejected": -2.5650112628936768, + "step": 286 + }, + { + "epoch": 1.356172474896633, + "grad_norm": 3.9322741636171017, + "learning_rate": 1.227892356417542e-07, + "logits/chosen": -2.8771088123321533, + "logits/rejected": -2.838731050491333, + "logps/chosen": -366.8982849121094, + "logps/rejected": -519.6290893554688, + "loss": 0.4657, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.7020255327224731, + "rewards/margins": 1.5927929878234863, + "rewards/rejected": -3.29481840133667, + "step": 287 + }, + { + "epoch": 1.3608978145304194, + "grad_norm": 4.441774997254426, + "learning_rate": 1.211399060312943e-07, + "logits/chosen": -2.6161060333251953, + "logits/rejected": -2.6837587356567383, + "logps/chosen": -333.47747802734375, + "logps/rejected": -396.8973388671875, + "loss": 0.4805, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4345372915267944, + "rewards/margins": 0.6499552726745605, + "rewards/rejected": -2.0844926834106445, + "step": 288 + }, + { + "epoch": 1.3656231541642057, + "grad_norm": 4.102742239919904, + "learning_rate": 1.1949817914856539e-07, + "logits/chosen": -2.6814827919006348, + "logits/rejected": -2.6281232833862305, + "logps/chosen": -309.97998046875, + "logps/rejected": -483.1641540527344, + "loss": 0.4876, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.347307801246643, + "rewards/margins": 1.4000307321548462, + "rewards/rejected": -2.7473385334014893, + "step": 289 + }, + { + "epoch": 1.3703484937979917, + "grad_norm": 3.510044851118614, + "learning_rate": 1.1786415185522644e-07, + "logits/chosen": -2.4141433238983154, + "logits/rejected": -2.440483331680298, + "logps/chosen": -371.58050537109375, + "logps/rejected": -485.5816345214844, + "loss": 0.462, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.6337575912475586, + "rewards/margins": 1.3851597309112549, + "rewards/rejected": -3.0189173221588135, + "step": 290 + }, + { + "epoch": 1.3750738334317778, + "grad_norm": 4.439828210645555, + "learning_rate": 1.1623792055866182e-07, + "logits/chosen": -2.9460198879241943, + "logits/rejected": -2.792397975921631, + "logps/chosen": -306.8702392578125, + "logps/rejected": -499.4562072753906, + "loss": 0.4817, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.5415101051330566, + "rewards/margins": 1.6944992542266846, + "rewards/rejected": -3.2360095977783203, + "step": 291 + }, + { + "epoch": 1.379799173065564, + "grad_norm": 3.808563735975941, + "learning_rate": 1.1461958120629345e-07, + "logits/chosen": -2.601799488067627, + "logits/rejected": -2.6055119037628174, + "logps/chosen": -349.54119873046875, + "logps/rejected": -453.0423278808594, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.38273286819458, + "rewards/margins": 1.1617132425308228, + "rewards/rejected": -2.5444459915161133, + "step": 292 + }, + { + "epoch": 1.3845245126993504, + "grad_norm": 3.614960145049863, + "learning_rate": 1.1300922927991912e-07, + "logits/chosen": -2.3846492767333984, + "logits/rejected": -2.35745906829834, + "logps/chosen": -377.6761169433594, + "logps/rejected": -496.96405029296875, + "loss": 0.4831, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5498136281967163, + "rewards/margins": 1.1908408403396606, + "rewards/rejected": -2.740654468536377, + "step": 293 + }, + { + "epoch": 1.3892498523331365, + "grad_norm": 5.27301677074656, + "learning_rate": 1.1140695979008017e-07, + "logits/chosen": -2.359983205795288, + "logits/rejected": -2.346726894378662, + "logps/chosen": -323.6959533691406, + "logps/rejected": -452.9242248535156, + "loss": 0.4725, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4340472221374512, + "rewards/margins": 1.28817880153656, + "rewards/rejected": -2.722226142883301, + "step": 294 + }, + { + "epoch": 1.3939751919669225, + "grad_norm": 3.7242744476436704, + "learning_rate": 1.0981286727045483e-07, + "logits/chosen": -2.4720327854156494, + "logits/rejected": -2.353053331375122, + "logps/chosen": -352.6996765136719, + "logps/rejected": -499.5018615722656, + "loss": 0.476, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4100433588027954, + "rewards/margins": 1.4266828298568726, + "rewards/rejected": -2.836726188659668, + "step": 295 + }, + { + "epoch": 1.3987005316007088, + "grad_norm": 4.955539561562781, + "learning_rate": 1.0822704577228131e-07, + "logits/chosen": -2.642940044403076, + "logits/rejected": -2.639770746231079, + "logps/chosen": -338.60595703125, + "logps/rejected": -491.56805419921875, + "loss": 0.4635, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4772026538848877, + "rewards/margins": 1.517985224723816, + "rewards/rejected": -2.995187759399414, + "step": 296 + }, + { + "epoch": 1.403425871234495, + "grad_norm": 4.549161233388332, + "learning_rate": 1.0664958885880901e-07, + "logits/chosen": -2.5309808254241943, + "logits/rejected": -2.6360573768615723, + "logps/chosen": -335.5814514160156, + "logps/rejected": -437.315673828125, + "loss": 0.4888, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.466670036315918, + "rewards/margins": 1.2143834829330444, + "rewards/rejected": -2.681053638458252, + "step": 297 + }, + { + "epoch": 1.4081512108682812, + "grad_norm": 3.7991551857385533, + "learning_rate": 1.0508058959977756e-07, + "logits/chosen": -2.673210382461548, + "logits/rejected": -2.5948569774627686, + "logps/chosen": -333.4312744140625, + "logps/rejected": -518.317626953125, + "loss": 0.47, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2860051393508911, + "rewards/margins": 1.857041358947754, + "rewards/rejected": -3.1430463790893555, + "step": 298 + }, + { + "epoch": 1.4128765505020673, + "grad_norm": 4.382508044742225, + "learning_rate": 1.0352014056592653e-07, + "logits/chosen": -2.746319055557251, + "logits/rejected": -2.8318300247192383, + "logps/chosen": -361.51220703125, + "logps/rejected": -441.3756103515625, + "loss": 0.492, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3820509910583496, + "rewards/margins": 1.1350202560424805, + "rewards/rejected": -2.517071485519409, + "step": 299 + }, + { + "epoch": 1.4176018901358536, + "grad_norm": 4.674467616661016, + "learning_rate": 1.0196833382353303e-07, + "logits/chosen": -2.731412887573242, + "logits/rejected": -2.6908507347106934, + "logps/chosen": -322.404052734375, + "logps/rejected": -462.2546081542969, + "loss": 0.4942, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4098780155181885, + "rewards/margins": 1.386682391166687, + "rewards/rejected": -2.796560287475586, + "step": 300 + }, + { + "epoch": 1.4223272297696397, + "grad_norm": 4.067329027842824, + "learning_rate": 1.0042526092898049e-07, + "logits/chosen": -2.8876852989196777, + "logits/rejected": -2.7597179412841797, + "logps/chosen": -328.04437255859375, + "logps/rejected": -431.107666015625, + "loss": 0.5068, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4123650789260864, + "rewards/margins": 1.0988068580627441, + "rewards/rejected": -2.511171817779541, + "step": 301 + }, + { + "epoch": 1.4223272297696397, + "eval_logits/chosen": -2.751302719116211, + "eval_logits/rejected": -2.7585830688476562, + "eval_logps/chosen": -369.27899169921875, + "eval_logps/rejected": -500.55902099609375, + "eval_loss": 0.47641003131866455, + "eval_rewards/accuracies": 0.6401515007019043, + "eval_rewards/chosen": -1.5804827213287354, + "eval_rewards/margins": 1.4256082773208618, + "eval_rewards/rejected": -3.0060908794403076, + "eval_runtime": 225.4536, + "eval_samples_per_second": 16.216, + "eval_steps_per_second": 0.293, + "step": 301 + }, + { + "epoch": 1.427052569403426, + "grad_norm": 4.636151437447421, + "learning_rate": 9.889101292335625e-08, + "logits/chosen": -2.6484196186065674, + "logits/rejected": -2.6770687103271484, + "logps/chosen": -402.7945251464844, + "logps/rejected": -453.43646240234375, + "loss": 0.4782, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.1831938028335571, + "rewards/margins": 1.1924808025360107, + "rewards/rejected": -2.3756744861602783, + "step": 302 + }, + { + "epoch": 1.431777909037212, + "grad_norm": 4.004369409074385, + "learning_rate": 9.736568032708068e-08, + "logits/chosen": -2.5602633953094482, + "logits/rejected": -2.6748807430267334, + "logps/chosen": -366.6900634765625, + "logps/rejected": -485.0727844238281, + "loss": 0.482, + "rewards/accuracies": 0.578125, + "rewards/chosen": -1.4285987615585327, + "rewards/margins": 1.416813611984253, + "rewards/rejected": -2.845412254333496, + "step": 303 + }, + { + "epoch": 1.436503248670998, + "grad_norm": 4.878881366827008, + "learning_rate": 9.584935313456596e-08, + "logits/chosen": -2.231307029724121, + "logits/rejected": -2.287929058074951, + "logps/chosen": -352.46282958984375, + "logps/rejected": -453.6793212890625, + "loss": 0.4942, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5504112243652344, + "rewards/margins": 1.3251781463623047, + "rewards/rejected": -2.875589609146118, + "step": 304 + }, + { + "epoch": 1.4412285883047844, + "grad_norm": 4.241730784056723, + "learning_rate": 9.4342120808907e-08, + "logits/chosen": -2.703420639038086, + "logits/rejected": -2.6591320037841797, + "logps/chosen": -383.8643493652344, + "logps/rejected": -594.2821655273438, + "loss": 0.4792, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5567654371261597, + "rewards/margins": 2.0447824001312256, + "rewards/rejected": -3.6015477180480957, + "step": 305 + }, + { + "epoch": 1.4459539279385707, + "grad_norm": 3.8843212431707284, + "learning_rate": 9.284407227660249e-08, + "logits/chosen": -2.8023083209991455, + "logits/rejected": -2.7946949005126953, + "logps/chosen": -337.86090087890625, + "logps/rejected": -434.48260498046875, + "loss": 0.4894, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2694463729858398, + "rewards/margins": 1.1360807418823242, + "rewards/rejected": -2.405527114868164, + "step": 306 + }, + { + "epoch": 1.4506792675723568, + "grad_norm": 3.978483022316159, + "learning_rate": 9.13552959223089e-08, + "logits/chosen": -2.676576852798462, + "logits/rejected": -2.5387959480285645, + "logps/chosen": -313.46954345703125, + "logps/rejected": -421.6120300292969, + "loss": 0.4765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3958847522735596, + "rewards/margins": 0.8875546455383301, + "rewards/rejected": -2.2834396362304688, + "step": 307 + }, + { + "epoch": 1.4554046072061428, + "grad_norm": 4.395780053942796, + "learning_rate": 8.987587958362516e-08, + "logits/chosen": -2.856872320175171, + "logits/rejected": -2.8188376426696777, + "logps/chosen": -371.1302795410156, + "logps/rejected": -465.9201354980469, + "loss": 0.4681, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4001535177230835, + "rewards/margins": 1.085311770439148, + "rewards/rejected": -2.4854652881622314, + "step": 308 + }, + { + "epoch": 1.4601299468399291, + "grad_norm": 3.8656297909933772, + "learning_rate": 8.840591054591096e-08, + "logits/chosen": -2.5090444087982178, + "logits/rejected": -2.6375505924224854, + "logps/chosen": -410.7342529296875, + "logps/rejected": -467.5154724121094, + "loss": 0.4832, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.2629905939102173, + "rewards/margins": 1.3948893547058105, + "rewards/rejected": -2.6578800678253174, + "step": 309 + }, + { + "epoch": 1.4648552864737154, + "grad_norm": 4.334534835752672, + "learning_rate": 8.694547553713618e-08, + "logits/chosen": -2.759681224822998, + "logits/rejected": -2.73026180267334, + "logps/chosen": -355.33624267578125, + "logps/rejected": -529.0985107421875, + "loss": 0.4896, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.521816372871399, + "rewards/margins": 1.6343505382537842, + "rewards/rejected": -3.1561670303344727, + "step": 310 + }, + { + "epoch": 1.4695806261075015, + "grad_norm": 4.853059543604455, + "learning_rate": 8.54946607227644e-08, + "logits/chosen": -2.4591803550720215, + "logits/rejected": -2.5936403274536133, + "logps/chosen": -391.37298583984375, + "logps/rejected": -434.3221435546875, + "loss": 0.4857, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5846033096313477, + "rewards/margins": 1.0046393871307373, + "rewards/rejected": -2.589242696762085, + "step": 311 + }, + { + "epoch": 1.4743059657412876, + "grad_norm": 5.028900388618828, + "learning_rate": 8.405355170066925e-08, + "logits/chosen": -2.7181220054626465, + "logits/rejected": -2.590919017791748, + "logps/chosen": -380.94818115234375, + "logps/rejected": -548.143798828125, + "loss": 0.4851, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.6279135942459106, + "rewards/margins": 1.5961647033691406, + "rewards/rejected": -3.2240781784057617, + "step": 312 + }, + { + "epoch": 1.4790313053750739, + "grad_norm": 4.261294839829192, + "learning_rate": 8.262223349608366e-08, + "logits/chosen": -2.7533867359161377, + "logits/rejected": -2.8102259635925293, + "logps/chosen": -394.7510681152344, + "logps/rejected": -499.4160461425781, + "loss": 0.4965, + "rewards/accuracies": 0.546875, + "rewards/chosen": -1.5835039615631104, + "rewards/margins": 1.1018283367156982, + "rewards/rejected": -2.6853325366973877, + "step": 313 + }, + { + "epoch": 1.48375664500886, + "grad_norm": 4.418632236406977, + "learning_rate": 8.120079055658402e-08, + "logits/chosen": -2.642446279525757, + "logits/rejected": -2.6767466068267822, + "logps/chosen": -323.5433349609375, + "logps/rejected": -469.7967834472656, + "loss": 0.475, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2628949880599976, + "rewards/margins": 1.5510131120681763, + "rewards/rejected": -2.813908100128174, + "step": 314 + }, + { + "epoch": 1.4884819846426462, + "grad_norm": 4.312754452868253, + "learning_rate": 7.978930674710719e-08, + "logits/chosen": -2.4338035583496094, + "logits/rejected": -2.444002151489258, + "logps/chosen": -374.0926513671875, + "logps/rejected": -496.3292236328125, + "loss": 0.4875, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.539766788482666, + "rewards/margins": 1.5390400886535645, + "rewards/rejected": -3.0788071155548096, + "step": 315 + }, + { + "epoch": 1.4932073242764323, + "grad_norm": 4.036827603807639, + "learning_rate": 7.838786534500269e-08, + "logits/chosen": -2.7101027965545654, + "logits/rejected": -2.7315640449523926, + "logps/chosen": -367.41107177734375, + "logps/rejected": -477.13916015625, + "loss": 0.4933, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.501757025718689, + "rewards/margins": 1.17995023727417, + "rewards/rejected": -2.6817073822021484, + "step": 316 + }, + { + "epoch": 1.4979326639102186, + "grad_norm": 4.1034959013822645, + "learning_rate": 7.699654903511971e-08, + "logits/chosen": -2.4980247020721436, + "logits/rejected": -2.569985866546631, + "logps/chosen": -306.99346923828125, + "logps/rejected": -443.2658996582031, + "loss": 0.4761, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.0971927642822266, + "rewards/margins": 1.3962390422821045, + "rewards/rejected": -2.49343204498291, + "step": 317 + }, + { + "epoch": 1.5026580035440047, + "grad_norm": 3.8908709553914083, + "learning_rate": 7.561543990492803e-08, + "logits/chosen": -2.5545809268951416, + "logits/rejected": -2.7232065200805664, + "logps/chosen": -382.45367431640625, + "logps/rejected": -496.840087890625, + "loss": 0.4696, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5967504978179932, + "rewards/margins": 1.5658732652664185, + "rewards/rejected": -3.162623643875122, + "step": 318 + }, + { + "epoch": 1.507383343177791, + "grad_norm": 4.1097836221394095, + "learning_rate": 7.424461943967555e-08, + "logits/chosen": -2.5873563289642334, + "logits/rejected": -2.7303099632263184, + "logps/chosen": -394.63916015625, + "logps/rejected": -551.1517333984375, + "loss": 0.4793, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5619065761566162, + "rewards/margins": 1.4349513053894043, + "rewards/rejected": -2.9968576431274414, + "step": 319 + }, + { + "epoch": 1.512108682811577, + "grad_norm": 3.8818233399377373, + "learning_rate": 7.288416851758016e-08, + "logits/chosen": -2.638657808303833, + "logits/rejected": -2.547767162322998, + "logps/chosen": -392.5679626464844, + "logps/rejected": -627.8018188476562, + "loss": 0.4782, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5840284824371338, + "rewards/margins": 2.276487112045288, + "rewards/rejected": -3.860515594482422, + "step": 320 + }, + { + "epoch": 1.5168340224453631, + "grad_norm": 4.298412046444391, + "learning_rate": 7.153416740505814e-08, + "logits/chosen": -2.473698854446411, + "logits/rejected": -2.580441951751709, + "logps/chosen": -402.406982421875, + "logps/rejected": -486.8596496582031, + "loss": 0.4701, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5713368654251099, + "rewards/margins": 1.1730860471725464, + "rewards/rejected": -2.7444231510162354, + "step": 321 + }, + { + "epoch": 1.5215593620791494, + "grad_norm": 3.844411445149988, + "learning_rate": 7.01946957519886e-08, + "logits/chosen": -2.414036273956299, + "logits/rejected": -2.5014243125915527, + "logps/chosen": -408.16455078125, + "logps/rejected": -474.1005859375, + "loss": 0.4766, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.5195822715759277, + "rewards/margins": 1.162341594696045, + "rewards/rejected": -2.6819238662719727, + "step": 322 + }, + { + "epoch": 1.5262847017129357, + "grad_norm": 4.081324399203426, + "learning_rate": 6.88658325870138e-08, + "logits/chosen": -2.6832616329193115, + "logits/rejected": -2.7357096672058105, + "logps/chosen": -385.5028076171875, + "logps/rejected": -469.4779052734375, + "loss": 0.4665, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.399859070777893, + "rewards/margins": 1.3238856792449951, + "rewards/rejected": -2.7237446308135986, + "step": 323 + }, + { + "epoch": 1.5310100413467218, + "grad_norm": 5.228883712742776, + "learning_rate": 6.754765631287695e-08, + "logits/chosen": -2.544619560241699, + "logits/rejected": -2.655355453491211, + "logps/chosen": -355.68328857421875, + "logps/rejected": -468.37738037109375, + "loss": 0.4799, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6572238206863403, + "rewards/margins": 1.6221368312835693, + "rewards/rejected": -3.27936053276062, + "step": 324 + }, + { + "epoch": 1.5357353809805079, + "grad_norm": 3.85359128112212, + "learning_rate": 6.62402447017959e-08, + "logits/chosen": -2.255566358566284, + "logits/rejected": -2.2861790657043457, + "logps/chosen": -374.5622863769531, + "logps/rejected": -525.822265625, + "loss": 0.4804, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5297114849090576, + "rewards/margins": 1.5574190616607666, + "rewards/rejected": -3.087130546569824, + "step": 325 + }, + { + "epoch": 1.5404607206142942, + "grad_norm": 3.9384759163947733, + "learning_rate": 6.494367489087488e-08, + "logits/chosen": -2.310734987258911, + "logits/rejected": -2.331479072570801, + "logps/chosen": -353.3757629394531, + "logps/rejected": -456.4471740722656, + "loss": 0.4973, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.2544013261795044, + "rewards/margins": 0.9933812618255615, + "rewards/rejected": -2.2477827072143555, + "step": 326 + }, + { + "epoch": 1.5451860602480805, + "grad_norm": 4.707537372396568, + "learning_rate": 6.365802337755364e-08, + "logits/chosen": -2.5162229537963867, + "logits/rejected": -2.5781514644622803, + "logps/chosen": -359.474365234375, + "logps/rejected": -459.92413330078125, + "loss": 0.4737, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3576124906539917, + "rewards/margins": 1.2539989948272705, + "rewards/rejected": -2.6116113662719727, + "step": 327 + }, + { + "epoch": 1.5499113998818665, + "grad_norm": 3.5347435278013024, + "learning_rate": 6.238336601509364e-08, + "logits/chosen": -2.4307329654693604, + "logits/rejected": -2.413248300552368, + "logps/chosen": -346.82537841796875, + "logps/rejected": -487.83111572265625, + "loss": 0.4487, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.5922647714614868, + "rewards/margins": 1.7097985744476318, + "rewards/rejected": -3.302063465118408, + "step": 328 + }, + { + "epoch": 1.5546367395156526, + "grad_norm": 4.5192043219385925, + "learning_rate": 6.111977800810316e-08, + "logits/chosen": -2.4796946048736572, + "logits/rejected": -2.3877992630004883, + "logps/chosen": -332.8540344238281, + "logps/rejected": -508.25360107421875, + "loss": 0.4836, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.7624458074569702, + "rewards/margins": 1.4423408508300781, + "rewards/rejected": -3.204786777496338, + "step": 329 + }, + { + "epoch": 1.559362079149439, + "grad_norm": 4.386475280759837, + "learning_rate": 5.986733390809993e-08, + "logits/chosen": -2.400326728820801, + "logits/rejected": -2.2906858921051025, + "logps/chosen": -386.3865966796875, + "logps/rejected": -556.1158447265625, + "loss": 0.4623, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.8006043434143066, + "rewards/margins": 1.7274423837661743, + "rewards/rejected": -3.5280466079711914, + "step": 330 + }, + { + "epoch": 1.564087418783225, + "grad_norm": 4.101101033266814, + "learning_rate": 5.862610760911257e-08, + "logits/chosen": -2.4113216400146484, + "logits/rejected": -2.3908936977386475, + "logps/chosen": -364.8887939453125, + "logps/rejected": -459.69854736328125, + "loss": 0.4372, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4839426279067993, + "rewards/margins": 1.3043051958084106, + "rewards/rejected": -2.78824782371521, + "step": 331 + }, + { + "epoch": 1.5688127584170113, + "grad_norm": 4.330222141211974, + "learning_rate": 5.739617234332131e-08, + "logits/chosen": -2.6859869956970215, + "logits/rejected": -2.7122979164123535, + "logps/chosen": -405.88018798828125, + "logps/rejected": -452.80926513671875, + "loss": 0.4648, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.5999912023544312, + "rewards/margins": 0.9445231556892395, + "rewards/rejected": -2.5445144176483154, + "step": 332 + }, + { + "epoch": 1.5735380980507974, + "grad_norm": 5.485683540439284, + "learning_rate": 5.6177600676736656e-08, + "logits/chosen": -2.5637035369873047, + "logits/rejected": -2.5589380264282227, + "logps/chosen": -393.2574462890625, + "logps/rejected": -529.729248046875, + "loss": 0.5169, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7628885507583618, + "rewards/margins": 1.4583940505981445, + "rewards/rejected": -3.221282482147217, + "step": 333 + }, + { + "epoch": 1.5782634376845834, + "grad_norm": 3.9160914653232983, + "learning_rate": 5.4970464504918654e-08, + "logits/chosen": -2.7670090198516846, + "logits/rejected": -2.7049059867858887, + "logps/chosen": -369.9678039550781, + "logps/rejected": -494.07196044921875, + "loss": 0.4565, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4675428867340088, + "rewards/margins": 1.4338531494140625, + "rewards/rejected": -2.901395797729492, + "step": 334 + }, + { + "epoch": 1.5829887773183697, + "grad_norm": 3.637994981997343, + "learning_rate": 5.37748350487344e-08, + "logits/chosen": -2.6616098880767822, + "logits/rejected": -2.6261179447174072, + "logps/chosen": -340.9493408203125, + "logps/rejected": -505.6307067871094, + "loss": 0.4528, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.4446711540222168, + "rewards/margins": 1.6602320671081543, + "rewards/rejected": -3.10490345954895, + "step": 335 + }, + { + "epoch": 1.587714116952156, + "grad_norm": 3.742063853340133, + "learning_rate": 5.2590782850156667e-08, + "logits/chosen": -2.755837917327881, + "logits/rejected": -2.6713008880615234, + "logps/chosen": -421.58154296875, + "logps/rejected": -625.4422607421875, + "loss": 0.4608, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.701414704322815, + "rewards/margins": 1.9440244436264038, + "rewards/rejected": -3.6454391479492188, + "step": 336 + }, + { + "epoch": 1.592439456585942, + "grad_norm": 3.5742676774235154, + "learning_rate": 5.14183777681014e-08, + "logits/chosen": -2.466548442840576, + "logits/rejected": -2.5580849647521973, + "logps/chosen": -414.5927734375, + "logps/rejected": -506.8092956542969, + "loss": 0.5037, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.518671989440918, + "rewards/margins": 1.1154457330703735, + "rewards/rejected": -2.634117603302002, + "step": 337 + }, + { + "epoch": 1.5971647962197282, + "grad_norm": 3.6653171076626205, + "learning_rate": 5.0257688974306436e-08, + "logits/chosen": -2.8738627433776855, + "logits/rejected": -3.015981435775757, + "logps/chosen": -381.5254211425781, + "logps/rejected": -457.140869140625, + "loss": 0.4883, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.607730507850647, + "rewards/margins": 1.3426533937454224, + "rewards/rejected": -2.9503836631774902, + "step": 338 + }, + { + "epoch": 1.6018901358535145, + "grad_norm": 3.837456752855929, + "learning_rate": 4.910878494925008e-08, + "logits/chosen": -2.6002864837646484, + "logits/rejected": -2.6144702434539795, + "logps/chosen": -402.1292724609375, + "logps/rejected": -593.8958740234375, + "loss": 0.4436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5465564727783203, + "rewards/margins": 1.739166498184204, + "rewards/rejected": -3.2857229709625244, + "step": 339 + }, + { + "epoch": 1.6066154754873008, + "grad_norm": 3.9821598438730565, + "learning_rate": 4.7971733478111094e-08, + "logits/chosen": -2.5669634342193604, + "logits/rejected": -2.5983939170837402, + "logps/chosen": -384.938720703125, + "logps/rejected": -557.4461669921875, + "loss": 0.471, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5989502668380737, + "rewards/margins": 1.8718353509902954, + "rewards/rejected": -3.4707858562469482, + "step": 340 + }, + { + "epoch": 1.6113408151210868, + "grad_norm": 4.1505854551229735, + "learning_rate": 4.684660164676896e-08, + "logits/chosen": -2.4149627685546875, + "logits/rejected": -2.3984737396240234, + "logps/chosen": -367.2955322265625, + "logps/rejected": -546.0213623046875, + "loss": 0.4849, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5166963338851929, + "rewards/margins": 1.747159719467163, + "rewards/rejected": -3.2638559341430664, + "step": 341 + }, + { + "epoch": 1.616066154754873, + "grad_norm": 4.195319716535671, + "learning_rate": 4.5733455837846325e-08, + "logits/chosen": -2.6863296031951904, + "logits/rejected": -2.740741014480591, + "logps/chosen": -388.5705261230469, + "logps/rejected": -527.72265625, + "loss": 0.4803, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.659536600112915, + "rewards/margins": 1.501134991645813, + "rewards/rejected": -3.1606712341308594, + "step": 342 + }, + { + "epoch": 1.6207914943886592, + "grad_norm": 4.414273407573815, + "learning_rate": 4.4632361726791914e-08, + "logits/chosen": -2.6036033630371094, + "logits/rejected": -2.6751937866210938, + "logps/chosen": -401.4815979003906, + "logps/rejected": -471.8222961425781, + "loss": 0.5018, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.435232162475586, + "rewards/margins": 1.3340187072753906, + "rewards/rejected": -2.7692506313323975, + "step": 343 + }, + { + "epoch": 1.6255168340224455, + "grad_norm": 5.041321789664628, + "learning_rate": 4.354338427800619e-08, + "logits/chosen": -2.6457765102386475, + "logits/rejected": -2.5664923191070557, + "logps/chosen": -325.753662109375, + "logps/rejected": -515.0615844726562, + "loss": 0.4724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4971141815185547, + "rewards/margins": 1.5831940174102783, + "rewards/rejected": -3.080307960510254, + "step": 344 + }, + { + "epoch": 1.6255168340224455, + "eval_logits/chosen": -2.623720645904541, + "eval_logits/rejected": -2.6295571327209473, + "eval_logps/chosen": -379.5492858886719, + "eval_logps/rejected": -517.3630981445312, + "eval_loss": 0.47301965951919556, + "eval_rewards/accuracies": 0.6382575631141663, + "eval_rewards/chosen": -1.683185338973999, + "eval_rewards/margins": 1.4909465312957764, + "eval_rewards/rejected": -3.1741318702697754, + "eval_runtime": 225.158, + "eval_samples_per_second": 16.237, + "eval_steps_per_second": 0.293, + "step": 344 + }, + { + "epoch": 1.6302421736562316, + "grad_norm": 3.9177181536561396, + "learning_rate": 4.246658774100803e-08, + "logits/chosen": -2.5313777923583984, + "logits/rejected": -2.6277084350585938, + "logps/chosen": -407.18621826171875, + "logps/rejected": -496.5007629394531, + "loss": 0.4807, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.6058672666549683, + "rewards/margins": 1.1191009283065796, + "rewards/rejected": -2.724968194961548, + "step": 345 + }, + { + "epoch": 1.6349675132900177, + "grad_norm": 3.841031231021936, + "learning_rate": 4.140203564664421e-08, + "logits/chosen": -2.6209938526153564, + "logits/rejected": -2.650219678878784, + "logps/chosen": -361.2805480957031, + "logps/rejected": -478.2210998535156, + "loss": 0.4595, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4036047458648682, + "rewards/margins": 1.4318450689315796, + "rewards/rejected": -2.8354499340057373, + "step": 346 + }, + { + "epoch": 1.6396928529238037, + "grad_norm": 3.522512280432432, + "learning_rate": 4.0349790803341274e-08, + "logits/chosen": -2.8207521438598633, + "logits/rejected": -2.7012057304382324, + "logps/chosen": -348.5565490722656, + "logps/rejected": -503.2954406738281, + "loss": 0.4711, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.5178475379943848, + "rewards/margins": 1.5804749727249146, + "rewards/rejected": -3.0983223915100098, + "step": 347 + }, + { + "epoch": 1.64441819255759, + "grad_norm": 4.0111825885134245, + "learning_rate": 3.930991529339936e-08, + "logits/chosen": -2.629011631011963, + "logits/rejected": -2.583953380584717, + "logps/chosen": -376.8917236328125, + "logps/rejected": -623.347412109375, + "loss": 0.4686, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.57673978805542, + "rewards/margins": 2.396878719329834, + "rewards/rejected": -3.973618268966675, + "step": 348 + }, + { + "epoch": 1.6491435321913763, + "grad_norm": 4.471413252688273, + "learning_rate": 3.828247046932992e-08, + "logits/chosen": -2.6523804664611816, + "logits/rejected": -2.6967380046844482, + "logps/chosen": -339.6617126464844, + "logps/rejected": -441.6883239746094, + "loss": 0.4632, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.344817876815796, + "rewards/margins": 1.1860837936401367, + "rewards/rejected": -2.5309014320373535, + "step": 349 + }, + { + "epoch": 1.6538688718251624, + "grad_norm": 3.9015948159102902, + "learning_rate": 3.7267516950235525e-08, + "logits/chosen": -2.590344190597534, + "logits/rejected": -2.6862494945526123, + "logps/chosen": -347.7746887207031, + "logps/rejected": -500.335693359375, + "loss": 0.4407, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.4618185758590698, + "rewards/margins": 1.6277704238891602, + "rewards/rejected": -3.0895891189575195, + "step": 350 + }, + { + "epoch": 1.6585942114589485, + "grad_norm": 4.642097450354513, + "learning_rate": 3.62651146182334e-08, + "logits/chosen": -2.656710624694824, + "logits/rejected": -2.673964738845825, + "logps/chosen": -364.386962890625, + "logps/rejected": -458.63348388671875, + "loss": 0.5201, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.533918023109436, + "rewards/margins": 1.1320774555206299, + "rewards/rejected": -2.6659955978393555, + "step": 351 + }, + { + "epoch": 1.6633195510927348, + "grad_norm": 4.219548915754237, + "learning_rate": 3.527532261492272e-08, + "logits/chosen": -2.572221517562866, + "logits/rejected": -2.526576042175293, + "logps/chosen": -370.37396240234375, + "logps/rejected": -487.1659240722656, + "loss": 0.4737, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.4427790641784668, + "rewards/margins": 1.223225712776184, + "rewards/rejected": -2.6660046577453613, + "step": 352 + }, + { + "epoch": 1.668044890726521, + "grad_norm": 4.167493075518676, + "learning_rate": 3.4298199337894685e-08, + "logits/chosen": -2.6304097175598145, + "logits/rejected": -2.6287808418273926, + "logps/chosen": -387.1392822265625, + "logps/rejected": -581.6126708984375, + "loss": 0.4896, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.8162624835968018, + "rewards/margins": 2.150383234024048, + "rewards/rejected": -3.9666457176208496, + "step": 353 + }, + { + "epoch": 1.6727702303603071, + "grad_norm": 5.897960463157494, + "learning_rate": 3.333380243728773e-08, + "logits/chosen": -2.4372665882110596, + "logits/rejected": -2.5481488704681396, + "logps/chosen": -380.5074157714844, + "logps/rejected": -474.3066711425781, + "loss": 0.4799, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3929729461669922, + "rewards/margins": 1.5269252061843872, + "rewards/rejected": -2.919898271560669, + "step": 354 + }, + { + "epoch": 1.6774955699940932, + "grad_norm": 4.6734891602402, + "learning_rate": 3.238218881238558e-08, + "logits/chosen": -2.68146014213562, + "logits/rejected": -2.764531135559082, + "logps/chosen": -385.89788818359375, + "logps/rejected": -453.99371337890625, + "loss": 0.4799, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.5357441902160645, + "rewards/margins": 1.310141921043396, + "rewards/rejected": -2.845885992050171, + "step": 355 + }, + { + "epoch": 1.6822209096278795, + "grad_norm": 3.9989351330319503, + "learning_rate": 3.1443414608260526e-08, + "logits/chosen": -2.7942676544189453, + "logits/rejected": -2.7728328704833984, + "logps/chosen": -417.6398010253906, + "logps/rejected": -534.0235595703125, + "loss": 0.4725, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.648948073387146, + "rewards/margins": 1.4638010263442993, + "rewards/rejected": -3.1127490997314453, + "step": 356 + }, + { + "epoch": 1.6869462492616658, + "grad_norm": 4.114134264165304, + "learning_rate": 3.0517535212460946e-08, + "logits/chosen": -2.6477482318878174, + "logits/rejected": -2.7860238552093506, + "logps/chosen": -470.87103271484375, + "logps/rejected": -513.6318969726562, + "loss": 0.469, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.568166732788086, + "rewards/margins": 0.9764127731323242, + "rewards/rejected": -2.5445797443389893, + "step": 357 + }, + { + "epoch": 1.6916715888954519, + "grad_norm": 4.425834666691968, + "learning_rate": 2.960460525174313e-08, + "logits/chosen": -2.8916306495666504, + "logits/rejected": -2.853088855743408, + "logps/chosen": -339.81610107421875, + "logps/rejected": -483.9287109375, + "loss": 0.472, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.479756236076355, + "rewards/margins": 1.5092005729675293, + "rewards/rejected": -2.9889566898345947, + "step": 358 + }, + { + "epoch": 1.696396928529238, + "grad_norm": 4.184257952905371, + "learning_rate": 2.8704678588848535e-08, + "logits/chosen": -2.52712345123291, + "logits/rejected": -2.5050594806671143, + "logps/chosen": -368.9576416015625, + "logps/rejected": -528.0908813476562, + "loss": 0.4573, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.523732304573059, + "rewards/margins": 1.6147029399871826, + "rewards/rejected": -3.1384353637695312, + "step": 359 + }, + { + "epoch": 1.7011222681630243, + "grad_norm": 4.172727890906571, + "learning_rate": 2.781780831932595e-08, + "logits/chosen": -2.6111361980438232, + "logits/rejected": -2.634065628051758, + "logps/chosen": -437.898681640625, + "logps/rejected": -492.3134460449219, + "loss": 0.4917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5102522373199463, + "rewards/margins": 0.8861820697784424, + "rewards/rejected": -2.3964343070983887, + "step": 360 + }, + { + "epoch": 1.7058476077968105, + "grad_norm": 3.6954976996048376, + "learning_rate": 2.6944046768398565e-08, + "logits/chosen": -2.5292959213256836, + "logits/rejected": -2.5336508750915527, + "logps/chosen": -348.8612060546875, + "logps/rejected": -474.258056640625, + "loss": 0.4838, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.3767406940460205, + "rewards/margins": 1.3499197959899902, + "rewards/rejected": -2.7266602516174316, + "step": 361 + }, + { + "epoch": 1.7105729474305966, + "grad_norm": 4.557110158231638, + "learning_rate": 2.608344548787722e-08, + "logits/chosen": -2.493603229522705, + "logits/rejected": -2.6656622886657715, + "logps/chosen": -443.8504638671875, + "logps/rejected": -544.0814819335938, + "loss": 0.4946, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7300480604171753, + "rewards/margins": 1.5911731719970703, + "rewards/rejected": -3.321221113204956, + "step": 362 + }, + { + "epoch": 1.7152982870643827, + "grad_norm": 4.456300726365723, + "learning_rate": 2.523605525311842e-08, + "logits/chosen": -2.5634899139404297, + "logits/rejected": -2.5102734565734863, + "logps/chosen": -380.3689270019531, + "logps/rejected": -480.8353271484375, + "loss": 0.457, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.3825726509094238, + "rewards/margins": 0.8929464221000671, + "rewards/rejected": -2.2755191326141357, + "step": 363 + }, + { + "epoch": 1.7200236266981688, + "grad_norm": 4.384534987558066, + "learning_rate": 2.440192606002889e-08, + "logits/chosen": -2.7241060733795166, + "logits/rejected": -2.700328826904297, + "logps/chosen": -400.6265869140625, + "logps/rejected": -509.753173828125, + "loss": 0.4544, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.817798137664795, + "rewards/margins": 1.4839262962341309, + "rewards/rejected": -3.3017241954803467, + "step": 364 + }, + { + "epoch": 1.724748966331955, + "grad_norm": 4.665836153801188, + "learning_rate": 2.3581107122115723e-08, + "logits/chosen": -2.7754364013671875, + "logits/rejected": -2.7966537475585938, + "logps/chosen": -398.6808166503906, + "logps/rejected": -486.95263671875, + "loss": 0.4908, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.9782556295394897, + "rewards/margins": 1.371333360671997, + "rewards/rejected": -3.3495888710021973, + "step": 365 + }, + { + "epoch": 1.7294743059657414, + "grad_norm": 4.147614147053133, + "learning_rate": 2.2773646867582763e-08, + "logits/chosen": -2.626425266265869, + "logits/rejected": -2.57529354095459, + "logps/chosen": -398.737548828125, + "logps/rejected": -586.5382690429688, + "loss": 0.488, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.5953752994537354, + "rewards/margins": 1.6417709589004517, + "rewards/rejected": -3.2371463775634766, + "step": 366 + }, + { + "epoch": 1.7341996455995274, + "grad_norm": 4.0997514353208775, + "learning_rate": 2.19795929364735e-08, + "logits/chosen": -2.473259449005127, + "logits/rejected": -2.6184892654418945, + "logps/chosen": -344.3788757324219, + "logps/rejected": -420.8124694824219, + "loss": 0.4917, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3588237762451172, + "rewards/margins": 1.1373924016952515, + "rewards/rejected": -2.496216058731079, + "step": 367 + }, + { + "epoch": 1.7389249852333135, + "grad_norm": 4.374774389574994, + "learning_rate": 2.119899217785995e-08, + "logits/chosen": -2.467965841293335, + "logits/rejected": -2.5698554515838623, + "logps/chosen": -387.9084777832031, + "logps/rejected": -457.1981201171875, + "loss": 0.4716, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.393808364868164, + "rewards/margins": 1.2519505023956299, + "rewards/rejected": -2.645759105682373, + "step": 368 + }, + { + "epoch": 1.7436503248670998, + "grad_norm": 4.049649530225524, + "learning_rate": 2.0431890647079093e-08, + "logits/chosen": -2.407700538635254, + "logits/rejected": -2.458270788192749, + "logps/chosen": -414.94561767578125, + "logps/rejected": -544.7018432617188, + "loss": 0.4652, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8057122230529785, + "rewards/margins": 1.333924412727356, + "rewards/rejected": -3.139636754989624, + "step": 369 + }, + { + "epoch": 1.748375664500886, + "grad_norm": 4.671069391094125, + "learning_rate": 1.967833360301513e-08, + "logits/chosen": -2.637674331665039, + "logits/rejected": -2.722353458404541, + "logps/chosen": -357.71142578125, + "logps/rejected": -436.8525390625, + "loss": 0.4661, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.5406451225280762, + "rewards/margins": 1.1408922672271729, + "rewards/rejected": -2.681537389755249, + "step": 370 + }, + { + "epoch": 1.7531010041346722, + "grad_norm": 4.49257028234618, + "learning_rate": 1.8938365505429544e-08, + "logits/chosen": -2.710331678390503, + "logits/rejected": -2.7880280017852783, + "logps/chosen": -386.74945068359375, + "logps/rejected": -489.5121154785156, + "loss": 0.4942, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6727674007415771, + "rewards/margins": 1.33446204662323, + "rewards/rejected": -3.0072293281555176, + "step": 371 + }, + { + "epoch": 1.7578263437684583, + "grad_norm": 3.5465325285307228, + "learning_rate": 1.8212030012337704e-08, + "logits/chosen": -2.757737874984741, + "logits/rejected": -2.687244415283203, + "logps/chosen": -360.2991943359375, + "logps/rejected": -512.8646240234375, + "loss": 0.4577, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.4896515607833862, + "rewards/margins": 1.5841938257217407, + "rewards/rejected": -3.0738449096679688, + "step": 372 + }, + { + "epoch": 1.7625516834022446, + "grad_norm": 4.417035193749527, + "learning_rate": 1.7499369977433453e-08, + "logits/chosen": -2.6367974281311035, + "logits/rejected": -2.677651882171631, + "logps/chosen": -362.3944396972656, + "logps/rejected": -460.9113464355469, + "loss": 0.4586, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5859841108322144, + "rewards/margins": 1.0494407415390015, + "rewards/rejected": -2.6354243755340576, + "step": 373 + }, + { + "epoch": 1.7672770230360308, + "grad_norm": 4.392962342030237, + "learning_rate": 1.680042744756016e-08, + "logits/chosen": -2.9928336143493652, + "logits/rejected": -2.9165008068084717, + "logps/chosen": -366.8034973144531, + "logps/rejected": -550.0221557617188, + "loss": 0.4769, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5658485889434814, + "rewards/margins": 1.7793083190917969, + "rewards/rejected": -3.3451569080352783, + "step": 374 + }, + { + "epoch": 1.772002362669817, + "grad_norm": 4.856077218434121, + "learning_rate": 1.611524366023062e-08, + "logits/chosen": -2.722025156021118, + "logits/rejected": -2.8585572242736816, + "logps/chosen": -347.9723815917969, + "logps/rejected": -446.63604736328125, + "loss": 0.4742, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4827804565429688, + "rewards/margins": 1.0786786079406738, + "rewards/rejected": -2.5614588260650635, + "step": 375 + }, + { + "epoch": 1.776727702303603, + "grad_norm": 3.752916160722341, + "learning_rate": 1.544385904119344e-08, + "logits/chosen": -2.913771867752075, + "logits/rejected": -2.971179962158203, + "logps/chosen": -360.3616638183594, + "logps/rejected": -423.8080749511719, + "loss": 0.4586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.482804298400879, + "rewards/margins": 1.2081830501556396, + "rewards/rejected": -2.6909875869750977, + "step": 376 + }, + { + "epoch": 1.7814530419373893, + "grad_norm": 4.12751400895733, + "learning_rate": 1.4786313202048456e-08, + "logits/chosen": -2.6907248497009277, + "logits/rejected": -2.816317319869995, + "logps/chosen": -403.627685546875, + "logps/rejected": -487.9366760253906, + "loss": 0.4929, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.6239163875579834, + "rewards/margins": 1.5215915441513062, + "rewards/rejected": -3.1455078125, + "step": 377 + }, + { + "epoch": 1.7861783815711756, + "grad_norm": 4.590826584263257, + "learning_rate": 1.4142644937909203e-08, + "logits/chosen": -2.6118569374084473, + "logits/rejected": -2.6019818782806396, + "logps/chosen": -341.8358154296875, + "logps/rejected": -419.81829833984375, + "loss": 0.484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3214576244354248, + "rewards/margins": 0.8901782631874084, + "rewards/rejected": -2.2116360664367676, + "step": 378 + }, + { + "epoch": 1.7909037212049617, + "grad_norm": 4.146292704965862, + "learning_rate": 1.351289222511426e-08, + "logits/chosen": -2.610994577407837, + "logits/rejected": -2.78933048248291, + "logps/chosen": -369.8974609375, + "logps/rejected": -480.7877197265625, + "loss": 0.4694, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5018396377563477, + "rewards/margins": 1.5635069608688354, + "rewards/rejected": -3.0653464794158936, + "step": 379 + }, + { + "epoch": 1.7956290608387477, + "grad_norm": 5.1734179405150895, + "learning_rate": 1.2897092218986716e-08, + "logits/chosen": -2.523732900619507, + "logits/rejected": -2.6424248218536377, + "logps/chosen": -425.2537536621094, + "logps/rejected": -560.3030395507812, + "loss": 0.4854, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5777177810668945, + "rewards/margins": 1.5381261110305786, + "rewards/rejected": -3.1158437728881836, + "step": 380 + }, + { + "epoch": 1.8003544004725338, + "grad_norm": 3.865564251130276, + "learning_rate": 1.2295281251641698e-08, + "logits/chosen": -2.653510570526123, + "logits/rejected": -2.6763200759887695, + "logps/chosen": -392.48577880859375, + "logps/rejected": -508.2447509765625, + "loss": 0.4676, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.5738812685012817, + "rewards/margins": 1.344365119934082, + "rewards/rejected": -2.918246269226074, + "step": 381 + }, + { + "epoch": 1.80507974010632, + "grad_norm": 3.9007660136930062, + "learning_rate": 1.1707494829843207e-08, + "logits/chosen": -2.4932150840759277, + "logits/rejected": -2.525545835494995, + "logps/chosen": -393.6082763671875, + "logps/rejected": -494.412109375, + "loss": 0.4637, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.6010535955429077, + "rewards/margins": 1.5001673698425293, + "rewards/rejected": -3.1012210845947266, + "step": 382 + }, + { + "epoch": 1.8098050797401064, + "grad_norm": 3.789687566531843, + "learning_rate": 1.1133767632908798e-08, + "logits/chosen": -2.773787021636963, + "logits/rejected": -2.7511181831359863, + "logps/chosen": -374.2195739746094, + "logps/rejected": -590.141357421875, + "loss": 0.5006, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.784470558166504, + "rewards/margins": 1.8250298500061035, + "rewards/rejected": -3.6095001697540283, + "step": 383 + }, + { + "epoch": 1.8145304193738925, + "grad_norm": 4.27781176287506, + "learning_rate": 1.0574133510663747e-08, + "logits/chosen": -2.4717490673065186, + "logits/rejected": -2.580211639404297, + "logps/chosen": -404.38250732421875, + "logps/rejected": -501.7881774902344, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4478615522384644, + "rewards/margins": 1.3333170413970947, + "rewards/rejected": -2.7811787128448486, + "step": 384 + }, + { + "epoch": 1.8192557590076786, + "grad_norm": 4.660154426544148, + "learning_rate": 1.0028625481443981e-08, + "logits/chosen": -2.6154394149780273, + "logits/rejected": -2.6079351902008057, + "logps/chosen": -332.8676452636719, + "logps/rejected": -475.0517272949219, + "loss": 0.4503, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.4882500171661377, + "rewards/margins": 1.4799097776412964, + "rewards/rejected": -2.9681599140167236, + "step": 385 + }, + { + "epoch": 1.8239810986414648, + "grad_norm": 4.32107981908328, + "learning_rate": 9.497275730147774e-09, + "logits/chosen": -2.57356333732605, + "logits/rejected": -2.566416025161743, + "logps/chosen": -400.1445007324219, + "logps/rejected": -566.9680786132812, + "loss": 0.4524, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.724095344543457, + "rewards/margins": 1.8485801219940186, + "rewards/rejected": -3.5726757049560547, + "step": 386 + }, + { + "epoch": 1.8287064382752511, + "grad_norm": 4.382882347588404, + "learning_rate": 8.980115606337046e-09, + "logits/chosen": -2.744180202484131, + "logits/rejected": -2.6595263481140137, + "logps/chosen": -318.4762268066406, + "logps/rejected": -484.2186584472656, + "loss": 0.4836, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4589486122131348, + "rewards/margins": 1.453284502029419, + "rewards/rejected": -2.912233352661133, + "step": 387 + }, + { + "epoch": 1.8287064382752511, + "eval_logits/chosen": -2.637449264526367, + "eval_logits/rejected": -2.6433615684509277, + "eval_logps/chosen": -379.1832275390625, + "eval_logps/rejected": -518.951416015625, + "eval_loss": 0.4717705249786377, + "eval_rewards/accuracies": 0.6420454382896423, + "eval_rewards/chosen": -1.679525375366211, + "eval_rewards/margins": 1.5104897022247314, + "eval_rewards/rejected": -3.1900153160095215, + "eval_runtime": 226.5578, + "eval_samples_per_second": 16.137, + "eval_steps_per_second": 0.291, + "step": 387 + }, + { + "epoch": 1.8334317779090372, + "grad_norm": 4.767778942870907, + "learning_rate": 8.47717562238756e-09, + "logits/chosen": -2.504225254058838, + "logits/rejected": -2.5545616149902344, + "logps/chosen": -355.3597717285156, + "logps/rejected": -519.2000732421875, + "loss": 0.4663, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5064764022827148, + "rewards/margins": 1.655312418937683, + "rewards/rejected": -3.1617889404296875, + "step": 388 + }, + { + "epoch": 1.8381571175428233, + "grad_norm": 4.59007333084006, + "learning_rate": 7.988485451688815e-09, + "logits/chosen": -2.8325204849243164, + "logits/rejected": -2.8123779296875, + "logps/chosen": -341.4407043457031, + "logps/rejected": -501.8046875, + "loss": 0.4958, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.6452869176864624, + "rewards/margins": 1.5238325595855713, + "rewards/rejected": -3.1691195964813232, + "step": 389 + }, + { + "epoch": 1.8428824571766096, + "grad_norm": 4.836517410103638, + "learning_rate": 7.514073926893432e-09, + "logits/chosen": -2.441648006439209, + "logits/rejected": -2.528268814086914, + "logps/chosen": -386.49493408203125, + "logps/rejected": -454.25616455078125, + "loss": 0.5151, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.5899585485458374, + "rewards/margins": 0.9383600950241089, + "rewards/rejected": -2.5283186435699463, + "step": 390 + }, + { + "epoch": 1.8476077968103959, + "grad_norm": 4.204493754201481, + "learning_rate": 7.053969038215674e-09, + "logits/chosen": -2.7649574279785156, + "logits/rejected": -2.6198995113372803, + "logps/chosen": -405.90283203125, + "logps/rejected": -581.5423583984375, + "loss": 0.4766, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.7321686744689941, + "rewards/margins": 1.7127902507781982, + "rewards/rejected": -3.4449586868286133, + "step": 391 + }, + { + "epoch": 1.852333136444182, + "grad_norm": 3.805689590990249, + "learning_rate": 6.608197931780496e-09, + "logits/chosen": -2.5429623126983643, + "logits/rejected": -2.4480888843536377, + "logps/chosen": -373.41375732421875, + "logps/rejected": -588.4783935546875, + "loss": 0.4787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4610140323638916, + "rewards/margins": 1.9005025625228882, + "rewards/rejected": -3.3615164756774902, + "step": 392 + }, + { + "epoch": 1.857058476077968, + "grad_norm": 4.640569841501701, + "learning_rate": 6.176786908021453e-09, + "logits/chosen": -2.661363363265991, + "logits/rejected": -2.734570026397705, + "logps/chosen": -411.58306884765625, + "logps/rejected": -510.5803527832031, + "loss": 0.475, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6014535427093506, + "rewards/margins": 1.3125067949295044, + "rewards/rejected": -2.9139604568481445, + "step": 393 + }, + { + "epoch": 1.8617838157117543, + "grad_norm": 4.140728148044369, + "learning_rate": 5.759761420129322e-09, + "logits/chosen": -2.901158332824707, + "logits/rejected": -2.942783832550049, + "logps/chosen": -331.9306945800781, + "logps/rejected": -466.704833984375, + "loss": 0.4703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4073690176010132, + "rewards/margins": 1.4787429571151733, + "rewards/rejected": -2.8861119747161865, + "step": 394 + }, + { + "epoch": 1.8665091553455406, + "grad_norm": 3.736333802301315, + "learning_rate": 5.357146072550278e-09, + "logits/chosen": -2.4809322357177734, + "logits/rejected": -2.499831438064575, + "logps/chosen": -393.06109619140625, + "logps/rejected": -467.2709655761719, + "loss": 0.4632, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.567178726196289, + "rewards/margins": 0.7932885885238647, + "rewards/rejected": -2.3604674339294434, + "step": 395 + }, + { + "epoch": 1.8712344949793267, + "grad_norm": 4.263089946197189, + "learning_rate": 4.968964619534138e-09, + "logits/chosen": -2.6070809364318848, + "logits/rejected": -2.5771572589874268, + "logps/chosen": -346.75042724609375, + "logps/rejected": -519.9202880859375, + "loss": 0.443, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.525521993637085, + "rewards/margins": 1.564245581626892, + "rewards/rejected": -3.0897674560546875, + "step": 396 + }, + { + "epoch": 1.8759598346131128, + "grad_norm": 4.104385303806953, + "learning_rate": 4.595239963733011e-09, + "logits/chosen": -2.834834575653076, + "logits/rejected": -2.8623130321502686, + "logps/chosen": -387.9482727050781, + "logps/rejected": -544.7717895507812, + "loss": 0.427, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8901216983795166, + "rewards/margins": 1.8027396202087402, + "rewards/rejected": -3.6928610801696777, + "step": 397 + }, + { + "epoch": 1.8806851742468988, + "grad_norm": 3.9166519315043407, + "learning_rate": 4.2359941548499035e-09, + "logits/chosen": -2.438992977142334, + "logits/rejected": -2.4675240516662598, + "logps/chosen": -377.0653076171875, + "logps/rejected": -579.7406616210938, + "loss": 0.4621, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.7994550466537476, + "rewards/margins": 1.7710639238357544, + "rewards/rejected": -3.570518970489502, + "step": 398 + }, + { + "epoch": 1.8854105138806851, + "grad_norm": 3.980027730791039, + "learning_rate": 3.891248388337847e-09, + "logits/chosen": -2.461378574371338, + "logits/rejected": -2.3893392086029053, + "logps/chosen": -391.1559753417969, + "logps/rejected": -487.46112060546875, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5593876838684082, + "rewards/margins": 1.065595030784607, + "rewards/rejected": -2.6249828338623047, + "step": 399 + }, + { + "epoch": 1.8901358535144714, + "grad_norm": 4.768587791597909, + "learning_rate": 3.5610230041494828e-09, + "logits/chosen": -2.3206558227539062, + "logits/rejected": -2.3519835472106934, + "logps/chosen": -392.8856506347656, + "logps/rejected": -517.7825317382812, + "loss": 0.4687, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5414066314697266, + "rewards/margins": 1.4560898542404175, + "rewards/rejected": -2.9974963665008545, + "step": 400 + }, + { + "epoch": 1.8948611931482575, + "grad_norm": 4.155264459662159, + "learning_rate": 3.2453374855367366e-09, + "logits/chosen": -2.830385684967041, + "logits/rejected": -2.893541097640991, + "logps/chosen": -356.2238464355469, + "logps/rejected": -443.7274169921875, + "loss": 0.4612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.463138461112976, + "rewards/margins": 1.1921138763427734, + "rewards/rejected": -2.65525221824646, + "step": 401 + }, + { + "epoch": 1.8995865327820436, + "grad_norm": 4.0526022624158955, + "learning_rate": 2.9442104579016356e-09, + "logits/chosen": -2.313565731048584, + "logits/rejected": -2.4396870136260986, + "logps/chosen": -441.73736572265625, + "logps/rejected": -458.2696533203125, + "loss": 0.4724, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.3574057817459106, + "rewards/margins": 0.8890299797058105, + "rewards/rejected": -2.2464358806610107, + "step": 402 + }, + { + "epoch": 1.9043118724158299, + "grad_norm": 3.860540669652651, + "learning_rate": 2.657659687697156e-09, + "logits/chosen": -2.6873679161071777, + "logits/rejected": -2.549762725830078, + "logps/chosen": -317.7322082519531, + "logps/rejected": -429.63787841796875, + "loss": 0.4784, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.5817471742630005, + "rewards/margins": 1.0039292573928833, + "rewards/rejected": -2.5856761932373047, + "step": 403 + }, + { + "epoch": 1.9090372120496162, + "grad_norm": 4.53787076168368, + "learning_rate": 2.385702081379143e-09, + "logits/chosen": -2.3574860095977783, + "logits/rejected": -2.427652359008789, + "logps/chosen": -431.2988586425781, + "logps/rejected": -567.0403442382812, + "loss": 0.4604, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6895103454589844, + "rewards/margins": 1.765432357788086, + "rewards/rejected": -3.4549427032470703, + "step": 404 + }, + { + "epoch": 1.9137625516834023, + "grad_norm": 3.735971197718604, + "learning_rate": 2.1283536844087513e-09, + "logits/chosen": -2.5645933151245117, + "logits/rejected": -2.5676231384277344, + "logps/chosen": -362.2431335449219, + "logps/rejected": -531.4500122070312, + "loss": 0.4731, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5010064840316772, + "rewards/margins": 1.6250488758087158, + "rewards/rejected": -3.1260552406311035, + "step": 405 + }, + { + "epoch": 1.9184878913171883, + "grad_norm": 3.613782865337248, + "learning_rate": 1.885629680305867e-09, + "logits/chosen": -2.569301128387451, + "logits/rejected": -2.568390130996704, + "logps/chosen": -384.6716613769531, + "logps/rejected": -521.9403076171875, + "loss": 0.4467, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.6642777919769287, + "rewards/margins": 1.619908332824707, + "rewards/rejected": -3.284186363220215, + "step": 406 + }, + { + "epoch": 1.9232132309509746, + "grad_norm": 4.461343319483535, + "learning_rate": 1.6575443897531294e-09, + "logits/chosen": -2.5552725791931152, + "logits/rejected": -2.436856985092163, + "logps/chosen": -367.128662109375, + "logps/rejected": -545.8114624023438, + "loss": 0.4754, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6492211818695068, + "rewards/margins": 1.592346429824829, + "rewards/rejected": -3.241567373275757, + "step": 407 + }, + { + "epoch": 1.927938570584761, + "grad_norm": 4.454271459686257, + "learning_rate": 1.4441112697511638e-09, + "logits/chosen": -2.6292550563812256, + "logits/rejected": -2.6084656715393066, + "logps/chosen": -421.9637756347656, + "logps/rejected": -506.38897705078125, + "loss": 0.484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6983362436294556, + "rewards/margins": 1.19191312789917, + "rewards/rejected": -2.890249252319336, + "step": 408 + }, + { + "epoch": 1.932663910218547, + "grad_norm": 4.002197751374577, + "learning_rate": 1.2453429128245762e-09, + "logits/chosen": -2.5047454833984375, + "logits/rejected": -2.451660633087158, + "logps/chosen": -431.7708740234375, + "logps/rejected": -614.2373657226562, + "loss": 0.471, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9678634405136108, + "rewards/margins": 1.840644359588623, + "rewards/rejected": -3.8085079193115234, + "step": 409 + }, + { + "epoch": 1.937389249852333, + "grad_norm": 4.283657038062509, + "learning_rate": 1.061251046278938e-09, + "logits/chosen": -2.5947906970977783, + "logits/rejected": -2.700439214706421, + "logps/chosen": -392.77850341796875, + "logps/rejected": -458.53570556640625, + "loss": 0.4785, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.4087834358215332, + "rewards/margins": 1.301395297050476, + "rewards/rejected": -2.7101786136627197, + "step": 410 + }, + { + "epoch": 1.9421145894861194, + "grad_norm": 4.320389688019286, + "learning_rate": 8.918465315088941e-10, + "logits/chosen": -2.529167413711548, + "logits/rejected": -2.595038652420044, + "logps/chosen": -436.0292663574219, + "logps/rejected": -522.2467041015625, + "loss": 0.4691, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7954521179199219, + "rewards/margins": 1.5213781595230103, + "rewards/rejected": -3.3168301582336426, + "step": 411 + }, + { + "epoch": 1.9468399291199054, + "grad_norm": 3.917488632599818, + "learning_rate": 7.371393633574252e-10, + "logits/chosen": -2.508873224258423, + "logits/rejected": -2.5457823276519775, + "logps/chosen": -384.64923095703125, + "logps/rejected": -475.94134521484375, + "loss": 0.469, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.4391266107559204, + "rewards/margins": 1.211023211479187, + "rewards/rejected": -2.6501498222351074, + "step": 412 + }, + { + "epoch": 1.9515652687536917, + "grad_norm": 3.9351124104785584, + "learning_rate": 5.971386695260705e-10, + "logits/chosen": -2.465684652328491, + "logits/rejected": -2.5066022872924805, + "logps/chosen": -385.0665283203125, + "logps/rejected": -479.13214111328125, + "loss": 0.4643, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4245293140411377, + "rewards/margins": 1.249526858329773, + "rewards/rejected": -2.6740562915802, + "step": 413 + }, + { + "epoch": 1.9562906083874778, + "grad_norm": 3.704041812352823, + "learning_rate": 4.718527100364134e-10, + "logits/chosen": -2.6545798778533936, + "logits/rejected": -2.6624419689178467, + "logps/chosen": -366.58184814453125, + "logps/rejected": -510.66436767578125, + "loss": 0.4482, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5549781322479248, + "rewards/margins": 1.5164021253585815, + "rewards/rejected": -3.071380376815796, + "step": 414 + }, + { + "epoch": 1.9610159480212639, + "grad_norm": 4.499954109459605, + "learning_rate": 3.6128887674272133e-10, + "logits/chosen": -2.5801494121551514, + "logits/rejected": -2.551297187805176, + "logps/chosen": -360.39935302734375, + "logps/rejected": -539.8038330078125, + "loss": 0.4529, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6478443145751953, + "rewards/margins": 1.7761938571929932, + "rewards/rejected": -3.4240384101867676, + "step": 415 + }, + { + "epoch": 1.9657412876550502, + "grad_norm": 4.539840176257443, + "learning_rate": 2.6545369289587836e-10, + "logits/chosen": -2.506206750869751, + "logits/rejected": -2.729609489440918, + "logps/chosen": -423.645751953125, + "logps/rejected": -465.3453674316406, + "loss": 0.5018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2937572002410889, + "rewards/margins": 1.3675696849822998, + "rewards/rejected": -2.6613268852233887, + "step": 416 + }, + { + "epoch": 1.9704666272888365, + "grad_norm": 4.977216825161169, + "learning_rate": 1.843528127584981e-10, + "logits/chosen": -2.5777127742767334, + "logits/rejected": -2.611386775970459, + "logps/chosen": -396.95892333984375, + "logps/rejected": -500.09375, + "loss": 0.4951, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.7503533363342285, + "rewards/margins": 1.2793786525726318, + "rewards/rejected": -3.0297319889068604, + "step": 417 + }, + { + "epoch": 1.9751919669226226, + "grad_norm": 4.452268262277202, + "learning_rate": 1.17991021271302e-10, + "logits/chosen": -2.4709465503692627, + "logits/rejected": -2.399839401245117, + "logps/chosen": -379.194091796875, + "logps/rejected": -520.7940063476562, + "loss": 0.4696, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5808072090148926, + "rewards/margins": 1.5406217575073242, + "rewards/rejected": -3.121428966522217, + "step": 418 + }, + { + "epoch": 1.9799173065564086, + "grad_norm": 4.060313383389935, + "learning_rate": 6.637223377078949e-11, + "logits/chosen": -2.822049856185913, + "logits/rejected": -2.800950765609741, + "logps/chosen": -309.0566711425781, + "logps/rejected": -446.6823425292969, + "loss": 0.4561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.39582097530365, + "rewards/margins": 1.2124607563018799, + "rewards/rejected": -2.6082818508148193, + "step": 419 + }, + { + "epoch": 1.984642646190195, + "grad_norm": 4.291839752698122, + "learning_rate": 2.949949575833943e-11, + "logits/chosen": -2.625284433364868, + "logits/rejected": -2.6156272888183594, + "logps/chosen": -342.9141845703125, + "logps/rejected": -463.14312744140625, + "loss": 0.5061, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5005218982696533, + "rewards/margins": 1.2724413871765137, + "rewards/rejected": -2.772963047027588, + "step": 420 + }, + { + "epoch": 1.9893679858239812, + "grad_norm": 4.75166895203396, + "learning_rate": 7.374982720326217e-12, + "logits/chosen": -2.7700271606445312, + "logits/rejected": -2.7563564777374268, + "logps/chosen": -397.4788818359375, + "logps/rejected": -582.6298828125, + "loss": 0.4718, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.6806347370147705, + "rewards/margins": 1.8375290632247925, + "rewards/rejected": -3.5181639194488525, + "step": 421 + }, + { + "epoch": 1.9940933254577673, + "grad_norm": 4.42823795907204, + "learning_rate": 0.0, + "logits/chosen": -2.568351984024048, + "logits/rejected": -2.728876829147339, + "logps/chosen": -369.7898254394531, + "logps/rejected": -406.9199523925781, + "loss": 0.4865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3815803527832031, + "rewards/margins": 0.7950088977813721, + "rewards/rejected": -2.176589250564575, + "step": 422 + }, + { + "epoch": 1.9940933254577673, + "step": 422, + "total_flos": 0.0, + "train_loss": 0.5345145690638872, + "train_runtime": 33183.7631, + "train_samples_per_second": 5.711, + "train_steps_per_second": 0.013 + } + ], + "logging_steps": 1, + "max_steps": 422, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 43, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}