{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 318, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-08, "logits/chosen": -2.8509469032287598, "logits/rejected": -2.833181858062744, "logps/chosen": -133.73171997070312, "logps/pi_response": -47.9057502746582, "logps/ref_response": -47.9057502746582, "logps/rejected": -197.21273803710938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.82759690284729, "logits/rejected": -2.798935890197754, "logps/chosen": -253.7941131591797, "logps/pi_response": -71.65193939208984, "logps/ref_response": -71.66093444824219, "logps/rejected": -215.510009765625, "loss": 0.693, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 2.8001426471746527e-05, "rewards/margins": -0.00033620299655012786, "rewards/rejected": 0.0003642044321168214, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.7570412158966064, "logits/rejected": -2.760845422744751, "logps/chosen": -217.1673126220703, "logps/pi_response": -74.27923583984375, "logps/ref_response": -74.31416320800781, "logps/rejected": -202.97946166992188, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005049519240856171, "rewards/margins": 0.0024197015445679426, "rewards/rejected": 0.0026298172306269407, "step": 20 }, { "epoch": 0.09, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.7637505531311035, "logits/rejected": -2.7554142475128174, "logps/chosen": -234.3155059814453, "logps/pi_response": -68.43399047851562, "logps/ref_response": -68.40460205078125, "logps/rejected": -203.23374938964844, "loss": 0.6859, "rewards/accuracies": 0.65625, "rewards/chosen": 0.028325339779257774, "rewards/margins": 0.01702897995710373, "rewards/rejected": 0.01129635889083147, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": -2.8049890995025635, "logits/rejected": -2.780224084854126, "logps/chosen": -207.7467041015625, "logps/pi_response": -68.51658630371094, "logps/ref_response": -64.04513549804688, "logps/rejected": -203.04983520507812, "loss": 0.6743, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04143567755818367, "rewards/margins": 0.02963915839791298, "rewards/rejected": 0.011796516366302967, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.951291206355559e-07, "logits/chosen": -2.7561471462249756, "logits/rejected": -2.7407054901123047, "logps/chosen": -256.8748474121094, "logps/pi_response": -92.87406921386719, "logps/ref_response": -74.82122039794922, "logps/rejected": -236.1123504638672, "loss": 0.6633, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0026867527049034834, "rewards/margins": 0.06110120937228203, "rewards/rejected": -0.05841444805264473, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": -2.679486036300659, "logits/rejected": -2.681763172149658, "logps/chosen": -266.70977783203125, "logps/pi_response": -106.77949523925781, "logps/ref_response": -76.94175720214844, "logps/rejected": -241.783447265625, "loss": 0.6503, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05337841436266899, "rewards/margins": 0.11435987800359726, "rewards/rejected": -0.16773828864097595, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.785350472409791e-07, "logits/chosen": -2.6562085151672363, "logits/rejected": -2.638962984085083, "logps/chosen": -262.48358154296875, "logps/pi_response": -129.61695861816406, "logps/ref_response": -82.19517517089844, "logps/rejected": -264.07568359375, "loss": 0.6444, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.20274746417999268, "rewards/margins": 0.09253005683422089, "rewards/rejected": -0.29527753591537476, "step": 70 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -2.6726343631744385, "logits/rejected": -2.667086601257324, "logps/chosen": -273.891845703125, "logps/pi_response": -138.45387268066406, "logps/ref_response": -72.91886901855469, "logps/rejected": -269.2601318359375, "loss": 0.6287, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2682049870491028, "rewards/margins": 0.1169251948595047, "rewards/rejected": -0.3851301670074463, "step": 80 }, { "epoch": 0.28, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -2.535644292831421, "logits/rejected": -2.532330274581909, "logps/chosen": -239.97012329101562, "logps/pi_response": -127.95100402832031, "logps/ref_response": -69.45870208740234, "logps/rejected": -257.403564453125, "loss": 0.6109, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14788806438446045, "rewards/margins": 0.2313682734966278, "rewards/rejected": -0.37925636768341064, "step": 90 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -2.5136990547180176, "logits/rejected": -2.509000062942505, "logps/chosen": -240.0645294189453, "logps/pi_response": -125.2841796875, "logps/ref_response": -70.416259765625, "logps/rejected": -254.03012084960938, "loss": 0.607, "rewards/accuracies": 0.625, "rewards/chosen": -0.2124701738357544, "rewards/margins": 0.27277401089668274, "rewards/rejected": -0.4852442145347595, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.137151834863213e-07, "logits/chosen": -2.480569362640381, "logits/rejected": -2.443432331085205, "logps/chosen": -275.77630615234375, "logps/pi_response": -136.15298461914062, "logps/ref_response": -71.45120239257812, "logps/rejected": -269.72015380859375, "loss": 0.6071, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2312408983707428, "rewards/margins": 0.23492033779621124, "rewards/rejected": -0.46616125106811523, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": -2.439021348953247, "logits/rejected": -2.4340755939483643, "logps/chosen": -268.33770751953125, "logps/pi_response": -167.921630859375, "logps/ref_response": -77.01984405517578, "logps/rejected": -322.13433837890625, "loss": 0.5893, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.35008108615875244, "rewards/margins": 0.42321157455444336, "rewards/rejected": -0.7732926607131958, "step": 120 }, { "epoch": 0.41, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -2.347311496734619, "logits/rejected": -2.335073947906494, "logps/chosen": -315.92718505859375, "logps/pi_response": -209.3319091796875, "logps/ref_response": -67.93054962158203, "logps/rejected": -312.83465576171875, "loss": 0.5915, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7186356782913208, "rewards/margins": 0.3434630036354065, "rewards/rejected": -1.062098741531372, "step": 130 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -2.2413113117218018, "logits/rejected": -2.211487293243408, "logps/chosen": -371.6416931152344, "logps/pi_response": -265.18096923828125, "logps/ref_response": -74.56938171386719, "logps/rejected": -376.59466552734375, "loss": 0.6044, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1778886318206787, "rewards/margins": 0.32092222571372986, "rewards/rejected": -1.4988110065460205, "step": 140 }, { "epoch": 0.47, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -2.240055561065674, "logits/rejected": -2.1979031562805176, "logps/chosen": -307.1637878417969, "logps/pi_response": -187.8131866455078, "logps/ref_response": -69.4150390625, "logps/rejected": -316.88873291015625, "loss": 0.5816, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5614252090454102, "rewards/margins": 0.41102123260498047, "rewards/rejected": -0.9724465608596802, "step": 150 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": -2.2194037437438965, "logits/rejected": -2.2016549110412598, "logps/chosen": -308.1712646484375, "logps/pi_response": -226.4488983154297, "logps/ref_response": -81.69718933105469, "logps/rejected": -343.4349365234375, "loss": 0.5699, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6651563048362732, "rewards/margins": 0.4220353662967682, "rewards/rejected": -1.0871917009353638, "step": 160 }, { "epoch": 0.53, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -2.233020305633545, "logits/rejected": -2.1675045490264893, "logps/chosen": -348.2539978027344, "logps/pi_response": -231.01455688476562, "logps/ref_response": -75.20973205566406, "logps/rejected": -334.9202880859375, "loss": 0.6045, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6902152895927429, "rewards/margins": 0.4886610507965088, "rewards/rejected": -1.1788761615753174, "step": 170 }, { "epoch": 0.57, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -2.137673854827881, "logits/rejected": -2.0975818634033203, "logps/chosen": -286.08172607421875, "logps/pi_response": -202.77284240722656, "logps/ref_response": -60.3970947265625, "logps/rejected": -297.22442626953125, "loss": 0.5776, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6755498051643372, "rewards/margins": 0.44268402457237244, "rewards/rejected": -1.1182337999343872, "step": 180 }, { "epoch": 0.6, "learning_rate": 2.089939221172446e-07, "logits/chosen": -2.1633880138397217, "logits/rejected": -2.1441142559051514, "logps/chosen": -301.385498046875, "logps/pi_response": -249.9862060546875, "logps/ref_response": -72.61909484863281, "logps/rejected": -376.3998718261719, "loss": 0.5583, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8089518547058105, "rewards/margins": 0.643667995929718, "rewards/rejected": -1.4526197910308838, "step": 190 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -2.21587872505188, "logits/rejected": -2.1797242164611816, "logps/chosen": -383.5548400878906, "logps/pi_response": -243.202880859375, "logps/ref_response": -76.12086486816406, "logps/rejected": -376.36138916015625, "loss": 0.5584, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7832227349281311, "rewards/margins": 0.4175417423248291, "rewards/rejected": -1.2007642984390259, "step": 200 }, { "epoch": 0.66, "learning_rate": 1.562351990976095e-07, "logits/chosen": -2.150066614151001, "logits/rejected": -2.121753692626953, "logps/chosen": -293.81488037109375, "logps/pi_response": -219.0130157470703, "logps/ref_response": -72.12835693359375, "logps/rejected": -308.10919189453125, "loss": 0.575, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8007175326347351, "rewards/margins": 0.48071402311325073, "rewards/rejected": -1.2814315557479858, "step": 210 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -2.182744026184082, "logits/rejected": -2.1633565425872803, "logps/chosen": -320.0791931152344, "logps/pi_response": -229.01052856445312, "logps/ref_response": -76.02903747558594, "logps/rejected": -319.9397277832031, "loss": 0.5582, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7191491723060608, "rewards/margins": 0.45336204767227173, "rewards/rejected": -1.172511339187622, "step": 220 }, { "epoch": 0.72, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -2.094500780105591, "logits/rejected": -2.094238758087158, "logps/chosen": -340.3441467285156, "logps/pi_response": -242.52029418945312, "logps/ref_response": -78.22608184814453, "logps/rejected": -359.96136474609375, "loss": 0.5815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8624930381774902, "rewards/margins": 0.3955257534980774, "rewards/rejected": -1.2580187320709229, "step": 230 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": -2.1416382789611816, "logits/rejected": -2.107877254486084, "logps/chosen": -334.34051513671875, "logps/pi_response": -253.45559692382812, "logps/ref_response": -71.44368743896484, "logps/rejected": -343.25750732421875, "loss": 0.567, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9990061521530151, "rewards/margins": 0.39658617973327637, "rewards/rejected": -1.3955923318862915, "step": 240 }, { "epoch": 0.78, "learning_rate": 6.655924144404906e-08, "logits/chosen": -2.109405279159546, "logits/rejected": -2.070265769958496, "logps/chosen": -366.1980285644531, "logps/pi_response": -274.1239318847656, "logps/ref_response": -80.24897003173828, "logps/rejected": -391.06976318359375, "loss": 0.5633, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0960283279418945, "rewards/margins": 0.4408624768257141, "rewards/rejected": -1.5368907451629639, "step": 250 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": -2.1373915672302246, "logits/rejected": -2.050778865814209, "logps/chosen": -357.5392150878906, "logps/pi_response": -278.73681640625, "logps/ref_response": -74.540283203125, "logps/rejected": -366.91839599609375, "loss": 0.5474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0771440267562866, "rewards/margins": 0.7008964419364929, "rewards/rejected": -1.7780405282974243, "step": 260 }, { "epoch": 0.85, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -2.031721353530884, "logits/rejected": -2.014037609100342, "logps/chosen": -339.314453125, "logps/pi_response": -293.8365783691406, "logps/ref_response": -78.09291076660156, "logps/rejected": -406.5647888183594, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1859939098358154, "rewards/margins": 0.4465731084346771, "rewards/rejected": -1.6325668096542358, "step": 270 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -2.1228859424591064, "logits/rejected": -2.0986340045928955, "logps/chosen": -329.19244384765625, "logps/pi_response": -264.7164306640625, "logps/ref_response": -76.1480712890625, "logps/rejected": -405.26287841796875, "loss": 0.5487, "rewards/accuracies": 0.71875, "rewards/chosen": -1.001992106437683, "rewards/margins": 0.5500288009643555, "rewards/rejected": -1.552020788192749, "step": 280 }, { "epoch": 0.91, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -2.1019539833068848, "logits/rejected": -2.051776885986328, "logps/chosen": -354.9820861816406, "logps/pi_response": -268.553466796875, "logps/ref_response": -72.30916595458984, "logps/rejected": -364.39324951171875, "loss": 0.5676, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.99445641040802, "rewards/margins": 0.649712324142456, "rewards/rejected": -1.6441688537597656, "step": 290 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -2.1944591999053955, "logits/rejected": -2.160240650177002, "logps/chosen": -352.7145080566406, "logps/pi_response": -258.2195129394531, "logps/ref_response": -73.66411590576172, "logps/rejected": -385.5911865234375, "loss": 0.5459, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9718688726425171, "rewards/margins": 0.5499030351638794, "rewards/rejected": -1.521772027015686, "step": 300 }, { "epoch": 0.97, "learning_rate": 9.64668657069706e-10, "logits/chosen": -2.138836622238159, "logits/rejected": -2.0898382663726807, "logps/chosen": -320.83782958984375, "logps/pi_response": -251.64108276367188, "logps/ref_response": -73.73819732666016, "logps/rejected": -360.6897277832031, "loss": 0.5544, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9648464322090149, "rewards/margins": 0.5148458480834961, "rewards/rejected": -1.4796922206878662, "step": 310 }, { "epoch": 1.0, "step": 318, "total_flos": 0.0, "train_loss": 0.5994115610542537, "train_runtime": 8245.2497, "train_samples_per_second": 4.943, "train_steps_per_second": 0.039 } ], "logging_steps": 10, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }