{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-08, "logits/chosen": -3.110421895980835, "logits/rejected": -3.134347915649414, "logps/chosen": -514.6908569335938, "logps/rejected": -579.9437255859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -3.101275682449341, "logits/rejected": -3.0809450149536133, "logps/chosen": -515.6033935546875, "logps/rejected": -643.0913696289062, "loss": 0.6927, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": -0.003898666240274906, "rewards/margins": 0.00284082000143826, "rewards/rejected": -0.006739485543221235, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -3.13663387298584, "logits/rejected": -3.1290111541748047, "logps/chosen": -521.3823852539062, "logps/rejected": -703.0851440429688, "loss": 0.6705, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.14891520142555237, "rewards/margins": 0.05075854808092117, "rewards/rejected": -0.19967375695705414, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -3.032599687576294, "logits/rejected": -3.0084524154663086, "logps/chosen": -572.4000854492188, "logps/rejected": -851.7752075195312, "loss": 0.5831, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.057348370552063, "rewards/margins": 0.3227284550666809, "rewards/rejected": -1.3800770044326782, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988097e-07, "logits/chosen": -3.5056405067443848, "logits/rejected": -3.5222296714782715, "logps/chosen": -622.5767822265625, "logps/rejected": -858.5099487304688, "loss": 0.5609, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6478599309921265, "rewards/margins": 0.43607082962989807, "rewards/rejected": -2.083930730819702, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-07, "logits/chosen": -3.6517977714538574, "logits/rejected": -3.6317756175994873, "logps/chosen": -670.474609375, "logps/rejected": -942.6395263671875, "loss": 0.496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0249228477478027, "rewards/margins": 0.6633843779563904, "rewards/rejected": -2.688307762145996, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.877641290737883e-07, "logits/chosen": -3.650251865386963, "logits/rejected": -3.607196807861328, "logps/chosen": -704.299072265625, "logps/rejected": -1045.16259765625, "loss": 0.3914, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3376059532165527, "rewards/margins": 0.9999262690544128, "rewards/rejected": -3.3375325202941895, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.776193866647039e-07, "logits/chosen": -3.519620895385742, "logits/rejected": -3.5314018726348877, "logps/chosen": -829.9962768554688, "logps/rejected": -1192.3013916015625, "loss": 0.3559, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.2941513061523438, "rewards/margins": 1.215731143951416, "rewards/rejected": -4.50988245010376, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004665e-07, "logits/chosen": -3.396796464920044, "logits/rejected": -3.372447967529297, "logps/chosen": -777.1376342773438, "logps/rejected": -1117.571533203125, "loss": 0.373, "rewards/accuracies": 0.875, "rewards/chosen": -3.0304834842681885, "rewards/margins": 1.2057898044586182, "rewards/rejected": -4.236273288726807, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.489061372204452e-07, "logits/chosen": -3.3173828125, "logits/rejected": -3.3279190063476562, "logps/chosen": -800.7711181640625, "logps/rejected": -1121.0828857421875, "loss": 0.37, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.910794496536255, "rewards/margins": 1.0732452869415283, "rewards/rejected": -3.9840400218963623, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -3.256371259689331, "logits/rejected": -3.238799571990967, "logps/chosen": -774.020263671875, "logps/rejected": -1094.2913818359375, "loss": 0.3678, "rewards/accuracies": 0.875, "rewards/chosen": -2.9788336753845215, "rewards/margins": 1.1674504280090332, "rewards/rejected": -4.146284103393555, "step": 100 }, { "epoch": 0.32, "eval_logits/chosen": -3.2330105304718018, "eval_logits/rejected": -3.2211368083953857, "eval_logps/chosen": -894.1544799804688, "eval_logps/rejected": -930.3583984375, "eval_loss": 0.8166332244873047, "eval_rewards/accuracies": 0.5703125, "eval_rewards/chosen": -3.5115277767181396, "eval_rewards/margins": 0.11863362044095993, "eval_rewards/rejected": -3.6301612854003906, "eval_runtime": 133.793, "eval_samples_per_second": 7.474, "eval_steps_per_second": 0.239, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.10218903496256e-07, "logits/chosen": -3.049738645553589, "logits/rejected": -3.0541279315948486, "logps/chosen": -837.93798828125, "logps/rejected": -1120.489501953125, "loss": 0.3897, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.197683572769165, "rewards/margins": 0.9859063029289246, "rewards/rejected": -4.183589935302734, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.862804889678955, "logits/rejected": -2.847139835357666, "logps/chosen": -793.1903076171875, "logps/rejected": -1099.2779541015625, "loss": 0.3824, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.867206573486328, "rewards/margins": 1.1447758674621582, "rewards/rejected": -4.011982440948486, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-07, "logits/chosen": -2.801413059234619, "logits/rejected": -2.810880184173584, "logps/chosen": -794.3284912109375, "logps/rejected": -1098.575927734375, "loss": 0.3865, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.982292652130127, "rewards/margins": 1.1373599767684937, "rewards/rejected": -4.11965274810791, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.378437060203357e-07, "logits/chosen": -2.725770950317383, "logits/rejected": -2.6947712898254395, "logps/chosen": -855.1282958984375, "logps/rejected": -1178.1373291015625, "loss": 0.3907, "rewards/accuracies": 0.875, "rewards/chosen": -3.1478772163391113, "rewards/margins": 1.088375449180603, "rewards/rejected": -4.236252784729004, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.110851015344735e-07, "logits/chosen": -2.901252269744873, "logits/rejected": -2.8886735439300537, "logps/chosen": -843.8787841796875, "logps/rejected": -1142.3704833984375, "loss": 0.3919, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.437390089035034, "rewards/margins": 1.1703314781188965, "rewards/rejected": -4.607722282409668, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.8201375007629395, "logits/rejected": -2.809413433074951, "logps/chosen": -826.0286865234375, "logps/rejected": -1086.612060546875, "loss": 0.4576, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1525731086730957, "rewards/margins": 0.9918072819709778, "rewards/rejected": -4.144380569458008, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.5560951607395126e-07, "logits/chosen": -2.9544239044189453, "logits/rejected": -2.957245349884033, "logps/chosen": -777.9389038085938, "logps/rejected": -1005.6591796875, "loss": 0.4832, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.6697211265563965, "rewards/margins": 0.7971886396408081, "rewards/rejected": -3.466909885406494, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -2.8567004203796387, "logits/rejected": -2.8372042179107666, "logps/chosen": -779.9255981445312, "logps/rejected": -981.6893310546875, "loss": 0.4884, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.700547456741333, "rewards/margins": 0.7202944755554199, "rewards/rejected": -3.420841932296753, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.998526460541818e-07, "logits/chosen": -2.821296215057373, "logits/rejected": -2.7948215007781982, "logps/chosen": -824.6710205078125, "logps/rejected": -1023.0182495117188, "loss": 0.5075, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.867779493331909, "rewards/margins": 0.6687132120132446, "rewards/rejected": -3.5364928245544434, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.842245101928711, "logits/rejected": -2.822925329208374, "logps/chosen": -775.6517333984375, "logps/rejected": -931.916015625, "loss": 0.5115, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6696395874023438, "rewards/margins": 0.5579996705055237, "rewards/rejected": -3.2276394367218018, "step": 200 }, { "epoch": 0.64, "eval_logits/chosen": -2.8897347450256348, "eval_logits/rejected": -2.870739221572876, "eval_logps/chosen": -840.2244262695312, "eval_logps/rejected": -875.9758911132812, "eval_loss": 0.7557607293128967, "eval_rewards/accuracies": 0.578125, "eval_rewards/chosen": -2.9722273349761963, "eval_rewards/margins": 0.11410895735025406, "eval_rewards/rejected": -3.086336135864258, "eval_runtime": 133.8945, "eval_samples_per_second": 7.469, "eval_steps_per_second": 0.239, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.4661037375836987e-07, "logits/chosen": -2.8397040367126465, "logits/rejected": -2.827742338180542, "logps/chosen": -825.7728271484375, "logps/rejected": -1029.4368896484375, "loss": 0.5171, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8868887424468994, "rewards/margins": 0.7782662510871887, "rewards/rejected": -3.6651549339294434, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -2.826509475708008, "logits/rejected": -2.8322367668151855, "logps/chosen": -816.0640869140625, "logps/rejected": -998.5843505859375, "loss": 0.5314, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.879359006881714, "rewards/margins": 0.658044695854187, "rewards/rejected": -3.5374042987823486, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-08, "logits/chosen": -2.9330477714538574, "logits/rejected": -2.914445161819458, "logps/chosen": -791.6444702148438, "logps/rejected": -906.9136962890625, "loss": 0.5657, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.653576612472534, "rewards/margins": 0.42590421438217163, "rewards/rejected": -3.0794806480407715, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328384e-08, "logits/chosen": -2.9414100646972656, "logits/rejected": -2.9438042640686035, "logps/chosen": -771.1376953125, "logps/rejected": -921.2554931640625, "loss": 0.5599, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4653267860412598, "rewards/margins": 0.5729144811630249, "rewards/rejected": -3.038240909576416, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049722e-08, "logits/chosen": -2.931709051132202, "logits/rejected": -2.920923948287964, "logps/chosen": -756.0911865234375, "logps/rejected": -916.9544677734375, "loss": 0.5833, "rewards/accuracies": 0.71875, "rewards/chosen": -2.339541435241699, "rewards/margins": 0.598496675491333, "rewards/rejected": -2.9380381107330322, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -2.885463237762451, "logits/rejected": -2.881591558456421, "logps/chosen": -859.6702270507812, "logps/rejected": -984.1055908203125, "loss": 0.5944, "rewards/accuracies": 0.625, "rewards/chosen": -2.5945677757263184, "rewards/margins": 0.44614577293395996, "rewards/rejected": -3.0407137870788574, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.724836895290805e-08, "logits/chosen": -2.936840772628784, "logits/rejected": -2.935763359069824, "logps/chosen": -759.699951171875, "logps/rejected": -853.43603515625, "loss": 0.5844, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.392477035522461, "rewards/margins": 0.3930489718914032, "rewards/rejected": -2.7855257987976074, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -2.9058921337127686, "logits/rejected": -2.907921314239502, "logps/chosen": -818.0262451171875, "logps/rejected": -934.5661010742188, "loss": 0.5967, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4692749977111816, "rewards/margins": 0.43090057373046875, "rewards/rejected": -2.9001753330230713, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-09, "logits/chosen": -2.9472568035125732, "logits/rejected": -2.9564738273620605, "logps/chosen": -775.2022705078125, "logps/rejected": -874.6551513671875, "loss": 0.6383, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.3951363563537598, "rewards/margins": 0.4553650915622711, "rewards/rejected": -2.850501537322998, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -2.9190421104431152, "logits/rejected": -2.8831028938293457, "logps/chosen": -839.2587890625, "logps/rejected": -975.5764770507812, "loss": 0.6156, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.542646884918213, "rewards/margins": 0.5185142755508423, "rewards/rejected": -3.0611610412597656, "step": 300 }, { "epoch": 0.96, "eval_logits/chosen": -3.023275852203369, "eval_logits/rejected": -3.008624315261841, "eval_logps/chosen": -781.6083374023438, "eval_logps/rejected": -817.5390014648438, "eval_loss": 0.7196429967880249, "eval_rewards/accuracies": 0.5703125, "eval_rewards/chosen": -2.386066436767578, "eval_rewards/margins": 0.11590027809143066, "eval_rewards/rejected": -2.501966714859009, "eval_runtime": 133.3992, "eval_samples_per_second": 7.496, "eval_steps_per_second": 0.24, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336733e-11, "logits/chosen": -2.9566540718078613, "logits/rejected": -2.9173121452331543, "logps/chosen": -796.7486572265625, "logps/rejected": -895.9386596679688, "loss": 0.6191, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.3850722312927246, "rewards/margins": 0.37639713287353516, "rewards/rejected": -2.7614693641662598, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.5059080181213526, "train_runtime": 5445.7158, "train_samples_per_second": 3.672, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }