{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19, "grad_norm": 1.7745435237884521, "learning_rate": 1.7777777777777777e-05, "logits/chosen": -76.13655853271484, "logits/rejected": -79.90269470214844, "logps/chosen": -409.5575256347656, "logps/rejected": -37.49543380737305, "loss": 0.6777, "rewards/accuracies": 0.7647058963775635, "rewards/chosen": 0.026076888665556908, "rewards/margins": 0.03172626718878746, "rewards/rejected": -0.005649375729262829, "step": 17 }, { "epoch": 0.38, "grad_norm": 1.5432785749435425, "learning_rate": 2.925e-05, "logits/chosen": -75.53842163085938, "logits/rejected": -79.45289611816406, "logps/chosen": -452.3400573730469, "logps/rejected": -38.20747756958008, "loss": 0.5134, "rewards/accuracies": 1.0, "rewards/chosen": 0.3700771629810333, "rewards/margins": 0.4342119097709656, "rewards/rejected": -0.06413476169109344, "step": 34 }, { "epoch": 0.57, "grad_norm": 1.087545394897461, "learning_rate": 2.7125000000000002e-05, "logits/chosen": -73.8866195678711, "logits/rejected": -77.79715728759766, "logps/chosen": -482.1276550292969, "logps/rejected": -39.81657791137695, "loss": 0.2867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0830751657485962, "rewards/margins": 1.335164189338684, "rewards/rejected": -0.25208911299705505, "step": 51 }, { "epoch": 0.76, "grad_norm": 0.5716714262962341, "learning_rate": 2.5e-05, "logits/chosen": -75.49048614501953, "logits/rejected": -77.57994079589844, "logps/chosen": -390.3035888671875, "logps/rejected": -43.950286865234375, "loss": 0.1289, "rewards/accuracies": 1.0, "rewards/chosen": 1.866355299949646, "rewards/margins": 2.485651731491089, "rewards/rejected": -0.6192965507507324, "step": 68 }, { "epoch": 0.96, "grad_norm": 0.27481791377067566, "learning_rate": 2.2875e-05, "logits/chosen": -76.5324478149414, "logits/rejected": -76.91486358642578, "logps/chosen": -544.392333984375, "logps/rejected": -47.081398010253906, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": 2.428267240524292, "rewards/margins": 3.443005323410034, "rewards/rejected": -1.014737844467163, "step": 85 }, { "epoch": 1.15, "grad_norm": 0.19873949885368347, "learning_rate": 2.075e-05, "logits/chosen": -75.65658569335938, "logits/rejected": -77.1497573852539, "logps/chosen": -488.48321533203125, "logps/rejected": -52.546443939208984, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 2.4470691680908203, "rewards/margins": 3.960081100463867, "rewards/rejected": -1.5130122900009155, "step": 102 }, { "epoch": 1.34, "grad_norm": 0.19984892010688782, "learning_rate": 1.8625000000000002e-05, "logits/chosen": -74.55690002441406, "logits/rejected": -76.0978012084961, "logps/chosen": -458.3366394042969, "logps/rejected": -55.80501937866211, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 2.5542659759521484, "rewards/margins": 4.403810501098633, "rewards/rejected": -1.849544644355774, "step": 119 }, { "epoch": 1.53, "grad_norm": 0.13584846258163452, "learning_rate": 1.65e-05, "logits/chosen": -76.71270751953125, "logits/rejected": -76.32665252685547, "logps/chosen": -315.4811706542969, "logps/rejected": -59.061004638671875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 2.4759371280670166, "rewards/margins": 4.6066389083862305, "rewards/rejected": -2.130701780319214, "step": 136 }, { "epoch": 1.72, "grad_norm": 0.2994050681591034, "learning_rate": 1.4375e-05, "logits/chosen": -75.33349609375, "logits/rejected": -75.40139770507812, "logps/chosen": -512.8544311523438, "logps/rejected": -61.232086181640625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 2.7437143325805664, "rewards/margins": 5.155636310577393, "rewards/rejected": -2.4119224548339844, "step": 153 }, { "epoch": 1.91, "grad_norm": 0.03395446017384529, "learning_rate": 1.225e-05, "logits/chosen": -77.47164916992188, "logits/rejected": -75.93584442138672, "logps/chosen": -452.4098815917969, "logps/rejected": -63.56547164916992, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 3.0080432891845703, "rewards/margins": 5.618269443511963, "rewards/rejected": -2.6102263927459717, "step": 170 }, { "epoch": 2.1, "grad_norm": 0.10275749117136002, "learning_rate": 1.0125e-05, "logits/chosen": -75.49082946777344, "logits/rejected": -75.35734558105469, "logps/chosen": -393.71875, "logps/rejected": -64.44371795654297, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 2.5345280170440674, "rewards/margins": 5.2231974601745605, "rewards/rejected": -2.688669204711914, "step": 187 }, { "epoch": 2.29, "grad_norm": 0.02322622574865818, "learning_rate": 8e-06, "logits/chosen": -75.02998352050781, "logits/rejected": -75.74195861816406, "logps/chosen": -566.3424072265625, "logps/rejected": -67.69281005859375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 3.286576986312866, "rewards/margins": 6.311046123504639, "rewards/rejected": -3.024468421936035, "step": 204 }, { "epoch": 2.48, "grad_norm": 0.0772617906332016, "learning_rate": 5.8750000000000005e-06, "logits/chosen": -75.23873138427734, "logits/rejected": -75.26593780517578, "logps/chosen": -395.7027282714844, "logps/rejected": -68.70228576660156, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 2.8116443157196045, "rewards/margins": 5.9091572761535645, "rewards/rejected": -3.097513198852539, "step": 221 }, { "epoch": 2.67, "grad_norm": 0.05107683688402176, "learning_rate": 3.75e-06, "logits/chosen": -76.43243408203125, "logits/rejected": -75.55731964111328, "logps/chosen": -469.6390380859375, "logps/rejected": -69.15847778320312, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.7971303462982178, "rewards/margins": 6.020565032958984, "rewards/rejected": -3.2234342098236084, "step": 238 }, { "epoch": 2.87, "grad_norm": 0.046351734548807144, "learning_rate": 1.6250000000000001e-06, "logits/chosen": -77.13455200195312, "logits/rejected": -75.18317413330078, "logps/chosen": -392.20501708984375, "logps/rejected": -69.30036163330078, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.826266050338745, "rewards/margins": 6.055777549743652, "rewards/rejected": -3.2295114994049072, "step": 255 } ], "logging_steps": 17, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }