{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9947643979057592, "eval_steps": 100, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010471204188481676, "grad_norm": 14.42849432669548, "learning_rate": 2e-08, "logits/chosen": -2.705627918243408, "logits/rejected": -1.8209420442581177, "logps/chosen": -315.2232666015625, "logps/rejected": -333.2189025878906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.10471204188481675, "grad_norm": 13.44691822523609, "learning_rate": 2e-07, "logits/chosen": -2.7048144340515137, "logits/rejected": -2.1536295413970947, "logps/chosen": -277.1604309082031, "logps/rejected": -290.7293701171875, "loss": 0.6923, "rewards/accuracies": 0.5347222089767456, "rewards/chosen": 0.00019832928956020623, "rewards/margins": 0.0016432523261755705, "rewards/rejected": -0.0014449231093749404, "step": 10 }, { "epoch": 0.2094240837696335, "grad_norm": 16.21569144475015, "learning_rate": 1.9324722294043556e-07, "logits/chosen": -2.492572546005249, "logits/rejected": -2.0814006328582764, "logps/chosen": -309.6625061035156, "logps/rejected": -296.83868408203125, "loss": 0.6658, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.003994358237832785, "rewards/margins": 0.057490717619657516, "rewards/rejected": -0.061485081911087036, "step": 20 }, { "epoch": 0.31413612565445026, "grad_norm": 14.182589244162711, "learning_rate": 1.739008917220659e-07, "logits/chosen": -2.387019395828247, "logits/rejected": -1.9367955923080444, "logps/chosen": -299.33404541015625, "logps/rejected": -322.9083251953125, "loss": 0.5937, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.022330567240715027, "rewards/margins": 0.21069249510765076, "rewards/rejected": -0.23302307724952698, "step": 30 }, { "epoch": 0.418848167539267, "grad_norm": 16.048018340744715, "learning_rate": 1.4457383557765383e-07, "logits/chosen": -2.4496796131134033, "logits/rejected": -2.079987049102783, "logps/chosen": -294.8586730957031, "logps/rejected": -345.1488037109375, "loss": 0.5035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1000712662935257, "rewards/margins": 0.45323365926742554, "rewards/rejected": -0.55330491065979, "step": 40 }, { "epoch": 0.5235602094240838, "grad_norm": 22.958491853819712, "learning_rate": 1.092268359463302e-07, "logits/chosen": -2.2879467010498047, "logits/rejected": -1.8292083740234375, "logps/chosen": -306.67706298828125, "logps/rejected": -418.6455993652344, "loss": 0.4123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4231874942779541, "rewards/margins": 0.8138816952705383, "rewards/rejected": -1.2370691299438477, "step": 50 }, { "epoch": 0.6282722513089005, "grad_norm": 19.370046611297532, "learning_rate": 7.263370099279171e-08, "logits/chosen": -2.491612672805786, "logits/rejected": -2.067852735519409, "logps/chosen": -442.7276916503906, "logps/rejected": -641.3619384765625, "loss": 0.2439, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.5304675102233887, "rewards/margins": 1.8317034244537354, "rewards/rejected": -3.362171173095703, "step": 60 }, { "epoch": 0.7329842931937173, "grad_norm": 17.60064306931795, "learning_rate": 3.973653636207437e-08, "logits/chosen": -2.3535571098327637, "logits/rejected": -1.9946672916412354, "logps/chosen": -591.0353393554688, "logps/rejected": -910.4153442382812, "loss": 0.1698, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.834195613861084, "rewards/margins": 3.2335205078125, "rewards/rejected": -6.067716598510742, "step": 70 }, { "epoch": 0.837696335078534, "grad_norm": 20.39945478972367, "learning_rate": 1.49782864270386e-08, "logits/chosen": -2.500075101852417, "logits/rejected": -2.16581654548645, "logps/chosen": -597.4744262695312, "logps/rejected": -964.6823120117188, "loss": 0.1431, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.152522563934326, "rewards/margins": 3.5699126720428467, "rewards/rejected": -6.722434997558594, "step": 80 }, { "epoch": 0.9424083769633508, "grad_norm": 27.851116517075948, "learning_rate": 1.7026900316098214e-09, "logits/chosen": -2.4192073345184326, "logits/rejected": -2.111260175704956, "logps/chosen": -618.6544799804688, "logps/rejected": -911.25146484375, "loss": 0.137, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.717777967453003, "rewards/margins": 3.354006290435791, "rewards/rejected": -6.071784019470215, "step": 90 }, { "epoch": 0.9947643979057592, "step": 95, "total_flos": 0.0, "train_loss": 0.38092811358602424, "train_runtime": 1269.7396, "train_samples_per_second": 9.621, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }