{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9795918367346939, "eval_steps": 500, "global_step": 24, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 417.625, "epoch": 0.04081632653061224, "grad_norm": 0.33383865936017726, "kl": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 1.2666015625, "reward_std": 0.6843482926487923, "rewards/format_reward": 0.0625, "rewards/instruction_follow_reward": 0.2200520858168602, "rewards/tag_count_reward": 0.5439453125, "step": 1 }, { "completion_length": 422.734375, "epoch": 0.08163265306122448, "grad_norm": 0.4356198565702374, "kl": 0.0, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": 1.1138671934604645, "reward_std": 0.8332342207431793, "rewards/format_reward": 0.06640625, "rewards/instruction_follow_reward": 0.16718750074505806, "rewards/tag_count_reward": 0.5458984375, "step": 2 }, { "completion_length": 402.48828125, "epoch": 0.12244897959183673, "grad_norm": 0.4637419702838537, "kl": 0.0002346038818359375, "learning_rate": 5e-06, "loss": 0.0, "reward": 1.1527343690395355, "reward_std": 0.7186494022607803, "rewards/format_reward": 0.078125, "rewards/instruction_follow_reward": 0.17395833134651184, "rewards/tag_count_reward": 0.552734375, "step": 3 }, { "completion_length": 456.05859375, "epoch": 0.16326530612244897, "grad_norm": 0.3857097027648158, "kl": 0.0002655982971191406, "learning_rate": 4.97486935900654e-06, "loss": 0.0, "reward": 1.0625, "reward_std": 0.7507821768522263, "rewards/format_reward": 0.05859375, "rewards/instruction_follow_reward": 0.15234375, "rewards/tag_count_reward": 0.546875, "step": 4 }, { "completion_length": 413.34375, "epoch": 0.20408163265306123, "grad_norm": 0.466480501029486, "kl": 0.0003476142883300781, "learning_rate": 4.900038813018817e-06, "loss": 0.0, "reward": 1.1712267696857452, "reward_std": 0.8179685026407242, "rewards/format_reward": 0.1328125, "rewards/instruction_follow_reward": 0.1521276757121086, "rewards/tag_count_reward": 0.58203125, "step": 5 }, { "completion_length": 433.4765625, "epoch": 0.24489795918367346, "grad_norm": 0.362256322240001, "kl": 0.0005931854248046875, "learning_rate": 4.777179952780443e-06, "loss": 0.0, "reward": 1.1787109375, "reward_std": 0.7316572219133377, "rewards/format_reward": 0.1484375, "rewards/instruction_follow_reward": 0.1458333283662796, "rewards/tag_count_reward": 0.5927734375, "step": 6 }, { "completion_length": 393.33203125, "epoch": 0.2857142857142857, "grad_norm": 0.41571573133533485, "kl": 0.002559661865234375, "learning_rate": 4.609037242210989e-06, "loss": 0.0001, "reward": 1.212109386920929, "reward_std": 0.84661765396595, "rewards/format_reward": 0.2265625, "rewards/instruction_follow_reward": 0.11757812649011612, "rewards/tag_count_reward": 0.6328125, "step": 7 }, { "completion_length": 417.6015625, "epoch": 0.32653061224489793, "grad_norm": 0.4374737996718517, "kl": 0.006134033203125, "learning_rate": 4.39936671161711e-06, "loss": 0.0002, "reward": 1.6025390625, "reward_std": 0.9893212765455246, "rewards/format_reward": 0.39453125, "rewards/instruction_follow_reward": 0.1650390625, "rewards/tag_count_reward": 0.712890625, "step": 8 }, { "completion_length": 401.41015625, "epoch": 0.3673469387755102, "grad_norm": 0.4343571016630881, "kl": 0.00911712646484375, "learning_rate": 4.152852054182151e-06, "loss": 0.0004, "reward": 1.696484386920929, "reward_std": 1.0057250708341599, "rewards/format_reward": 0.4765625, "rewards/instruction_follow_reward": 0.15273437649011612, "rewards/tag_count_reward": 0.76171875, "step": 9 }, { "completion_length": 368.63671875, "epoch": 0.40816326530612246, "grad_norm": 0.48080292326265134, "kl": 0.021087646484375, "learning_rate": 3.875e-06, "loss": 0.0008, "reward": 1.8870041966438293, "reward_std": 1.0354472994804382, "rewards/format_reward": 0.5859375, "rewards/instruction_follow_reward": 0.1654597371816635, "rewards/tag_count_reward": 0.8046875, "step": 10 }, { "completion_length": 423.52734375, "epoch": 0.4489795918367347, "grad_norm": 0.4924150425528683, "kl": 0.027374267578125, "learning_rate": 3.5720173048243896e-06, "loss": 0.0011, "reward": 2.1278125047683716, "reward_std": 0.8741143345832825, "rewards/format_reward": 0.61328125, "rewards/instruction_follow_reward": 0.23433593660593033, "rewards/tag_count_reward": 0.8115234375, "step": 11 }, { "completion_length": 377.40234375, "epoch": 0.4897959183673469, "grad_norm": 5.854623452977172, "kl": 0.2474365234375, "learning_rate": 3.2506721014017075e-06, "loss": 0.0099, "reward": 2.1224609315395355, "reward_std": 0.955539807677269, "rewards/format_reward": 0.65625, "rewards/instruction_follow_reward": 0.20585937798023224, "rewards/tag_count_reward": 0.8486328125, "step": 12 }, { "completion_length": 413.79296875, "epoch": 0.5306122448979592, "grad_norm": 0.6602522146512088, "kl": 0.042938232421875, "learning_rate": 2.918142710569455e-06, "loss": 0.0017, "reward": 2.029882788658142, "reward_std": 1.0107707530260086, "rewards/format_reward": 0.625, "rewards/instruction_follow_reward": 0.19062500074505806, "rewards/tag_count_reward": 0.8330078125, "step": 13 }, { "completion_length": 421.06640625, "epoch": 0.5714285714285714, "grad_norm": 0.5712567282923191, "kl": 0.025848388671875, "learning_rate": 2.5818572894305453e-06, "loss": 0.001, "reward": 2.166015625, "reward_std": 1.066056489944458, "rewards/format_reward": 0.6328125, "rewards/instruction_follow_reward": 0.236328125, "rewards/tag_count_reward": 0.82421875, "step": 14 }, { "completion_length": 444.87890625, "epoch": 0.6122448979591837, "grad_norm": 0.36194372988033446, "kl": 0.0115203857421875, "learning_rate": 2.2493278985982932e-06, "loss": 0.0005, "reward": 2.072853773832321, "reward_std": 0.8489355742931366, "rewards/format_reward": 0.66796875, "rewards/instruction_follow_reward": 0.18574292585253716, "rewards/tag_count_reward": 0.84765625, "step": 15 }, { "completion_length": 384.046875, "epoch": 0.6530612244897959, "grad_norm": 0.44405131315182733, "kl": 0.014404296875, "learning_rate": 1.9279826951756115e-06, "loss": 0.0006, "reward": 2.0263671875, "reward_std": 1.1294618248939514, "rewards/format_reward": 0.62109375, "rewards/instruction_follow_reward": 0.193359375, "rewards/tag_count_reward": 0.8251953125, "step": 16 }, { "completion_length": 422.83984375, "epoch": 0.6938775510204082, "grad_norm": 0.423027453292725, "kl": 0.0103912353515625, "learning_rate": 1.6250000000000007e-06, "loss": 0.0004, "reward": 1.822265625, "reward_std": 0.994486004114151, "rewards/format_reward": 0.57421875, "rewards/instruction_follow_reward": 0.15234375, "rewards/tag_count_reward": 0.791015625, "step": 17 }, { "completion_length": 406.91015625, "epoch": 0.7346938775510204, "grad_norm": 0.39946864041715185, "kl": 0.0131072998046875, "learning_rate": 1.3471479458178499e-06, "loss": 0.0005, "reward": 2.033278226852417, "reward_std": 0.9287643581628799, "rewards/format_reward": 0.64453125, "rewards/instruction_follow_reward": 0.18426983058452606, "rewards/tag_count_reward": 0.8359375, "step": 18 }, { "completion_length": 337.79296875, "epoch": 0.7755102040816326, "grad_norm": 0.4685607339001439, "kl": 0.0146942138671875, "learning_rate": 1.1006332883828912e-06, "loss": 0.0006, "reward": 2.13671875, "reward_std": 0.9065403789281845, "rewards/format_reward": 0.57421875, "rewards/instruction_follow_reward": 0.25, "rewards/tag_count_reward": 0.8125, "step": 19 }, { "completion_length": 366.734375, "epoch": 0.8163265306122449, "grad_norm": 0.4565162205296167, "kl": 0.0125579833984375, "learning_rate": 8.909627577890121e-07, "loss": 0.0005, "reward": 2.1884765625, "reward_std": 1.1439659893512726, "rewards/format_reward": 0.609375, "rewards/instruction_follow_reward": 0.2500000037252903, "rewards/tag_count_reward": 0.8291015625, "step": 20 }, { "completion_length": 380.3515625, "epoch": 0.8571428571428571, "grad_norm": 0.4354360557045128, "kl": 0.0111846923828125, "learning_rate": 7.228200472195574e-07, "loss": 0.0004, "reward": 1.9033203125, "reward_std": 0.9723567366600037, "rewards/format_reward": 0.6015625, "rewards/instruction_follow_reward": 0.166015625, "rewards/tag_count_reward": 0.8037109375, "step": 21 }, { "completion_length": 357.83984375, "epoch": 0.8979591836734694, "grad_norm": 0.4196659338855581, "kl": 0.0109710693359375, "learning_rate": 5.999611869811834e-07, "loss": 0.0004, "reward": 1.966796875, "reward_std": 0.9246305525302887, "rewards/format_reward": 0.53515625, "rewards/instruction_follow_reward": 0.2109375, "rewards/tag_count_reward": 0.798828125, "step": 22 }, { "completion_length": 371.6953125, "epoch": 0.9387755102040817, "grad_norm": 0.45407932227577535, "kl": 0.0106658935546875, "learning_rate": 5.251306409934609e-07, "loss": 0.0004, "reward": 1.66015625, "reward_std": 0.976933628320694, "rewards/format_reward": 0.54296875, "rewards/instruction_follow_reward": 0.109375, "rewards/tag_count_reward": 0.7890625, "step": 23 }, { "completion_length": 396.734375, "epoch": 0.9795918367346939, "grad_norm": 0.4051011056836698, "kl": 0.01055908203125, "learning_rate": 5.000000000000001e-07, "loss": 0.0004, "reward": 1.942187488079071, "reward_std": 0.9197708517313004, "rewards/format_reward": 0.5546875, "rewards/instruction_follow_reward": 0.20468749850988388, "rewards/tag_count_reward": 0.7734375, "step": 24 }, { "epoch": 0.9795918367346939, "step": 24, "total_flos": 0.0, "train_loss": 0.0008400396373341815, "train_runtime": 3059.8724, "train_samples_per_second": 0.255, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 24, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }