{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9921671018276762, "eval_steps": 500, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6.531942491552821, "learning_rate": 5e-08, "logits/chosen": -2.851747512817383, "logits/rejected": -2.833996534347534, "logps/chosen": -165.70089721679688, "logps/rejected": -198.857666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 5.930803989300868, "learning_rate": 2.5e-07, "logits/chosen": -2.770416259765625, "logits/rejected": -2.7731680870056152, "logps/chosen": -171.3281707763672, "logps/rejected": -172.58348083496094, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": 0.00034834028338082135, "rewards/margins": 4.263037408236414e-05, "rewards/rejected": 0.0003057100111618638, "step": 5 }, { "epoch": 0.1, "grad_norm": 7.205939520530408, "learning_rate": 5e-07, "logits/chosen": -2.785672664642334, "logits/rejected": -2.7945070266723633, "logps/chosen": -189.79400634765625, "logps/rejected": -194.38011169433594, "loss": 0.6923, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005779535509645939, "rewards/margins": 0.0015561816981062293, "rewards/rejected": 0.004223353695124388, "step": 10 }, { "epoch": 0.16, "grad_norm": 7.119689881451758, "learning_rate": 4.957432749209755e-07, "logits/chosen": -2.841862678527832, "logits/rejected": -2.8522396087646484, "logps/chosen": -196.4453582763672, "logps/rejected": -186.3593292236328, "loss": 0.6891, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0351785309612751, "rewards/margins": 0.009719189256429672, "rewards/rejected": 0.025459343567490578, "step": 15 }, { "epoch": 0.21, "grad_norm": 7.025742204681022, "learning_rate": 4.83118057351089e-07, "logits/chosen": -2.8577423095703125, "logits/rejected": -2.8679168224334717, "logps/chosen": -163.30587768554688, "logps/rejected": -176.16122436523438, "loss": 0.6822, "rewards/accuracies": 0.65625, "rewards/chosen": 0.054369617253541946, "rewards/margins": 0.01927168108522892, "rewards/rejected": 0.035097938030958176, "step": 20 }, { "epoch": 0.26, "grad_norm": 7.047833772227819, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -2.8176944255828857, "logits/rejected": -2.8154852390289307, "logps/chosen": -127.52900695800781, "logps/rejected": -149.99598693847656, "loss": 0.6734, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.039430197328329086, "rewards/margins": 0.044888969510793686, "rewards/rejected": -0.005458767991513014, "step": 25 }, { "epoch": 0.31, "grad_norm": 7.4255717276037405, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -2.781858444213867, "logits/rejected": -2.7814831733703613, "logps/chosen": -161.177734375, "logps/rejected": -173.82421875, "loss": 0.665, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01698228344321251, "rewards/margins": 0.0684308260679245, "rewards/rejected": -0.08541311323642731, "step": 30 }, { "epoch": 0.37, "grad_norm": 7.891881929971765, "learning_rate": 4.006586590948141e-07, "logits/chosen": -2.848252296447754, "logits/rejected": -2.8431050777435303, "logps/chosen": -192.15963745117188, "logps/rejected": -209.07540893554688, "loss": 0.6544, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008883295580744743, "rewards/margins": 0.0794510543346405, "rewards/rejected": -0.0883343443274498, "step": 35 }, { "epoch": 0.42, "grad_norm": 9.209834953181781, "learning_rate": 3.614345889441346e-07, "logits/chosen": -2.7681477069854736, "logits/rejected": -2.78022837638855, "logps/chosen": -135.9792022705078, "logps/rejected": -164.3667449951172, "loss": 0.6465, "rewards/accuracies": 0.65625, "rewards/chosen": -0.031201759353280067, "rewards/margins": 0.13447019457817078, "rewards/rejected": -0.16567197442054749, "step": 40 }, { "epoch": 0.47, "grad_norm": 13.835886251568184, "learning_rate": 3.184157475180207e-07, "logits/chosen": -2.7284975051879883, "logits/rejected": -2.7436182498931885, "logps/chosen": -205.8560028076172, "logps/rejected": -212.56710815429688, "loss": 0.6496, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17208269238471985, "rewards/margins": 0.13685402274131775, "rewards/rejected": -0.30893674492836, "step": 45 }, { "epoch": 0.52, "grad_norm": 10.309186722273289, "learning_rate": 2.730670898658255e-07, "logits/chosen": -2.7203848361968994, "logits/rejected": -2.7220139503479004, "logps/chosen": -183.94479370117188, "logps/rejected": -218.1922149658203, "loss": 0.6248, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09618374705314636, "rewards/margins": 0.18668127059936523, "rewards/rejected": -0.2828650176525116, "step": 50 }, { "epoch": 0.57, "grad_norm": 12.04836501966109, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -2.6110920906066895, "logits/rejected": -2.6190452575683594, "logps/chosen": -151.388916015625, "logps/rejected": -174.0006561279297, "loss": 0.6317, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.032760851085186005, "rewards/margins": 0.17945989966392517, "rewards/rejected": -0.2122207134962082, "step": 55 }, { "epoch": 0.63, "grad_norm": 14.818475765214615, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -2.69221568107605, "logits/rejected": -2.689034938812256, "logps/chosen": -181.30128479003906, "logps/rejected": -231.5193634033203, "loss": 0.6122, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.09343220293521881, "rewards/margins": 0.2633873522281647, "rewards/rejected": -0.3568195104598999, "step": 60 }, { "epoch": 0.68, "grad_norm": 15.265728023102268, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -2.7168681621551514, "logits/rejected": -2.7309060096740723, "logps/chosen": -185.16700744628906, "logps/rejected": -220.42764282226562, "loss": 0.6045, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23089858889579773, "rewards/margins": 0.27521029114723206, "rewards/rejected": -0.5061088800430298, "step": 65 }, { "epoch": 0.73, "grad_norm": 13.52741638941588, "learning_rate": 9.934134090518592e-08, "logits/chosen": -2.6834919452667236, "logits/rejected": -2.6923632621765137, "logps/chosen": -200.9665069580078, "logps/rejected": -217.9497528076172, "loss": 0.6094, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23829719424247742, "rewards/margins": 0.2035256326198578, "rewards/rejected": -0.4418228268623352, "step": 70 }, { "epoch": 0.78, "grad_norm": 14.871873879280589, "learning_rate": 6.524777069483525e-08, "logits/chosen": -2.6725871562957764, "logits/rejected": -2.6699538230895996, "logps/chosen": -185.2981719970703, "logps/rejected": -229.42092895507812, "loss": 0.5985, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13438589870929718, "rewards/margins": 0.32371044158935547, "rewards/rejected": -0.45809632539749146, "step": 75 }, { "epoch": 0.84, "grad_norm": 12.313204564006284, "learning_rate": 3.74457160675965e-08, "logits/chosen": -2.6488523483276367, "logits/rejected": -2.6512537002563477, "logps/chosen": -177.8891143798828, "logps/rejected": -211.4371795654297, "loss": 0.6019, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.09899892657995224, "rewards/margins": 0.26206719875335693, "rewards/rejected": -0.36106616258621216, "step": 80 }, { "epoch": 0.89, "grad_norm": 17.242389025181602, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -2.6852784156799316, "logits/rejected": -2.6899216175079346, "logps/chosen": -171.39414978027344, "logps/rejected": -207.66738891601562, "loss": 0.6214, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11210503429174423, "rewards/margins": 0.2755950093269348, "rewards/rejected": -0.38770005106925964, "step": 85 }, { "epoch": 0.94, "grad_norm": 13.932688124952723, "learning_rate": 4.256725079024553e-09, "logits/chosen": -2.6324477195739746, "logits/rejected": -2.6469483375549316, "logps/chosen": -181.08218383789062, "logps/rejected": -215.79953002929688, "loss": 0.6081, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12142710387706757, "rewards/margins": 0.2807455062866211, "rewards/rejected": -0.40217262506484985, "step": 90 }, { "epoch": 0.99, "grad_norm": 15.247505163019246, "learning_rate": 0.0, "logits/chosen": -2.682211399078369, "logits/rejected": -2.697298765182495, "logps/chosen": -198.21182250976562, "logps/rejected": -223.2611541748047, "loss": 0.6054, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12603162229061127, "rewards/margins": 0.2787989377975464, "rewards/rejected": -0.40483060479164124, "step": 95 }, { "epoch": 0.99, "step": 95, "total_flos": 0.0, "train_loss": 0.6401761331056294, "train_runtime": 2555.4095, "train_samples_per_second": 4.785, "train_steps_per_second": 0.037 } ], "logging_steps": 5, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }