{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9921671018276762, "eval_steps": 500, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6.531803428424505, "learning_rate": 1.5000000000000002e-07, "logits/chosen": -2.851747512817383, "logits/rejected": -2.833996534347534, "logps/chosen": -165.70089721679688, "logps/rejected": -198.857666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 5.945146082388948, "learning_rate": 7.5e-07, "logits/chosen": -2.7703874111175537, "logits/rejected": -2.773144006729126, "logps/chosen": -171.2422332763672, "logps/rejected": -172.52732849121094, "loss": 0.6929, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.001207518856972456, "rewards/margins": 0.00034020940074697137, "rewards/rejected": 0.0008673094562254846, "step": 5 }, { "epoch": 0.1, "grad_norm": 7.177620483727426, "learning_rate": 1.5e-06, "logits/chosen": -2.7866053581237793, "logits/rejected": -2.7954766750335693, "logps/chosen": -186.5630340576172, "logps/rejected": -192.17149353027344, "loss": 0.6878, "rewards/accuracies": 0.625, "rewards/chosen": 0.038089457899332047, "rewards/margins": 0.011779792606830597, "rewards/rejected": 0.02630966529250145, "step": 10 }, { "epoch": 0.16, "grad_norm": 8.417917120076636, "learning_rate": 1.4872298247629265e-06, "logits/chosen": -2.824795722961426, "logits/rejected": -2.8347983360290527, "logps/chosen": -197.54190063476562, "logps/rejected": -192.82992553710938, "loss": 0.6676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02421323023736477, "rewards/margins": 0.06345993280410767, "rewards/rejected": -0.03924670070409775, "step": 15 }, { "epoch": 0.21, "grad_norm": 9.10298055209343, "learning_rate": 1.4493541720532668e-06, "logits/chosen": -2.7853965759277344, "logits/rejected": -2.7954306602478027, "logps/chosen": -172.51913452148438, "logps/rejected": -199.7716522216797, "loss": 0.6292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0377628356218338, "rewards/margins": 0.16324351727962494, "rewards/rejected": -0.20100633800029755, "step": 20 }, { "epoch": 0.26, "grad_norm": 10.051372999934031, "learning_rate": 1.3876628517972108e-06, "logits/chosen": -2.7528138160705566, "logits/rejected": -2.7506392002105713, "logps/chosen": -147.0410919189453, "logps/rejected": -184.7159423828125, "loss": 0.6228, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15569067001342773, "rewards/margins": 0.19696763157844543, "rewards/rejected": -0.3526582419872284, "step": 25 }, { "epoch": 0.31, "grad_norm": 11.47609664425213, "learning_rate": 1.3042566879154944e-06, "logits/chosen": -2.77292799949646, "logits/rejected": -2.770139217376709, "logps/chosen": -189.57742309570312, "logps/rejected": -228.9635467529297, "loss": 0.6066, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3009790778160095, "rewards/margins": 0.33582720160484314, "rewards/rejected": -0.6368061900138855, "step": 30 }, { "epoch": 0.37, "grad_norm": 14.051983189813676, "learning_rate": 1.2019759772844424e-06, "logits/chosen": -2.8096296787261963, "logits/rejected": -2.8017215728759766, "logps/chosen": -245.0883026123047, "logps/rejected": -281.6932067871094, "loss": 0.602, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5381699800491333, "rewards/margins": 0.27634209394454956, "rewards/rejected": -0.8145120739936829, "step": 35 }, { "epoch": 0.42, "grad_norm": 16.49665781047947, "learning_rate": 1.0843037668324039e-06, "logits/chosen": -2.690340042114258, "logits/rejected": -2.703282356262207, "logps/chosen": -204.72268676757812, "logps/rejected": -264.89361572265625, "loss": 0.5792, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7186366319656372, "rewards/margins": 0.4523038864135742, "rewards/rejected": -1.1709405183792114, "step": 40 }, { "epoch": 0.47, "grad_norm": 18.566501994693994, "learning_rate": 9.552472425540623e-07, "logits/chosen": -2.699636936187744, "logits/rejected": -2.7163338661193848, "logps/chosen": -291.1601257324219, "logps/rejected": -333.49310302734375, "loss": 0.6088, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0251235961914062, "rewards/margins": 0.493073046207428, "rewards/rejected": -1.518196702003479, "step": 45 }, { "epoch": 0.52, "grad_norm": 20.128869617238305, "learning_rate": 8.192012695974766e-07, "logits/chosen": -2.689870595932007, "logits/rejected": -2.6908297538757324, "logps/chosen": -252.3955078125, "logps/rejected": -331.952880859375, "loss": 0.577, "rewards/accuracies": 0.71875, "rewards/chosen": -0.780690610408783, "rewards/margins": 0.6397809982299805, "rewards/rejected": -1.4204715490341187, "step": 50 }, { "epoch": 0.57, "grad_norm": 17.347634723380718, "learning_rate": 6.807987304025236e-07, "logits/chosen": -2.582890748977661, "logits/rejected": -2.5895731449127197, "logps/chosen": -216.3740234375, "logps/rejected": -272.9044494628906, "loss": 0.5676, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6826119422912598, "rewards/margins": 0.5186464190483093, "rewards/rejected": -1.2012584209442139, "step": 55 }, { "epoch": 0.63, "grad_norm": 18.207607616337743, "learning_rate": 5.447527574459379e-07, "logits/chosen": -2.615662097930908, "logits/rejected": -2.612374782562256, "logps/chosen": -263.39892578125, "logps/rejected": -343.30780029296875, "loss": 0.556, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9144086837768555, "rewards/margins": 0.5602952241897583, "rewards/rejected": -1.4747039079666138, "step": 60 }, { "epoch": 0.68, "grad_norm": 17.920968395292732, "learning_rate": 4.1569623316759636e-07, "logits/chosen": -2.603685140609741, "logits/rejected": -2.6156582832336426, "logps/chosen": -278.8509216308594, "logps/rejected": -348.4021911621094, "loss": 0.5707, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1677374839782715, "rewards/margins": 0.6181171536445618, "rewards/rejected": -1.7858545780181885, "step": 65 }, { "epoch": 0.73, "grad_norm": 21.60317231649874, "learning_rate": 2.980240227155578e-07, "logits/chosen": -2.530977725982666, "logits/rejected": -2.538461208343506, "logps/chosen": -298.65631103515625, "logps/rejected": -345.4963684082031, "loss": 0.5603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2151950597763062, "rewards/margins": 0.5020937323570251, "rewards/rejected": -1.7172887325286865, "step": 70 }, { "epoch": 0.78, "grad_norm": 23.519516498457133, "learning_rate": 1.9574331208450578e-07, "logits/chosen": -2.4647083282470703, "logits/rejected": -2.457724094390869, "logps/chosen": -297.4154052734375, "logps/rejected": -408.7651062011719, "loss": 0.5333, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2555583715438843, "rewards/margins": 0.9959794282913208, "rewards/rejected": -2.251537799835205, "step": 75 }, { "epoch": 0.84, "grad_norm": 24.189874157051893, "learning_rate": 1.1233714820278951e-07, "logits/chosen": -2.415569305419922, "logits/rejected": -2.409623384475708, "logps/chosen": -286.6866455078125, "logps/rejected": -374.3118591308594, "loss": 0.5319, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1869741678237915, "rewards/margins": 0.8028388023376465, "rewards/rejected": -1.9898128509521484, "step": 80 }, { "epoch": 0.89, "grad_norm": 27.1421302316283, "learning_rate": 5.064582794673323e-08, "logits/chosen": -2.463069438934326, "logits/rejected": -2.4618234634399414, "logps/chosen": -291.75006103515625, "logps/rejected": -380.35186767578125, "loss": 0.5565, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.315664529800415, "rewards/margins": 0.7988798022270203, "rewards/rejected": -2.11454439163208, "step": 85 }, { "epoch": 0.94, "grad_norm": 24.52916506686646, "learning_rate": 1.2770175237073662e-08, "logits/chosen": -2.391339063644409, "logits/rejected": -2.3986663818359375, "logps/chosen": -315.0043640136719, "logps/rejected": -409.3547668457031, "loss": 0.552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4606488943099976, "rewards/margins": 0.8770757913589478, "rewards/rejected": -2.337724447250366, "step": 90 }, { "epoch": 0.99, "grad_norm": 31.400140337371543, "learning_rate": 0.0, "logits/chosen": -2.4539380073547363, "logits/rejected": -2.4629311561584473, "logps/chosen": -328.31024169921875, "logps/rejected": -411.2120056152344, "loss": 0.5588, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4270151853561401, "rewards/margins": 0.8573243021965027, "rewards/rejected": -2.284339666366577, "step": 95 }, { "epoch": 0.99, "step": 95, "total_flos": 0.0, "train_loss": 0.5926835913407175, "train_runtime": 2430.0992, "train_samples_per_second": 5.031, "train_steps_per_second": 0.039 } ], "logging_steps": 5, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }