{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 54.7275276184082, "learning_rate": 3.3333333333333334e-09, "logps/chosen": -12.533590316772461, "logps/rejected": -31.803932189941406, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.26103636622428894, "losses/total": 0.6931471824645996, "ref_logps/chosen": -12.533590316772461, "ref_logps/rejected": -31.803932189941406, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 54.985111236572266, "learning_rate": 6.666666666666667e-09, "logps/chosen": -10.713068962097168, "logps/rejected": -33.42286682128906, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.2067307084798813, "losses/total": 0.6931471824645996, "ref_logps/chosen": -10.713068962097168, "ref_logps/rejected": -33.42286682128906, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.01, "grad_norm": 63.79972839355469, "learning_rate": 1e-08, "logps/chosen": -18.444631576538086, "logps/rejected": -40.561065673828125, "loss": 0.6898, "losses/dpo": 0.6951494216918945, "losses/sft": 0.3127816319465637, "losses/total": 0.6951494216918945, "ref_logps/chosen": -18.396644592285156, "ref_logps/rejected": -40.443275451660156, "rewards/accuracies": 0.625, "rewards/chosen": -0.004798633046448231, "rewards/margins": 0.006980050355195999, "rewards/rejected": -0.011778682470321655, "step": 3 }, { "epoch": 0.01, "grad_norm": 62.5953254699707, "learning_rate": 1.3333333333333334e-08, "logps/chosen": -14.227697372436523, "logps/rejected": -48.33661651611328, "loss": 0.6971, "losses/dpo": 0.6941574811935425, "losses/sft": 0.3062475621700287, "losses/total": 0.6941574811935425, "ref_logps/chosen": -14.194951057434082, "ref_logps/rejected": -48.378318786621094, "rewards/accuracies": 0.4375, "rewards/chosen": -0.003274601884186268, "rewards/margins": -0.007445037364959717, "rewards/rejected": 0.004170434549450874, "step": 4 }, { "epoch": 0.01, "grad_norm": 55.13102722167969, "learning_rate": 1.6666666666666667e-08, "logps/chosen": -13.59489631652832, "logps/rejected": -30.49202537536621, "loss": 0.6868, "losses/dpo": 0.681822657585144, "losses/sft": 0.2918586730957031, "losses/total": 0.681822657585144, "ref_logps/chosen": -13.620017051696777, "ref_logps/rejected": -30.386978149414062, "rewards/accuracies": 0.6875, "rewards/chosen": 0.002512148581445217, "rewards/margins": 0.01301683858036995, "rewards/rejected": -0.010504689998924732, "step": 5 }, { "epoch": 0.01, "grad_norm": 63.41639709472656, "learning_rate": 2e-08, "logps/chosen": -21.938560485839844, "logps/rejected": -45.47388458251953, "loss": 0.6968, "losses/dpo": 0.7057414054870605, "losses/sft": 0.23792970180511475, "losses/total": 0.7057414054870605, "ref_logps/chosen": -21.881378173828125, "ref_logps/rejected": -45.48707580566406, "rewards/accuracies": 0.5, "rewards/chosen": -0.0057183862663805485, "rewards/margins": -0.007037466391921043, "rewards/rejected": 0.0013190805912017822, "step": 6 }, { "epoch": 0.01, "grad_norm": 78.54264068603516, "learning_rate": 2.3333333333333334e-08, "logps/chosen": -18.155752182006836, "logps/rejected": -56.19256591796875, "loss": 0.6961, "losses/dpo": 0.6858953237533569, "losses/sft": 0.36166518926620483, "losses/total": 0.6858953237533569, "ref_logps/chosen": -18.145042419433594, "ref_logps/rejected": -56.23862838745117, "rewards/accuracies": 0.4375, "rewards/chosen": -0.001071083708666265, "rewards/margins": -0.005677402019500732, "rewards/rejected": 0.004606318660080433, "step": 7 }, { "epoch": 0.02, "grad_norm": 67.39810180664062, "learning_rate": 2.6666666666666667e-08, "logps/chosen": -18.465206146240234, "logps/rejected": -49.916664123535156, "loss": 0.6982, "losses/dpo": 0.6833201050758362, "losses/sft": 0.32572177052497864, "losses/total": 0.6833201050758362, "ref_logps/chosen": -18.501081466674805, "ref_logps/rejected": -50.047977447509766, "rewards/accuracies": 0.5, "rewards/chosen": 0.0035875202156603336, "rewards/margins": -0.009543540887534618, "rewards/rejected": 0.013131062500178814, "step": 8 }, { "epoch": 0.02, "grad_norm": 53.30474090576172, "learning_rate": 3e-08, "logps/chosen": -11.304487228393555, "logps/rejected": -35.22385787963867, "loss": 0.6868, "losses/dpo": 0.6948688626289368, "losses/sft": 0.2951069474220276, "losses/total": 0.6948688626289368, "ref_logps/chosen": -11.316177368164062, "ref_logps/rejected": -35.10341262817383, "rewards/accuracies": 0.75, "rewards/chosen": 0.0011690347455441952, "rewards/margins": 0.013213572092354298, "rewards/rejected": -0.01204453781247139, "step": 9 }, { "epoch": 0.02, "grad_norm": 62.466922760009766, "learning_rate": 3.3333333333333334e-08, "logps/chosen": -18.549427032470703, "logps/rejected": -45.951873779296875, "loss": 0.6818, "losses/dpo": 0.6670930981636047, "losses/sft": 0.2927955687046051, "losses/total": 0.6670930981636047, "ref_logps/chosen": -18.618999481201172, "ref_logps/rejected": -45.7880973815918, "rewards/accuracies": 0.625, "rewards/chosen": 0.006957197096198797, "rewards/margins": 0.023334600031375885, "rewards/rejected": -0.01637740060687065, "step": 10 }, { "epoch": 0.02, "grad_norm": 52.64626693725586, "learning_rate": 3.6666666666666664e-08, "logps/chosen": -11.861335754394531, "logps/rejected": -42.37664794921875, "loss": 0.6953, "losses/dpo": 0.6928779482841492, "losses/sft": 0.2991476058959961, "losses/total": 0.6928779482841492, "ref_logps/chosen": -11.865274429321289, "ref_logps/rejected": -42.42080307006836, "rewards/accuracies": 0.375, "rewards/chosen": 0.0003938704030588269, "rewards/margins": -0.004021647851914167, "rewards/rejected": 0.004415517672896385, "step": 11 }, { "epoch": 0.02, "grad_norm": 60.31581497192383, "learning_rate": 4e-08, "logps/chosen": -10.655393600463867, "logps/rejected": -45.220428466796875, "loss": 0.6907, "losses/dpo": 0.6967537999153137, "losses/sft": 0.32602399587631226, "losses/total": 0.6967537999153137, "ref_logps/chosen": -10.646404266357422, "ref_logps/rejected": -45.16096878051758, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008988277986645699, "rewards/margins": 0.005047045648097992, "rewards/rejected": -0.005945873446762562, "step": 12 }, { "epoch": 0.03, "grad_norm": 67.63394927978516, "learning_rate": 4.333333333333333e-08, "logps/chosen": -14.832448959350586, "logps/rejected": -42.32299041748047, "loss": 0.6898, "losses/dpo": 0.686829149723053, "losses/sft": 0.2940187454223633, "losses/total": 0.686829149723053, "ref_logps/chosen": -14.852863311767578, "ref_logps/rejected": -42.27099609375, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0020414486061781645, "rewards/margins": 0.007241221610456705, "rewards/rejected": -0.005199772771447897, "step": 13 }, { "epoch": 0.03, "grad_norm": 68.29341125488281, "learning_rate": 4.666666666666667e-08, "logps/chosen": -11.122224807739258, "logps/rejected": -44.558685302734375, "loss": 0.6936, "losses/dpo": 0.6977905035018921, "losses/sft": 0.3477708101272583, "losses/total": 0.6977905035018921, "ref_logps/chosen": -11.04557991027832, "ref_logps/rejected": -44.489044189453125, "rewards/accuracies": 0.5, "rewards/chosen": -0.007664448581635952, "rewards/margins": -0.0007005957886576653, "rewards/rejected": -0.006963852792978287, "step": 14 }, { "epoch": 0.03, "grad_norm": 64.83494567871094, "learning_rate": 5e-08, "logps/chosen": -20.65314483642578, "logps/rejected": -48.80792236328125, "loss": 0.6906, "losses/dpo": 0.6923655867576599, "losses/sft": 0.3701089918613434, "losses/total": 0.6923655867576599, "ref_logps/chosen": -20.716949462890625, "ref_logps/rejected": -48.81908416748047, "rewards/accuracies": 0.5, "rewards/chosen": 0.006380443461239338, "rewards/margins": 0.00526385335251689, "rewards/rejected": 0.0011165902251377702, "step": 15 }, { "epoch": 0.03, "grad_norm": 66.92870330810547, "learning_rate": 5.3333333333333334e-08, "logps/chosen": -11.5431489944458, "logps/rejected": -49.763465881347656, "loss": 0.6928, "losses/dpo": 0.6806471347808838, "losses/sft": 0.26024022698402405, "losses/total": 0.6806471347808838, "ref_logps/chosen": -11.589057922363281, "ref_logps/rejected": -49.79829406738281, "rewards/accuracies": 0.5, "rewards/chosen": 0.004590884782373905, "rewards/margins": 0.0011078307870775461, "rewards/rejected": 0.0034830542281270027, "step": 16 }, { "epoch": 0.03, "grad_norm": 71.3647689819336, "learning_rate": 5.666666666666666e-08, "logps/chosen": -18.725505828857422, "logps/rejected": -53.86628341674805, "loss": 0.6869, "losses/dpo": 0.6923279762268066, "losses/sft": 0.3113042712211609, "losses/total": 0.6923279762268066, "ref_logps/chosen": -18.76999282836914, "ref_logps/rejected": -53.78404998779297, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0044487896375358105, "rewards/margins": 0.012671994976699352, "rewards/rejected": -0.008223205804824829, "step": 17 }, { "epoch": 0.04, "grad_norm": 54.42089080810547, "learning_rate": 6e-08, "logps/chosen": -15.599297523498535, "logps/rejected": -35.98277282714844, "loss": 0.686, "losses/dpo": 0.6924772262573242, "losses/sft": 0.28760266304016113, "losses/total": 0.6924772262573242, "ref_logps/chosen": -15.625329971313477, "ref_logps/rejected": -35.86233139038086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.002603264059871435, "rewards/margins": 0.014647157862782478, "rewards/rejected": -0.012043893337249756, "step": 18 }, { "epoch": 0.04, "grad_norm": 84.1619644165039, "learning_rate": 6.333333333333333e-08, "logps/chosen": -19.605751037597656, "logps/rejected": -62.37677764892578, "loss": 0.686, "losses/dpo": 0.689681351184845, "losses/sft": 0.29873067140579224, "losses/total": 0.689681351184845, "ref_logps/chosen": -19.670486450195312, "ref_logps/rejected": -62.29419708251953, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006473721005022526, "rewards/margins": 0.014731885865330696, "rewards/rejected": -0.008258162997663021, "step": 19 }, { "epoch": 0.04, "grad_norm": 67.69758605957031, "learning_rate": 6.666666666666667e-08, "logps/chosen": -12.570611953735352, "logps/rejected": -56.44734191894531, "loss": 0.6914, "losses/dpo": 0.6809213161468506, "losses/sft": 0.17609833180904388, "losses/total": 0.6809213161468506, "ref_logps/chosen": -12.528483390808105, "ref_logps/rejected": -56.36581039428711, "rewards/accuracies": 0.5, "rewards/chosen": -0.004212804604321718, "rewards/margins": 0.003940396010875702, "rewards/rejected": -0.008153200149536133, "step": 20 }, { "epoch": 0.04, "grad_norm": 59.63766860961914, "learning_rate": 7e-08, "logps/chosen": -12.914083480834961, "logps/rejected": -40.84098815917969, "loss": 0.6877, "losses/dpo": 0.6884068250656128, "losses/sft": 0.2242521047592163, "losses/total": 0.6884068250656128, "ref_logps/chosen": -12.980051040649414, "ref_logps/rejected": -40.794857025146484, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006596784107387066, "rewards/margins": 0.011209950782358646, "rewards/rejected": -0.0046131666749715805, "step": 21 }, { "epoch": 0.04, "grad_norm": 56.689208984375, "learning_rate": 7.333333333333333e-08, "logps/chosen": -13.277399063110352, "logps/rejected": -42.286441802978516, "loss": 0.6838, "losses/dpo": 0.6860532760620117, "losses/sft": 0.2654157280921936, "losses/total": 0.6860532760620117, "ref_logps/chosen": -13.287601470947266, "ref_logps/rejected": -42.10773468017578, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0010203286074101925, "rewards/margins": 0.018891172483563423, "rewards/rejected": -0.017870843410491943, "step": 22 }, { "epoch": 0.05, "grad_norm": 56.51724624633789, "learning_rate": 7.666666666666665e-08, "logps/chosen": -14.184118270874023, "logps/rejected": -38.65294647216797, "loss": 0.6937, "losses/dpo": 0.7033101320266724, "losses/sft": 0.22815537452697754, "losses/total": 0.7033101320266724, "ref_logps/chosen": -14.229755401611328, "ref_logps/rejected": -38.708251953125, "rewards/accuracies": 0.5, "rewards/chosen": 0.004563881549984217, "rewards/margins": -0.0009667248232290149, "rewards/rejected": 0.005530606489628553, "step": 23 }, { "epoch": 0.05, "grad_norm": 54.5797119140625, "learning_rate": 8e-08, "logps/chosen": -13.569002151489258, "logps/rejected": -36.74486541748047, "loss": 0.6955, "losses/dpo": 0.6941298246383667, "losses/sft": 0.2619819641113281, "losses/total": 0.6941298246383667, "ref_logps/chosen": -13.519054412841797, "ref_logps/rejected": -36.73900604248047, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004994727671146393, "rewards/margins": -0.004408624954521656, "rewards/rejected": -0.0005861027166247368, "step": 24 }, { "epoch": 0.05, "grad_norm": 60.37660217285156, "learning_rate": 8.333333333333333e-08, "logps/chosen": -11.180171012878418, "logps/rejected": -45.01734161376953, "loss": 0.6862, "losses/dpo": 0.6880084872245789, "losses/sft": 0.2977868318557739, "losses/total": 0.6880084872245789, "ref_logps/chosen": -11.221393585205078, "ref_logps/rejected": -44.918087005615234, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004122227430343628, "rewards/margins": 0.014047539792954922, "rewards/rejected": -0.009925312362611294, "step": 25 }, { "epoch": 0.05, "grad_norm": 70.93061828613281, "learning_rate": 8.666666666666666e-08, "logps/chosen": -18.109363555908203, "logps/rejected": -61.71974563598633, "loss": 0.6808, "losses/dpo": 0.6811847686767578, "losses/sft": 0.24314402043819427, "losses/total": 0.6811847686767578, "ref_logps/chosen": -18.21930503845215, "ref_logps/rejected": -61.57814025878906, "rewards/accuracies": 0.75, "rewards/chosen": 0.010994033887982368, "rewards/margins": 0.025154881179332733, "rewards/rejected": -0.01416084822267294, "step": 26 }, { "epoch": 0.05, "grad_norm": 69.38543701171875, "learning_rate": 9e-08, "logps/chosen": -15.200861930847168, "logps/rejected": -43.106510162353516, "loss": 0.6763, "losses/dpo": 0.6658411026000977, "losses/sft": 0.25579920411109924, "losses/total": 0.6658411026000977, "ref_logps/chosen": -15.397557258605957, "ref_logps/rejected": -42.95894241333008, "rewards/accuracies": 0.8125, "rewards/chosen": 0.019669612869620323, "rewards/margins": 0.0344264879822731, "rewards/rejected": -0.01475687325000763, "step": 27 }, { "epoch": 0.06, "grad_norm": 49.01519012451172, "learning_rate": 9.333333333333334e-08, "logps/chosen": -10.879910469055176, "logps/rejected": -34.23360061645508, "loss": 0.6912, "losses/dpo": 0.704246997833252, "losses/sft": 0.2846185266971588, "losses/total": 0.704246997833252, "ref_logps/chosen": -10.923196792602539, "ref_logps/rejected": -34.23560333251953, "rewards/accuracies": 0.375, "rewards/chosen": 0.0043286713771522045, "rewards/margins": 0.004128447733819485, "rewards/rejected": 0.00020022434182465076, "step": 28 }, { "epoch": 0.06, "grad_norm": 55.75774002075195, "learning_rate": 9.666666666666666e-08, "logps/chosen": -13.540484428405762, "logps/rejected": -37.22199249267578, "loss": 0.6721, "losses/dpo": 0.670184850692749, "losses/sft": 0.24113543331623077, "losses/total": 0.670184850692749, "ref_logps/chosen": -13.689138412475586, "ref_logps/rejected": -36.942596435546875, "rewards/accuracies": 0.9375, "rewards/chosen": 0.014865398406982422, "rewards/margins": 0.042804695665836334, "rewards/rejected": -0.027939295396208763, "step": 29 }, { "epoch": 0.06, "grad_norm": 65.36017608642578, "learning_rate": 1e-07, "logps/chosen": -16.502321243286133, "logps/rejected": -63.630531311035156, "loss": 0.664, "losses/dpo": 0.6740528345108032, "losses/sft": 0.2862235903739929, "losses/total": 0.6740528345108032, "ref_logps/chosen": -16.616275787353516, "ref_logps/rejected": -63.14226531982422, "rewards/accuracies": 0.75, "rewards/chosen": 0.01139531098306179, "rewards/margins": 0.060221925377845764, "rewards/rejected": -0.048826612532138824, "step": 30 }, { "epoch": 0.06, "grad_norm": 66.14002227783203, "learning_rate": 1.0333333333333333e-07, "logps/chosen": -11.186877250671387, "logps/rejected": -48.64232635498047, "loss": 0.6673, "losses/dpo": 0.6822100281715393, "losses/sft": 0.2335319221019745, "losses/total": 0.6822100281715393, "ref_logps/chosen": -11.297755241394043, "ref_logps/rejected": -48.220863342285156, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01108776405453682, "rewards/margins": 0.05323418974876404, "rewards/rejected": -0.04214642941951752, "step": 31 }, { "epoch": 0.06, "grad_norm": 64.11575317382812, "learning_rate": 1.0666666666666667e-07, "logps/chosen": -14.923648834228516, "logps/rejected": -47.85265350341797, "loss": 0.6695, "losses/dpo": 0.6915856599807739, "losses/sft": 0.2507500648498535, "losses/total": 0.6915856599807739, "ref_logps/chosen": -15.00050163269043, "ref_logps/rejected": -47.44062042236328, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0076853265054523945, "rewards/margins": 0.04888825863599777, "rewards/rejected": -0.04120292887091637, "step": 32 }, { "epoch": 0.07, "grad_norm": 67.46519470214844, "learning_rate": 1.0999999999999999e-07, "logps/chosen": -15.70111083984375, "logps/rejected": -51.965126037597656, "loss": 0.6681, "losses/dpo": 0.6695447564125061, "losses/sft": 0.30792540311813354, "losses/total": 0.6695447564125061, "ref_logps/chosen": -15.807563781738281, "ref_logps/rejected": -51.55677032470703, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010645204223692417, "rewards/margins": 0.05148132145404816, "rewards/rejected": -0.040836118161678314, "step": 33 }, { "epoch": 0.07, "grad_norm": 50.910823822021484, "learning_rate": 1.1333333333333332e-07, "logps/chosen": -11.442159652709961, "logps/rejected": -32.383033752441406, "loss": 0.6783, "losses/dpo": 0.6662068367004395, "losses/sft": 0.30792683362960815, "losses/total": 0.6662068367004395, "ref_logps/chosen": -11.554034233093262, "ref_logps/rejected": -32.18949890136719, "rewards/accuracies": 0.6875, "rewards/chosen": 0.011187402531504631, "rewards/margins": 0.03054075315594673, "rewards/rejected": -0.01935334876179695, "step": 34 }, { "epoch": 0.07, "grad_norm": 53.7436637878418, "learning_rate": 1.1666666666666667e-07, "logps/chosen": -14.998608589172363, "logps/rejected": -31.25254249572754, "loss": 0.6719, "losses/dpo": 0.6867252588272095, "losses/sft": 0.2634373605251312, "losses/total": 0.6867252588272095, "ref_logps/chosen": -15.171464920043945, "ref_logps/rejected": -30.990324020385742, "rewards/accuracies": 0.8125, "rewards/chosen": 0.017285751178860664, "rewards/margins": 0.04350760579109192, "rewards/rejected": -0.026221856474876404, "step": 35 }, { "epoch": 0.07, "grad_norm": 50.0301513671875, "learning_rate": 1.2e-07, "logps/chosen": -10.806026458740234, "logps/rejected": -37.376033782958984, "loss": 0.6672, "losses/dpo": 0.6568068265914917, "losses/sft": 0.25919607281684875, "losses/total": 0.6568068265914917, "ref_logps/chosen": -10.89212417602539, "ref_logps/rejected": -36.92989730834961, "rewards/accuracies": 0.875, "rewards/chosen": 0.008609759621322155, "rewards/margins": 0.05322342365980148, "rewards/rejected": -0.044613663107156754, "step": 36 }, { "epoch": 0.07, "grad_norm": 62.99734115600586, "learning_rate": 1.2333333333333333e-07, "logps/chosen": -9.172164916992188, "logps/rejected": -48.006935119628906, "loss": 0.6505, "losses/dpo": 0.6659780144691467, "losses/sft": 0.19942894577980042, "losses/total": 0.6659780144691467, "ref_logps/chosen": -9.280783653259277, "ref_logps/rejected": -47.235198974609375, "rewards/accuracies": 1.0, "rewards/chosen": 0.010861923918128014, "rewards/margins": 0.08803565055131912, "rewards/rejected": -0.07717373222112656, "step": 37 }, { "epoch": 0.08, "grad_norm": 69.74348449707031, "learning_rate": 1.2666666666666666e-07, "logps/chosen": -20.615678787231445, "logps/rejected": -54.96332931518555, "loss": 0.6384, "losses/dpo": 0.6584606766700745, "losses/sft": 0.2419307678937912, "losses/total": 0.6584606766700745, "ref_logps/chosen": -21.108688354492188, "ref_logps/rejected": -54.317012786865234, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04930093139410019, "rewards/margins": 0.11393265426158905, "rewards/rejected": -0.06463172286748886, "step": 38 }, { "epoch": 0.08, "grad_norm": 72.3482894897461, "learning_rate": 1.3e-07, "logps/chosen": -13.5218505859375, "logps/rejected": -59.38899230957031, "loss": 0.612, "losses/dpo": 0.6105036735534668, "losses/sft": 0.34414657950401306, "losses/total": 0.6105036735534668, "ref_logps/chosen": -13.771303176879883, "ref_logps/rejected": -57.91090393066406, "rewards/accuracies": 1.0, "rewards/chosen": 0.024945255368947983, "rewards/margins": 0.17275384068489075, "rewards/rejected": -0.14780858159065247, "step": 39 }, { "epoch": 0.08, "grad_norm": 66.04383087158203, "learning_rate": 1.3333333333333334e-07, "logps/chosen": -16.882469177246094, "logps/rejected": -59.21009826660156, "loss": 0.6179, "losses/dpo": 0.6421835422515869, "losses/sft": 0.3301094174385071, "losses/total": 0.6421835422515869, "ref_logps/chosen": -17.26021385192871, "ref_logps/rejected": -57.98883056640625, "rewards/accuracies": 1.0, "rewards/chosen": 0.037774428725242615, "rewards/margins": 0.15990111231803894, "rewards/rejected": -0.12212669849395752, "step": 40 }, { "epoch": 0.08, "grad_norm": 67.81195068359375, "learning_rate": 1.3666666666666665e-07, "logps/chosen": -14.069759368896484, "logps/rejected": -53.27463912963867, "loss": 0.6137, "losses/dpo": 0.6028338670730591, "losses/sft": 0.26799964904785156, "losses/total": 0.6028338670730591, "ref_logps/chosen": -14.432823181152344, "ref_logps/rejected": -51.95939636230469, "rewards/accuracies": 1.0, "rewards/chosen": 0.03630626201629639, "rewards/margins": 0.16783034801483154, "rewards/rejected": -0.13152408599853516, "step": 41 }, { "epoch": 0.08, "grad_norm": 60.39308166503906, "learning_rate": 1.4e-07, "logps/chosen": -16.07038116455078, "logps/rejected": -51.06721115112305, "loss": 0.624, "losses/dpo": 0.6596254110336304, "losses/sft": 0.2904399037361145, "losses/total": 0.6596254110336304, "ref_logps/chosen": -16.394489288330078, "ref_logps/rejected": -49.92795181274414, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03241092711687088, "rewards/margins": 0.14633695781230927, "rewards/rejected": -0.11392602324485779, "step": 42 }, { "epoch": 0.09, "grad_norm": 55.89990997314453, "learning_rate": 1.4333333333333335e-07, "logps/chosen": -17.65834617614746, "logps/rejected": -49.329383850097656, "loss": 0.6355, "losses/dpo": 0.6224067211151123, "losses/sft": 0.2809451222419739, "losses/total": 0.6224067211151123, "ref_logps/chosen": -17.987916946411133, "ref_logps/rejected": -48.46190643310547, "rewards/accuracies": 1.0, "rewards/chosen": 0.03295706957578659, "rewards/margins": 0.11970466375350952, "rewards/rejected": -0.08674759417772293, "step": 43 }, { "epoch": 0.09, "grad_norm": 51.306312561035156, "learning_rate": 1.4666666666666666e-07, "logps/chosen": -10.183408737182617, "logps/rejected": -32.69266128540039, "loss": 0.6226, "losses/dpo": 0.5917978286743164, "losses/sft": 0.25666502118110657, "losses/total": 0.5917978286743164, "ref_logps/chosen": -10.467521667480469, "ref_logps/rejected": -31.477275848388672, "rewards/accuracies": 0.9375, "rewards/chosen": 0.028411362320184708, "rewards/margins": 0.1499500572681427, "rewards/rejected": -0.12153870612382889, "step": 44 }, { "epoch": 0.09, "grad_norm": 66.90591430664062, "learning_rate": 1.5e-07, "logps/chosen": -14.379773139953613, "logps/rejected": -65.27030944824219, "loss": 0.5908, "losses/dpo": 0.5828587412834167, "losses/sft": 0.2970637381076813, "losses/total": 0.5828587412834167, "ref_logps/chosen": -14.763320922851562, "ref_logps/rejected": -63.37145233154297, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03835476189851761, "rewards/margins": 0.22824007272720337, "rewards/rejected": -0.18988531827926636, "step": 45 }, { "epoch": 0.09, "grad_norm": 52.453800201416016, "learning_rate": 1.533333333333333e-07, "logps/chosen": -13.01441764831543, "logps/rejected": -42.7321662902832, "loss": 0.6133, "losses/dpo": 0.6588992476463318, "losses/sft": 0.3094305992126465, "losses/total": 0.6588992476463318, "ref_logps/chosen": -13.52122688293457, "ref_logps/rejected": -41.48785400390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.05068095400929451, "rewards/margins": 0.17511233687400818, "rewards/rejected": -0.12443137168884277, "step": 46 }, { "epoch": 0.09, "grad_norm": 60.28200149536133, "learning_rate": 1.5666666666666667e-07, "logps/chosen": -11.062196731567383, "logps/rejected": -43.15785217285156, "loss": 0.5899, "losses/dpo": 0.6494508385658264, "losses/sft": 0.28766411542892456, "losses/total": 0.6494508385658264, "ref_logps/chosen": -11.557550430297852, "ref_logps/rejected": -41.409461975097656, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04953545704483986, "rewards/margins": 0.22437459230422974, "rewards/rejected": -0.17483913898468018, "step": 47 }, { "epoch": 0.1, "grad_norm": 44.662635803222656, "learning_rate": 1.6e-07, "logps/chosen": -13.756662368774414, "logps/rejected": -31.407695770263672, "loss": 0.6274, "losses/dpo": 0.6444739103317261, "losses/sft": 0.24791499972343445, "losses/total": 0.6444739103317261, "ref_logps/chosen": -14.214315414428711, "ref_logps/rejected": -30.471614837646484, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04576535150408745, "rewards/margins": 0.13937321305274963, "rewards/rejected": -0.09360785037279129, "step": 48 }, { "epoch": 0.1, "grad_norm": 51.2293701171875, "learning_rate": 1.6333333333333331e-07, "logps/chosen": -12.181818008422852, "logps/rejected": -39.64811706542969, "loss": 0.6029, "losses/dpo": 0.5977880954742432, "losses/sft": 0.24987655878067017, "losses/total": 0.5977880954742432, "ref_logps/chosen": -12.48669719696045, "ref_logps/rejected": -37.98247528076172, "rewards/accuracies": 0.9375, "rewards/chosen": 0.030487842857837677, "rewards/margins": 0.19705218076705933, "rewards/rejected": -0.16656433045864105, "step": 49 }, { "epoch": 0.1, "grad_norm": 61.242591857910156, "learning_rate": 1.6666666666666665e-07, "logps/chosen": -17.916460037231445, "logps/rejected": -51.87417984008789, "loss": 0.5833, "losses/dpo": 0.5518717765808105, "losses/sft": 0.334248423576355, "losses/total": 0.5518717765808105, "ref_logps/chosen": -18.505146026611328, "ref_logps/rejected": -50.02173614501953, "rewards/accuracies": 1.0, "rewards/chosen": 0.058868564665317535, "rewards/margins": 0.24411310255527496, "rewards/rejected": -0.18524454534053802, "step": 50 }, { "epoch": 0.1, "grad_norm": 54.873069763183594, "learning_rate": 1.7000000000000001e-07, "logps/chosen": -15.96760082244873, "logps/rejected": -45.260826110839844, "loss": 0.5708, "losses/dpo": 0.5711266994476318, "losses/sft": 0.2405555695295334, "losses/total": 0.5711266994476318, "ref_logps/chosen": -16.65478515625, "ref_logps/rejected": -43.300804138183594, "rewards/accuracies": 1.0, "rewards/chosen": 0.06871844083070755, "rewards/margins": 0.2647208869457245, "rewards/rejected": -0.19600245356559753, "step": 51 }, { "epoch": 0.1, "grad_norm": 61.774105072021484, "learning_rate": 1.7333333333333332e-07, "logps/chosen": -15.063488006591797, "logps/rejected": -52.22322082519531, "loss": 0.5067, "losses/dpo": 0.5486783981323242, "losses/sft": 0.2762540280818939, "losses/total": 0.5486783981323242, "ref_logps/chosen": -15.942670822143555, "ref_logps/rejected": -48.783878326416016, "rewards/accuracies": 1.0, "rewards/chosen": 0.08791828900575638, "rewards/margins": 0.43185263872146606, "rewards/rejected": -0.3439343571662903, "step": 52 }, { "epoch": 0.11, "grad_norm": 51.12025833129883, "learning_rate": 1.7666666666666666e-07, "logps/chosen": -12.447774887084961, "logps/rejected": -49.730525970458984, "loss": 0.5292, "losses/dpo": 0.5261654853820801, "losses/sft": 0.3010826110839844, "losses/total": 0.5261654853820801, "ref_logps/chosen": -12.847427368164062, "ref_logps/rejected": -46.31183624267578, "rewards/accuracies": 1.0, "rewards/chosen": 0.039965298026800156, "rewards/margins": 0.38183388113975525, "rewards/rejected": -0.3418685793876648, "step": 53 }, { "epoch": 0.11, "grad_norm": 56.890716552734375, "learning_rate": 1.8e-07, "logps/chosen": -15.07343864440918, "logps/rejected": -47.3963508605957, "loss": 0.5003, "losses/dpo": 0.4316698908805847, "losses/sft": 0.25664612650871277, "losses/total": 0.4316698908805847, "ref_logps/chosen": -15.735147476196289, "ref_logps/rejected": -43.51297378540039, "rewards/accuracies": 1.0, "rewards/chosen": 0.06617091596126556, "rewards/margins": 0.45450854301452637, "rewards/rejected": -0.3883376121520996, "step": 54 }, { "epoch": 0.11, "grad_norm": 47.778656005859375, "learning_rate": 1.833333333333333e-07, "logps/chosen": -12.364097595214844, "logps/rejected": -40.489898681640625, "loss": 0.5156, "losses/dpo": 0.5084203481674194, "losses/sft": 0.19767522811889648, "losses/total": 0.5084203481674194, "ref_logps/chosen": -12.910322189331055, "ref_logps/rejected": -36.8597412109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.05462249368429184, "rewards/margins": 0.41763830184936523, "rewards/rejected": -0.3630157709121704, "step": 55 }, { "epoch": 0.11, "grad_norm": 48.53829574584961, "learning_rate": 1.8666666666666667e-07, "logps/chosen": -15.324739456176758, "logps/rejected": -55.346656799316406, "loss": 0.4816, "losses/dpo": 0.4806634485721588, "losses/sft": 0.31916913390159607, "losses/total": 0.4806634485721588, "ref_logps/chosen": -16.457077026367188, "ref_logps/rejected": -51.29027557373047, "rewards/accuracies": 1.0, "rewards/chosen": 0.11323380470275879, "rewards/margins": 0.5188711881637573, "rewards/rejected": -0.4056374132633209, "step": 56 }, { "epoch": 0.11, "grad_norm": 50.436275482177734, "learning_rate": 1.8999999999999998e-07, "logps/chosen": -18.092538833618164, "logps/rejected": -51.386226654052734, "loss": 0.4638, "losses/dpo": 0.48723289370536804, "losses/sft": 0.24147015810012817, "losses/total": 0.48723289370536804, "ref_logps/chosen": -19.495101928710938, "ref_logps/rejected": -47.26172637939453, "rewards/accuracies": 1.0, "rewards/chosen": 0.1402563750743866, "rewards/margins": 0.5527061223983765, "rewards/rejected": -0.41244977712631226, "step": 57 }, { "epoch": 0.12, "grad_norm": 49.289188385009766, "learning_rate": 1.9333333333333332e-07, "logps/chosen": -19.1153621673584, "logps/rejected": -59.98572540283203, "loss": 0.4424, "losses/dpo": 0.49887678027153015, "losses/sft": 0.30951380729675293, "losses/total": 0.49887678027153015, "ref_logps/chosen": -20.371097564697266, "ref_logps/rejected": -54.775394439697266, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12557338178157806, "rewards/margins": 0.6466065049171448, "rewards/rejected": -0.5210331082344055, "step": 58 }, { "epoch": 0.12, "grad_norm": 45.76652526855469, "learning_rate": 1.9666666666666665e-07, "logps/chosen": -13.932234764099121, "logps/rejected": -38.268592834472656, "loss": 0.5087, "losses/dpo": 0.42856013774871826, "losses/sft": 0.2895386219024658, "losses/total": 0.42856013774871826, "ref_logps/chosen": -14.634007453918457, "ref_logps/rejected": -34.70841979980469, "rewards/accuracies": 1.0, "rewards/chosen": 0.07017731666564941, "rewards/margins": 0.42619505524635315, "rewards/rejected": -0.3560177683830261, "step": 59 }, { "epoch": 0.12, "grad_norm": 44.962379455566406, "learning_rate": 2e-07, "logps/chosen": -15.4169282913208, "logps/rejected": -39.44453811645508, "loss": 0.5122, "losses/dpo": 0.4593808650970459, "losses/sft": 0.29917794466018677, "losses/total": 0.4593808650970459, "ref_logps/chosen": -16.19096565246582, "ref_logps/rejected": -35.855220794677734, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07740387320518494, "rewards/margins": 0.4363355338573456, "rewards/rejected": -0.35893166065216064, "step": 60 }, { "epoch": 0.12, "grad_norm": 41.11608123779297, "learning_rate": 2.0333333333333333e-07, "logps/chosen": -14.037020683288574, "logps/rejected": -50.51776123046875, "loss": 0.4495, "losses/dpo": 0.4573870301246643, "losses/sft": 0.3381502628326416, "losses/total": 0.4573870301246643, "ref_logps/chosen": -14.76598834991455, "ref_logps/rejected": -44.90858840942383, "rewards/accuracies": 1.0, "rewards/chosen": 0.07289674878120422, "rewards/margins": 0.633813738822937, "rewards/rejected": -0.5609170198440552, "step": 61 }, { "epoch": 0.12, "grad_norm": 45.58005142211914, "learning_rate": 2.0666666666666666e-07, "logps/chosen": -18.84115982055664, "logps/rejected": -54.45577621459961, "loss": 0.4451, "losses/dpo": 0.4381011128425598, "losses/sft": 0.26262062788009644, "losses/total": 0.4381011128425598, "ref_logps/chosen": -20.094820022583008, "ref_logps/rejected": -49.11144256591797, "rewards/accuracies": 1.0, "rewards/chosen": 0.12536601722240448, "rewards/margins": 0.6597994565963745, "rewards/rejected": -0.5344335436820984, "step": 62 }, { "epoch": 0.13, "grad_norm": 37.02690505981445, "learning_rate": 2.0999999999999997e-07, "logps/chosen": -13.949782371520996, "logps/rejected": -37.66564178466797, "loss": 0.5103, "losses/dpo": 0.4280434548854828, "losses/sft": 0.24366047978401184, "losses/total": 0.4280434548854828, "ref_logps/chosen": -15.224845886230469, "ref_logps/rejected": -34.20655059814453, "rewards/accuracies": 0.875, "rewards/chosen": 0.1275063157081604, "rewards/margins": 0.4734152853488922, "rewards/rejected": -0.3459089994430542, "step": 63 }, { "epoch": 0.13, "grad_norm": 44.19191360473633, "learning_rate": 2.1333333333333334e-07, "logps/chosen": -9.342233657836914, "logps/rejected": -40.536136627197266, "loss": 0.4482, "losses/dpo": 0.5243103504180908, "losses/sft": 0.1963924765586853, "losses/total": 0.5243103504180908, "ref_logps/chosen": -9.877543449401855, "ref_logps/rejected": -34.941558837890625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05353102833032608, "rewards/margins": 0.6129887104034424, "rewards/rejected": -0.5594576597213745, "step": 64 }, { "epoch": 0.13, "grad_norm": 43.218048095703125, "learning_rate": 2.1666666666666667e-07, "logps/chosen": -24.74606704711914, "logps/rejected": -65.12860107421875, "loss": 0.4176, "losses/dpo": 0.537351667881012, "losses/sft": 0.13678902387619019, "losses/total": 0.537351667881012, "ref_logps/chosen": -26.11852264404297, "ref_logps/rejected": -58.860374450683594, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13724543154239655, "rewards/margins": 0.7640678286552429, "rewards/rejected": -0.6268223524093628, "step": 65 }, { "epoch": 0.13, "grad_norm": 39.85637283325195, "learning_rate": 2.1999999999999998e-07, "logps/chosen": -14.346301078796387, "logps/rejected": -39.72654724121094, "loss": 0.4504, "losses/dpo": 0.5213490724563599, "losses/sft": 0.2660483121871948, "losses/total": 0.5213490724563599, "ref_logps/chosen": -15.04425048828125, "ref_logps/rejected": -33.927894592285156, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06979489326477051, "rewards/margins": 0.6496601700782776, "rewards/rejected": -0.5798653364181519, "step": 66 }, { "epoch": 0.13, "grad_norm": 38.80109786987305, "learning_rate": 2.2333333333333332e-07, "logps/chosen": -11.50251293182373, "logps/rejected": -40.8316535949707, "loss": 0.4839, "losses/dpo": 0.4850131869316101, "losses/sft": 0.22737279534339905, "losses/total": 0.4850131869316101, "ref_logps/chosen": -11.721717834472656, "ref_logps/rejected": -35.50334930419922, "rewards/accuracies": 0.9375, "rewards/chosen": 0.021920526400208473, "rewards/margins": 0.554750919342041, "rewards/rejected": -0.5328303575515747, "step": 67 }, { "epoch": 0.14, "grad_norm": 41.928707122802734, "learning_rate": 2.2666666666666663e-07, "logps/chosen": -15.948335647583008, "logps/rejected": -58.71236801147461, "loss": 0.3387, "losses/dpo": 0.4127328097820282, "losses/sft": 0.2764541506767273, "losses/total": 0.4127328097820282, "ref_logps/chosen": -17.22922134399414, "ref_logps/rejected": -50.34320831298828, "rewards/accuracies": 1.0, "rewards/chosen": 0.12808847427368164, "rewards/margins": 0.9650048613548279, "rewards/rejected": -0.8369163870811462, "step": 68 }, { "epoch": 0.14, "grad_norm": 37.16038131713867, "learning_rate": 2.3e-07, "logps/chosen": -13.122339248657227, "logps/rejected": -48.92229461669922, "loss": 0.4026, "losses/dpo": 0.36682942509651184, "losses/sft": 0.24337750673294067, "losses/total": 0.36682942509651184, "ref_logps/chosen": -13.790239334106445, "ref_logps/rejected": -41.1927490234375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06679011881351471, "rewards/margins": 0.8397451639175415, "rewards/rejected": -0.772955060005188, "step": 69 }, { "epoch": 0.14, "grad_norm": 43.80226135253906, "learning_rate": 2.3333333333333333e-07, "logps/chosen": -16.744388580322266, "logps/rejected": -63.02336883544922, "loss": 0.371, "losses/dpo": 0.4458756148815155, "losses/sft": 0.2844783663749695, "losses/total": 0.4458756148815155, "ref_logps/chosen": -17.103076934814453, "ref_logps/rejected": -54.47646713256836, "rewards/accuracies": 1.0, "rewards/chosen": 0.035868849605321884, "rewards/margins": 0.8905590176582336, "rewards/rejected": -0.8546901941299438, "step": 70 }, { "epoch": 0.14, "grad_norm": 37.900611877441406, "learning_rate": 2.3666666666666664e-07, "logps/chosen": -15.185718536376953, "logps/rejected": -57.733131408691406, "loss": 0.3906, "losses/dpo": 0.36287635564804077, "losses/sft": 0.26010823249816895, "losses/total": 0.36287635564804077, "ref_logps/chosen": -15.726322174072266, "ref_logps/rejected": -49.109825134277344, "rewards/accuracies": 1.0, "rewards/chosen": 0.05406036227941513, "rewards/margins": 0.9163906574249268, "rewards/rejected": -0.8623303174972534, "step": 71 }, { "epoch": 0.14, "grad_norm": 29.51009750366211, "learning_rate": 2.4e-07, "logps/chosen": -10.109313011169434, "logps/rejected": -70.4901351928711, "loss": 0.2932, "losses/dpo": 0.3870830535888672, "losses/sft": 0.26137688755989075, "losses/total": 0.3870830535888672, "ref_logps/chosen": -10.630828857421875, "ref_logps/rejected": -57.545108795166016, "rewards/accuracies": 1.0, "rewards/chosen": 0.05215153098106384, "rewards/margins": 1.3466542959213257, "rewards/rejected": -1.2945027351379395, "step": 72 }, { "epoch": 0.15, "grad_norm": 35.36587905883789, "learning_rate": 2.433333333333333e-07, "logps/chosen": -14.218860626220703, "logps/rejected": -51.722843170166016, "loss": 0.2982, "losses/dpo": 0.25439390540122986, "losses/sft": 0.30609095096588135, "losses/total": 0.25439390540122986, "ref_logps/chosen": -14.53840446472168, "ref_logps/rejected": -40.36719512939453, "rewards/accuracies": 1.0, "rewards/chosen": 0.0319543182849884, "rewards/margins": 1.1675193309783936, "rewards/rejected": -1.1355650424957275, "step": 73 }, { "epoch": 0.15, "grad_norm": 34.00502395629883, "learning_rate": 2.4666666666666665e-07, "logps/chosen": -14.362863540649414, "logps/rejected": -57.53142166137695, "loss": 0.2736, "losses/dpo": 0.281907856464386, "losses/sft": 0.2517828643321991, "losses/total": 0.281907856464386, "ref_logps/chosen": -14.883176803588867, "ref_logps/rejected": -44.662784576416016, "rewards/accuracies": 1.0, "rewards/chosen": 0.052031371742486954, "rewards/margins": 1.3388949632644653, "rewards/rejected": -1.2868635654449463, "step": 74 }, { "epoch": 0.15, "grad_norm": 33.48396301269531, "learning_rate": 2.5e-07, "logps/chosen": -15.098970413208008, "logps/rejected": -64.7098159790039, "loss": 0.2486, "losses/dpo": 0.24968230724334717, "losses/sft": 0.2866656184196472, "losses/total": 0.24968230724334717, "ref_logps/chosen": -15.477853775024414, "ref_logps/rejected": -50.06144714355469, "rewards/accuracies": 1.0, "rewards/chosen": 0.03788831830024719, "rewards/margins": 1.50272536277771, "rewards/rejected": -1.4648370742797852, "step": 75 }, { "epoch": 0.15, "grad_norm": 37.191131591796875, "learning_rate": 2.533333333333333e-07, "logps/chosen": -14.241877555847168, "logps/rejected": -69.87599182128906, "loss": 0.2465, "losses/dpo": 0.23917606472969055, "losses/sft": 0.27395009994506836, "losses/total": 0.23917606472969055, "ref_logps/chosen": -12.860979080200195, "ref_logps/rejected": -53.578670501708984, "rewards/accuracies": 1.0, "rewards/chosen": -0.13808982074260712, "rewards/margins": 1.4916424751281738, "rewards/rejected": -1.629732370376587, "step": 76 }, { "epoch": 0.15, "grad_norm": 26.611677169799805, "learning_rate": 2.5666666666666666e-07, "logps/chosen": -14.91260814666748, "logps/rejected": -80.22364044189453, "loss": 0.2541, "losses/dpo": 0.3971107602119446, "losses/sft": 0.3323269486427307, "losses/total": 0.3971107602119446, "ref_logps/chosen": -13.36182975769043, "ref_logps/rejected": -58.73191833496094, "rewards/accuracies": 0.875, "rewards/chosen": -0.15507787466049194, "rewards/margins": 1.994094729423523, "rewards/rejected": -2.149172782897949, "step": 77 }, { "epoch": 0.16, "grad_norm": 28.33013343811035, "learning_rate": 2.6e-07, "logps/chosen": -15.90027141571045, "logps/rejected": -66.85693359375, "loss": 0.2514, "losses/dpo": 0.18611454963684082, "losses/sft": 0.28300029039382935, "losses/total": 0.18611454963684082, "ref_logps/chosen": -13.217735290527344, "ref_logps/rejected": -49.22061538696289, "rewards/accuracies": 1.0, "rewards/chosen": -0.26825347542762756, "rewards/margins": 1.495378851890564, "rewards/rejected": -1.7636322975158691, "step": 78 }, { "epoch": 0.16, "grad_norm": 37.32158279418945, "learning_rate": 2.633333333333333e-07, "logps/chosen": -15.713750839233398, "logps/rejected": -53.1090087890625, "loss": 0.3564, "losses/dpo": 0.5421111583709717, "losses/sft": 0.3495892286300659, "losses/total": 0.5421111583709717, "ref_logps/chosen": -13.422369003295898, "ref_logps/rejected": -37.21599578857422, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2291383147239685, "rewards/margins": 1.3601632118225098, "rewards/rejected": -1.589301586151123, "step": 79 }, { "epoch": 0.16, "grad_norm": 33.47981643676758, "learning_rate": 2.6666666666666667e-07, "logps/chosen": -16.27362060546875, "logps/rejected": -57.29899978637695, "loss": 0.2627, "losses/dpo": 0.19374717772006989, "losses/sft": 0.32004982233047485, "losses/total": 0.19374717772006989, "ref_logps/chosen": -13.620620727539062, "ref_logps/rejected": -38.440185546875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.26529988646507263, "rewards/margins": 1.620581865310669, "rewards/rejected": -1.8858816623687744, "step": 80 }, { "epoch": 0.16, "grad_norm": 35.3527946472168, "learning_rate": 2.7e-07, "logps/chosen": -20.024314880371094, "logps/rejected": -85.74209594726562, "loss": 0.2311, "losses/dpo": 0.19971030950546265, "losses/sft": 0.2853155732154846, "losses/total": 0.19971030950546265, "ref_logps/chosen": -18.787288665771484, "ref_logps/rejected": -65.08699035644531, "rewards/accuracies": 0.875, "rewards/chosen": -0.12370243668556213, "rewards/margins": 1.9418089389801025, "rewards/rejected": -2.0655112266540527, "step": 81 }, { "epoch": 0.16, "grad_norm": 28.21786117553711, "learning_rate": 2.733333333333333e-07, "logps/chosen": -16.94407081604004, "logps/rejected": -79.70083618164062, "loss": 0.2106, "losses/dpo": 0.22653785347938538, "losses/sft": 0.31818974018096924, "losses/total": 0.22653785347938538, "ref_logps/chosen": -15.551984786987305, "ref_logps/rejected": -55.47174072265625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13920865952968597, "rewards/margins": 2.283700704574585, "rewards/rejected": -2.4229092597961426, "step": 82 }, { "epoch": 0.17, "grad_norm": 23.01938247680664, "learning_rate": 2.766666666666667e-07, "logps/chosen": -15.447221755981445, "logps/rejected": -83.06011199951172, "loss": 0.1544, "losses/dpo": 0.13929356634616852, "losses/sft": 0.33886247873306274, "losses/total": 0.13929356634616852, "ref_logps/chosen": -12.357421875, "ref_logps/rejected": -56.69266891479492, "rewards/accuracies": 1.0, "rewards/chosen": -0.3089800477027893, "rewards/margins": 2.3277645111083984, "rewards/rejected": -2.636744499206543, "step": 83 }, { "epoch": 0.17, "grad_norm": 30.944183349609375, "learning_rate": 2.8e-07, "logps/chosen": -19.864492416381836, "logps/rejected": -59.14112091064453, "loss": 0.2418, "losses/dpo": 0.33125755190849304, "losses/sft": 0.23029811680316925, "losses/total": 0.33125755190849304, "ref_logps/chosen": -16.55842399597168, "ref_logps/rejected": -39.68665313720703, "rewards/accuracies": 0.875, "rewards/chosen": -0.3306068181991577, "rewards/margins": 1.6148401498794556, "rewards/rejected": -1.9454468488693237, "step": 84 }, { "epoch": 0.17, "grad_norm": 26.384204864501953, "learning_rate": 2.833333333333333e-07, "logps/chosen": -16.929367065429688, "logps/rejected": -87.05389404296875, "loss": 0.1536, "losses/dpo": 0.18227216601371765, "losses/sft": 0.3527926206588745, "losses/total": 0.18227216601371765, "ref_logps/chosen": -14.183502197265625, "ref_logps/rejected": -56.431129455566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.27458658814430237, "rewards/margins": 2.787689685821533, "rewards/rejected": -3.0622763633728027, "step": 85 }, { "epoch": 0.17, "grad_norm": 38.91211700439453, "learning_rate": 2.866666666666667e-07, "logps/chosen": -17.564605712890625, "logps/rejected": -52.08045196533203, "loss": 0.2764, "losses/dpo": 0.1875544637441635, "losses/sft": 0.3149993121623993, "losses/total": 0.1875544637441635, "ref_logps/chosen": -13.960559844970703, "ref_logps/rejected": -32.2154426574707, "rewards/accuracies": 1.0, "rewards/chosen": -0.3604046702384949, "rewards/margins": 1.626096487045288, "rewards/rejected": -1.9865012168884277, "step": 86 }, { "epoch": 0.17, "grad_norm": 54.64894485473633, "learning_rate": 2.9e-07, "logps/chosen": -24.219192504882812, "logps/rejected": -59.99394989013672, "loss": 0.3631, "losses/dpo": 0.3719308376312256, "losses/sft": 0.49110162258148193, "losses/total": 0.3719308376312256, "ref_logps/chosen": -15.149127960205078, "ref_logps/rejected": -34.75643539428711, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9070063829421997, "rewards/margins": 1.6167452335357666, "rewards/rejected": -2.523751735687256, "step": 87 }, { "epoch": 0.18, "grad_norm": 67.03410339355469, "learning_rate": 2.933333333333333e-07, "logps/chosen": -15.183414459228516, "logps/rejected": -64.70655059814453, "loss": 0.3215, "losses/dpo": 0.4381785988807678, "losses/sft": 0.40025150775909424, "losses/total": 0.4381785988807678, "ref_logps/chosen": -11.338159561157227, "ref_logps/rejected": -38.85139083862305, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3845253884792328, "rewards/margins": 2.200990676879883, "rewards/rejected": -2.5855159759521484, "step": 88 }, { "epoch": 0.18, "grad_norm": 37.209999084472656, "learning_rate": 2.966666666666667e-07, "logps/chosen": -20.264392852783203, "logps/rejected": -64.114013671875, "loss": 0.304, "losses/dpo": 0.20819611847400665, "losses/sft": 0.3846052885055542, "losses/total": 0.20819611847400665, "ref_logps/chosen": -13.015954971313477, "ref_logps/rejected": -37.71721649169922, "rewards/accuracies": 0.875, "rewards/chosen": -0.7248438596725464, "rewards/margins": 1.9148359298706055, "rewards/rejected": -2.6396799087524414, "step": 89 }, { "epoch": 0.18, "grad_norm": 28.591053009033203, "learning_rate": 3e-07, "logps/chosen": -25.06616973876953, "logps/rejected": -81.93843841552734, "loss": 0.2055, "losses/dpo": 0.31712400913238525, "losses/sft": 0.31778547167778015, "losses/total": 0.31712400913238525, "ref_logps/chosen": -21.52896499633789, "ref_logps/rejected": -53.268577575683594, "rewards/accuracies": 0.875, "rewards/chosen": -0.3537205457687378, "rewards/margins": 2.51326584815979, "rewards/rejected": -2.8669862747192383, "step": 90 }, { "epoch": 0.18, "grad_norm": 38.66221618652344, "learning_rate": 3.033333333333333e-07, "logps/chosen": -24.031814575195312, "logps/rejected": -61.21632385253906, "loss": 0.246, "losses/dpo": 0.18292659521102905, "losses/sft": 0.4320409297943115, "losses/total": 0.18292659521102905, "ref_logps/chosen": -17.206039428710938, "ref_logps/rejected": -38.700233459472656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.68257737159729, "rewards/margins": 1.5690321922302246, "rewards/rejected": -2.2516093254089355, "step": 91 }, { "epoch": 0.18, "grad_norm": 39.619354248046875, "learning_rate": 3.066666666666666e-07, "logps/chosen": -19.91756820678711, "logps/rejected": -62.49524688720703, "loss": 0.2167, "losses/dpo": 0.22770985960960388, "losses/sft": 0.4464987814426422, "losses/total": 0.22770985960960388, "ref_logps/chosen": -12.815778732299805, "ref_logps/rejected": -34.96184539794922, "rewards/accuracies": 1.0, "rewards/chosen": -0.7101789712905884, "rewards/margins": 2.043161392211914, "rewards/rejected": -2.753340482711792, "step": 92 }, { "epoch": 0.19, "grad_norm": 36.227333068847656, "learning_rate": 3.1e-07, "logps/chosen": -25.785503387451172, "logps/rejected": -64.98928833007812, "loss": 0.2481, "losses/dpo": 0.220164492726326, "losses/sft": 0.3677349090576172, "losses/total": 0.220164492726326, "ref_logps/chosen": -21.26690673828125, "ref_logps/rejected": -37.80683898925781, "rewards/accuracies": 1.0, "rewards/chosen": -0.45185956358909607, "rewards/margins": 2.266386032104492, "rewards/rejected": -2.7182457447052, "step": 93 }, { "epoch": 0.19, "grad_norm": 37.962623596191406, "learning_rate": 3.1333333333333333e-07, "logps/chosen": -24.163372039794922, "logps/rejected": -66.34964752197266, "loss": 0.2241, "losses/dpo": 0.310378760099411, "losses/sft": 0.4730556607246399, "losses/total": 0.310378760099411, "ref_logps/chosen": -14.319221496582031, "ref_logps/rejected": -38.69602966308594, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9844151139259338, "rewards/margins": 1.7809470891952515, "rewards/rejected": -2.76536226272583, "step": 94 }, { "epoch": 0.19, "grad_norm": 18.532638549804688, "learning_rate": 3.166666666666666e-07, "logps/chosen": -20.102611541748047, "logps/rejected": -83.15855407714844, "loss": 0.1204, "losses/dpo": 0.039735302329063416, "losses/sft": 0.36929309368133545, "losses/total": 0.039735302329063416, "ref_logps/chosen": -12.35050106048584, "ref_logps/rejected": -45.502262115478516, "rewards/accuracies": 1.0, "rewards/chosen": -0.775210976600647, "rewards/margins": 2.9904184341430664, "rewards/rejected": -3.765629291534424, "step": 95 }, { "epoch": 0.19, "grad_norm": 47.28300476074219, "learning_rate": 3.2e-07, "logps/chosen": -30.591978073120117, "logps/rejected": -79.0492172241211, "loss": 0.2275, "losses/dpo": 0.14131276309490204, "losses/sft": 0.34366729855537415, "losses/total": 0.14131276309490204, "ref_logps/chosen": -20.450027465820312, "ref_logps/rejected": -51.264705657958984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0141950845718384, "rewards/margins": 1.7642561197280884, "rewards/rejected": -2.7784509658813477, "step": 96 }, { "epoch": 0.19, "grad_norm": 22.053281784057617, "learning_rate": 3.233333333333333e-07, "logps/chosen": -23.116226196289062, "logps/rejected": -105.54429626464844, "loss": 0.0763, "losses/dpo": 0.08986547589302063, "losses/sft": 0.47947466373443604, "losses/total": 0.08986547589302063, "ref_logps/chosen": -15.21923542022705, "ref_logps/rejected": -61.29276657104492, "rewards/accuracies": 1.0, "rewards/chosen": -0.7896990180015564, "rewards/margins": 3.6354546546936035, "rewards/rejected": -4.425153732299805, "step": 97 }, { "epoch": 0.2, "grad_norm": 28.593852996826172, "learning_rate": 3.2666666666666663e-07, "logps/chosen": -17.118698120117188, "logps/rejected": -88.55982971191406, "loss": 0.1363, "losses/dpo": 0.07273420691490173, "losses/sft": 0.4799639582633972, "losses/total": 0.07273420691490173, "ref_logps/chosen": -12.235620498657227, "ref_logps/rejected": -49.14242935180664, "rewards/accuracies": 1.0, "rewards/chosen": -0.4883077144622803, "rewards/margins": 3.453432559967041, "rewards/rejected": -3.9417405128479004, "step": 98 }, { "epoch": 0.2, "grad_norm": 28.878681182861328, "learning_rate": 3.3e-07, "logps/chosen": -24.96469497680664, "logps/rejected": -89.02628326416016, "loss": 0.1801, "losses/dpo": 0.21059244871139526, "losses/sft": 0.4723120331764221, "losses/total": 0.21059244871139526, "ref_logps/chosen": -15.973878860473633, "ref_logps/rejected": -47.815399169921875, "rewards/accuracies": 0.875, "rewards/chosen": -0.8990815877914429, "rewards/margins": 3.2220067977905273, "rewards/rejected": -4.12108850479126, "step": 99 }, { "epoch": 0.2, "grad_norm": 29.858488082885742, "learning_rate": 3.333333333333333e-07, "logps/chosen": -23.120346069335938, "logps/rejected": -78.31523895263672, "loss": 0.1001, "losses/dpo": 0.2422274351119995, "losses/sft": 0.527462899684906, "losses/total": 0.2422274351119995, "ref_logps/chosen": -14.651782035827637, "ref_logps/rejected": -38.066070556640625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8468563556671143, "rewards/margins": 3.178060531616211, "rewards/rejected": -4.024916648864746, "step": 100 }, { "epoch": 0.2, "grad_norm": 40.085304260253906, "learning_rate": 3.3666666666666664e-07, "logps/chosen": -17.66061019897461, "logps/rejected": -55.92891311645508, "loss": 0.3245, "losses/dpo": 0.38317328691482544, "losses/sft": 0.43344035744667053, "losses/total": 0.38317328691482544, "ref_logps/chosen": -11.103376388549805, "ref_logps/rejected": -33.681671142578125, "rewards/accuracies": 0.875, "rewards/chosen": -0.655723512172699, "rewards/margins": 1.5690006017684937, "rewards/rejected": -2.224724054336548, "step": 101 }, { "epoch": 0.2, "grad_norm": 37.162322998046875, "learning_rate": 3.4000000000000003e-07, "logps/chosen": -20.915874481201172, "logps/rejected": -70.4129638671875, "loss": 0.2303, "losses/dpo": 0.40623629093170166, "losses/sft": 0.2984255254268646, "losses/total": 0.40623629093170166, "ref_logps/chosen": -13.981635093688965, "ref_logps/rejected": -39.540313720703125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6934242248535156, "rewards/margins": 2.393840789794922, "rewards/rejected": -3.0872647762298584, "step": 102 }, { "epoch": 0.21, "grad_norm": 52.18952178955078, "learning_rate": 3.433333333333333e-07, "logps/chosen": -23.943897247314453, "logps/rejected": -86.97280883789062, "loss": 0.2605, "losses/dpo": 0.36269426345825195, "losses/sft": 0.4030131697654724, "losses/total": 0.36269426345825195, "ref_logps/chosen": -14.55135726928711, "ref_logps/rejected": -48.15530776977539, "rewards/accuracies": 0.875, "rewards/chosen": -0.9392539262771606, "rewards/margins": 2.9424962997436523, "rewards/rejected": -3.8817505836486816, "step": 103 }, { "epoch": 0.21, "grad_norm": 27.642332077026367, "learning_rate": 3.4666666666666665e-07, "logps/chosen": -25.413158416748047, "logps/rejected": -79.21366119384766, "loss": 0.0897, "losses/dpo": 0.05271019786596298, "losses/sft": 0.547292172908783, "losses/total": 0.05271019786596298, "ref_logps/chosen": -17.499645233154297, "ref_logps/rejected": -39.83184051513672, "rewards/accuracies": 1.0, "rewards/chosen": -0.7913513779640198, "rewards/margins": 3.1468305587768555, "rewards/rejected": -3.9381821155548096, "step": 104 }, { "epoch": 0.21, "grad_norm": 26.87654685974121, "learning_rate": 3.5e-07, "logps/chosen": -20.214862823486328, "logps/rejected": -87.76766967773438, "loss": 0.1645, "losses/dpo": 0.20824576914310455, "losses/sft": 0.33288073539733887, "losses/total": 0.20824576914310455, "ref_logps/chosen": -11.795272827148438, "ref_logps/rejected": -48.254093170166016, "rewards/accuracies": 1.0, "rewards/chosen": -0.8419591188430786, "rewards/margins": 3.10939884185791, "rewards/rejected": -3.951357841491699, "step": 105 }, { "epoch": 0.21, "grad_norm": 64.75567626953125, "learning_rate": 3.533333333333333e-07, "logps/chosen": -30.353635787963867, "logps/rejected": -72.8525390625, "loss": 0.3244, "losses/dpo": 0.3965432345867157, "losses/sft": 0.5763383507728577, "losses/total": 0.3965432345867157, "ref_logps/chosen": -15.887895584106445, "ref_logps/rejected": -36.173828125, "rewards/accuracies": 0.875, "rewards/chosen": -1.4465739727020264, "rewards/margins": 2.221297025680542, "rewards/rejected": -3.6678709983825684, "step": 106 }, { "epoch": 0.21, "grad_norm": 70.32903289794922, "learning_rate": 3.5666666666666666e-07, "logps/chosen": -27.496435165405273, "logps/rejected": -96.47198486328125, "loss": 0.1491, "losses/dpo": 0.14805351197719574, "losses/sft": 0.48672235012054443, "losses/total": 0.14805351197719574, "ref_logps/chosen": -14.464235305786133, "ref_logps/rejected": -50.544700622558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3032200336456299, "rewards/margins": 3.2895092964172363, "rewards/rejected": -4.592729091644287, "step": 107 }, { "epoch": 0.22, "grad_norm": 19.775211334228516, "learning_rate": 3.6e-07, "logps/chosen": -26.703758239746094, "logps/rejected": -95.46380615234375, "loss": 0.0968, "losses/dpo": 0.046378664672374725, "losses/sft": 0.40180331468582153, "losses/total": 0.046378664672374725, "ref_logps/chosen": -15.243803024291992, "ref_logps/rejected": -47.9715576171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1459956169128418, "rewards/margins": 3.60322904586792, "rewards/rejected": -4.749224662780762, "step": 108 }, { "epoch": 0.22, "grad_norm": 34.58012390136719, "learning_rate": 3.6333333333333333e-07, "logps/chosen": -25.720474243164062, "logps/rejected": -101.4200439453125, "loss": 0.1469, "losses/dpo": 0.16447117924690247, "losses/sft": 0.49221622943878174, "losses/total": 0.16447117924690247, "ref_logps/chosen": -16.14147186279297, "ref_logps/rejected": -54.09002685546875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9579001665115356, "rewards/margins": 3.7751007080078125, "rewards/rejected": -4.733000755310059, "step": 109 }, { "epoch": 0.22, "grad_norm": 40.38216018676758, "learning_rate": 3.666666666666666e-07, "logps/chosen": -26.010862350463867, "logps/rejected": -78.31238555908203, "loss": 0.2122, "losses/dpo": 0.38288456201553345, "losses/sft": 0.5134553909301758, "losses/total": 0.38288456201553345, "ref_logps/chosen": -12.394744873046875, "ref_logps/rejected": -39.01166915893555, "rewards/accuracies": 0.875, "rewards/chosen": -1.3616118431091309, "rewards/margins": 2.568459987640381, "rewards/rejected": -3.9300713539123535, "step": 110 }, { "epoch": 0.22, "grad_norm": 34.98234939575195, "learning_rate": 3.7e-07, "logps/chosen": -33.29746627807617, "logps/rejected": -82.07467651367188, "loss": 0.1525, "losses/dpo": 0.1822669804096222, "losses/sft": 0.49815496802330017, "losses/total": 0.1822669804096222, "ref_logps/chosen": -21.91498374938965, "ref_logps/rejected": -41.01658248901367, "rewards/accuracies": 1.0, "rewards/chosen": -1.1382479667663574, "rewards/margins": 2.9675610065460205, "rewards/rejected": -4.105809211730957, "step": 111 }, { "epoch": 0.22, "grad_norm": 33.530418395996094, "learning_rate": 3.7333333333333334e-07, "logps/chosen": -23.67328643798828, "logps/rejected": -92.13876342773438, "loss": 0.0961, "losses/dpo": 0.008127570152282715, "losses/sft": 0.21549856662750244, "losses/total": 0.008127570152282715, "ref_logps/chosen": -13.349861145019531, "ref_logps/rejected": -43.54875946044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.0323424339294434, "rewards/margins": 3.8266587257385254, "rewards/rejected": -4.859001159667969, "step": 112 }, { "epoch": 0.23, "grad_norm": 38.63556671142578, "learning_rate": 3.766666666666666e-07, "logps/chosen": -30.415813446044922, "logps/rejected": -84.25430297851562, "loss": 0.1936, "losses/dpo": 0.12123291194438934, "losses/sft": 0.20419417321681976, "losses/total": 0.12123291194438934, "ref_logps/chosen": -19.41822052001953, "ref_logps/rejected": -45.32547378540039, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0997594594955444, "rewards/margins": 2.793123483657837, "rewards/rejected": -3.892883062362671, "step": 113 }, { "epoch": 0.23, "grad_norm": 23.571645736694336, "learning_rate": 3.7999999999999996e-07, "logps/chosen": -26.51279067993164, "logps/rejected": -94.08573913574219, "loss": 0.1537, "losses/dpo": 0.14313971996307373, "losses/sft": 0.36151280999183655, "losses/total": 0.14313971996307373, "ref_logps/chosen": -18.325164794921875, "ref_logps/rejected": -49.47245788574219, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8187628984451294, "rewards/margins": 3.6425652503967285, "rewards/rejected": -4.461328029632568, "step": 114 }, { "epoch": 0.23, "grad_norm": 23.784942626953125, "learning_rate": 3.8333333333333335e-07, "logps/chosen": -28.95541763305664, "logps/rejected": -91.38735961914062, "loss": 0.0878, "losses/dpo": 0.0992375910282135, "losses/sft": 0.2555049955844879, "losses/total": 0.0992375910282135, "ref_logps/chosen": -16.020353317260742, "ref_logps/rejected": -43.40199279785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2935062646865845, "rewards/margins": 3.505030632019043, "rewards/rejected": -4.798537254333496, "step": 115 }, { "epoch": 0.23, "grad_norm": 67.53050994873047, "learning_rate": 3.8666666666666664e-07, "logps/chosen": -25.763362884521484, "logps/rejected": -83.27964782714844, "loss": 0.2907, "losses/dpo": 0.1631295382976532, "losses/sft": 0.5447544455528259, "losses/total": 0.1631295382976532, "ref_logps/chosen": -11.275564193725586, "ref_logps/rejected": -42.068965911865234, "rewards/accuracies": 0.75, "rewards/chosen": -1.448779582977295, "rewards/margins": 2.672288179397583, "rewards/rejected": -4.121068000793457, "step": 116 }, { "epoch": 0.23, "grad_norm": 35.78622055053711, "learning_rate": 3.8999999999999997e-07, "logps/chosen": -24.083629608154297, "logps/rejected": -84.81612396240234, "loss": 0.1676, "losses/dpo": 0.24396544694900513, "losses/sft": 0.6446655988693237, "losses/total": 0.24396544694900513, "ref_logps/chosen": -12.8930025100708, "ref_logps/rejected": -37.470420837402344, "rewards/accuracies": 0.875, "rewards/chosen": -1.1190627813339233, "rewards/margins": 3.615507125854492, "rewards/rejected": -4.734570503234863, "step": 117 }, { "epoch": 0.24, "grad_norm": 24.79482650756836, "learning_rate": 3.933333333333333e-07, "logps/chosen": -24.10096549987793, "logps/rejected": -91.7545394897461, "loss": 0.1411, "losses/dpo": 0.07791762053966522, "losses/sft": 0.5899461507797241, "losses/total": 0.07791762053966522, "ref_logps/chosen": -13.302517890930176, "ref_logps/rejected": -47.74324035644531, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0798447132110596, "rewards/margins": 3.3212857246398926, "rewards/rejected": -4.401130199432373, "step": 118 }, { "epoch": 0.24, "grad_norm": 35.7785530090332, "learning_rate": 3.9666666666666665e-07, "logps/chosen": -31.792556762695312, "logps/rejected": -106.40116882324219, "loss": 0.1401, "losses/dpo": 0.4013972878456116, "losses/sft": 0.6858751177787781, "losses/total": 0.4013972878456116, "ref_logps/chosen": -13.898950576782227, "ref_logps/rejected": -49.06494140625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.789360523223877, "rewards/margins": 3.9442622661590576, "rewards/rejected": -5.7336225509643555, "step": 119 }, { "epoch": 0.24, "grad_norm": 53.136749267578125, "learning_rate": 4e-07, "logps/chosen": -29.701318740844727, "logps/rejected": -108.02490997314453, "loss": 0.2095, "losses/dpo": 0.6063670516014099, "losses/sft": 0.5956507325172424, "losses/total": 0.6063670516014099, "ref_logps/chosen": -10.965313911437988, "ref_logps/rejected": -52.60283279418945, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8736006021499634, "rewards/margins": 3.668607711791992, "rewards/rejected": -5.542208194732666, "step": 120 }, { "epoch": 0.24, "grad_norm": 32.23996353149414, "learning_rate": 4.033333333333333e-07, "logps/chosen": -33.582359313964844, "logps/rejected": -113.1934814453125, "loss": 0.1153, "losses/dpo": 0.05961308628320694, "losses/sft": 0.45472973585128784, "losses/total": 0.05961308628320694, "ref_logps/chosen": -18.625728607177734, "ref_logps/rejected": -56.13653564453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4956634044647217, "rewards/margins": 4.210031509399414, "rewards/rejected": -5.705695152282715, "step": 121 }, { "epoch": 0.24, "grad_norm": 55.00533676147461, "learning_rate": 4.0666666666666666e-07, "logps/chosen": -26.204700469970703, "logps/rejected": -103.64109802246094, "loss": 0.1689, "losses/dpo": 0.274069219827652, "losses/sft": 0.598950207233429, "losses/total": 0.274069219827652, "ref_logps/chosen": -10.38711929321289, "ref_logps/rejected": -48.82931137084961, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5817580223083496, "rewards/margins": 3.8994200229644775, "rewards/rejected": -5.481178283691406, "step": 122 }, { "epoch": 0.25, "grad_norm": 28.600406646728516, "learning_rate": 4.0999999999999994e-07, "logps/chosen": -27.372495651245117, "logps/rejected": -88.63994598388672, "loss": 0.1469, "losses/dpo": 0.06027643382549286, "losses/sft": 0.5721110105514526, "losses/total": 0.06027643382549286, "ref_logps/chosen": -10.918659210205078, "ref_logps/rejected": -36.75074005126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.645383596420288, "rewards/margins": 3.54353666305542, "rewards/rejected": -5.188920497894287, "step": 123 }, { "epoch": 0.25, "grad_norm": 80.9706802368164, "learning_rate": 4.1333333333333333e-07, "logps/chosen": -35.911651611328125, "logps/rejected": -91.64453125, "loss": 0.3585, "losses/dpo": 0.6168155074119568, "losses/sft": 0.8081568479537964, "losses/total": 0.6168155074119568, "ref_logps/chosen": -12.966390609741211, "ref_logps/rejected": -45.097755432128906, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2945261001586914, "rewards/margins": 2.3601508140563965, "rewards/rejected": -4.654676914215088, "step": 124 }, { "epoch": 0.25, "grad_norm": 58.33188247680664, "learning_rate": 4.1666666666666667e-07, "logps/chosen": -33.37420654296875, "logps/rejected": -100.40385437011719, "loss": 0.2055, "losses/dpo": 0.35650166869163513, "losses/sft": 0.651277482509613, "losses/total": 0.35650166869163513, "ref_logps/chosen": -14.553751945495605, "ref_logps/rejected": -42.69612503051758, "rewards/accuracies": 0.875, "rewards/chosen": -1.8820456266403198, "rewards/margins": 3.8887269496917725, "rewards/rejected": -5.770772933959961, "step": 125 }, { "epoch": 0.25, "grad_norm": 63.172340393066406, "learning_rate": 4.1999999999999995e-07, "logps/chosen": -34.846107482910156, "logps/rejected": -123.60401153564453, "loss": 0.2306, "losses/dpo": 0.06216158717870712, "losses/sft": 0.736182451248169, "losses/total": 0.06216158717870712, "ref_logps/chosen": -16.453075408935547, "ref_logps/rejected": -64.68106842041016, "rewards/accuracies": 0.875, "rewards/chosen": -1.8393032550811768, "rewards/margins": 4.052990913391113, "rewards/rejected": -5.892294406890869, "step": 126 }, { "epoch": 0.25, "grad_norm": 31.807851791381836, "learning_rate": 4.2333333333333334e-07, "logps/chosen": -30.268192291259766, "logps/rejected": -88.67607116699219, "loss": 0.1248, "losses/dpo": 0.24264563620090485, "losses/sft": 0.5180367827415466, "losses/total": 0.24264563620090485, "ref_logps/chosen": -17.16900062561035, "ref_logps/rejected": -38.45530319213867, "rewards/accuracies": 1.0, "rewards/chosen": -1.3099192380905151, "rewards/margins": 3.712158203125, "rewards/rejected": -5.022077560424805, "step": 127 }, { "epoch": 0.26, "grad_norm": 24.525039672851562, "learning_rate": 4.266666666666667e-07, "logps/chosen": -39.662452697753906, "logps/rejected": -110.11405944824219, "loss": 0.0984, "losses/dpo": 0.17816494405269623, "losses/sft": 0.6083031296730042, "losses/total": 0.17816494405269623, "ref_logps/chosen": -17.88237762451172, "ref_logps/rejected": -45.97468566894531, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1780076026916504, "rewards/margins": 4.235930442810059, "rewards/rejected": -6.413937568664551, "step": 128 }, { "epoch": 0.26, "grad_norm": 36.77156448364258, "learning_rate": 4.2999999999999996e-07, "logps/chosen": -33.76113510131836, "logps/rejected": -85.31007385253906, "loss": 0.1854, "losses/dpo": 0.09161588549613953, "losses/sft": 0.4901028275489807, "losses/total": 0.09161588549613953, "ref_logps/chosen": -19.187625885009766, "ref_logps/rejected": -41.31425857543945, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4573508501052856, "rewards/margins": 2.942230463027954, "rewards/rejected": -4.399581432342529, "step": 129 }, { "epoch": 0.26, "grad_norm": 14.954230308532715, "learning_rate": 4.3333333333333335e-07, "logps/chosen": -26.31670379638672, "logps/rejected": -95.92216491699219, "loss": 0.0717, "losses/dpo": 0.22551429271697998, "losses/sft": 0.5335481762886047, "losses/total": 0.22551429271697998, "ref_logps/chosen": -14.930686950683594, "ref_logps/rejected": -44.4586181640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1386016607284546, "rewards/margins": 4.007754325866699, "rewards/rejected": -5.146356105804443, "step": 130 }, { "epoch": 0.26, "grad_norm": 18.751760482788086, "learning_rate": 4.3666666666666663e-07, "logps/chosen": -27.520706176757812, "logps/rejected": -87.16500854492188, "loss": 0.0965, "losses/dpo": 0.062105268239974976, "losses/sft": 0.48621320724487305, "losses/total": 0.062105268239974976, "ref_logps/chosen": -12.407279968261719, "ref_logps/rejected": -39.634822845458984, "rewards/accuracies": 1.0, "rewards/chosen": -1.5113425254821777, "rewards/margins": 3.241675853729248, "rewards/rejected": -4.753018379211426, "step": 131 }, { "epoch": 0.26, "grad_norm": 43.365806579589844, "learning_rate": 4.3999999999999997e-07, "logps/chosen": -30.080524444580078, "logps/rejected": -100.35038757324219, "loss": 0.1902, "losses/dpo": 0.5356749296188354, "losses/sft": 0.5842803716659546, "losses/total": 0.5356749296188354, "ref_logps/chosen": -12.293049812316895, "ref_logps/rejected": -47.880836486816406, "rewards/accuracies": 0.875, "rewards/chosen": -1.778747320175171, "rewards/margins": 3.468207836151123, "rewards/rejected": -5.246955394744873, "step": 132 }, { "epoch": 0.27, "grad_norm": 27.982099533081055, "learning_rate": 4.4333333333333336e-07, "logps/chosen": -31.108388900756836, "logps/rejected": -93.52197265625, "loss": 0.0911, "losses/dpo": 0.08737071603536606, "losses/sft": 0.6428839564323425, "losses/total": 0.08737071603536606, "ref_logps/chosen": -15.797719955444336, "ref_logps/rejected": -40.723243713378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.531067132949829, "rewards/margins": 3.7488057613372803, "rewards/rejected": -5.279872417449951, "step": 133 }, { "epoch": 0.27, "grad_norm": 40.31496047973633, "learning_rate": 4.4666666666666664e-07, "logps/chosen": -25.789161682128906, "logps/rejected": -110.07024383544922, "loss": 0.124, "losses/dpo": 0.4045230448246002, "losses/sft": 0.4008540213108063, "losses/total": 0.4045230448246002, "ref_logps/chosen": -15.012077331542969, "ref_logps/rejected": -47.305419921875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.07770836353302, "rewards/margins": 5.1987738609313965, "rewards/rejected": -6.276482105255127, "step": 134 }, { "epoch": 0.27, "grad_norm": 46.73427200317383, "learning_rate": 4.5e-07, "logps/chosen": -33.74063491821289, "logps/rejected": -86.67448425292969, "loss": 0.1774, "losses/dpo": 0.23459814488887787, "losses/sft": 0.5259028673171997, "losses/total": 0.23459814488887787, "ref_logps/chosen": -15.16489028930664, "ref_logps/rejected": -37.2421875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.857574462890625, "rewards/margins": 3.085655689239502, "rewards/rejected": -4.943229675292969, "step": 135 }, { "epoch": 0.27, "grad_norm": 65.31625366210938, "learning_rate": 4.5333333333333326e-07, "logps/chosen": -23.63500213623047, "logps/rejected": -81.20355224609375, "loss": 0.134, "losses/dpo": 0.3590734004974365, "losses/sft": 0.5037756562232971, "losses/total": 0.3590734004974365, "ref_logps/chosen": -11.519001960754395, "ref_logps/rejected": -35.56211853027344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2116000652313232, "rewards/margins": 3.352543354034424, "rewards/rejected": -4.564143180847168, "step": 136 }, { "epoch": 0.27, "grad_norm": 26.6831111907959, "learning_rate": 4.5666666666666665e-07, "logps/chosen": -30.455238342285156, "logps/rejected": -90.72950744628906, "loss": 0.1094, "losses/dpo": 0.08296354115009308, "losses/sft": 0.6522700786590576, "losses/total": 0.08296354115009308, "ref_logps/chosen": -14.431440353393555, "ref_logps/rejected": -37.58430480957031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6023797988891602, "rewards/margins": 3.7121400833129883, "rewards/rejected": -5.314520359039307, "step": 137 }, { "epoch": 0.28, "grad_norm": 44.44517517089844, "learning_rate": 4.6e-07, "logps/chosen": -26.478111267089844, "logps/rejected": -86.48759460449219, "loss": 0.1435, "losses/dpo": 0.25203052163124084, "losses/sft": 0.5533711910247803, "losses/total": 0.25203052163124084, "ref_logps/chosen": -15.576150894165039, "ref_logps/rejected": -38.519466400146484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.090196132659912, "rewards/margins": 3.7066164016723633, "rewards/rejected": -4.796812057495117, "step": 138 }, { "epoch": 0.28, "grad_norm": 33.14357376098633, "learning_rate": 4.633333333333333e-07, "logps/chosen": -26.719112396240234, "logps/rejected": -92.25952911376953, "loss": 0.1509, "losses/dpo": 0.11920268833637238, "losses/sft": 0.6446678638458252, "losses/total": 0.11920268833637238, "ref_logps/chosen": -11.331499099731445, "ref_logps/rejected": -35.657508850097656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5387613773345947, "rewards/margins": 4.121440410614014, "rewards/rejected": -5.6602020263671875, "step": 139 }, { "epoch": 0.28, "grad_norm": 38.349788665771484, "learning_rate": 4.6666666666666666e-07, "logps/chosen": -26.575002670288086, "logps/rejected": -93.16529083251953, "loss": 0.1955, "losses/dpo": 0.27706673741340637, "losses/sft": 0.6116777658462524, "losses/total": 0.27706673741340637, "ref_logps/chosen": -13.48236083984375, "ref_logps/rejected": -40.799232482910156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.309264063835144, "rewards/margins": 3.9273412227630615, "rewards/rejected": -5.236605167388916, "step": 140 }, { "epoch": 0.28, "grad_norm": 19.368520736694336, "learning_rate": 4.6999999999999995e-07, "logps/chosen": -26.615325927734375, "logps/rejected": -99.11043548583984, "loss": 0.126, "losses/dpo": 0.1754596084356308, "losses/sft": 0.49534112215042114, "losses/total": 0.1754596084356308, "ref_logps/chosen": -13.058014869689941, "ref_logps/rejected": -45.73929977416992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3557310104370117, "rewards/margins": 3.9813828468322754, "rewards/rejected": -5.337113380432129, "step": 141 }, { "epoch": 0.28, "grad_norm": 41.14610290527344, "learning_rate": 4.733333333333333e-07, "logps/chosen": -32.56314468383789, "logps/rejected": -93.21540069580078, "loss": 0.201, "losses/dpo": 0.4042896628379822, "losses/sft": 0.9554309844970703, "losses/total": 0.4042896628379822, "ref_logps/chosen": -12.147167205810547, "ref_logps/rejected": -40.382930755615234, "rewards/accuracies": 0.875, "rewards/chosen": -2.041597604751587, "rewards/margins": 3.241649866104126, "rewards/rejected": -5.283247470855713, "step": 142 }, { "epoch": 0.29, "grad_norm": 51.96454620361328, "learning_rate": 4.7666666666666667e-07, "logps/chosen": -26.63742446899414, "logps/rejected": -133.12020874023438, "loss": 0.145, "losses/dpo": 0.0019743088632822037, "losses/sft": 0.6446032524108887, "losses/total": 0.0019743088632822037, "ref_logps/chosen": -10.237613677978516, "ref_logps/rejected": -61.997535705566406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6399810314178467, "rewards/margins": 5.472287178039551, "rewards/rejected": -7.112268447875977, "step": 143 }, { "epoch": 0.29, "grad_norm": 37.46514129638672, "learning_rate": 4.8e-07, "logps/chosen": -37.32117462158203, "logps/rejected": -96.15669250488281, "loss": 0.1903, "losses/dpo": 0.11820630729198456, "losses/sft": 0.6547752618789673, "losses/total": 0.11820630729198456, "ref_logps/chosen": -16.876934051513672, "ref_logps/rejected": -38.996849060058594, "rewards/accuracies": 0.875, "rewards/chosen": -2.044424057006836, "rewards/margins": 3.6715593338012695, "rewards/rejected": -5.7159833908081055, "step": 144 }, { "epoch": 0.29, "grad_norm": 51.414974212646484, "learning_rate": 4.833333333333333e-07, "logps/chosen": -33.48571014404297, "logps/rejected": -100.97714233398438, "loss": 0.2538, "losses/dpo": 0.30546796321868896, "losses/sft": 0.6166024804115295, "losses/total": 0.30546796321868896, "ref_logps/chosen": -14.59931755065918, "ref_logps/rejected": -47.38710021972656, "rewards/accuracies": 0.875, "rewards/chosen": -1.888639211654663, "rewards/margins": 3.470364570617676, "rewards/rejected": -5.359004020690918, "step": 145 }, { "epoch": 0.29, "grad_norm": 45.47442626953125, "learning_rate": 4.866666666666666e-07, "logps/chosen": -40.079994201660156, "logps/rejected": -128.56350708007812, "loss": 0.1701, "losses/dpo": 0.022961853072047234, "losses/sft": 0.595543384552002, "losses/total": 0.022961853072047234, "ref_logps/chosen": -18.27761459350586, "ref_logps/rejected": -60.02783966064453, "rewards/accuracies": 0.875, "rewards/chosen": -2.1802380084991455, "rewards/margins": 4.673327445983887, "rewards/rejected": -6.853565216064453, "step": 146 }, { "epoch": 0.29, "grad_norm": 26.409053802490234, "learning_rate": 4.9e-07, "logps/chosen": -33.39720153808594, "logps/rejected": -109.74790954589844, "loss": 0.0791, "losses/dpo": 0.15374058485031128, "losses/sft": 0.4255408048629761, "losses/total": 0.15374058485031128, "ref_logps/chosen": -20.84080696105957, "ref_logps/rejected": -52.04368591308594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2556393146514893, "rewards/margins": 4.5147833824157715, "rewards/rejected": -5.770422458648682, "step": 147 }, { "epoch": 0.3, "grad_norm": 40.29155349731445, "learning_rate": 4.933333333333333e-07, "logps/chosen": -25.50351333618164, "logps/rejected": -97.32110595703125, "loss": 0.1404, "losses/dpo": 0.05997714027762413, "losses/sft": 0.4540533721446991, "losses/total": 0.05997714027762413, "ref_logps/chosen": -11.617637634277344, "ref_logps/rejected": -42.73888397216797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3885875940322876, "rewards/margins": 4.069634914398193, "rewards/rejected": -5.45822286605835, "step": 148 }, { "epoch": 0.3, "grad_norm": 22.356138229370117, "learning_rate": 4.966666666666666e-07, "logps/chosen": -36.5714111328125, "logps/rejected": -119.6854248046875, "loss": 0.0701, "losses/dpo": 0.095610611140728, "losses/sft": 0.5715881586074829, "losses/total": 0.095610611140728, "ref_logps/chosen": -15.912175178527832, "ref_logps/rejected": -53.39912414550781, "rewards/accuracies": 1.0, "rewards/chosen": -2.0659236907958984, "rewards/margins": 4.562705993652344, "rewards/rejected": -6.628629684448242, "step": 149 }, { "epoch": 0.3, "grad_norm": 52.81399917602539, "learning_rate": 5e-07, "logps/chosen": -30.146623611450195, "logps/rejected": -95.46763610839844, "loss": 0.1872, "losses/dpo": 0.35411715507507324, "losses/sft": 0.7511922121047974, "losses/total": 0.35411715507507324, "ref_logps/chosen": -11.182028770446777, "ref_logps/rejected": -40.55536651611328, "rewards/accuracies": 0.875, "rewards/chosen": -1.8964595794677734, "rewards/margins": 3.5947678089141846, "rewards/rejected": -5.491227149963379, "step": 150 }, { "epoch": 0.3, "grad_norm": 51.27703857421875, "learning_rate": 4.996296296296296e-07, "logps/chosen": -26.03563690185547, "logps/rejected": -85.95819091796875, "loss": 0.1315, "losses/dpo": 0.04389333724975586, "losses/sft": 0.5486029386520386, "losses/total": 0.04389333724975586, "ref_logps/chosen": -10.959346771240234, "ref_logps/rejected": -36.31953430175781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5076290369033813, "rewards/margins": 3.456235647201538, "rewards/rejected": -4.963865280151367, "step": 151 }, { "epoch": 0.3, "grad_norm": 28.76018714904785, "learning_rate": 4.992592592592593e-07, "logps/chosen": -28.23834991455078, "logps/rejected": -113.56182098388672, "loss": 0.1036, "losses/dpo": 0.011073265224695206, "losses/sft": 0.3654525876045227, "losses/total": 0.011073265224695206, "ref_logps/chosen": -16.015453338623047, "ref_logps/rejected": -50.925968170166016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.222289800643921, "rewards/margins": 5.041295528411865, "rewards/rejected": -6.263585567474365, "step": 152 }, { "epoch": 0.31, "grad_norm": 25.341352462768555, "learning_rate": 4.988888888888889e-07, "logps/chosen": -22.11452865600586, "logps/rejected": -83.98039245605469, "loss": 0.1078, "losses/dpo": 0.18434298038482666, "losses/sft": 0.3981594741344452, "losses/total": 0.18434298038482666, "ref_logps/chosen": -12.649343490600586, "ref_logps/rejected": -36.79124450683594, "rewards/accuracies": 1.0, "rewards/chosen": -0.9465184211730957, "rewards/margins": 3.7723965644836426, "rewards/rejected": -4.718914985656738, "step": 153 }, { "epoch": 0.31, "grad_norm": 48.10272216796875, "learning_rate": 4.985185185185185e-07, "logps/chosen": -24.186851501464844, "logps/rejected": -97.55790710449219, "loss": 0.1649, "losses/dpo": 0.20018045604228973, "losses/sft": 0.4684436321258545, "losses/total": 0.20018045604228973, "ref_logps/chosen": -13.064205169677734, "ref_logps/rejected": -51.07586669921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1122647523880005, "rewards/margins": 3.5359394550323486, "rewards/rejected": -4.6482038497924805, "step": 154 }, { "epoch": 0.31, "grad_norm": 40.72660827636719, "learning_rate": 4.981481481481482e-07, "logps/chosen": -25.070098876953125, "logps/rejected": -96.81678009033203, "loss": 0.1761, "losses/dpo": 0.32067233324050903, "losses/sft": 0.47111189365386963, "losses/total": 0.32067233324050903, "ref_logps/chosen": -13.66195011138916, "ref_logps/rejected": -52.27348327636719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.140815019607544, "rewards/margins": 3.3135147094726562, "rewards/rejected": -4.454329967498779, "step": 155 }, { "epoch": 0.31, "grad_norm": 60.72633743286133, "learning_rate": 4.977777777777777e-07, "logps/chosen": -25.260963439941406, "logps/rejected": -120.26380920410156, "loss": 0.1332, "losses/dpo": 0.0010022318456321955, "losses/sft": 0.4159192442893982, "losses/total": 0.0010022318456321955, "ref_logps/chosen": -11.787809371948242, "ref_logps/rejected": -55.686058044433594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3473155498504639, "rewards/margins": 5.11046028137207, "rewards/rejected": -6.457776069641113, "step": 156 }, { "epoch": 0.31, "grad_norm": 13.246103286743164, "learning_rate": 4.974074074074074e-07, "logps/chosen": -32.20502853393555, "logps/rejected": -99.13482666015625, "loss": 0.043, "losses/dpo": 0.06591884046792984, "losses/sft": 0.3959016799926758, "losses/total": 0.06591884046792984, "ref_logps/chosen": -18.83717155456543, "ref_logps/rejected": -42.83684539794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.3367857933044434, "rewards/margins": 4.293012619018555, "rewards/rejected": -5.629798412322998, "step": 157 }, { "epoch": 0.32, "grad_norm": 51.342262268066406, "learning_rate": 4.97037037037037e-07, "logps/chosen": -34.90985107421875, "logps/rejected": -100.9737548828125, "loss": 0.1856, "losses/dpo": 0.3706285357475281, "losses/sft": 0.6656729578971863, "losses/total": 0.3706285357475281, "ref_logps/chosen": -14.48876953125, "ref_logps/rejected": -45.89849853515625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0421080589294434, "rewards/margins": 3.4654178619384766, "rewards/rejected": -5.50752592086792, "step": 158 }, { "epoch": 0.32, "grad_norm": 27.331863403320312, "learning_rate": 4.966666666666666e-07, "logps/chosen": -26.92082405090332, "logps/rejected": -89.95733642578125, "loss": 0.1194, "losses/dpo": 0.01959105022251606, "losses/sft": 0.614719033241272, "losses/total": 0.01959105022251606, "ref_logps/chosen": -11.247827529907227, "ref_logps/rejected": -35.469085693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5672996044158936, "rewards/margins": 3.8815250396728516, "rewards/rejected": -5.448824882507324, "step": 159 }, { "epoch": 0.32, "grad_norm": 29.60675048828125, "learning_rate": 4.962962962962963e-07, "logps/chosen": -28.120311737060547, "logps/rejected": -109.587646484375, "loss": 0.0785, "losses/dpo": 0.07884176820516586, "losses/sft": 0.4254249334335327, "losses/total": 0.07884176820516586, "ref_logps/chosen": -12.621431350708008, "ref_logps/rejected": -48.657039642333984, "rewards/accuracies": 1.0, "rewards/chosen": -1.549888014793396, "rewards/margins": 4.543172836303711, "rewards/rejected": -6.093061447143555, "step": 160 }, { "epoch": 0.32, "grad_norm": 38.52204513549805, "learning_rate": 4.959259259259259e-07, "logps/chosen": -34.612762451171875, "logps/rejected": -106.73829650878906, "loss": 0.096, "losses/dpo": 0.0721823126077652, "losses/sft": 0.6061477065086365, "losses/total": 0.0721823126077652, "ref_logps/chosen": -17.091495513916016, "ref_logps/rejected": -50.930519104003906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.752126693725586, "rewards/margins": 3.828650712966919, "rewards/rejected": -5.580777645111084, "step": 161 }, { "epoch": 0.32, "grad_norm": 17.7041072845459, "learning_rate": 4.955555555555556e-07, "logps/chosen": -36.67155075073242, "logps/rejected": -109.15782165527344, "loss": 0.0632, "losses/dpo": 0.11136096715927124, "losses/sft": 0.587780773639679, "losses/total": 0.11136096715927124, "ref_logps/chosen": -20.507972717285156, "ref_logps/rejected": -48.61429977416992, "rewards/accuracies": 1.0, "rewards/chosen": -1.616357684135437, "rewards/margins": 4.437994003295898, "rewards/rejected": -6.054351806640625, "step": 162 }, { "epoch": 0.33, "grad_norm": 21.67244529724121, "learning_rate": 4.951851851851851e-07, "logps/chosen": -29.562240600585938, "logps/rejected": -101.33702087402344, "loss": 0.1004, "losses/dpo": 0.010235275141894817, "losses/sft": 0.5660809278488159, "losses/total": 0.010235275141894817, "ref_logps/chosen": -13.207432746887207, "ref_logps/rejected": -45.143585205078125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6354806423187256, "rewards/margins": 3.983863115310669, "rewards/rejected": -5.619344234466553, "step": 163 }, { "epoch": 0.33, "grad_norm": 28.07857322692871, "learning_rate": 4.948148148148148e-07, "logps/chosen": -30.570457458496094, "logps/rejected": -112.83328247070312, "loss": 0.0717, "losses/dpo": 0.06467992067337036, "losses/sft": 0.45188504457473755, "losses/total": 0.06467992067337036, "ref_logps/chosen": -15.896978378295898, "ref_logps/rejected": -52.206260681152344, "rewards/accuracies": 1.0, "rewards/chosen": -1.4673478603363037, "rewards/margins": 4.595355033874512, "rewards/rejected": -6.0627031326293945, "step": 164 }, { "epoch": 0.33, "grad_norm": 26.712987899780273, "learning_rate": 4.944444444444445e-07, "logps/chosen": -23.911460876464844, "logps/rejected": -107.57379150390625, "loss": 0.1045, "losses/dpo": 0.15704016387462616, "losses/sft": 0.585786759853363, "losses/total": 0.15704016387462616, "ref_logps/chosen": -13.260459899902344, "ref_logps/rejected": -53.76809310913086, "rewards/accuracies": 1.0, "rewards/chosen": -1.0651001930236816, "rewards/margins": 4.315468788146973, "rewards/rejected": -5.3805694580078125, "step": 165 }, { "epoch": 0.33, "grad_norm": 16.83503532409668, "learning_rate": 4.94074074074074e-07, "logps/chosen": -40.63855743408203, "logps/rejected": -105.12744903564453, "loss": 0.0802, "losses/dpo": 0.1741233468055725, "losses/sft": 0.2261582911014557, "losses/total": 0.1741233468055725, "ref_logps/chosen": -24.93436050415039, "ref_logps/rejected": -50.1585578918457, "rewards/accuracies": 1.0, "rewards/chosen": -1.5704195499420166, "rewards/margins": 3.926469564437866, "rewards/rejected": -5.496889114379883, "step": 166 }, { "epoch": 0.33, "grad_norm": 43.83147430419922, "learning_rate": 4.937037037037037e-07, "logps/chosen": -30.703670501708984, "logps/rejected": -97.72269439697266, "loss": 0.1269, "losses/dpo": 0.1549128293991089, "losses/sft": 0.6281462907791138, "losses/total": 0.1549128293991089, "ref_logps/chosen": -13.337878227233887, "ref_logps/rejected": -40.41243362426758, "rewards/accuracies": 0.9375, "rewards/chosen": -1.736579179763794, "rewards/margins": 3.9944469928741455, "rewards/rejected": -5.731026649475098, "step": 167 }, { "epoch": 0.34, "grad_norm": 28.505626678466797, "learning_rate": 4.933333333333333e-07, "logps/chosen": -33.72929382324219, "logps/rejected": -133.2057647705078, "loss": 0.1079, "losses/dpo": 0.04674118757247925, "losses/sft": 0.49377191066741943, "losses/total": 0.04674118757247925, "ref_logps/chosen": -15.563924789428711, "ref_logps/rejected": -61.7348518371582, "rewards/accuracies": 1.0, "rewards/chosen": -1.8165371417999268, "rewards/margins": 5.330554962158203, "rewards/rejected": -7.147091865539551, "step": 168 }, { "epoch": 0.34, "grad_norm": 40.403419494628906, "learning_rate": 4.929629629629629e-07, "logps/chosen": -33.252532958984375, "logps/rejected": -105.06487274169922, "loss": 0.1667, "losses/dpo": 0.2247321456670761, "losses/sft": 0.6353814601898193, "losses/total": 0.2247321456670761, "ref_logps/chosen": -14.608097076416016, "ref_logps/rejected": -41.214820861816406, "rewards/accuracies": 0.875, "rewards/chosen": -1.8644435405731201, "rewards/margins": 4.520562171936035, "rewards/rejected": -6.385005474090576, "step": 169 }, { "epoch": 0.34, "grad_norm": 50.592559814453125, "learning_rate": 4.925925925925926e-07, "logps/chosen": -31.340618133544922, "logps/rejected": -106.16822052001953, "loss": 0.1412, "losses/dpo": 0.23204943537712097, "losses/sft": 0.6205928325653076, "losses/total": 0.23204943537712097, "ref_logps/chosen": -14.412965774536133, "ref_logps/rejected": -41.494407653808594, "rewards/accuracies": 1.0, "rewards/chosen": -1.692765235900879, "rewards/margins": 4.77461576461792, "rewards/rejected": -6.467381477355957, "step": 170 }, { "epoch": 0.34, "grad_norm": 34.329734802246094, "learning_rate": 4.922222222222222e-07, "logps/chosen": -33.6573371887207, "logps/rejected": -77.05763244628906, "loss": 0.1371, "losses/dpo": 0.15634378790855408, "losses/sft": 0.6816786527633667, "losses/total": 0.15634378790855408, "ref_logps/chosen": -15.983509063720703, "ref_logps/rejected": -30.413066864013672, "rewards/accuracies": 1.0, "rewards/chosen": -1.7673827409744263, "rewards/margins": 2.8970742225646973, "rewards/rejected": -4.664457321166992, "step": 171 }, { "epoch": 0.34, "grad_norm": 37.6192741394043, "learning_rate": 4.918518518518519e-07, "logps/chosen": -36.66886520385742, "logps/rejected": -85.15867614746094, "loss": 0.1536, "losses/dpo": 0.21989810466766357, "losses/sft": 0.596770703792572, "losses/total": 0.21989810466766357, "ref_logps/chosen": -20.18588638305664, "ref_logps/rejected": -34.98530197143555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6482977867126465, "rewards/margins": 3.3690390586853027, "rewards/rejected": -5.017336845397949, "step": 172 }, { "epoch": 0.35, "grad_norm": 25.663639068603516, "learning_rate": 4.914814814814814e-07, "logps/chosen": -35.823875427246094, "logps/rejected": -106.20530700683594, "loss": 0.0575, "losses/dpo": 0.04412658512592316, "losses/sft": 0.3898341655731201, "losses/total": 0.04412658512592316, "ref_logps/chosen": -21.750324249267578, "ref_logps/rejected": -43.821144104003906, "rewards/accuracies": 1.0, "rewards/chosen": -1.4073551893234253, "rewards/margins": 4.831061363220215, "rewards/rejected": -6.23841667175293, "step": 173 }, { "epoch": 0.35, "grad_norm": 35.56593704223633, "learning_rate": 4.91111111111111e-07, "logps/chosen": -32.283058166503906, "logps/rejected": -117.61260986328125, "loss": 0.0963, "losses/dpo": 0.033313900232315063, "losses/sft": 0.6720283031463623, "losses/total": 0.033313900232315063, "ref_logps/chosen": -18.36054039001465, "ref_logps/rejected": -54.971343994140625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.392251968383789, "rewards/margins": 4.8718743324279785, "rewards/rejected": -6.264126300811768, "step": 174 }, { "epoch": 0.35, "grad_norm": 43.761077880859375, "learning_rate": 4.907407407407407e-07, "logps/chosen": -34.093841552734375, "logps/rejected": -100.14768981933594, "loss": 0.1455, "losses/dpo": 0.13595961034297943, "losses/sft": 0.5358244180679321, "losses/total": 0.13595961034297943, "ref_logps/chosen": -16.00359344482422, "ref_logps/rejected": -42.43475341796875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.809024691581726, "rewards/margins": 3.962268829345703, "rewards/rejected": -5.771293640136719, "step": 175 }, { "epoch": 0.35, "grad_norm": 24.053037643432617, "learning_rate": 4.903703703703703e-07, "logps/chosen": -24.640159606933594, "logps/rejected": -116.64846801757812, "loss": 0.0634, "losses/dpo": 0.15330170094966888, "losses/sft": 0.496634840965271, "losses/total": 0.15330170094966888, "ref_logps/chosen": -12.316726684570312, "ref_logps/rejected": -49.52324295043945, "rewards/accuracies": 1.0, "rewards/chosen": -1.2323435544967651, "rewards/margins": 5.480178356170654, "rewards/rejected": -6.712522506713867, "step": 176 }, { "epoch": 0.35, "grad_norm": 18.6986141204834, "learning_rate": 4.9e-07, "logps/chosen": -25.20108413696289, "logps/rejected": -98.72869873046875, "loss": 0.0739, "losses/dpo": 0.09845062345266342, "losses/sft": 0.7194775342941284, "losses/total": 0.09845062345266342, "ref_logps/chosen": -11.484321594238281, "ref_logps/rejected": -40.09552764892578, "rewards/accuracies": 1.0, "rewards/chosen": -1.3716762065887451, "rewards/margins": 4.491641044616699, "rewards/rejected": -5.863317489624023, "step": 177 }, { "epoch": 0.36, "grad_norm": 25.084514617919922, "learning_rate": 4.896296296296296e-07, "logps/chosen": -28.90692138671875, "logps/rejected": -109.8388671875, "loss": 0.1246, "losses/dpo": 0.1252589076757431, "losses/sft": 0.48763686418533325, "losses/total": 0.1252589076757431, "ref_logps/chosen": -13.82424545288086, "ref_logps/rejected": -44.95261001586914, "rewards/accuracies": 1.0, "rewards/chosen": -1.5082676410675049, "rewards/margins": 4.980358123779297, "rewards/rejected": -6.488625526428223, "step": 178 }, { "epoch": 0.36, "grad_norm": 25.185590744018555, "learning_rate": 4.892592592592592e-07, "logps/chosen": -35.07394027709961, "logps/rejected": -107.2217025756836, "loss": 0.0935, "losses/dpo": 0.06011039763689041, "losses/sft": 0.6677021980285645, "losses/total": 0.06011039763689041, "ref_logps/chosen": -15.76345443725586, "ref_logps/rejected": -44.287784576416016, "rewards/accuracies": 1.0, "rewards/chosen": -1.9310487508773804, "rewards/margins": 4.362342834472656, "rewards/rejected": -6.293392181396484, "step": 179 }, { "epoch": 0.36, "grad_norm": 33.190589904785156, "learning_rate": 4.888888888888889e-07, "logps/chosen": -27.996540069580078, "logps/rejected": -84.33702087402344, "loss": 0.151, "losses/dpo": 0.2758825123310089, "losses/sft": 0.4535401463508606, "losses/total": 0.2758825123310089, "ref_logps/chosen": -13.512284278869629, "ref_logps/rejected": -37.30048751831055, "rewards/accuracies": 1.0, "rewards/chosen": -1.4484257698059082, "rewards/margins": 3.25522780418396, "rewards/rejected": -4.703653335571289, "step": 180 }, { "epoch": 0.36, "grad_norm": 52.29859924316406, "learning_rate": 4.885185185185185e-07, "logps/chosen": -33.77317810058594, "logps/rejected": -103.15384674072266, "loss": 0.1567, "losses/dpo": 0.11710235476493835, "losses/sft": 0.45147156715393066, "losses/total": 0.11710235476493835, "ref_logps/chosen": -20.137229919433594, "ref_logps/rejected": -41.28327178955078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3635950088500977, "rewards/margins": 4.823462963104248, "rewards/rejected": -6.187057971954346, "step": 181 }, { "epoch": 0.36, "grad_norm": 70.58116149902344, "learning_rate": 4.881481481481482e-07, "logps/chosen": -38.54304504394531, "logps/rejected": -84.55474090576172, "loss": 0.2789, "losses/dpo": 0.24431732296943665, "losses/sft": 0.6288025975227356, "losses/total": 0.24431732296943665, "ref_logps/chosen": -17.36640167236328, "ref_logps/rejected": -35.1982536315918, "rewards/accuracies": 0.875, "rewards/chosen": -2.1176645755767822, "rewards/margins": 2.8179845809936523, "rewards/rejected": -4.935649394989014, "step": 182 }, { "epoch": 0.37, "grad_norm": 37.199989318847656, "learning_rate": 4.877777777777777e-07, "logps/chosen": -31.085803985595703, "logps/rejected": -92.9924545288086, "loss": 0.171, "losses/dpo": 0.18612107634544373, "losses/sft": 0.5101003646850586, "losses/total": 0.18612107634544373, "ref_logps/chosen": -13.339794158935547, "ref_logps/rejected": -39.03697204589844, "rewards/accuracies": 1.0, "rewards/chosen": -1.7746011018753052, "rewards/margins": 3.6209468841552734, "rewards/rejected": -5.395547866821289, "step": 183 }, { "epoch": 0.37, "grad_norm": 31.927581787109375, "learning_rate": 4.874074074074073e-07, "logps/chosen": -26.326370239257812, "logps/rejected": -101.94358825683594, "loss": 0.1419, "losses/dpo": 0.18461284041404724, "losses/sft": 0.7296566367149353, "losses/total": 0.18461284041404724, "ref_logps/chosen": -12.85525131225586, "ref_logps/rejected": -45.30030822753906, "rewards/accuracies": 1.0, "rewards/chosen": -1.347111701965332, "rewards/margins": 4.317215919494629, "rewards/rejected": -5.664327621459961, "step": 184 }, { "epoch": 0.37, "grad_norm": 37.610389709472656, "learning_rate": 4.87037037037037e-07, "logps/chosen": -25.972145080566406, "logps/rejected": -91.00416564941406, "loss": 0.1407, "losses/dpo": 0.19254301488399506, "losses/sft": 0.6111183166503906, "losses/total": 0.19254301488399506, "ref_logps/chosen": -13.008270263671875, "ref_logps/rejected": -42.0411491394043, "rewards/accuracies": 1.0, "rewards/chosen": -1.2963876724243164, "rewards/margins": 3.599914073944092, "rewards/rejected": -4.896301746368408, "step": 185 }, { "epoch": 0.37, "grad_norm": 46.32947540283203, "learning_rate": 4.866666666666666e-07, "logps/chosen": -30.488025665283203, "logps/rejected": -84.3827896118164, "loss": 0.2015, "losses/dpo": 0.2830401360988617, "losses/sft": 0.6334186792373657, "losses/total": 0.2830401360988617, "ref_logps/chosen": -13.258672714233398, "ref_logps/rejected": -34.712615966796875, "rewards/accuracies": 0.875, "rewards/chosen": -1.722935438156128, "rewards/margins": 3.244082450866699, "rewards/rejected": -4.967017650604248, "step": 186 }, { "epoch": 0.37, "grad_norm": 43.799713134765625, "learning_rate": 4.862962962962963e-07, "logps/chosen": -38.89411163330078, "logps/rejected": -103.1797103881836, "loss": 0.1821, "losses/dpo": 0.47093939781188965, "losses/sft": 0.7258151769638062, "losses/total": 0.47093939781188965, "ref_logps/chosen": -16.819536209106445, "ref_logps/rejected": -42.68255615234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2074575424194336, "rewards/margins": 3.8422579765319824, "rewards/rejected": -6.049715518951416, "step": 187 }, { "epoch": 0.38, "grad_norm": 50.88268280029297, "learning_rate": 4.859259259259259e-07, "logps/chosen": -35.74350357055664, "logps/rejected": -76.9640121459961, "loss": 0.2391, "losses/dpo": 0.25321850180625916, "losses/sft": 0.5408048033714294, "losses/total": 0.25321850180625916, "ref_logps/chosen": -17.249225616455078, "ref_logps/rejected": -31.546180725097656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8494281768798828, "rewards/margins": 2.6923556327819824, "rewards/rejected": -4.541783809661865, "step": 188 }, { "epoch": 0.38, "grad_norm": 30.711532592773438, "learning_rate": 4.855555555555556e-07, "logps/chosen": -34.859683990478516, "logps/rejected": -98.89837646484375, "loss": 0.1398, "losses/dpo": 0.24823901057243347, "losses/sft": 0.6124401688575745, "losses/total": 0.24823901057243347, "ref_logps/chosen": -14.467687606811523, "ref_logps/rejected": -38.222923278808594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0391998291015625, "rewards/margins": 4.028346061706543, "rewards/rejected": -6.0675458908081055, "step": 189 }, { "epoch": 0.38, "grad_norm": 62.155982971191406, "learning_rate": 4.851851851851852e-07, "logps/chosen": -35.63481140136719, "logps/rejected": -111.28214263916016, "loss": 0.2023, "losses/dpo": 0.2405925989151001, "losses/sft": 0.816167950630188, "losses/total": 0.2405925989151001, "ref_logps/chosen": -13.084310531616211, "ref_logps/rejected": -44.44044494628906, "rewards/accuracies": 0.875, "rewards/chosen": -2.255049705505371, "rewards/margins": 4.4291205406188965, "rewards/rejected": -6.684170722961426, "step": 190 }, { "epoch": 0.38, "grad_norm": 20.9333553314209, "learning_rate": 4.848148148148148e-07, "logps/chosen": -28.955718994140625, "logps/rejected": -106.60125732421875, "loss": 0.1037, "losses/dpo": 0.17893077433109283, "losses/sft": 0.5490385890007019, "losses/total": 0.17893077433109283, "ref_logps/chosen": -14.494391441345215, "ref_logps/rejected": -42.58512878417969, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4461328983306885, "rewards/margins": 4.955480098724365, "rewards/rejected": -6.401612758636475, "step": 191 }, { "epoch": 0.38, "grad_norm": 37.37415313720703, "learning_rate": 4.844444444444445e-07, "logps/chosen": -33.32417297363281, "logps/rejected": -103.1170654296875, "loss": 0.2031, "losses/dpo": 0.3571177124977112, "losses/sft": 0.5955436825752258, "losses/total": 0.3571177124977112, "ref_logps/chosen": -17.138912200927734, "ref_logps/rejected": -44.96180725097656, "rewards/accuracies": 0.875, "rewards/chosen": -1.6185261011123657, "rewards/margins": 4.1969990730285645, "rewards/rejected": -5.815525531768799, "step": 192 }, { "epoch": 0.39, "grad_norm": 17.647977828979492, "learning_rate": 4.840740740740741e-07, "logps/chosen": -27.43695068359375, "logps/rejected": -98.47382354736328, "loss": 0.0697, "losses/dpo": 0.06315574049949646, "losses/sft": 0.4831572473049164, "losses/total": 0.06315574049949646, "ref_logps/chosen": -13.393509864807129, "ref_logps/rejected": -38.59257888793945, "rewards/accuracies": 1.0, "rewards/chosen": -1.404344081878662, "rewards/margins": 4.583780288696289, "rewards/rejected": -5.988123893737793, "step": 193 }, { "epoch": 0.39, "grad_norm": 46.62928771972656, "learning_rate": 4.837037037037037e-07, "logps/chosen": -29.626968383789062, "logps/rejected": -91.1700439453125, "loss": 0.2158, "losses/dpo": 0.49264955520629883, "losses/sft": 0.6701329350471497, "losses/total": 0.49264955520629883, "ref_logps/chosen": -11.046613693237305, "ref_logps/rejected": -39.36475372314453, "rewards/accuracies": 0.875, "rewards/chosen": -1.8580358028411865, "rewards/margins": 3.322493076324463, "rewards/rejected": -5.18052864074707, "step": 194 }, { "epoch": 0.39, "grad_norm": 18.034120559692383, "learning_rate": 4.833333333333333e-07, "logps/chosen": -29.973468780517578, "logps/rejected": -105.07539367675781, "loss": 0.0674, "losses/dpo": 0.08304838836193085, "losses/sft": 0.6566027402877808, "losses/total": 0.08304838836193085, "ref_logps/chosen": -14.975432395935059, "ref_logps/rejected": -43.71247863769531, "rewards/accuracies": 1.0, "rewards/chosen": -1.4998037815093994, "rewards/margins": 4.6364874839782715, "rewards/rejected": -6.13629150390625, "step": 195 }, { "epoch": 0.39, "grad_norm": 54.77147674560547, "learning_rate": 4.829629629629629e-07, "logps/chosen": -32.78771209716797, "logps/rejected": -109.28285217285156, "loss": 0.2141, "losses/dpo": 0.3265150189399719, "losses/sft": 0.8198443651199341, "losses/total": 0.3265150189399719, "ref_logps/chosen": -11.376585006713867, "ref_logps/rejected": -46.43901062011719, "rewards/accuracies": 0.875, "rewards/chosen": -2.141112804412842, "rewards/margins": 4.143270492553711, "rewards/rejected": -6.284383773803711, "step": 196 }, { "epoch": 0.39, "grad_norm": 48.40336227416992, "learning_rate": 4.825925925925926e-07, "logps/chosen": -29.586572647094727, "logps/rejected": -100.91725158691406, "loss": 0.1325, "losses/dpo": 0.1769840270280838, "losses/sft": 0.528570294380188, "losses/total": 0.1769840270280838, "ref_logps/chosen": -10.439521789550781, "ref_logps/rejected": -40.107704162597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.9147053956985474, "rewards/margins": 4.166248798370361, "rewards/rejected": -6.080954551696777, "step": 197 }, { "epoch": 0.4, "grad_norm": 19.345624923706055, "learning_rate": 4.822222222222222e-07, "logps/chosen": -33.93301773071289, "logps/rejected": -112.19253540039062, "loss": 0.0755, "losses/dpo": 0.2622387409210205, "losses/sft": 0.5382803678512573, "losses/total": 0.2622387409210205, "ref_logps/chosen": -15.570449829101562, "ref_logps/rejected": -42.25779724121094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8362568616867065, "rewards/margins": 5.157217025756836, "rewards/rejected": -6.993474006652832, "step": 198 }, { "epoch": 0.4, "grad_norm": 8.654232025146484, "learning_rate": 4.818518518518519e-07, "logps/chosen": -32.183082580566406, "logps/rejected": -139.59701538085938, "loss": 0.0178, "losses/dpo": 0.0051182028837502, "losses/sft": 0.29134008288383484, "losses/total": 0.0051182028837502, "ref_logps/chosen": -19.9312686920166, "ref_logps/rejected": -55.7081298828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.225181221961975, "rewards/margins": 7.163707733154297, "rewards/rejected": -8.38888931274414, "step": 199 }, { "epoch": 0.4, "grad_norm": 19.976350784301758, "learning_rate": 4.814814814814814e-07, "logps/chosen": -30.71693229675293, "logps/rejected": -122.64790344238281, "loss": 0.0996, "losses/dpo": 0.04794596508145332, "losses/sft": 0.655545175075531, "losses/total": 0.04794596508145332, "ref_logps/chosen": -16.344539642333984, "ref_logps/rejected": -52.21672058105469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4372395277023315, "rewards/margins": 5.605879783630371, "rewards/rejected": -7.04311990737915, "step": 200 }, { "epoch": 0.4, "grad_norm": 9.067500114440918, "learning_rate": 4.81111111111111e-07, "logps/chosen": -24.73886489868164, "logps/rejected": -113.34480285644531, "loss": 0.07, "losses/dpo": 0.2013578712940216, "losses/sft": 0.6681497693061829, "losses/total": 0.2013578712940216, "ref_logps/chosen": -10.942086219787598, "ref_logps/rejected": -45.542938232421875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3796777725219727, "rewards/margins": 5.400509834289551, "rewards/rejected": -6.780187606811523, "step": 201 }, { "epoch": 0.4, "grad_norm": 20.155838012695312, "learning_rate": 4.807407407407407e-07, "logps/chosen": -34.64628219604492, "logps/rejected": -109.43836212158203, "loss": 0.0937, "losses/dpo": 0.26625820994377136, "losses/sft": 0.5927165150642395, "losses/total": 0.26625820994377136, "ref_logps/chosen": -14.607067108154297, "ref_logps/rejected": -45.044769287109375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0039215087890625, "rewards/margins": 4.4354376792907715, "rewards/rejected": -6.439358711242676, "step": 202 }, { "epoch": 0.41, "grad_norm": 45.13740158081055, "learning_rate": 4.803703703703704e-07, "logps/chosen": -34.342552185058594, "logps/rejected": -115.29844665527344, "loss": 0.16, "losses/dpo": 0.19734561443328857, "losses/sft": 0.9053852558135986, "losses/total": 0.19734561443328857, "ref_logps/chosen": -12.908370018005371, "ref_logps/rejected": -47.28173065185547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.143418073654175, "rewards/margins": 4.6582536697387695, "rewards/rejected": -6.801671981811523, "step": 203 }, { "epoch": 0.41, "grad_norm": 47.24686813354492, "learning_rate": 4.8e-07, "logps/chosen": -37.38880157470703, "logps/rejected": -108.8377456665039, "loss": 0.129, "losses/dpo": 0.08070208877325058, "losses/sft": 0.9097791314125061, "losses/total": 0.08070208877325058, "ref_logps/chosen": -15.263628959655762, "ref_logps/rejected": -41.726402282714844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.212517261505127, "rewards/margins": 4.498617172241211, "rewards/rejected": -6.71113395690918, "step": 204 }, { "epoch": 0.41, "grad_norm": 8.513498306274414, "learning_rate": 4.796296296296296e-07, "logps/chosen": -34.856285095214844, "logps/rejected": -127.9891357421875, "loss": 0.0201, "losses/dpo": 0.0579579658806324, "losses/sft": 0.6573350429534912, "losses/total": 0.0579579658806324, "ref_logps/chosen": -12.282885551452637, "ref_logps/rejected": -46.556800842285156, "rewards/accuracies": 1.0, "rewards/chosen": -2.2573397159576416, "rewards/margins": 5.885893821716309, "rewards/rejected": -8.143234252929688, "step": 205 }, { "epoch": 0.41, "grad_norm": 35.727291107177734, "learning_rate": 4.792592592592592e-07, "logps/chosen": -36.47418975830078, "logps/rejected": -120.59506225585938, "loss": 0.106, "losses/dpo": 0.17767032980918884, "losses/sft": 0.5659396648406982, "losses/total": 0.17767032980918884, "ref_logps/chosen": -14.854631423950195, "ref_logps/rejected": -54.54015350341797, "rewards/accuracies": 1.0, "rewards/chosen": -2.1619560718536377, "rewards/margins": 4.443534851074219, "rewards/rejected": -6.605490684509277, "step": 206 }, { "epoch": 0.41, "grad_norm": 53.7916374206543, "learning_rate": 4.788888888888889e-07, "logps/chosen": -31.674482345581055, "logps/rejected": -111.17448425292969, "loss": 0.1663, "losses/dpo": 0.4532434344291687, "losses/sft": 0.7202601432800293, "losses/total": 0.4532434344291687, "ref_logps/chosen": -13.298295021057129, "ref_logps/rejected": -40.8799934387207, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8376187086105347, "rewards/margins": 5.191830635070801, "rewards/rejected": -7.029449462890625, "step": 207 }, { "epoch": 0.42, "grad_norm": 40.02408981323242, "learning_rate": 4.785185185185185e-07, "logps/chosen": -34.69915008544922, "logps/rejected": -99.4919204711914, "loss": 0.2046, "losses/dpo": 0.6538295745849609, "losses/sft": 0.660841703414917, "losses/total": 0.6538295745849609, "ref_logps/chosen": -14.765646934509277, "ref_logps/rejected": -39.34600830078125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9933503866195679, "rewards/margins": 4.021241188049316, "rewards/rejected": -6.014591693878174, "step": 208 }, { "epoch": 0.42, "grad_norm": 61.37720489501953, "learning_rate": 4.781481481481482e-07, "logps/chosen": -47.08232879638672, "logps/rejected": -118.26411437988281, "loss": 0.1965, "losses/dpo": 0.3133441209793091, "losses/sft": 0.6915695667266846, "losses/total": 0.3133441209793091, "ref_logps/chosen": -16.830141067504883, "ref_logps/rejected": -45.98204040527344, "rewards/accuracies": 0.875, "rewards/chosen": -3.025218963623047, "rewards/margins": 4.202988624572754, "rewards/rejected": -7.228207588195801, "step": 209 }, { "epoch": 0.42, "grad_norm": 31.95800018310547, "learning_rate": 4.777777777777778e-07, "logps/chosen": -36.95540237426758, "logps/rejected": -123.79988098144531, "loss": 0.0974, "losses/dpo": 0.09307215362787247, "losses/sft": 0.7482750415802002, "losses/total": 0.09307215362787247, "ref_logps/chosen": -14.497810363769531, "ref_logps/rejected": -52.44733428955078, "rewards/accuracies": 1.0, "rewards/chosen": -2.2457590103149414, "rewards/margins": 4.889495849609375, "rewards/rejected": -7.135254859924316, "step": 210 }, { "epoch": 0.42, "grad_norm": 56.4405632019043, "learning_rate": 4.774074074074073e-07, "logps/chosen": -26.164018630981445, "logps/rejected": -125.44146728515625, "loss": 0.2563, "losses/dpo": 0.01354515552520752, "losses/sft": 0.4640093445777893, "losses/total": 0.01354515552520752, "ref_logps/chosen": -10.535161018371582, "ref_logps/rejected": -49.800575256347656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5628857612609863, "rewards/margins": 6.001203536987305, "rewards/rejected": -7.564088821411133, "step": 211 }, { "epoch": 0.42, "grad_norm": 36.40195846557617, "learning_rate": 4.77037037037037e-07, "logps/chosen": -29.295894622802734, "logps/rejected": -79.89122009277344, "loss": 0.1645, "losses/dpo": 0.19777898490428925, "losses/sft": 0.5077103972434998, "losses/total": 0.19777898490428925, "ref_logps/chosen": -11.7691068649292, "ref_logps/rejected": -33.79277038574219, "rewards/accuracies": 0.9375, "rewards/chosen": -1.752678632736206, "rewards/margins": 2.8571667671203613, "rewards/rejected": -4.609845161437988, "step": 212 }, { "epoch": 0.43, "grad_norm": 70.0155029296875, "learning_rate": 4.7666666666666667e-07, "logps/chosen": -30.167865753173828, "logps/rejected": -88.53804016113281, "loss": 0.2756, "losses/dpo": 0.18790437281131744, "losses/sft": 0.6706699132919312, "losses/total": 0.18790437281131744, "ref_logps/chosen": -11.840088844299316, "ref_logps/rejected": -32.273094177246094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.832777500152588, "rewards/margins": 3.7937171459198, "rewards/rejected": -5.626494884490967, "step": 213 }, { "epoch": 0.43, "grad_norm": 43.36481857299805, "learning_rate": 4.7629629629629626e-07, "logps/chosen": -34.01522445678711, "logps/rejected": -145.6756591796875, "loss": 0.1293, "losses/dpo": 0.005147114396095276, "losses/sft": 0.7248036861419678, "losses/total": 0.005147114396095276, "ref_logps/chosen": -14.497140884399414, "ref_logps/rejected": -54.134281158447266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9518084526062012, "rewards/margins": 7.202329635620117, "rewards/rejected": -9.154138565063477, "step": 214 }, { "epoch": 0.43, "grad_norm": 15.9217529296875, "learning_rate": 4.759259259259259e-07, "logps/chosen": -39.08689880371094, "logps/rejected": -118.49710845947266, "loss": 0.0408, "losses/dpo": 0.05637574940919876, "losses/sft": 0.6518598794937134, "losses/total": 0.05637574940919876, "ref_logps/chosen": -16.12924575805664, "ref_logps/rejected": -45.19512939453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.295764923095703, "rewards/margins": 5.034433841705322, "rewards/rejected": -7.330198287963867, "step": 215 }, { "epoch": 0.43, "grad_norm": 28.412574768066406, "learning_rate": 4.7555555555555554e-07, "logps/chosen": -40.055641174316406, "logps/rejected": -119.33856201171875, "loss": 0.066, "losses/dpo": 0.16118040680885315, "losses/sft": 0.7415938377380371, "losses/total": 0.16118040680885315, "ref_logps/chosen": -18.70709991455078, "ref_logps/rejected": -47.00398635864258, "rewards/accuracies": 1.0, "rewards/chosen": -2.1348538398742676, "rewards/margins": 5.098602771759033, "rewards/rejected": -7.233456611633301, "step": 216 }, { "epoch": 0.43, "grad_norm": 30.004011154174805, "learning_rate": 4.751851851851852e-07, "logps/chosen": -31.2343692779541, "logps/rejected": -152.04202270507812, "loss": 0.0871, "losses/dpo": 0.2913620173931122, "losses/sft": 0.5769238471984863, "losses/total": 0.2913620173931122, "ref_logps/chosen": -11.397944450378418, "ref_logps/rejected": -56.8441162109375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.983642339706421, "rewards/margins": 7.536149024963379, "rewards/rejected": -9.519791603088379, "step": 217 }, { "epoch": 0.44, "grad_norm": 49.84681701660156, "learning_rate": 4.7481481481481477e-07, "logps/chosen": -38.713043212890625, "logps/rejected": -169.45138549804688, "loss": 0.0497, "losses/dpo": 0.021128835156559944, "losses/sft": 0.6783910989761353, "losses/total": 0.021128835156559944, "ref_logps/chosen": -13.662879943847656, "ref_logps/rejected": -70.60210418701172, "rewards/accuracies": 1.0, "rewards/chosen": -2.505016326904297, "rewards/margins": 7.379911422729492, "rewards/rejected": -9.884927749633789, "step": 218 }, { "epoch": 0.44, "grad_norm": 42.746803283691406, "learning_rate": 4.744444444444444e-07, "logps/chosen": -40.39366912841797, "logps/rejected": -112.6432113647461, "loss": 0.0885, "losses/dpo": 0.092128686606884, "losses/sft": 0.5247994661331177, "losses/total": 0.092128686606884, "ref_logps/chosen": -17.325380325317383, "ref_logps/rejected": -42.825828552246094, "rewards/accuracies": 1.0, "rewards/chosen": -2.3068289756774902, "rewards/margins": 4.6749091148376465, "rewards/rejected": -6.9817376136779785, "step": 219 }, { "epoch": 0.44, "grad_norm": 20.136625289916992, "learning_rate": 4.7407407407407405e-07, "logps/chosen": -33.28211975097656, "logps/rejected": -112.59181213378906, "loss": 0.0883, "losses/dpo": 0.20460954308509827, "losses/sft": 0.5208683013916016, "losses/total": 0.20460954308509827, "ref_logps/chosen": -15.455129623413086, "ref_logps/rejected": -40.72985076904297, "rewards/accuracies": 1.0, "rewards/chosen": -1.7826988697052002, "rewards/margins": 5.403497695922852, "rewards/rejected": -7.186196327209473, "step": 220 }, { "epoch": 0.44, "grad_norm": 66.03868103027344, "learning_rate": 4.7370370370370364e-07, "logps/chosen": -31.484960556030273, "logps/rejected": -113.89936828613281, "loss": 0.2053, "losses/dpo": 0.5744110941886902, "losses/sft": 0.8293743133544922, "losses/total": 0.5744110941886902, "ref_logps/chosen": -12.668027877807617, "ref_logps/rejected": -46.156768798828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8816933631896973, "rewards/margins": 4.892566680908203, "rewards/rejected": -6.7742600440979, "step": 221 }, { "epoch": 0.44, "grad_norm": 36.92087936401367, "learning_rate": 4.733333333333333e-07, "logps/chosen": -34.15308380126953, "logps/rejected": -102.67060852050781, "loss": 0.1643, "losses/dpo": 0.04543168097734451, "losses/sft": 0.6770851016044617, "losses/total": 0.04543168097734451, "ref_logps/chosen": -13.57850456237793, "ref_logps/rejected": -39.29328918457031, "rewards/accuracies": 0.9375, "rewards/chosen": -2.05745792388916, "rewards/margins": 4.280274391174316, "rewards/rejected": -6.337732315063477, "step": 222 }, { "epoch": 0.45, "grad_norm": 47.12366485595703, "learning_rate": 4.72962962962963e-07, "logps/chosen": -31.432491302490234, "logps/rejected": -99.98987579345703, "loss": 0.1713, "losses/dpo": 0.1538832187652588, "losses/sft": 0.8228436708450317, "losses/total": 0.1538832187652588, "ref_logps/chosen": -12.841203689575195, "ref_logps/rejected": -39.756378173828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8591285943984985, "rewards/margins": 4.16422176361084, "rewards/rejected": -6.023350238800049, "step": 223 }, { "epoch": 0.45, "grad_norm": 74.00916290283203, "learning_rate": 4.725925925925926e-07, "logps/chosen": -33.925750732421875, "logps/rejected": -107.38592529296875, "loss": 0.2729, "losses/dpo": 0.24903066456317902, "losses/sft": 0.5291041135787964, "losses/total": 0.24903066456317902, "ref_logps/chosen": -17.715248107910156, "ref_logps/rejected": -48.10115051269531, "rewards/accuracies": 0.875, "rewards/chosen": -1.621050238609314, "rewards/margins": 4.307427406311035, "rewards/rejected": -5.9284772872924805, "step": 224 }, { "epoch": 0.45, "grad_norm": 26.433197021484375, "learning_rate": 4.722222222222222e-07, "logps/chosen": -29.673873901367188, "logps/rejected": -96.80340576171875, "loss": 0.1643, "losses/dpo": 0.0513819195330143, "losses/sft": 0.5060651302337646, "losses/total": 0.0513819195330143, "ref_logps/chosen": -13.205547332763672, "ref_logps/rejected": -39.28229522705078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6468327045440674, "rewards/margins": 4.105278491973877, "rewards/rejected": -5.752111434936523, "step": 225 }, { "epoch": 0.45, "grad_norm": 24.875255584716797, "learning_rate": 4.7185185185185185e-07, "logps/chosen": -34.83883285522461, "logps/rejected": -128.52703857421875, "loss": 0.0889, "losses/dpo": 0.26298192143440247, "losses/sft": 0.489166259765625, "losses/total": 0.26298192143440247, "ref_logps/chosen": -18.28306770324707, "ref_logps/rejected": -54.15491485595703, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6555763483047485, "rewards/margins": 5.78163480758667, "rewards/rejected": -7.437211513519287, "step": 226 }, { "epoch": 0.45, "grad_norm": 27.833566665649414, "learning_rate": 4.714814814814815e-07, "logps/chosen": -32.174381256103516, "logps/rejected": -122.44319152832031, "loss": 0.0825, "losses/dpo": 0.08397988975048065, "losses/sft": 0.7054194808006287, "losses/total": 0.08397988975048065, "ref_logps/chosen": -13.736897468566895, "ref_logps/rejected": -52.05677032470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.8437484502792358, "rewards/margins": 5.194893836975098, "rewards/rejected": -7.038642406463623, "step": 227 }, { "epoch": 0.46, "grad_norm": 8.22507095336914, "learning_rate": 4.711111111111111e-07, "logps/chosen": -33.30576705932617, "logps/rejected": -127.82069396972656, "loss": 0.0227, "losses/dpo": 0.049265846610069275, "losses/sft": 0.48058852553367615, "losses/total": 0.049265846610069275, "ref_logps/chosen": -15.762434959411621, "ref_logps/rejected": -50.94194030761719, "rewards/accuracies": 1.0, "rewards/chosen": -1.75433349609375, "rewards/margins": 5.933541297912598, "rewards/rejected": -7.687874794006348, "step": 228 }, { "epoch": 0.46, "grad_norm": 17.72207260131836, "learning_rate": 4.707407407407407e-07, "logps/chosen": -37.80146789550781, "logps/rejected": -109.19346618652344, "loss": 0.0558, "losses/dpo": 0.10598674416542053, "losses/sft": 0.5179128050804138, "losses/total": 0.10598674416542053, "ref_logps/chosen": -17.415359497070312, "ref_logps/rejected": -44.45947265625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0386109352111816, "rewards/margins": 4.434789657592773, "rewards/rejected": -6.473400592803955, "step": 229 }, { "epoch": 0.46, "grad_norm": 76.22272491455078, "learning_rate": 4.7037037037037036e-07, "logps/chosen": -37.97969436645508, "logps/rejected": -98.01371002197266, "loss": 0.303, "losses/dpo": 0.26055893301963806, "losses/sft": 0.7976289987564087, "losses/total": 0.26055893301963806, "ref_logps/chosen": -15.016735076904297, "ref_logps/rejected": -39.44939422607422, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2962958812713623, "rewards/margins": 3.560136318206787, "rewards/rejected": -5.85643196105957, "step": 230 }, { "epoch": 0.46, "grad_norm": 4.947128772735596, "learning_rate": 4.6999999999999995e-07, "logps/chosen": -32.68196487426758, "logps/rejected": -122.32713317871094, "loss": 0.0543, "losses/dpo": 0.1880553811788559, "losses/sft": 0.4257510006427765, "losses/total": 0.1880553811788559, "ref_logps/chosen": -17.893299102783203, "ref_logps/rejected": -49.445945739746094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4788663387298584, "rewards/margins": 5.809252738952637, "rewards/rejected": -7.288119316101074, "step": 231 }, { "epoch": 0.46, "grad_norm": 29.260194778442383, "learning_rate": 4.696296296296296e-07, "logps/chosen": -30.062938690185547, "logps/rejected": -100.40599060058594, "loss": 0.0955, "losses/dpo": 0.04028013348579407, "losses/sft": 0.6679549217224121, "losses/total": 0.04028013348579407, "ref_logps/chosen": -12.384693145751953, "ref_logps/rejected": -39.80000305175781, "rewards/accuracies": 1.0, "rewards/chosen": -1.767824411392212, "rewards/margins": 4.292774200439453, "rewards/rejected": -6.060598850250244, "step": 232 }, { "epoch": 0.47, "grad_norm": 70.72667694091797, "learning_rate": 4.6925925925925923e-07, "logps/chosen": -37.28951644897461, "logps/rejected": -87.13302612304688, "loss": 0.4117, "losses/dpo": 0.19169825315475464, "losses/sft": 0.9827092885971069, "losses/total": 0.19169825315475464, "ref_logps/chosen": -11.481277465820312, "ref_logps/rejected": -34.47808074951172, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5808238983154297, "rewards/margins": 2.6846706867218018, "rewards/rejected": -5.265494346618652, "step": 233 }, { "epoch": 0.47, "grad_norm": 38.08920669555664, "learning_rate": 4.6888888888888887e-07, "logps/chosen": -46.131221771240234, "logps/rejected": -120.89785766601562, "loss": 0.1526, "losses/dpo": 0.06840162724256516, "losses/sft": 0.5885744094848633, "losses/total": 0.06840162724256516, "ref_logps/chosen": -29.300039291381836, "ref_logps/rejected": -57.00225830078125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.683118224143982, "rewards/margins": 4.706441402435303, "rewards/rejected": -6.389559745788574, "step": 234 }, { "epoch": 0.47, "grad_norm": 60.85316848754883, "learning_rate": 4.6851851851851846e-07, "logps/chosen": -36.02318572998047, "logps/rejected": -101.59310913085938, "loss": 0.2075, "losses/dpo": 0.1142917200922966, "losses/sft": 0.7939429879188538, "losses/total": 0.1142917200922966, "ref_logps/chosen": -14.844230651855469, "ref_logps/rejected": -42.496620178222656, "rewards/accuracies": 0.875, "rewards/chosen": -2.1178956031799316, "rewards/margins": 3.7917532920837402, "rewards/rejected": -5.909648895263672, "step": 235 }, { "epoch": 0.47, "grad_norm": 30.55718421936035, "learning_rate": 4.681481481481481e-07, "logps/chosen": -40.79773712158203, "logps/rejected": -105.50750732421875, "loss": 0.0906, "losses/dpo": 0.08892350643873215, "losses/sft": 0.33517754077911377, "losses/total": 0.08892350643873215, "ref_logps/chosen": -20.355552673339844, "ref_logps/rejected": -45.50160217285156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0442183017730713, "rewards/margins": 3.956371545791626, "rewards/rejected": -6.0005903244018555, "step": 236 }, { "epoch": 0.47, "grad_norm": 36.39009475708008, "learning_rate": 4.677777777777778e-07, "logps/chosen": -35.50791931152344, "logps/rejected": -107.39195251464844, "loss": 0.124, "losses/dpo": 0.1986294537782669, "losses/sft": 0.3821603059768677, "losses/total": 0.1986294537782669, "ref_logps/chosen": -16.365543365478516, "ref_logps/rejected": -40.81610107421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9142378568649292, "rewards/margins": 4.74334716796875, "rewards/rejected": -6.6575846672058105, "step": 237 }, { "epoch": 0.48, "grad_norm": 34.70072555541992, "learning_rate": 4.674074074074074e-07, "logps/chosen": -31.894338607788086, "logps/rejected": -115.76055145263672, "loss": 0.0858, "losses/dpo": 0.17275740206241608, "losses/sft": 0.6638415455818176, "losses/total": 0.17275740206241608, "ref_logps/chosen": -14.566429138183594, "ref_logps/rejected": -48.65589904785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7327909469604492, "rewards/margins": 4.97767448425293, "rewards/rejected": -6.710465431213379, "step": 238 }, { "epoch": 0.48, "grad_norm": 65.5306625366211, "learning_rate": 4.67037037037037e-07, "logps/chosen": -36.98841857910156, "logps/rejected": -89.85264587402344, "loss": 0.2781, "losses/dpo": 0.0937236100435257, "losses/sft": 0.5629141330718994, "losses/total": 0.0937236100435257, "ref_logps/chosen": -15.890052795410156, "ref_logps/rejected": -37.154483795166016, "rewards/accuracies": 0.875, "rewards/chosen": -2.1098363399505615, "rewards/margins": 3.159980058670044, "rewards/rejected": -5.2698163986206055, "step": 239 }, { "epoch": 0.48, "grad_norm": 29.891502380371094, "learning_rate": 4.6666666666666666e-07, "logps/chosen": -25.592954635620117, "logps/rejected": -114.64250183105469, "loss": 0.1173, "losses/dpo": 0.15019918978214264, "losses/sft": 0.5278570652008057, "losses/total": 0.15019918978214264, "ref_logps/chosen": -13.68138313293457, "ref_logps/rejected": -52.11135482788086, "rewards/accuracies": 1.0, "rewards/chosen": -1.1911571025848389, "rewards/margins": 5.061957359313965, "rewards/rejected": -6.253114700317383, "step": 240 }, { "epoch": 0.48, "grad_norm": 16.76541519165039, "learning_rate": 4.662962962962963e-07, "logps/chosen": -31.80239486694336, "logps/rejected": -137.59100341796875, "loss": 0.0861, "losses/dpo": 0.23009899258613586, "losses/sft": 0.5560140013694763, "losses/total": 0.23009899258613586, "ref_logps/chosen": -11.215553283691406, "ref_logps/rejected": -52.46674346923828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0586843490600586, "rewards/margins": 6.453742027282715, "rewards/rejected": -8.512426376342773, "step": 241 }, { "epoch": 0.48, "grad_norm": 54.48352813720703, "learning_rate": 4.659259259259259e-07, "logps/chosen": -30.94244384765625, "logps/rejected": -103.1988525390625, "loss": 0.1839, "losses/dpo": 0.4775644540786743, "losses/sft": 0.7302254438400269, "losses/total": 0.4775644540786743, "ref_logps/chosen": -13.036834716796875, "ref_logps/rejected": -44.61477279663086, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7905609607696533, "rewards/margins": 4.067846775054932, "rewards/rejected": -5.858407974243164, "step": 242 }, { "epoch": 0.49, "grad_norm": 68.84845733642578, "learning_rate": 4.6555555555555553e-07, "logps/chosen": -30.0528564453125, "logps/rejected": -136.130859375, "loss": 0.1851, "losses/dpo": 0.0996524840593338, "losses/sft": 0.5073419213294983, "losses/total": 0.0996524840593338, "ref_logps/chosen": -13.253033638000488, "ref_logps/rejected": -60.84919738769531, "rewards/accuracies": 0.9375, "rewards/chosen": -1.679982304573059, "rewards/margins": 5.848184585571289, "rewards/rejected": -7.5281662940979, "step": 243 }, { "epoch": 0.49, "grad_norm": 50.93902587890625, "learning_rate": 4.651851851851852e-07, "logps/chosen": -33.61359405517578, "logps/rejected": -108.6624984741211, "loss": 0.1313, "losses/dpo": 0.01921762339770794, "losses/sft": 0.6857509613037109, "losses/total": 0.01921762339770794, "ref_logps/chosen": -15.684530258178711, "ref_logps/rejected": -44.83380126953125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7929065227508545, "rewards/margins": 4.589962959289551, "rewards/rejected": -6.382869243621826, "step": 244 }, { "epoch": 0.49, "grad_norm": 32.41175079345703, "learning_rate": 4.6481481481481476e-07, "logps/chosen": -30.212507247924805, "logps/rejected": -133.505126953125, "loss": 0.0789, "losses/dpo": 0.2453601062297821, "losses/sft": 0.7741209268569946, "losses/total": 0.2453601062297821, "ref_logps/chosen": -11.35911750793457, "ref_logps/rejected": -49.58050537109375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8853390216827393, "rewards/margins": 6.507124423980713, "rewards/rejected": -8.392463684082031, "step": 245 }, { "epoch": 0.49, "grad_norm": 29.210350036621094, "learning_rate": 4.644444444444444e-07, "logps/chosen": -32.476158142089844, "logps/rejected": -92.0237045288086, "loss": 0.0859, "losses/dpo": 0.06182260811328888, "losses/sft": 0.8990675806999207, "losses/total": 0.06182260811328888, "ref_logps/chosen": -12.002470970153809, "ref_logps/rejected": -30.318471908569336, "rewards/accuracies": 1.0, "rewards/chosen": -2.0473690032958984, "rewards/margins": 4.123154640197754, "rewards/rejected": -6.170523643493652, "step": 246 }, { "epoch": 0.49, "grad_norm": 40.80755615234375, "learning_rate": 4.6407407407407404e-07, "logps/chosen": -33.550479888916016, "logps/rejected": -115.95787811279297, "loss": 0.1212, "losses/dpo": 0.10118857771158218, "losses/sft": 0.5980912446975708, "losses/total": 0.10118857771158218, "ref_logps/chosen": -12.059894561767578, "ref_logps/rejected": -49.10792922973633, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1490585803985596, "rewards/margins": 4.535937309265137, "rewards/rejected": -6.684995651245117, "step": 247 }, { "epoch": 0.5, "grad_norm": 46.593788146972656, "learning_rate": 4.637037037037037e-07, "logps/chosen": -40.190673828125, "logps/rejected": -125.31559753417969, "loss": 0.1325, "losses/dpo": 0.12757886946201324, "losses/sft": 0.7891998291015625, "losses/total": 0.12757886946201324, "ref_logps/chosen": -17.106029510498047, "ref_logps/rejected": -51.4550895690918, "rewards/accuracies": 1.0, "rewards/chosen": -2.308464288711548, "rewards/margins": 5.077587127685547, "rewards/rejected": -7.386051654815674, "step": 248 }, { "epoch": 0.5, "grad_norm": 12.482865333557129, "learning_rate": 4.633333333333333e-07, "logps/chosen": -33.39018249511719, "logps/rejected": -124.94376373291016, "loss": 0.0251, "losses/dpo": 0.0052696047350764275, "losses/sft": 0.7853429317474365, "losses/total": 0.0052696047350764275, "ref_logps/chosen": -13.695304870605469, "ref_logps/rejected": -46.375450134277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.9694875478744507, "rewards/margins": 5.887343883514404, "rewards/rejected": -7.856831073760986, "step": 249 }, { "epoch": 0.5, "grad_norm": 13.024248123168945, "learning_rate": 4.6296296296296297e-07, "logps/chosen": -25.502864837646484, "logps/rejected": -125.57063293457031, "loss": 0.0692, "losses/dpo": 0.015543513000011444, "losses/sft": 0.41955021023750305, "losses/total": 0.015543513000011444, "ref_logps/chosen": -12.20850658416748, "ref_logps/rejected": -49.56139373779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.3294358253479004, "rewards/margins": 6.271487236022949, "rewards/rejected": -7.600924015045166, "step": 250 }, { "epoch": 0.5, "grad_norm": 23.54766082763672, "learning_rate": 4.625925925925926e-07, "logps/chosen": -32.23238754272461, "logps/rejected": -103.67035675048828, "loss": 0.1256, "losses/dpo": 0.06304627656936646, "losses/sft": 0.591474175453186, "losses/total": 0.06304627656936646, "ref_logps/chosen": -11.873893737792969, "ref_logps/rejected": -39.451812744140625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0358495712280273, "rewards/margins": 4.386005401611328, "rewards/rejected": -6.4218549728393555, "step": 251 }, { "epoch": 0.5, "grad_norm": 13.067554473876953, "learning_rate": 4.622222222222222e-07, "logps/chosen": -36.21117401123047, "logps/rejected": -123.28595733642578, "loss": 0.0377, "losses/dpo": 0.035405233502388, "losses/sft": 0.7797837853431702, "losses/total": 0.035405233502388, "ref_logps/chosen": -16.308122634887695, "ref_logps/rejected": -51.14239501953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9903050661087036, "rewards/margins": 5.224051475524902, "rewards/rejected": -7.214356422424316, "step": 252 }, { "epoch": 0.51, "grad_norm": 39.37320327758789, "learning_rate": 4.6185185185185184e-07, "logps/chosen": -39.30860900878906, "logps/rejected": -93.77995300292969, "loss": 0.2996, "losses/dpo": 0.1254265010356903, "losses/sft": 0.975360631942749, "losses/total": 0.1254265010356903, "ref_logps/chosen": -10.353572845458984, "ref_logps/rejected": -28.20992088317871, "rewards/accuracies": 0.8125, "rewards/chosen": -2.895503520965576, "rewards/margins": 3.661499500274658, "rewards/rejected": -6.557003021240234, "step": 253 }, { "epoch": 0.51, "grad_norm": 15.333211898803711, "learning_rate": 4.614814814814815e-07, "logps/chosen": -37.52141571044922, "logps/rejected": -137.45230102539062, "loss": 0.0751, "losses/dpo": 0.0252683162689209, "losses/sft": 0.4655612111091614, "losses/total": 0.0252683162689209, "ref_logps/chosen": -18.3541259765625, "ref_logps/rejected": -56.88932800292969, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9167288541793823, "rewards/margins": 6.139569282531738, "rewards/rejected": -8.056299209594727, "step": 254 }, { "epoch": 0.51, "grad_norm": 32.789756774902344, "learning_rate": 4.611111111111111e-07, "logps/chosen": -36.83077621459961, "logps/rejected": -113.62246704101562, "loss": 0.1548, "losses/dpo": 0.2664481997489929, "losses/sft": 0.7239383459091187, "losses/total": 0.2664481997489929, "ref_logps/chosen": -14.862010955810547, "ref_logps/rejected": -41.03871536254883, "rewards/accuracies": 0.875, "rewards/chosen": -2.1968765258789062, "rewards/margins": 5.061498641967773, "rewards/rejected": -7.25837516784668, "step": 255 }, { "epoch": 0.51, "grad_norm": 15.145865440368652, "learning_rate": 4.607407407407407e-07, "logps/chosen": -36.43583297729492, "logps/rejected": -128.9272003173828, "loss": 0.0594, "losses/dpo": 0.15890918672084808, "losses/sft": 1.0230506658554077, "losses/total": 0.15890918672084808, "ref_logps/chosen": -13.353048324584961, "ref_logps/rejected": -46.61483383178711, "rewards/accuracies": 1.0, "rewards/chosen": -2.3082785606384277, "rewards/margins": 5.9229583740234375, "rewards/rejected": -8.231237411499023, "step": 256 }, { "epoch": 0.51, "grad_norm": 34.71031951904297, "learning_rate": 4.6037037037037035e-07, "logps/chosen": -36.72929382324219, "logps/rejected": -120.64111328125, "loss": 0.1215, "losses/dpo": 0.34539294242858887, "losses/sft": 0.5950815677642822, "losses/total": 0.34539294242858887, "ref_logps/chosen": -14.87203311920166, "ref_logps/rejected": -43.00688934326172, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1857261657714844, "rewards/margins": 5.577695846557617, "rewards/rejected": -7.763422012329102, "step": 257 }, { "epoch": 0.52, "grad_norm": 39.24991226196289, "learning_rate": 4.6e-07, "logps/chosen": -36.05840301513672, "logps/rejected": -112.58522033691406, "loss": 0.12, "losses/dpo": 0.390174001455307, "losses/sft": 0.3807469606399536, "losses/total": 0.390174001455307, "ref_logps/chosen": -15.537040710449219, "ref_logps/rejected": -48.35340118408203, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0521364212036133, "rewards/margins": 4.371045112609863, "rewards/rejected": -6.423181533813477, "step": 258 }, { "epoch": 0.52, "grad_norm": 34.78696060180664, "learning_rate": 4.596296296296296e-07, "logps/chosen": -30.690231323242188, "logps/rejected": -98.33554077148438, "loss": 0.1078, "losses/dpo": 0.0833856463432312, "losses/sft": 0.6299813985824585, "losses/total": 0.0833856463432312, "ref_logps/chosen": -11.812528610229492, "ref_logps/rejected": -32.57080078125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.887770175933838, "rewards/margins": 4.688704490661621, "rewards/rejected": -6.576474666595459, "step": 259 }, { "epoch": 0.52, "grad_norm": 13.130879402160645, "learning_rate": 4.592592592592592e-07, "logps/chosen": -30.18829345703125, "logps/rejected": -131.82125854492188, "loss": 0.0388, "losses/dpo": 0.04645497351884842, "losses/sft": 0.4061301350593567, "losses/total": 0.04645497351884842, "ref_logps/chosen": -13.528017044067383, "ref_logps/rejected": -56.977508544921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6660277843475342, "rewards/margins": 5.818347930908203, "rewards/rejected": -7.484375953674316, "step": 260 }, { "epoch": 0.52, "grad_norm": 19.638797760009766, "learning_rate": 4.5888888888888886e-07, "logps/chosen": -28.61343765258789, "logps/rejected": -122.08333587646484, "loss": 0.063, "losses/dpo": 0.039696015417575836, "losses/sft": 0.5925369262695312, "losses/total": 0.039696015417575836, "ref_logps/chosen": -12.935312271118164, "ref_logps/rejected": -47.2875862121582, "rewards/accuracies": 1.0, "rewards/chosen": -1.5678125619888306, "rewards/margins": 5.911762237548828, "rewards/rejected": -7.479574203491211, "step": 261 }, { "epoch": 0.52, "grad_norm": 25.106760025024414, "learning_rate": 4.5851851851851845e-07, "logps/chosen": -30.96830177307129, "logps/rejected": -123.52799987792969, "loss": 0.129, "losses/dpo": 0.1747172325849533, "losses/sft": 0.6229246258735657, "losses/total": 0.1747172325849533, "ref_logps/chosen": -9.280638694763184, "ref_logps/rejected": -40.720794677734375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1687662601470947, "rewards/margins": 6.111954212188721, "rewards/rejected": -8.280721664428711, "step": 262 }, { "epoch": 0.53, "grad_norm": 16.804208755493164, "learning_rate": 4.5814814814814814e-07, "logps/chosen": -30.801437377929688, "logps/rejected": -137.78640747070312, "loss": 0.056, "losses/dpo": 0.11071071773767471, "losses/sft": 0.5817344188690186, "losses/total": 0.11071071773767471, "ref_logps/chosen": -12.325854301452637, "ref_logps/rejected": -59.880088806152344, "rewards/accuracies": 1.0, "rewards/chosen": -1.8475583791732788, "rewards/margins": 5.943074703216553, "rewards/rejected": -7.790633201599121, "step": 263 }, { "epoch": 0.53, "grad_norm": 22.54884910583496, "learning_rate": 4.577777777777778e-07, "logps/chosen": -36.032554626464844, "logps/rejected": -111.55353546142578, "loss": 0.0648, "losses/dpo": 0.08020076900720596, "losses/sft": 0.5052534341812134, "losses/total": 0.08020076900720596, "ref_logps/chosen": -12.630363464355469, "ref_logps/rejected": -38.44928741455078, "rewards/accuracies": 1.0, "rewards/chosen": -2.340219259262085, "rewards/margins": 4.970205307006836, "rewards/rejected": -7.310424327850342, "step": 264 }, { "epoch": 0.53, "grad_norm": 49.5006103515625, "learning_rate": 4.574074074074074e-07, "logps/chosen": -42.61256790161133, "logps/rejected": -110.77767944335938, "loss": 0.1175, "losses/dpo": 0.14292128384113312, "losses/sft": 0.8216965198516846, "losses/total": 0.14292128384113312, "ref_logps/chosen": -14.239264488220215, "ref_logps/rejected": -39.324241638183594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8373303413391113, "rewards/margins": 4.308013916015625, "rewards/rejected": -7.145343780517578, "step": 265 }, { "epoch": 0.53, "grad_norm": 39.606327056884766, "learning_rate": 4.57037037037037e-07, "logps/chosen": -36.93309020996094, "logps/rejected": -108.97806549072266, "loss": 0.111, "losses/dpo": 0.30392947793006897, "losses/sft": 0.7737802267074585, "losses/total": 0.30392947793006897, "ref_logps/chosen": -14.136423110961914, "ref_logps/rejected": -38.00397491455078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2796666622161865, "rewards/margins": 4.817742347717285, "rewards/rejected": -7.097409725189209, "step": 266 }, { "epoch": 0.53, "grad_norm": 41.95115661621094, "learning_rate": 4.5666666666666665e-07, "logps/chosen": -33.319175720214844, "logps/rejected": -120.86522674560547, "loss": 0.0801, "losses/dpo": 0.25569969415664673, "losses/sft": 0.8948369026184082, "losses/total": 0.25569969415664673, "ref_logps/chosen": -12.202211380004883, "ref_logps/rejected": -40.148895263671875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.111696720123291, "rewards/margins": 5.95993709564209, "rewards/rejected": -8.071634292602539, "step": 267 }, { "epoch": 0.54, "grad_norm": 13.912221908569336, "learning_rate": 4.562962962962963e-07, "logps/chosen": -46.565673828125, "logps/rejected": -145.53082275390625, "loss": 0.0639, "losses/dpo": 0.2050100415945053, "losses/sft": 0.6665656566619873, "losses/total": 0.2050100415945053, "ref_logps/chosen": -17.806560516357422, "ref_logps/rejected": -48.13336181640625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8759117126464844, "rewards/margins": 6.863834857940674, "rewards/rejected": -9.73974609375, "step": 268 }, { "epoch": 0.54, "grad_norm": 50.010066986083984, "learning_rate": 4.559259259259259e-07, "logps/chosen": -34.211036682128906, "logps/rejected": -83.97434997558594, "loss": 0.1918, "losses/dpo": 0.2276979237794876, "losses/sft": 0.6309546828269958, "losses/total": 0.2276979237794876, "ref_logps/chosen": -13.882712364196777, "ref_logps/rejected": -31.278932571411133, "rewards/accuracies": 0.9375, "rewards/chosen": -2.032832622528076, "rewards/margins": 3.2367098331451416, "rewards/rejected": -5.269542217254639, "step": 269 }, { "epoch": 0.54, "grad_norm": 53.363426208496094, "learning_rate": 4.555555555555555e-07, "logps/chosen": -45.16518020629883, "logps/rejected": -133.35223388671875, "loss": 0.1578, "losses/dpo": 0.18790379166603088, "losses/sft": 0.7288150787353516, "losses/total": 0.18790379166603088, "ref_logps/chosen": -15.43204402923584, "ref_logps/rejected": -51.11394119262695, "rewards/accuracies": 0.9375, "rewards/chosen": -2.973313808441162, "rewards/margins": 5.250514030456543, "rewards/rejected": -8.223828315734863, "step": 270 }, { "epoch": 0.54, "grad_norm": 11.669504165649414, "learning_rate": 4.5518518518518516e-07, "logps/chosen": -45.2990837097168, "logps/rejected": -153.83236694335938, "loss": 0.0161, "losses/dpo": 0.012812875211238861, "losses/sft": 0.8085505962371826, "losses/total": 0.012812875211238861, "ref_logps/chosen": -16.569459915161133, "ref_logps/rejected": -61.62638473510742, "rewards/accuracies": 1.0, "rewards/chosen": -2.872962474822998, "rewards/margins": 6.347635269165039, "rewards/rejected": -9.220597267150879, "step": 271 }, { "epoch": 0.54, "grad_norm": 101.09085845947266, "learning_rate": 4.548148148148148e-07, "logps/chosen": -43.06938171386719, "logps/rejected": -112.60737609863281, "loss": 0.2584, "losses/dpo": 0.1160985678434372, "losses/sft": 0.5782561302185059, "losses/total": 0.1160985678434372, "ref_logps/chosen": -15.254789352416992, "ref_logps/rejected": -38.1031379699707, "rewards/accuracies": 0.875, "rewards/chosen": -2.781459331512451, "rewards/margins": 4.668964385986328, "rewards/rejected": -7.450423717498779, "step": 272 }, { "epoch": 0.55, "grad_norm": 50.457908630371094, "learning_rate": 4.544444444444444e-07, "logps/chosen": -41.965553283691406, "logps/rejected": -151.30963134765625, "loss": 0.1371, "losses/dpo": 0.24365629255771637, "losses/sft": 0.8020548820495605, "losses/total": 0.24365629255771637, "ref_logps/chosen": -17.38597869873047, "ref_logps/rejected": -60.589176177978516, "rewards/accuracies": 0.875, "rewards/chosen": -2.4579577445983887, "rewards/margins": 6.614086627960205, "rewards/rejected": -9.072044372558594, "step": 273 }, { "epoch": 0.55, "grad_norm": 43.55804443359375, "learning_rate": 4.5407407407407403e-07, "logps/chosen": -45.85125732421875, "logps/rejected": -116.78541564941406, "loss": 0.1379, "losses/dpo": 0.307068407535553, "losses/sft": 0.6947055459022522, "losses/total": 0.307068407535553, "ref_logps/chosen": -15.99758529663086, "ref_logps/rejected": -41.6662712097168, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9853672981262207, "rewards/margins": 4.526546478271484, "rewards/rejected": -7.511913776397705, "step": 274 }, { "epoch": 0.55, "grad_norm": 63.197059631347656, "learning_rate": 4.537037037037037e-07, "logps/chosen": -35.24736022949219, "logps/rejected": -103.81106567382812, "loss": 0.1881, "losses/dpo": 0.09665709733963013, "losses/sft": 0.39394059777259827, "losses/total": 0.09665709733963013, "ref_logps/chosen": -11.432135581970215, "ref_logps/rejected": -36.34547424316406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3815226554870605, "rewards/margins": 4.36503791809082, "rewards/rejected": -6.746560096740723, "step": 275 }, { "epoch": 0.55, "grad_norm": 37.735565185546875, "learning_rate": 4.5333333333333326e-07, "logps/chosen": -43.59982681274414, "logps/rejected": -135.76087951660156, "loss": 0.0668, "losses/dpo": 0.10445442795753479, "losses/sft": 0.5132284164428711, "losses/total": 0.10445442795753479, "ref_logps/chosen": -21.570680618286133, "ref_logps/rejected": -52.23603820800781, "rewards/accuracies": 1.0, "rewards/chosen": -2.2029147148132324, "rewards/margins": 6.149569511413574, "rewards/rejected": -8.352483749389648, "step": 276 }, { "epoch": 0.55, "grad_norm": 28.7652530670166, "learning_rate": 4.5296296296296296e-07, "logps/chosen": -36.430747985839844, "logps/rejected": -103.90020751953125, "loss": 0.14, "losses/dpo": 0.31618010997772217, "losses/sft": 1.035854697227478, "losses/total": 0.31618010997772217, "ref_logps/chosen": -11.691465377807617, "ref_logps/rejected": -35.15990447998047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.473928451538086, "rewards/margins": 4.400101661682129, "rewards/rejected": -6.874030113220215, "step": 277 }, { "epoch": 0.56, "grad_norm": 47.00922393798828, "learning_rate": 4.525925925925926e-07, "logps/chosen": -31.12659454345703, "logps/rejected": -111.13185119628906, "loss": 0.0839, "losses/dpo": 0.0065039535984396935, "losses/sft": 0.5701088309288025, "losses/total": 0.0065039535984396935, "ref_logps/chosen": -11.445125579833984, "ref_logps/rejected": -39.33173370361328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9681470394134521, "rewards/margins": 5.211864948272705, "rewards/rejected": -7.180011749267578, "step": 278 }, { "epoch": 0.56, "grad_norm": 14.516613006591797, "learning_rate": 4.5222222222222224e-07, "logps/chosen": -47.697349548339844, "logps/rejected": -147.38427734375, "loss": 0.0261, "losses/dpo": 0.08990654349327087, "losses/sft": 1.041534423828125, "losses/total": 0.08990654349327087, "ref_logps/chosen": -16.217287063598633, "ref_logps/rejected": -51.448463439941406, "rewards/accuracies": 1.0, "rewards/chosen": -3.1480064392089844, "rewards/margins": 6.44557523727417, "rewards/rejected": -9.593582153320312, "step": 279 }, { "epoch": 0.56, "grad_norm": 29.43877601623535, "learning_rate": 4.5185185185185183e-07, "logps/chosen": -25.924114227294922, "logps/rejected": -103.9954833984375, "loss": 0.0962, "losses/dpo": 0.17584168910980225, "losses/sft": 0.3538723289966583, "losses/total": 0.17584168910980225, "ref_logps/chosen": -11.91704273223877, "ref_logps/rejected": -38.9530029296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4007071256637573, "rewards/margins": 5.103541374206543, "rewards/rejected": -6.50424861907959, "step": 280 }, { "epoch": 0.56, "grad_norm": 32.56535339355469, "learning_rate": 4.5148148148148147e-07, "logps/chosen": -34.9146728515625, "logps/rejected": -138.0476531982422, "loss": 0.1337, "losses/dpo": 0.19118238985538483, "losses/sft": 0.7913627624511719, "losses/total": 0.19118238985538483, "ref_logps/chosen": -14.662369728088379, "ref_logps/rejected": -48.69712448120117, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0252304077148438, "rewards/margins": 6.909822463989258, "rewards/rejected": -8.935052871704102, "step": 281 }, { "epoch": 0.56, "grad_norm": 25.12720489501953, "learning_rate": 4.511111111111111e-07, "logps/chosen": -39.446475982666016, "logps/rejected": -138.76202392578125, "loss": 0.0765, "losses/dpo": 0.04247686639428139, "losses/sft": 0.618464469909668, "losses/total": 0.04247686639428139, "ref_logps/chosen": -17.198135375976562, "ref_logps/rejected": -58.359657287597656, "rewards/accuracies": 1.0, "rewards/chosen": -2.2248339653015137, "rewards/margins": 5.815402984619141, "rewards/rejected": -8.040237426757812, "step": 282 }, { "epoch": 0.57, "grad_norm": 25.820880889892578, "learning_rate": 4.507407407407407e-07, "logps/chosen": -43.3232421875, "logps/rejected": -140.09263610839844, "loss": 0.0537, "losses/dpo": 0.1348758488893509, "losses/sft": 0.7474174499511719, "losses/total": 0.1348758488893509, "ref_logps/chosen": -17.421993255615234, "ref_logps/rejected": -50.615196228027344, "rewards/accuracies": 1.0, "rewards/chosen": -2.59012508392334, "rewards/margins": 6.357618808746338, "rewards/rejected": -8.94774341583252, "step": 283 }, { "epoch": 0.57, "grad_norm": 26.093074798583984, "learning_rate": 4.5037037037037034e-07, "logps/chosen": -33.18143081665039, "logps/rejected": -129.96148681640625, "loss": 0.0364, "losses/dpo": 0.01221714448183775, "losses/sft": 0.5038900971412659, "losses/total": 0.01221714448183775, "ref_logps/chosen": -11.697612762451172, "ref_logps/rejected": -47.87187957763672, "rewards/accuracies": 1.0, "rewards/chosen": -2.1483817100524902, "rewards/margins": 6.060579299926758, "rewards/rejected": -8.208961486816406, "step": 284 }, { "epoch": 0.57, "grad_norm": 47.605594635009766, "learning_rate": 4.5e-07, "logps/chosen": -43.6019172668457, "logps/rejected": -107.98235321044922, "loss": 0.1269, "losses/dpo": 0.006264102179557085, "losses/sft": 0.6233557462692261, "losses/total": 0.006264102179557085, "ref_logps/chosen": -19.681671142578125, "ref_logps/rejected": -36.80232620239258, "rewards/accuracies": 0.9375, "rewards/chosen": -2.392024278640747, "rewards/margins": 4.725978851318359, "rewards/rejected": -7.118002891540527, "step": 285 }, { "epoch": 0.57, "grad_norm": 15.573431968688965, "learning_rate": 4.496296296296296e-07, "logps/chosen": -41.77413558959961, "logps/rejected": -159.12973022460938, "loss": 0.0299, "losses/dpo": 0.10609380900859833, "losses/sft": 1.0648298263549805, "losses/total": 0.10609380900859833, "ref_logps/chosen": -16.024517059326172, "ref_logps/rejected": -60.03338623046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.5749616622924805, "rewards/margins": 7.334672927856445, "rewards/rejected": -9.909634590148926, "step": 286 }, { "epoch": 0.57, "grad_norm": 27.241920471191406, "learning_rate": 4.492592592592592e-07, "logps/chosen": -41.994728088378906, "logps/rejected": -149.73220825195312, "loss": 0.0586, "losses/dpo": 0.08027364313602448, "losses/sft": 0.6191474795341492, "losses/total": 0.08027364313602448, "ref_logps/chosen": -15.291231155395508, "ref_logps/rejected": -62.80726623535156, "rewards/accuracies": 1.0, "rewards/chosen": -2.670349597930908, "rewards/margins": 6.0221452713012695, "rewards/rejected": -8.69249439239502, "step": 287 }, { "epoch": 0.58, "grad_norm": 76.77140808105469, "learning_rate": 4.4888888888888885e-07, "logps/chosen": -41.82225799560547, "logps/rejected": -102.08113098144531, "loss": 0.2931, "losses/dpo": 0.3177294135093689, "losses/sft": 0.9429760575294495, "losses/total": 0.3177294135093689, "ref_logps/chosen": -14.118833541870117, "ref_logps/rejected": -30.601680755615234, "rewards/accuracies": 0.875, "rewards/chosen": -2.7703428268432617, "rewards/margins": 4.377601623535156, "rewards/rejected": -7.147944450378418, "step": 288 }, { "epoch": 0.58, "grad_norm": 20.054603576660156, "learning_rate": 4.4851851851851854e-07, "logps/chosen": -40.90589904785156, "logps/rejected": -129.79904174804688, "loss": 0.0673, "losses/dpo": 0.008846285752952099, "losses/sft": 0.9324573278427124, "losses/total": 0.008846285752952099, "ref_logps/chosen": -13.950414657592773, "ref_logps/rejected": -42.79026794433594, "rewards/accuracies": 1.0, "rewards/chosen": -2.6955482959747314, "rewards/margins": 6.005330562591553, "rewards/rejected": -8.700878143310547, "step": 289 }, { "epoch": 0.58, "grad_norm": 27.402971267700195, "learning_rate": 4.4814814814814813e-07, "logps/chosen": -40.17758560180664, "logps/rejected": -131.27212524414062, "loss": 0.0893, "losses/dpo": 0.17474588751792908, "losses/sft": 0.7338415384292603, "losses/total": 0.17474588751792908, "ref_logps/chosen": -12.303985595703125, "ref_logps/rejected": -47.974098205566406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7873599529266357, "rewards/margins": 5.542443752288818, "rewards/rejected": -8.329803466796875, "step": 290 }, { "epoch": 0.58, "grad_norm": 51.174476623535156, "learning_rate": 4.4777777777777777e-07, "logps/chosen": -37.67184066772461, "logps/rejected": -130.61839294433594, "loss": 0.0929, "losses/dpo": 0.1532328724861145, "losses/sft": 0.6164145469665527, "losses/total": 0.1532328724861145, "ref_logps/chosen": -13.739557266235352, "ref_logps/rejected": -47.06127166748047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.39322829246521, "rewards/margins": 5.962482929229736, "rewards/rejected": -8.355711936950684, "step": 291 }, { "epoch": 0.58, "grad_norm": 24.83441925048828, "learning_rate": 4.474074074074074e-07, "logps/chosen": -43.6513671875, "logps/rejected": -114.49932098388672, "loss": 0.1051, "losses/dpo": 0.14195476472377777, "losses/sft": 0.6711016893386841, "losses/total": 0.14195476472377777, "ref_logps/chosen": -18.276350021362305, "ref_logps/rejected": -40.27934646606445, "rewards/accuracies": 1.0, "rewards/chosen": -2.537501573562622, "rewards/margins": 4.884495735168457, "rewards/rejected": -7.4219970703125, "step": 292 }, { "epoch": 0.59, "grad_norm": 54.920963287353516, "learning_rate": 4.47037037037037e-07, "logps/chosen": -39.11894989013672, "logps/rejected": -102.25743103027344, "loss": 0.1623, "losses/dpo": 0.13864727318286896, "losses/sft": 0.8034663200378418, "losses/total": 0.13864727318286896, "ref_logps/chosen": -16.265247344970703, "ref_logps/rejected": -31.22001075744629, "rewards/accuracies": 0.9375, "rewards/chosen": -2.285369873046875, "rewards/margins": 4.81837272644043, "rewards/rejected": -7.103742599487305, "step": 293 }, { "epoch": 0.59, "grad_norm": 42.93906021118164, "learning_rate": 4.4666666666666664e-07, "logps/chosen": -44.16669464111328, "logps/rejected": -126.49778747558594, "loss": 0.1054, "losses/dpo": 0.008677210658788681, "losses/sft": 0.7196841835975647, "losses/total": 0.008677210658788681, "ref_logps/chosen": -16.730802536010742, "ref_logps/rejected": -46.01910400390625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.743589401245117, "rewards/margins": 5.304279804229736, "rewards/rejected": -8.047868728637695, "step": 294 }, { "epoch": 0.59, "grad_norm": 19.96090316772461, "learning_rate": 4.462962962962963e-07, "logps/chosen": -37.56354522705078, "logps/rejected": -116.57569122314453, "loss": 0.079, "losses/dpo": 0.13941551744937897, "losses/sft": 1.1352434158325195, "losses/total": 0.13941551744937897, "ref_logps/chosen": -11.381913185119629, "ref_logps/rejected": -40.20533752441406, "rewards/accuracies": 1.0, "rewards/chosen": -2.6181631088256836, "rewards/margins": 5.018872261047363, "rewards/rejected": -7.637035846710205, "step": 295 }, { "epoch": 0.59, "grad_norm": 35.72517395019531, "learning_rate": 4.459259259259259e-07, "logps/chosen": -41.6621208190918, "logps/rejected": -103.08749389648438, "loss": 0.1509, "losses/dpo": 0.28686463832855225, "losses/sft": 0.8667877316474915, "losses/total": 0.28686463832855225, "ref_logps/chosen": -14.639177322387695, "ref_logps/rejected": -33.67582702636719, "rewards/accuracies": 1.0, "rewards/chosen": -2.7022948265075684, "rewards/margins": 4.238872051239014, "rewards/rejected": -6.941166877746582, "step": 296 }, { "epoch": 0.59, "grad_norm": 21.558837890625, "learning_rate": 4.455555555555555e-07, "logps/chosen": -40.06609344482422, "logps/rejected": -143.51748657226562, "loss": 0.068, "losses/dpo": 0.05549360811710358, "losses/sft": 0.6680642366409302, "losses/total": 0.05549360811710358, "ref_logps/chosen": -15.775091171264648, "ref_logps/rejected": -57.9827766418457, "rewards/accuracies": 1.0, "rewards/chosen": -2.4291000366210938, "rewards/margins": 6.124370574951172, "rewards/rejected": -8.553470611572266, "step": 297 }, { "epoch": 0.6, "grad_norm": 14.56146240234375, "learning_rate": 4.4518518518518515e-07, "logps/chosen": -34.31193542480469, "logps/rejected": -137.0986328125, "loss": 0.0469, "losses/dpo": 0.09848415851593018, "losses/sft": 0.5717564821243286, "losses/total": 0.09848415851593018, "ref_logps/chosen": -16.207962036132812, "ref_logps/rejected": -54.10321044921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8103973865509033, "rewards/margins": 6.4891462326049805, "rewards/rejected": -8.299543380737305, "step": 298 }, { "epoch": 0.6, "grad_norm": 21.25967788696289, "learning_rate": 4.448148148148148e-07, "logps/chosen": -55.163639068603516, "logps/rejected": -189.49459838867188, "loss": 0.0382, "losses/dpo": 0.01291961781680584, "losses/sft": 0.5291265249252319, "losses/total": 0.01291961781680584, "ref_logps/chosen": -25.353343963623047, "ref_logps/rejected": -74.54182434082031, "rewards/accuracies": 1.0, "rewards/chosen": -2.981029510498047, "rewards/margins": 8.514249801635742, "rewards/rejected": -11.495279312133789, "step": 299 }, { "epoch": 0.6, "grad_norm": 52.146820068359375, "learning_rate": 4.444444444444444e-07, "logps/chosen": -40.749237060546875, "logps/rejected": -144.6281280517578, "loss": 0.0857, "losses/dpo": 0.03312437981367111, "losses/sft": 0.7374606132507324, "losses/total": 0.03312437981367111, "ref_logps/chosen": -16.874542236328125, "ref_logps/rejected": -57.76238250732422, "rewards/accuracies": 0.9375, "rewards/chosen": -2.38746976852417, "rewards/margins": 6.299104690551758, "rewards/rejected": -8.686574935913086, "step": 300 }, { "epoch": 0.6, "grad_norm": 40.0439567565918, "learning_rate": 4.44074074074074e-07, "logps/chosen": -35.04018020629883, "logps/rejected": -151.41586303710938, "loss": 0.1326, "losses/dpo": 0.020077509805560112, "losses/sft": 0.6906237006187439, "losses/total": 0.020077509805560112, "ref_logps/chosen": -12.195897102355957, "ref_logps/rejected": -57.81694030761719, "rewards/accuracies": 0.875, "rewards/chosen": -2.284428358078003, "rewards/margins": 7.075465202331543, "rewards/rejected": -9.359892845153809, "step": 301 }, { "epoch": 0.6, "grad_norm": 31.01496696472168, "learning_rate": 4.4370370370370367e-07, "logps/chosen": -42.011356353759766, "logps/rejected": -120.56248474121094, "loss": 0.0772, "losses/dpo": 0.14634078741073608, "losses/sft": 1.0607680082321167, "losses/total": 0.14634078741073608, "ref_logps/chosen": -14.41528606414795, "ref_logps/rejected": -45.77004623413086, "rewards/accuracies": 1.0, "rewards/chosen": -2.7596068382263184, "rewards/margins": 4.7196364402771, "rewards/rejected": -7.479243278503418, "step": 302 }, { "epoch": 0.61, "grad_norm": 28.67228126525879, "learning_rate": 4.4333333333333336e-07, "logps/chosen": -46.753761291503906, "logps/rejected": -114.69883728027344, "loss": 0.1173, "losses/dpo": 0.042167238891124725, "losses/sft": 0.6821388006210327, "losses/total": 0.042167238891124725, "ref_logps/chosen": -17.26732635498047, "ref_logps/rejected": -40.17786407470703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.948643684387207, "rewards/margins": 4.503453254699707, "rewards/rejected": -7.452096939086914, "step": 303 }, { "epoch": 0.61, "grad_norm": 44.84244918823242, "learning_rate": 4.4296296296296295e-07, "logps/chosen": -39.67682647705078, "logps/rejected": -121.08567810058594, "loss": 0.1254, "losses/dpo": 0.12976478040218353, "losses/sft": 0.7743264436721802, "losses/total": 0.12976478040218353, "ref_logps/chosen": -14.958328247070312, "ref_logps/rejected": -45.83427047729492, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4718499183654785, "rewards/margins": 5.053291320800781, "rewards/rejected": -7.525140762329102, "step": 304 }, { "epoch": 0.61, "grad_norm": 53.262367248535156, "learning_rate": 4.425925925925926e-07, "logps/chosen": -36.113426208496094, "logps/rejected": -141.60494995117188, "loss": 0.193, "losses/dpo": 0.5892627835273743, "losses/sft": 0.7533572316169739, "losses/total": 0.5892627835273743, "ref_logps/chosen": -13.810129165649414, "ref_logps/rejected": -58.52567672729492, "rewards/accuracies": 0.9375, "rewards/chosen": -2.230329990386963, "rewards/margins": 6.077596664428711, "rewards/rejected": -8.307926177978516, "step": 305 }, { "epoch": 0.61, "grad_norm": 26.402587890625, "learning_rate": 4.4222222222222223e-07, "logps/chosen": -33.75492858886719, "logps/rejected": -116.12544250488281, "loss": 0.1265, "losses/dpo": 0.22533872723579407, "losses/sft": 0.7445718050003052, "losses/total": 0.22533872723579407, "ref_logps/chosen": -14.405969619750977, "ref_logps/rejected": -45.15299987792969, "rewards/accuracies": 0.875, "rewards/chosen": -1.9348959922790527, "rewards/margins": 5.162348747253418, "rewards/rejected": -7.097244739532471, "step": 306 }, { "epoch": 0.61, "grad_norm": 40.44252014160156, "learning_rate": 4.418518518518518e-07, "logps/chosen": -36.53474426269531, "logps/rejected": -120.66770935058594, "loss": 0.1622, "losses/dpo": 0.1868322640657425, "losses/sft": 0.7348847389221191, "losses/total": 0.1868322640657425, "ref_logps/chosen": -10.541549682617188, "ref_logps/rejected": -43.96518325805664, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5993194580078125, "rewards/margins": 5.070933818817139, "rewards/rejected": -7.670252799987793, "step": 307 }, { "epoch": 0.62, "grad_norm": 33.434852600097656, "learning_rate": 4.4148148148148146e-07, "logps/chosen": -32.63652038574219, "logps/rejected": -122.68383026123047, "loss": 0.094, "losses/dpo": 0.08705702424049377, "losses/sft": 0.7313491106033325, "losses/total": 0.08705702424049377, "ref_logps/chosen": -13.012248039245605, "ref_logps/rejected": -46.19256591796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9624271392822266, "rewards/margins": 5.686699867248535, "rewards/rejected": -7.649127006530762, "step": 308 }, { "epoch": 0.62, "grad_norm": 61.25316619873047, "learning_rate": 4.411111111111111e-07, "logps/chosen": -36.29332733154297, "logps/rejected": -89.23573303222656, "loss": 0.1863, "losses/dpo": 0.2902761995792389, "losses/sft": 0.6693310141563416, "losses/total": 0.2902761995792389, "ref_logps/chosen": -13.322938919067383, "ref_logps/rejected": -34.61989974975586, "rewards/accuracies": 0.875, "rewards/chosen": -2.2970387935638428, "rewards/margins": 3.1645452976226807, "rewards/rejected": -5.461584091186523, "step": 309 }, { "epoch": 0.62, "grad_norm": 15.663551330566406, "learning_rate": 4.4074074074074074e-07, "logps/chosen": -39.670867919921875, "logps/rejected": -113.9607925415039, "loss": 0.0431, "losses/dpo": 0.05196515470743179, "losses/sft": 0.46948933601379395, "losses/total": 0.05196515470743179, "ref_logps/chosen": -17.334888458251953, "ref_logps/rejected": -42.784278869628906, "rewards/accuracies": 1.0, "rewards/chosen": -2.233597755432129, "rewards/margins": 4.8840532302856445, "rewards/rejected": -7.11765193939209, "step": 310 }, { "epoch": 0.62, "grad_norm": 52.19047164916992, "learning_rate": 4.4037037037037033e-07, "logps/chosen": -44.69192123413086, "logps/rejected": -132.44076538085938, "loss": 0.222, "losses/dpo": 0.32939571142196655, "losses/sft": 0.918161153793335, "losses/total": 0.32939571142196655, "ref_logps/chosen": -15.003911972045898, "ref_logps/rejected": -56.22412872314453, "rewards/accuracies": 0.875, "rewards/chosen": -2.9688005447387695, "rewards/margins": 4.652862548828125, "rewards/rejected": -7.6216630935668945, "step": 311 }, { "epoch": 0.62, "grad_norm": 26.470958709716797, "learning_rate": 4.3999999999999997e-07, "logps/chosen": -36.404624938964844, "logps/rejected": -128.2510223388672, "loss": 0.0642, "losses/dpo": 0.0827181488275528, "losses/sft": 0.6652327179908752, "losses/total": 0.0827181488275528, "ref_logps/chosen": -17.150863647460938, "ref_logps/rejected": -50.41554260253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.9253758192062378, "rewards/margins": 5.858171463012695, "rewards/rejected": -7.783547401428223, "step": 312 }, { "epoch": 0.63, "grad_norm": 68.88140869140625, "learning_rate": 4.396296296296296e-07, "logps/chosen": -48.03262710571289, "logps/rejected": -126.91957092285156, "loss": 0.2486, "losses/dpo": 0.16818460822105408, "losses/sft": 0.5306582450866699, "losses/total": 0.16818460822105408, "ref_logps/chosen": -19.19322967529297, "ref_logps/rejected": -51.45057678222656, "rewards/accuracies": 0.875, "rewards/chosen": -2.883939743041992, "rewards/margins": 4.662960052490234, "rewards/rejected": -7.546899318695068, "step": 313 }, { "epoch": 0.63, "grad_norm": 71.32614135742188, "learning_rate": 4.392592592592592e-07, "logps/chosen": -33.718597412109375, "logps/rejected": -118.68995666503906, "loss": 0.2059, "losses/dpo": 0.5544869899749756, "losses/sft": 1.09904944896698, "losses/total": 0.5544869899749756, "ref_logps/chosen": -12.754974365234375, "ref_logps/rejected": -46.52629852294922, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0963621139526367, "rewards/margins": 5.120002746582031, "rewards/rejected": -7.216364860534668, "step": 314 }, { "epoch": 0.63, "grad_norm": 45.111080169677734, "learning_rate": 4.3888888888888884e-07, "logps/chosen": -29.031845092773438, "logps/rejected": -102.54756927490234, "loss": 0.1605, "losses/dpo": 0.19417327642440796, "losses/sft": 0.4396215081214905, "losses/total": 0.19417327642440796, "ref_logps/chosen": -12.911455154418945, "ref_logps/rejected": -41.638851165771484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6120388507843018, "rewards/margins": 4.478832721710205, "rewards/rejected": -6.090871810913086, "step": 315 }, { "epoch": 0.63, "grad_norm": 44.88180923461914, "learning_rate": 4.3851851851851853e-07, "logps/chosen": -32.31817626953125, "logps/rejected": -94.99343872070312, "loss": 0.1621, "losses/dpo": 0.2723809778690338, "losses/sft": 0.5749231576919556, "losses/total": 0.2723809778690338, "ref_logps/chosen": -13.24195671081543, "ref_logps/rejected": -34.43061065673828, "rewards/accuracies": 0.875, "rewards/chosen": -1.907622218132019, "rewards/margins": 4.148660659790039, "rewards/rejected": -6.056282997131348, "step": 316 }, { "epoch": 0.63, "grad_norm": 45.00884246826172, "learning_rate": 4.381481481481482e-07, "logps/chosen": -38.91361999511719, "logps/rejected": -127.98786926269531, "loss": 0.1514, "losses/dpo": 0.0010936926119029522, "losses/sft": 0.7295569777488708, "losses/total": 0.0010936926119029522, "ref_logps/chosen": -15.756044387817383, "ref_logps/rejected": -47.62613296508789, "rewards/accuracies": 0.875, "rewards/chosen": -2.3157577514648438, "rewards/margins": 5.720416069030762, "rewards/rejected": -8.036173820495605, "step": 317 }, { "epoch": 0.64, "grad_norm": 18.036989212036133, "learning_rate": 4.3777777777777776e-07, "logps/chosen": -33.98314666748047, "logps/rejected": -131.54330444335938, "loss": 0.0428, "losses/dpo": 0.06015072390437126, "losses/sft": 0.5692850947380066, "losses/total": 0.06015072390437126, "ref_logps/chosen": -17.438108444213867, "ref_logps/rejected": -49.704689025878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6545041799545288, "rewards/margins": 6.529358863830566, "rewards/rejected": -8.183862686157227, "step": 318 }, { "epoch": 0.64, "grad_norm": 39.62465286254883, "learning_rate": 4.374074074074074e-07, "logps/chosen": -39.50983810424805, "logps/rejected": -133.33090209960938, "loss": 0.0765, "losses/dpo": 0.042518407106399536, "losses/sft": 0.6064221858978271, "losses/total": 0.042518407106399536, "ref_logps/chosen": -16.553245544433594, "ref_logps/rejected": -55.0023193359375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.295659065246582, "rewards/margins": 5.537198066711426, "rewards/rejected": -7.832857131958008, "step": 319 }, { "epoch": 0.64, "grad_norm": 30.962528228759766, "learning_rate": 4.3703703703703704e-07, "logps/chosen": -34.99256134033203, "logps/rejected": -144.94845581054688, "loss": 0.0733, "losses/dpo": 0.1906587928533554, "losses/sft": 0.5711092948913574, "losses/total": 0.1906587928533554, "ref_logps/chosen": -14.728666305541992, "ref_logps/rejected": -60.68986511230469, "rewards/accuracies": 1.0, "rewards/chosen": -2.0263895988464355, "rewards/margins": 6.399470329284668, "rewards/rejected": -8.425859451293945, "step": 320 }, { "epoch": 0.64, "grad_norm": 16.429805755615234, "learning_rate": 4.3666666666666663e-07, "logps/chosen": -29.450841903686523, "logps/rejected": -111.19835662841797, "loss": 0.0472, "losses/dpo": 0.032579537481069565, "losses/sft": 0.5942339301109314, "losses/total": 0.032579537481069565, "ref_logps/chosen": -13.149361610412598, "ref_logps/rejected": -44.32951736450195, "rewards/accuracies": 1.0, "rewards/chosen": -1.630147933959961, "rewards/margins": 5.056735992431641, "rewards/rejected": -6.68688440322876, "step": 321 }, { "epoch": 0.64, "grad_norm": 9.389930725097656, "learning_rate": 4.362962962962963e-07, "logps/chosen": -33.028564453125, "logps/rejected": -126.03584289550781, "loss": 0.024, "losses/dpo": 0.03465582802891731, "losses/sft": 0.531201958656311, "losses/total": 0.03465582802891731, "ref_logps/chosen": -14.504253387451172, "ref_logps/rejected": -47.91209030151367, "rewards/accuracies": 1.0, "rewards/chosen": -1.852430820465088, "rewards/margins": 5.959944725036621, "rewards/rejected": -7.812375068664551, "step": 322 }, { "epoch": 0.65, "grad_norm": 59.17805480957031, "learning_rate": 4.359259259259259e-07, "logps/chosen": -35.84375, "logps/rejected": -89.96308135986328, "loss": 0.1587, "losses/dpo": 0.45940184593200684, "losses/sft": 0.693610668182373, "losses/total": 0.45940184593200684, "ref_logps/chosen": -13.855268478393555, "ref_logps/rejected": -31.887981414794922, "rewards/accuracies": 0.875, "rewards/chosen": -2.198847770690918, "rewards/margins": 3.6086621284484863, "rewards/rejected": -5.8075103759765625, "step": 323 }, { "epoch": 0.65, "grad_norm": 35.07844543457031, "learning_rate": 4.355555555555555e-07, "logps/chosen": -37.46888732910156, "logps/rejected": -111.22254943847656, "loss": 0.1159, "losses/dpo": 0.13895408809185028, "losses/sft": 0.5949329733848572, "losses/total": 0.13895408809185028, "ref_logps/chosen": -16.702150344848633, "ref_logps/rejected": -48.511962890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0766735076904297, "rewards/margins": 4.194385051727295, "rewards/rejected": -6.271059036254883, "step": 324 }, { "epoch": 0.65, "grad_norm": 18.968955993652344, "learning_rate": 4.3518518518518514e-07, "logps/chosen": -42.50941467285156, "logps/rejected": -129.11965942382812, "loss": 0.0579, "losses/dpo": 0.06625945121049881, "losses/sft": 0.6571711301803589, "losses/total": 0.06625945121049881, "ref_logps/chosen": -16.408336639404297, "ref_logps/rejected": -47.11133575439453, "rewards/accuracies": 1.0, "rewards/chosen": -2.610107421875, "rewards/margins": 5.590725898742676, "rewards/rejected": -8.200833320617676, "step": 325 }, { "epoch": 0.65, "grad_norm": 21.00503158569336, "learning_rate": 4.348148148148148e-07, "logps/chosen": -33.02533721923828, "logps/rejected": -137.7234344482422, "loss": 0.0548, "losses/dpo": 0.150486558675766, "losses/sft": 0.5657316446304321, "losses/total": 0.150486558675766, "ref_logps/chosen": -16.13014030456543, "ref_logps/rejected": -54.781654357910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6895192861557007, "rewards/margins": 6.604659080505371, "rewards/rejected": -8.294178009033203, "step": 326 }, { "epoch": 0.65, "grad_norm": 9.425925254821777, "learning_rate": 4.344444444444444e-07, "logps/chosen": -29.731834411621094, "logps/rejected": -135.8011474609375, "loss": 0.0229, "losses/dpo": 0.01692948304116726, "losses/sft": 0.44572684168815613, "losses/total": 0.01692948304116726, "ref_logps/chosen": -14.26729965209961, "ref_logps/rejected": -50.366058349609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5464532375335693, "rewards/margins": 6.9970550537109375, "rewards/rejected": -8.543508529663086, "step": 327 }, { "epoch": 0.66, "grad_norm": 38.38417434692383, "learning_rate": 4.34074074074074e-07, "logps/chosen": -42.788551330566406, "logps/rejected": -121.70774841308594, "loss": 0.0896, "losses/dpo": 0.19306769967079163, "losses/sft": 0.448386549949646, "losses/total": 0.19306769967079163, "ref_logps/chosen": -21.298828125, "ref_logps/rejected": -44.85980987548828, "rewards/accuracies": 1.0, "rewards/chosen": -2.1489720344543457, "rewards/margins": 5.535821437835693, "rewards/rejected": -7.684793472290039, "step": 328 }, { "epoch": 0.66, "grad_norm": 74.86832427978516, "learning_rate": 4.337037037037037e-07, "logps/chosen": -37.2952766418457, "logps/rejected": -93.89851379394531, "loss": 0.2036, "losses/dpo": 0.1406317800283432, "losses/sft": 0.678645133972168, "losses/total": 0.1406317800283432, "ref_logps/chosen": -12.764579772949219, "ref_logps/rejected": -35.51376724243164, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4530696868896484, "rewards/margins": 3.3854055404663086, "rewards/rejected": -5.838475227355957, "step": 329 }, { "epoch": 0.66, "grad_norm": 101.73636627197266, "learning_rate": 4.3333333333333335e-07, "logps/chosen": -35.815696716308594, "logps/rejected": -116.6551513671875, "loss": 0.3297, "losses/dpo": 0.45946767926216125, "losses/sft": 0.6446714997291565, "losses/total": 0.45946767926216125, "ref_logps/chosen": -14.69547176361084, "ref_logps/rejected": -42.31071472167969, "rewards/accuracies": 0.8125, "rewards/chosen": -2.112022876739502, "rewards/margins": 5.322422027587891, "rewards/rejected": -7.434444427490234, "step": 330 }, { "epoch": 0.66, "grad_norm": 15.054662704467773, "learning_rate": 4.3296296296296294e-07, "logps/chosen": -31.542770385742188, "logps/rejected": -104.85406494140625, "loss": 0.049, "losses/dpo": 0.014171997085213661, "losses/sft": 0.7473487257957458, "losses/total": 0.014171997085213661, "ref_logps/chosen": -11.19998550415039, "ref_logps/rejected": -35.349510192871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.034278392791748, "rewards/margins": 4.916176795959473, "rewards/rejected": -6.950455188751221, "step": 331 }, { "epoch": 0.66, "grad_norm": 34.530670166015625, "learning_rate": 4.325925925925926e-07, "logps/chosen": -36.74336242675781, "logps/rejected": -105.05992126464844, "loss": 0.0749, "losses/dpo": 0.05851326882839203, "losses/sft": 0.5923424959182739, "losses/total": 0.05851326882839203, "ref_logps/chosen": -14.333761215209961, "ref_logps/rejected": -31.491056442260742, "rewards/accuracies": 0.9375, "rewards/chosen": -2.240960121154785, "rewards/margins": 5.115926742553711, "rewards/rejected": -7.356886863708496, "step": 332 }, { "epoch": 0.67, "grad_norm": 53.47503662109375, "learning_rate": 4.322222222222222e-07, "logps/chosen": -40.43827819824219, "logps/rejected": -136.73532104492188, "loss": 0.1446, "losses/dpo": 0.0015527131035923958, "losses/sft": 0.5699902772903442, "losses/total": 0.0015527131035923958, "ref_logps/chosen": -15.056352615356445, "ref_logps/rejected": -47.89313507080078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5381927490234375, "rewards/margins": 6.34602689743042, "rewards/rejected": -8.884220123291016, "step": 333 }, { "epoch": 0.67, "grad_norm": 20.74199867248535, "learning_rate": 4.3185185185185186e-07, "logps/chosen": -32.69434356689453, "logps/rejected": -102.65889739990234, "loss": 0.0676, "losses/dpo": 0.1204390749335289, "losses/sft": 0.6606833934783936, "losses/total": 0.1204390749335289, "ref_logps/chosen": -12.64737319946289, "ref_logps/rejected": -35.22722625732422, "rewards/accuracies": 1.0, "rewards/chosen": -2.004696846008301, "rewards/margins": 4.738471031188965, "rewards/rejected": -6.743167877197266, "step": 334 }, { "epoch": 0.67, "grad_norm": 26.58478546142578, "learning_rate": 4.3148148148148145e-07, "logps/chosen": -35.19960021972656, "logps/rejected": -93.163330078125, "loss": 0.0759, "losses/dpo": 0.14587007462978363, "losses/sft": 0.7762695550918579, "losses/total": 0.14587007462978363, "ref_logps/chosen": -9.405826568603516, "ref_logps/rejected": -29.789966583251953, "rewards/accuracies": 1.0, "rewards/chosen": -2.5793776512145996, "rewards/margins": 3.7579588890075684, "rewards/rejected": -6.337336540222168, "step": 335 }, { "epoch": 0.67, "grad_norm": 26.999387741088867, "learning_rate": 4.311111111111111e-07, "logps/chosen": -30.46701431274414, "logps/rejected": -122.60038757324219, "loss": 0.0843, "losses/dpo": 0.00098854408133775, "losses/sft": 0.6043179631233215, "losses/total": 0.00098854408133775, "ref_logps/chosen": -13.824264526367188, "ref_logps/rejected": -42.21704864501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.6642749309539795, "rewards/margins": 6.374058723449707, "rewards/rejected": -8.038333892822266, "step": 336 }, { "epoch": 0.67, "grad_norm": 14.741119384765625, "learning_rate": 4.3074074074074073e-07, "logps/chosen": -31.685304641723633, "logps/rejected": -134.98548889160156, "loss": 0.1284, "losses/dpo": 0.107744500041008, "losses/sft": 0.6917870044708252, "losses/total": 0.107744500041008, "ref_logps/chosen": -11.8071928024292, "ref_logps/rejected": -49.07727813720703, "rewards/accuracies": 0.875, "rewards/chosen": -1.9878110885620117, "rewards/margins": 6.603010654449463, "rewards/rejected": -8.590821266174316, "step": 337 }, { "epoch": 0.68, "grad_norm": 20.093679428100586, "learning_rate": 4.303703703703703e-07, "logps/chosen": -42.22148132324219, "logps/rejected": -143.92608642578125, "loss": 0.0437, "losses/dpo": 0.0012397286482155323, "losses/sft": 0.7648298740386963, "losses/total": 0.0012397286482155323, "ref_logps/chosen": -19.591176986694336, "ref_logps/rejected": -57.07090759277344, "rewards/accuracies": 1.0, "rewards/chosen": -2.2630302906036377, "rewards/margins": 6.422488689422607, "rewards/rejected": -8.685519218444824, "step": 338 }, { "epoch": 0.68, "grad_norm": 40.627845764160156, "learning_rate": 4.2999999999999996e-07, "logps/chosen": -45.338165283203125, "logps/rejected": -124.24771118164062, "loss": 0.1153, "losses/dpo": 0.03297574073076248, "losses/sft": 0.7030767798423767, "losses/total": 0.03297574073076248, "ref_logps/chosen": -17.034236907958984, "ref_logps/rejected": -46.402137756347656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.830392837524414, "rewards/margins": 4.954164981842041, "rewards/rejected": -7.784557819366455, "step": 339 }, { "epoch": 0.68, "grad_norm": 21.781951904296875, "learning_rate": 4.296296296296296e-07, "logps/chosen": -38.99799728393555, "logps/rejected": -129.36880493164062, "loss": 0.0632, "losses/dpo": 0.17519626021385193, "losses/sft": 0.4823850393295288, "losses/total": 0.17519626021385193, "ref_logps/chosen": -19.250946044921875, "ref_logps/rejected": -46.28950881958008, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9747052192687988, "rewards/margins": 6.333225250244141, "rewards/rejected": -8.307929992675781, "step": 340 }, { "epoch": 0.68, "grad_norm": 33.246971130371094, "learning_rate": 4.2925925925925924e-07, "logps/chosen": -34.87786865234375, "logps/rejected": -110.35307312011719, "loss": 0.1453, "losses/dpo": 0.2280084192752838, "losses/sft": 0.822892427444458, "losses/total": 0.2280084192752838, "ref_logps/chosen": -10.642579078674316, "ref_logps/rejected": -36.44561004638672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4235291481018066, "rewards/margins": 4.967216968536377, "rewards/rejected": -7.390746116638184, "step": 341 }, { "epoch": 0.68, "grad_norm": 25.239110946655273, "learning_rate": 4.2888888888888883e-07, "logps/chosen": -34.88047790527344, "logps/rejected": -112.80699157714844, "loss": 0.073, "losses/dpo": 0.13405922055244446, "losses/sft": 0.5381667017936707, "losses/total": 0.13405922055244446, "ref_logps/chosen": -16.415294647216797, "ref_logps/rejected": -45.48058319091797, "rewards/accuracies": 1.0, "rewards/chosen": -1.8465182781219482, "rewards/margins": 4.886122226715088, "rewards/rejected": -6.732640266418457, "step": 342 }, { "epoch": 0.69, "grad_norm": 63.96232986450195, "learning_rate": 4.285185185185185e-07, "logps/chosen": -42.0777473449707, "logps/rejected": -113.29154968261719, "loss": 0.2174, "losses/dpo": 0.3859434127807617, "losses/sft": 0.7559604048728943, "losses/total": 0.3859434127807617, "ref_logps/chosen": -14.650555610656738, "ref_logps/rejected": -45.057212829589844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7427194118499756, "rewards/margins": 4.080714225769043, "rewards/rejected": -6.8234333992004395, "step": 343 }, { "epoch": 0.69, "grad_norm": 56.83890151977539, "learning_rate": 4.2814814814814816e-07, "logps/chosen": -36.53527069091797, "logps/rejected": -108.47628784179688, "loss": 0.103, "losses/dpo": 0.31091389060020447, "losses/sft": 0.9659160375595093, "losses/total": 0.31091389060020447, "ref_logps/chosen": -11.020648956298828, "ref_logps/rejected": -31.206253051757812, "rewards/accuracies": 0.9375, "rewards/chosen": -2.551462173461914, "rewards/margins": 5.175541877746582, "rewards/rejected": -7.72700309753418, "step": 344 }, { "epoch": 0.69, "grad_norm": 58.79685974121094, "learning_rate": 4.2777777777777775e-07, "logps/chosen": -42.97200012207031, "logps/rejected": -116.67851257324219, "loss": 0.1638, "losses/dpo": 0.14491723477840424, "losses/sft": 0.57149738073349, "losses/total": 0.14491723477840424, "ref_logps/chosen": -16.25571060180664, "ref_logps/rejected": -47.97196578979492, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6716291904449463, "rewards/margins": 4.199025630950928, "rewards/rejected": -6.870654582977295, "step": 345 }, { "epoch": 0.69, "grad_norm": 35.571510314941406, "learning_rate": 4.274074074074074e-07, "logps/chosen": -43.3200798034668, "logps/rejected": -131.97885131835938, "loss": 0.0656, "losses/dpo": 0.10765451937913895, "losses/sft": 0.9277602434158325, "losses/total": 0.10765451937913895, "ref_logps/chosen": -16.17816925048828, "ref_logps/rejected": -45.527259826660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.71419095993042, "rewards/margins": 5.930967330932617, "rewards/rejected": -8.645158767700195, "step": 346 }, { "epoch": 0.69, "grad_norm": 87.16278839111328, "learning_rate": 4.2703703703703703e-07, "logps/chosen": -45.16498947143555, "logps/rejected": -122.59102630615234, "loss": 0.1582, "losses/dpo": 0.17615115642547607, "losses/sft": 0.7514989376068115, "losses/total": 0.17615115642547607, "ref_logps/chosen": -14.824213027954102, "ref_logps/rejected": -43.101112365722656, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0340776443481445, "rewards/margins": 4.914913654327393, "rewards/rejected": -7.948991298675537, "step": 347 }, { "epoch": 0.7, "grad_norm": 14.873774528503418, "learning_rate": 4.266666666666667e-07, "logps/chosen": -37.2757568359375, "logps/rejected": -136.62548828125, "loss": 0.0518, "losses/dpo": 0.0002141115692211315, "losses/sft": 0.6635446548461914, "losses/total": 0.0002141115692211315, "ref_logps/chosen": -13.512396812438965, "ref_logps/rejected": -46.763038635253906, "rewards/accuracies": 1.0, "rewards/chosen": -2.376336097717285, "rewards/margins": 6.609910011291504, "rewards/rejected": -8.986246109008789, "step": 348 }, { "epoch": 0.7, "grad_norm": 46.89150619506836, "learning_rate": 4.2629629629629626e-07, "logps/chosen": -50.33883285522461, "logps/rejected": -103.36832427978516, "loss": 0.1483, "losses/dpo": 0.06724968552589417, "losses/sft": 0.8436750173568726, "losses/total": 0.06724968552589417, "ref_logps/chosen": -18.469154357910156, "ref_logps/rejected": -33.844810485839844, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1869678497314453, "rewards/margins": 3.7653839588165283, "rewards/rejected": -6.9523515701293945, "step": 349 }, { "epoch": 0.7, "grad_norm": 22.70137596130371, "learning_rate": 4.259259259259259e-07, "logps/chosen": -36.81205749511719, "logps/rejected": -128.5601806640625, "loss": 0.0615, "losses/dpo": 0.08877705037593842, "losses/sft": 0.5782928466796875, "losses/total": 0.08877705037593842, "ref_logps/chosen": -18.262386322021484, "ref_logps/rejected": -46.65271759033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.8549673557281494, "rewards/margins": 6.33577823638916, "rewards/rejected": -8.190746307373047, "step": 350 }, { "epoch": 0.7, "grad_norm": 31.05264663696289, "learning_rate": 4.2555555555555555e-07, "logps/chosen": -49.91789245605469, "logps/rejected": -129.2419891357422, "loss": 0.0952, "losses/dpo": 0.199846088886261, "losses/sft": 0.975861668586731, "losses/total": 0.199846088886261, "ref_logps/chosen": -13.545465469360352, "ref_logps/rejected": -42.03595733642578, "rewards/accuracies": 1.0, "rewards/chosen": -3.6372427940368652, "rewards/margins": 5.083359718322754, "rewards/rejected": -8.720602989196777, "step": 351 }, { "epoch": 0.7, "grad_norm": 48.4364013671875, "learning_rate": 4.2518518518518513e-07, "logps/chosen": -38.42912673950195, "logps/rejected": -151.83380126953125, "loss": 0.086, "losses/dpo": 0.02164197713136673, "losses/sft": 0.6958895325660706, "losses/total": 0.02164197713136673, "ref_logps/chosen": -14.27855396270752, "ref_logps/rejected": -56.75341033935547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.415057420730591, "rewards/margins": 7.092981338500977, "rewards/rejected": -9.508038520812988, "step": 352 }, { "epoch": 0.71, "grad_norm": 33.09815216064453, "learning_rate": 4.248148148148148e-07, "logps/chosen": -36.29337692260742, "logps/rejected": -129.50125122070312, "loss": 0.121, "losses/dpo": 0.18076889216899872, "losses/sft": 0.9632574915885925, "losses/total": 0.18076889216899872, "ref_logps/chosen": -11.399040222167969, "ref_logps/rejected": -50.23072814941406, "rewards/accuracies": 0.875, "rewards/chosen": -2.4894332885742188, "rewards/margins": 5.437620162963867, "rewards/rejected": -7.927053451538086, "step": 353 }, { "epoch": 0.71, "grad_norm": 50.44475555419922, "learning_rate": 4.244444444444444e-07, "logps/chosen": -36.57550811767578, "logps/rejected": -105.3508071899414, "loss": 0.158, "losses/dpo": 0.01699255406856537, "losses/sft": 0.6166819334030151, "losses/total": 0.01699255406856537, "ref_logps/chosen": -12.352455139160156, "ref_logps/rejected": -37.243473052978516, "rewards/accuracies": 0.9375, "rewards/chosen": -2.422305107116699, "rewards/margins": 4.388428688049316, "rewards/rejected": -6.810733795166016, "step": 354 }, { "epoch": 0.71, "grad_norm": 41.80162811279297, "learning_rate": 4.24074074074074e-07, "logps/chosen": -32.989688873291016, "logps/rejected": -104.3983154296875, "loss": 0.138, "losses/dpo": 0.13521035015583038, "losses/sft": 0.48757117986679077, "losses/total": 0.13521035015583038, "ref_logps/chosen": -15.936813354492188, "ref_logps/rejected": -38.816070556640625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7052874565124512, "rewards/margins": 4.852938175201416, "rewards/rejected": -6.558225631713867, "step": 355 }, { "epoch": 0.71, "grad_norm": 27.352943420410156, "learning_rate": 4.237037037037037e-07, "logps/chosen": -31.969324111938477, "logps/rejected": -96.71903991699219, "loss": 0.1022, "losses/dpo": 0.20521797239780426, "losses/sft": 0.7039509415626526, "losses/total": 0.20521797239780426, "ref_logps/chosen": -10.606605529785156, "ref_logps/rejected": -35.394012451171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1362719535827637, "rewards/margins": 3.9962306022644043, "rewards/rejected": -6.132502555847168, "step": 356 }, { "epoch": 0.71, "grad_norm": 34.827945709228516, "learning_rate": 4.2333333333333334e-07, "logps/chosen": -39.3182373046875, "logps/rejected": -122.38554382324219, "loss": 0.0739, "losses/dpo": 0.15058466792106628, "losses/sft": 0.6575937271118164, "losses/total": 0.15058466792106628, "ref_logps/chosen": -14.849676132202148, "ref_logps/rejected": -45.92652893066406, "rewards/accuracies": 1.0, "rewards/chosen": -2.4468560218811035, "rewards/margins": 5.199045181274414, "rewards/rejected": -7.645901203155518, "step": 357 }, { "epoch": 0.72, "grad_norm": 35.0914421081543, "learning_rate": 4.22962962962963e-07, "logps/chosen": -34.46870422363281, "logps/rejected": -106.36213684082031, "loss": 0.0733, "losses/dpo": 0.22453603148460388, "losses/sft": 1.1330952644348145, "losses/total": 0.22453603148460388, "ref_logps/chosen": -12.364035606384277, "ref_logps/rejected": -34.82964324951172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2104668617248535, "rewards/margins": 4.942782878875732, "rewards/rejected": -7.153249740600586, "step": 358 }, { "epoch": 0.72, "grad_norm": 26.42351531982422, "learning_rate": 4.2259259259259257e-07, "logps/chosen": -34.861663818359375, "logps/rejected": -108.93109130859375, "loss": 0.0815, "losses/dpo": 0.04924841225147247, "losses/sft": 0.89609694480896, "losses/total": 0.04924841225147247, "ref_logps/chosen": -13.483573913574219, "ref_logps/rejected": -35.11107635498047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1378092765808105, "rewards/margins": 5.244193077087402, "rewards/rejected": -7.382001876831055, "step": 359 }, { "epoch": 0.72, "grad_norm": 78.83193969726562, "learning_rate": 4.222222222222222e-07, "logps/chosen": -37.34288787841797, "logps/rejected": -124.1546630859375, "loss": 0.1932, "losses/dpo": 0.0520499125123024, "losses/sft": 0.7105432748794556, "losses/total": 0.0520499125123024, "ref_logps/chosen": -16.80919647216797, "ref_logps/rejected": -44.207786560058594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0533690452575684, "rewards/margins": 5.941318511962891, "rewards/rejected": -7.994687557220459, "step": 360 }, { "epoch": 0.72, "grad_norm": 16.736604690551758, "learning_rate": 4.2185185185185185e-07, "logps/chosen": -43.460426330566406, "logps/rejected": -139.9253387451172, "loss": 0.0443, "losses/dpo": 0.010741611942648888, "losses/sft": 0.5808259844779968, "losses/total": 0.010741611942648888, "ref_logps/chosen": -18.60515785217285, "ref_logps/rejected": -51.46515655517578, "rewards/accuracies": 1.0, "rewards/chosen": -2.4855270385742188, "rewards/margins": 6.360491752624512, "rewards/rejected": -8.846017837524414, "step": 361 }, { "epoch": 0.72, "grad_norm": 47.535552978515625, "learning_rate": 4.2148148148148144e-07, "logps/chosen": -32.118614196777344, "logps/rejected": -121.19420623779297, "loss": 0.156, "losses/dpo": 0.1825261265039444, "losses/sft": 0.44307950139045715, "losses/total": 0.1825261265039444, "ref_logps/chosen": -10.199451446533203, "ref_logps/rejected": -48.19236755371094, "rewards/accuracies": 1.0, "rewards/chosen": -2.1919162273406982, "rewards/margins": 5.108268737792969, "rewards/rejected": -7.300185203552246, "step": 362 }, { "epoch": 0.73, "grad_norm": 16.611141204833984, "learning_rate": 4.211111111111111e-07, "logps/chosen": -31.812637329101562, "logps/rejected": -127.39079284667969, "loss": 0.0425, "losses/dpo": 0.002231371821835637, "losses/sft": 0.6797657012939453, "losses/total": 0.002231371821835637, "ref_logps/chosen": -14.011062622070312, "ref_logps/rejected": -46.2428092956543, "rewards/accuracies": 1.0, "rewards/chosen": -1.7801575660705566, "rewards/margins": 6.334640979766846, "rewards/rejected": -8.114798545837402, "step": 363 }, { "epoch": 0.73, "grad_norm": 118.1374740600586, "learning_rate": 4.207407407407407e-07, "logps/chosen": -45.69947814941406, "logps/rejected": -123.48897552490234, "loss": 0.3475, "losses/dpo": 0.0039491476491093636, "losses/sft": 0.6805664300918579, "losses/total": 0.0039491476491093636, "ref_logps/chosen": -15.519075393676758, "ref_logps/rejected": -47.35173034667969, "rewards/accuracies": 0.875, "rewards/chosen": -3.018040657043457, "rewards/margins": 4.5956830978393555, "rewards/rejected": -7.613724231719971, "step": 364 }, { "epoch": 0.73, "grad_norm": 24.093891143798828, "learning_rate": 4.2037037037037036e-07, "logps/chosen": -37.241790771484375, "logps/rejected": -149.10354614257812, "loss": 0.078, "losses/dpo": 0.2201634645462036, "losses/sft": 0.4000563323497772, "losses/total": 0.2201634645462036, "ref_logps/chosen": -18.728097915649414, "ref_logps/rejected": -60.30243682861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.8513693809509277, "rewards/margins": 7.028740406036377, "rewards/rejected": -8.880109786987305, "step": 365 }, { "epoch": 0.73, "grad_norm": 31.15038299560547, "learning_rate": 4.1999999999999995e-07, "logps/chosen": -37.69719314575195, "logps/rejected": -116.640869140625, "loss": 0.0791, "losses/dpo": 0.1046699583530426, "losses/sft": 0.5402253270149231, "losses/total": 0.1046699583530426, "ref_logps/chosen": -14.103933334350586, "ref_logps/rejected": -36.36824417114258, "rewards/accuracies": 1.0, "rewards/chosen": -2.359325885772705, "rewards/margins": 5.6679368019104, "rewards/rejected": -8.027262687683105, "step": 366 }, { "epoch": 0.73, "grad_norm": 62.569313049316406, "learning_rate": 4.196296296296296e-07, "logps/chosen": -38.47657775878906, "logps/rejected": -107.96405792236328, "loss": 0.2867, "losses/dpo": 0.33012351393699646, "losses/sft": 0.91560298204422, "losses/total": 0.33012351393699646, "ref_logps/chosen": -12.317329406738281, "ref_logps/rejected": -38.52401351928711, "rewards/accuracies": 0.875, "rewards/chosen": -2.615924835205078, "rewards/margins": 4.328080177307129, "rewards/rejected": -6.944005489349365, "step": 367 }, { "epoch": 0.74, "grad_norm": 25.4249267578125, "learning_rate": 4.1925925925925923e-07, "logps/chosen": -41.08087921142578, "logps/rejected": -132.00863647460938, "loss": 0.093, "losses/dpo": 0.175454780459404, "losses/sft": 0.7735669612884521, "losses/total": 0.175454780459404, "ref_logps/chosen": -16.551727294921875, "ref_logps/rejected": -49.65811538696289, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4529147148132324, "rewards/margins": 5.782136917114258, "rewards/rejected": -8.235052108764648, "step": 368 }, { "epoch": 0.74, "grad_norm": 38.16954040527344, "learning_rate": 4.1888888888888887e-07, "logps/chosen": -35.82998275756836, "logps/rejected": -96.82465362548828, "loss": 0.0917, "losses/dpo": 0.021983064711093903, "losses/sft": 0.570757269859314, "losses/total": 0.021983064711093903, "ref_logps/chosen": -13.760212898254395, "ref_logps/rejected": -32.04032516479492, "rewards/accuracies": 1.0, "rewards/chosen": -2.206976890563965, "rewards/margins": 4.271455764770508, "rewards/rejected": -6.478432655334473, "step": 369 }, { "epoch": 0.74, "grad_norm": 18.851280212402344, "learning_rate": 4.185185185185185e-07, "logps/chosen": -44.81340026855469, "logps/rejected": -119.28524780273438, "loss": 0.0781, "losses/dpo": 0.012161415070295334, "losses/sft": 0.599815845489502, "losses/total": 0.012161415070295334, "ref_logps/chosen": -19.234413146972656, "ref_logps/rejected": -46.1428108215332, "rewards/accuracies": 1.0, "rewards/chosen": -2.557898998260498, "rewards/margins": 4.756345748901367, "rewards/rejected": -7.314244270324707, "step": 370 }, { "epoch": 0.74, "grad_norm": 8.515548706054688, "learning_rate": 4.1814814814814815e-07, "logps/chosen": -36.610347747802734, "logps/rejected": -104.79900360107422, "loss": 0.0264, "losses/dpo": 0.023754268884658813, "losses/sft": 0.8289971947669983, "losses/total": 0.023754268884658813, "ref_logps/chosen": -13.560300827026367, "ref_logps/rejected": -33.50470733642578, "rewards/accuracies": 1.0, "rewards/chosen": -2.305004596710205, "rewards/margins": 4.824424743652344, "rewards/rejected": -7.129429817199707, "step": 371 }, { "epoch": 0.74, "grad_norm": 54.51543426513672, "learning_rate": 4.177777777777778e-07, "logps/chosen": -40.711090087890625, "logps/rejected": -114.20442962646484, "loss": 0.1797, "losses/dpo": 0.01912502571940422, "losses/sft": 0.7507596015930176, "losses/total": 0.01912502571940422, "ref_logps/chosen": -8.806960105895996, "ref_logps/rejected": -40.375144958496094, "rewards/accuracies": 1.0, "rewards/chosen": -3.190412998199463, "rewards/margins": 4.1925153732299805, "rewards/rejected": -7.382928371429443, "step": 372 }, { "epoch": 0.75, "grad_norm": 34.81773376464844, "learning_rate": 4.174074074074074e-07, "logps/chosen": -39.61103057861328, "logps/rejected": -139.82696533203125, "loss": 0.0626, "losses/dpo": 0.0014186090556904674, "losses/sft": 0.5715436339378357, "losses/total": 0.0014186090556904674, "ref_logps/chosen": -19.26919937133789, "ref_logps/rejected": -52.185546875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0341830253601074, "rewards/margins": 6.729958534240723, "rewards/rejected": -8.764141082763672, "step": 373 }, { "epoch": 0.75, "grad_norm": 61.708709716796875, "learning_rate": 4.17037037037037e-07, "logps/chosen": -38.32596969604492, "logps/rejected": -139.75997924804688, "loss": 0.1031, "losses/dpo": 0.03926457092165947, "losses/sft": 0.5558092594146729, "losses/total": 0.03926457092165947, "ref_logps/chosen": -14.84549617767334, "ref_logps/rejected": -56.434959411621094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3480472564697266, "rewards/margins": 5.984454154968262, "rewards/rejected": -8.332502365112305, "step": 374 }, { "epoch": 0.75, "grad_norm": 38.21398162841797, "learning_rate": 4.1666666666666667e-07, "logps/chosen": -29.691017150878906, "logps/rejected": -132.10032653808594, "loss": 0.0956, "losses/dpo": 0.05562710016965866, "losses/sft": 0.7517297267913818, "losses/total": 0.05562710016965866, "ref_logps/chosen": -9.127862930297852, "ref_logps/rejected": -46.169281005859375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0563154220581055, "rewards/margins": 6.5367889404296875, "rewards/rejected": -8.593104362487793, "step": 375 }, { "epoch": 0.75, "grad_norm": 39.852298736572266, "learning_rate": 4.1629629629629625e-07, "logps/chosen": -34.75245666503906, "logps/rejected": -136.33731079101562, "loss": 0.142, "losses/dpo": 0.31215742230415344, "losses/sft": 0.5526888370513916, "losses/total": 0.31215742230415344, "ref_logps/chosen": -15.972925186157227, "ref_logps/rejected": -50.73971176147461, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8779528141021729, "rewards/margins": 6.681807518005371, "rewards/rejected": -8.559760093688965, "step": 376 }, { "epoch": 0.75, "grad_norm": 36.76629638671875, "learning_rate": 4.159259259259259e-07, "logps/chosen": -38.22616195678711, "logps/rejected": -103.80357360839844, "loss": 0.1058, "losses/dpo": 0.12199509143829346, "losses/sft": 0.79285728931427, "losses/total": 0.12199509143829346, "ref_logps/chosen": -13.101573944091797, "ref_logps/rejected": -33.90134048461914, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5124588012695312, "rewards/margins": 4.47776460647583, "rewards/rejected": -6.9902238845825195, "step": 377 }, { "epoch": 0.76, "grad_norm": 49.55006408691406, "learning_rate": 4.1555555555555554e-07, "logps/chosen": -37.08061218261719, "logps/rejected": -144.79342651367188, "loss": 0.1404, "losses/dpo": 0.3125501871109009, "losses/sft": 0.604672372341156, "losses/total": 0.3125501871109009, "ref_logps/chosen": -16.00560760498047, "ref_logps/rejected": -55.147979736328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1075005531311035, "rewards/margins": 6.857043266296387, "rewards/rejected": -8.964544296264648, "step": 378 }, { "epoch": 0.76, "grad_norm": 19.39054298400879, "learning_rate": 4.151851851851852e-07, "logps/chosen": -33.039878845214844, "logps/rejected": -144.43283081054688, "loss": 0.0474, "losses/dpo": 0.03315652906894684, "losses/sft": 0.6851824522018433, "losses/total": 0.03315652906894684, "ref_logps/chosen": -11.591955184936523, "ref_logps/rejected": -54.99343490600586, "rewards/accuracies": 1.0, "rewards/chosen": -2.144792079925537, "rewards/margins": 6.79914665222168, "rewards/rejected": -8.943939208984375, "step": 379 }, { "epoch": 0.76, "grad_norm": 14.704289436340332, "learning_rate": 4.1481481481481476e-07, "logps/chosen": -36.49052810668945, "logps/rejected": -112.64339447021484, "loss": 0.0508, "losses/dpo": 0.05176647752523422, "losses/sft": 0.7867259979248047, "losses/total": 0.05176647752523422, "ref_logps/chosen": -16.324478149414062, "ref_logps/rejected": -38.75141906738281, "rewards/accuracies": 1.0, "rewards/chosen": -2.0166049003601074, "rewards/margins": 5.372592926025391, "rewards/rejected": -7.389198303222656, "step": 380 }, { "epoch": 0.76, "grad_norm": 9.825971603393555, "learning_rate": 4.144444444444444e-07, "logps/chosen": -38.247528076171875, "logps/rejected": -134.92242431640625, "loss": 0.0183, "losses/dpo": 0.037824541330337524, "losses/sft": 0.624715268611908, "losses/total": 0.037824541330337524, "ref_logps/chosen": -16.953731536865234, "ref_logps/rejected": -50.335994720458984, "rewards/accuracies": 1.0, "rewards/chosen": -2.1293797492980957, "rewards/margins": 6.329263687133789, "rewards/rejected": -8.458642959594727, "step": 381 }, { "epoch": 0.76, "grad_norm": 9.921040534973145, "learning_rate": 4.140740740740741e-07, "logps/chosen": -41.63106155395508, "logps/rejected": -138.38983154296875, "loss": 0.0245, "losses/dpo": 0.024639783427119255, "losses/sft": 0.7088327407836914, "losses/total": 0.024639783427119255, "ref_logps/chosen": -16.87957763671875, "ref_logps/rejected": -50.51683807373047, "rewards/accuracies": 1.0, "rewards/chosen": -2.4751484394073486, "rewards/margins": 6.312151908874512, "rewards/rejected": -8.787300109863281, "step": 382 }, { "epoch": 0.77, "grad_norm": 11.32041072845459, "learning_rate": 4.137037037037037e-07, "logps/chosen": -41.30024337768555, "logps/rejected": -127.23213958740234, "loss": 0.0502, "losses/dpo": 0.11755054444074631, "losses/sft": 0.6314442157745361, "losses/total": 0.11755054444074631, "ref_logps/chosen": -22.129472732543945, "ref_logps/rejected": -50.32065200805664, "rewards/accuracies": 1.0, "rewards/chosen": -1.917076826095581, "rewards/margins": 5.77407169342041, "rewards/rejected": -7.69114875793457, "step": 383 }, { "epoch": 0.77, "grad_norm": 50.16292953491211, "learning_rate": 4.1333333333333333e-07, "logps/chosen": -44.699562072753906, "logps/rejected": -103.92740631103516, "loss": 0.0995, "losses/dpo": 0.205742746591568, "losses/sft": 0.5135591626167297, "losses/total": 0.205742746591568, "ref_logps/chosen": -21.934329986572266, "ref_logps/rejected": -38.492950439453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2765228748321533, "rewards/margins": 4.2669219970703125, "rewards/rejected": -6.543445587158203, "step": 384 }, { "epoch": 0.77, "grad_norm": 26.36185073852539, "learning_rate": 4.1296296296296297e-07, "logps/chosen": -48.22766876220703, "logps/rejected": -134.123779296875, "loss": 0.0517, "losses/dpo": 0.058786191046237946, "losses/sft": 0.8921412229537964, "losses/total": 0.058786191046237946, "ref_logps/chosen": -16.519601821899414, "ref_logps/rejected": -44.94132995605469, "rewards/accuracies": 1.0, "rewards/chosen": -3.170806884765625, "rewards/margins": 5.747437953948975, "rewards/rejected": -8.918245315551758, "step": 385 }, { "epoch": 0.77, "grad_norm": 49.396728515625, "learning_rate": 4.1259259259259256e-07, "logps/chosen": -40.38239288330078, "logps/rejected": -129.00726318359375, "loss": 0.1603, "losses/dpo": 0.26252609491348267, "losses/sft": 0.7657068371772766, "losses/total": 0.26252609491348267, "ref_logps/chosen": -14.554398536682129, "ref_logps/rejected": -49.01328659057617, "rewards/accuracies": 0.875, "rewards/chosen": -2.5827994346618652, "rewards/margins": 5.416598320007324, "rewards/rejected": -7.999397277832031, "step": 386 }, { "epoch": 0.77, "grad_norm": 32.83975601196289, "learning_rate": 4.122222222222222e-07, "logps/chosen": -40.15536880493164, "logps/rejected": -117.92662048339844, "loss": 0.0849, "losses/dpo": 0.2725093960762024, "losses/sft": 0.75511634349823, "losses/total": 0.2725093960762024, "ref_logps/chosen": -16.408966064453125, "ref_logps/rejected": -45.3177604675293, "rewards/accuracies": 1.0, "rewards/chosen": -2.374640464782715, "rewards/margins": 4.886244773864746, "rewards/rejected": -7.260885238647461, "step": 387 }, { "epoch": 0.78, "grad_norm": 55.663578033447266, "learning_rate": 4.1185185185185184e-07, "logps/chosen": -43.84164047241211, "logps/rejected": -110.4112319946289, "loss": 0.1738, "losses/dpo": 0.023735491558909416, "losses/sft": 0.7960777282714844, "losses/total": 0.023735491558909416, "ref_logps/chosen": -15.776308059692383, "ref_logps/rejected": -38.48309326171875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8065333366394043, "rewards/margins": 4.386280536651611, "rewards/rejected": -7.192813873291016, "step": 388 }, { "epoch": 0.78, "grad_norm": 46.071250915527344, "learning_rate": 4.114814814814815e-07, "logps/chosen": -41.7428092956543, "logps/rejected": -143.06747436523438, "loss": 0.1379, "losses/dpo": 0.0022226774599403143, "losses/sft": 0.5881319642066956, "losses/total": 0.0022226774599403143, "ref_logps/chosen": -15.068788528442383, "ref_logps/rejected": -46.10564422607422, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6674017906188965, "rewards/margins": 7.028781414031982, "rewards/rejected": -9.696184158325195, "step": 389 }, { "epoch": 0.78, "grad_norm": 18.005714416503906, "learning_rate": 4.1111111111111107e-07, "logps/chosen": -47.57895278930664, "logps/rejected": -116.93091583251953, "loss": 0.0512, "losses/dpo": 0.021306635811924934, "losses/sft": 0.5592812895774841, "losses/total": 0.021306635811924934, "ref_logps/chosen": -20.549213409423828, "ref_logps/rejected": -42.300048828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.7029738426208496, "rewards/margins": 4.760113716125488, "rewards/rejected": -7.463088035583496, "step": 390 }, { "epoch": 0.78, "grad_norm": 62.33810043334961, "learning_rate": 4.107407407407407e-07, "logps/chosen": -44.71644592285156, "logps/rejected": -147.71343994140625, "loss": 0.1605, "losses/dpo": 0.025463107973337173, "losses/sft": 0.6098695993423462, "losses/total": 0.025463107973337173, "ref_logps/chosen": -14.803157806396484, "ref_logps/rejected": -50.16062927246094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.991328716278076, "rewards/margins": 6.76395320892334, "rewards/rejected": -9.755282402038574, "step": 391 }, { "epoch": 0.78, "grad_norm": 24.61797523498535, "learning_rate": 4.1037037037037035e-07, "logps/chosen": -45.47924041748047, "logps/rejected": -134.3936004638672, "loss": 0.0449, "losses/dpo": 0.1113649234175682, "losses/sft": 0.7880243062973022, "losses/total": 0.1113649234175682, "ref_logps/chosen": -16.07276153564453, "ref_logps/rejected": -39.296424865722656, "rewards/accuracies": 1.0, "rewards/chosen": -2.940647840499878, "rewards/margins": 6.5690693855285645, "rewards/rejected": -9.509716987609863, "step": 392 }, { "epoch": 0.79, "grad_norm": 62.34648895263672, "learning_rate": 4.0999999999999994e-07, "logps/chosen": -53.482086181640625, "logps/rejected": -143.56060791015625, "loss": 0.1232, "losses/dpo": 0.05823798477649689, "losses/sft": 1.0682368278503418, "losses/total": 0.05823798477649689, "ref_logps/chosen": -18.055744171142578, "ref_logps/rejected": -50.04603576660156, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5426340103149414, "rewards/margins": 5.8088226318359375, "rewards/rejected": -9.351457595825195, "step": 393 }, { "epoch": 0.79, "grad_norm": 27.68257713317871, "learning_rate": 4.096296296296296e-07, "logps/chosen": -41.640113830566406, "logps/rejected": -128.0128631591797, "loss": 0.0461, "losses/dpo": 0.052962690591812134, "losses/sft": 0.46492961049079895, "losses/total": 0.052962690591812134, "ref_logps/chosen": -14.865592002868652, "ref_logps/rejected": -43.59786605834961, "rewards/accuracies": 1.0, "rewards/chosen": -2.6774520874023438, "rewards/margins": 5.764047622680664, "rewards/rejected": -8.441499710083008, "step": 394 }, { "epoch": 0.79, "grad_norm": 23.043331146240234, "learning_rate": 4.092592592592593e-07, "logps/chosen": -35.16297912597656, "logps/rejected": -113.73015594482422, "loss": 0.0636, "losses/dpo": 0.05132364481687546, "losses/sft": 0.5571942329406738, "losses/total": 0.05132364481687546, "ref_logps/chosen": -11.585695266723633, "ref_logps/rejected": -32.01404571533203, "rewards/accuracies": 1.0, "rewards/chosen": -2.3577282428741455, "rewards/margins": 5.813882827758789, "rewards/rejected": -8.171610832214355, "step": 395 }, { "epoch": 0.79, "grad_norm": 41.0941276550293, "learning_rate": 4.088888888888889e-07, "logps/chosen": -42.99498748779297, "logps/rejected": -106.75965881347656, "loss": 0.1797, "losses/dpo": 0.40317678451538086, "losses/sft": 0.5293663740158081, "losses/total": 0.40317678451538086, "ref_logps/chosen": -14.02493953704834, "ref_logps/rejected": -32.46437454223633, "rewards/accuracies": 0.9375, "rewards/chosen": -2.897005081176758, "rewards/margins": 4.532524108886719, "rewards/rejected": -7.429529190063477, "step": 396 }, { "epoch": 0.79, "grad_norm": 35.72065734863281, "learning_rate": 4.085185185185185e-07, "logps/chosen": -54.67194366455078, "logps/rejected": -155.37124633789062, "loss": 0.0593, "losses/dpo": 0.015326184220612049, "losses/sft": 0.7019369602203369, "losses/total": 0.015326184220612049, "ref_logps/chosen": -20.30701446533203, "ref_logps/rejected": -50.850345611572266, "rewards/accuracies": 0.9375, "rewards/chosen": -3.436492919921875, "rewards/margins": 7.015597343444824, "rewards/rejected": -10.452091217041016, "step": 397 }, { "epoch": 0.8, "grad_norm": 43.3508415222168, "learning_rate": 4.0814814814814814e-07, "logps/chosen": -47.85393524169922, "logps/rejected": -113.46194458007812, "loss": 0.1027, "losses/dpo": 0.026841329410672188, "losses/sft": 0.7813979387283325, "losses/total": 0.026841329410672188, "ref_logps/chosen": -14.83338737487793, "ref_logps/rejected": -40.13924789428711, "rewards/accuracies": 0.9375, "rewards/chosen": -3.3020548820495605, "rewards/margins": 4.030215263366699, "rewards/rejected": -7.33227014541626, "step": 398 }, { "epoch": 0.8, "grad_norm": 49.429466247558594, "learning_rate": 4.077777777777778e-07, "logps/chosen": -43.05474853515625, "logps/rejected": -133.95877075195312, "loss": 0.1155, "losses/dpo": 0.2445557415485382, "losses/sft": 0.6521463394165039, "losses/total": 0.2445557415485382, "ref_logps/chosen": -15.042274475097656, "ref_logps/rejected": -43.79792404174805, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8012471199035645, "rewards/margins": 6.214838027954102, "rewards/rejected": -9.016085624694824, "step": 399 }, { "epoch": 0.8, "grad_norm": 37.26084899902344, "learning_rate": 4.0740740740740737e-07, "logps/chosen": -38.936378479003906, "logps/rejected": -123.11367797851562, "loss": 0.097, "losses/dpo": 0.2155800759792328, "losses/sft": 0.5946239829063416, "losses/total": 0.2155800759792328, "ref_logps/chosen": -13.216875076293945, "ref_logps/rejected": -42.703304290771484, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5719504356384277, "rewards/margins": 5.469087600708008, "rewards/rejected": -8.041038513183594, "step": 400 }, { "epoch": 0.8, "grad_norm": 31.475679397583008, "learning_rate": 4.07037037037037e-07, "logps/chosen": -34.12742614746094, "logps/rejected": -130.51797485351562, "loss": 0.0595, "losses/dpo": 0.03444742411375046, "losses/sft": 0.668463945388794, "losses/total": 0.03444742411375046, "ref_logps/chosen": -15.204010009765625, "ref_logps/rejected": -47.96771240234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8923413753509521, "rewards/margins": 6.36268424987793, "rewards/rejected": -8.255025863647461, "step": 401 }, { "epoch": 0.8, "grad_norm": 47.62376022338867, "learning_rate": 4.0666666666666666e-07, "logps/chosen": -38.50322723388672, "logps/rejected": -122.90655517578125, "loss": 0.1303, "losses/dpo": 0.1284504532814026, "losses/sft": 0.7032700777053833, "losses/total": 0.1284504532814026, "ref_logps/chosen": -14.979454040527344, "ref_logps/rejected": -43.11996841430664, "rewards/accuracies": 1.0, "rewards/chosen": -2.352377414703369, "rewards/margins": 5.62628173828125, "rewards/rejected": -7.978658676147461, "step": 402 }, { "epoch": 0.81, "grad_norm": 23.952451705932617, "learning_rate": 4.062962962962963e-07, "logps/chosen": -45.23053741455078, "logps/rejected": -132.68417358398438, "loss": 0.0404, "losses/dpo": 0.09107367694377899, "losses/sft": 0.8164919018745422, "losses/total": 0.09107367694377899, "ref_logps/chosen": -15.182647705078125, "ref_logps/rejected": -45.23405456542969, "rewards/accuracies": 1.0, "rewards/chosen": -3.004788875579834, "rewards/margins": 5.740222930908203, "rewards/rejected": -8.745011329650879, "step": 403 }, { "epoch": 0.81, "grad_norm": 53.22486114501953, "learning_rate": 4.059259259259259e-07, "logps/chosen": -42.65351867675781, "logps/rejected": -136.69607543945312, "loss": 0.1512, "losses/dpo": 0.19927635788917542, "losses/sft": 0.6840373277664185, "losses/total": 0.19927635788917542, "ref_logps/chosen": -18.133865356445312, "ref_logps/rejected": -49.21907043457031, "rewards/accuracies": 0.875, "rewards/chosen": -2.451965808868408, "rewards/margins": 6.295734882354736, "rewards/rejected": -8.747699737548828, "step": 404 }, { "epoch": 0.81, "grad_norm": 51.80613327026367, "learning_rate": 4.055555555555555e-07, "logps/chosen": -36.814353942871094, "logps/rejected": -134.43319702148438, "loss": 0.0981, "losses/dpo": 0.28013885021209717, "losses/sft": 0.7207523584365845, "losses/total": 0.28013885021209717, "ref_logps/chosen": -11.887121200561523, "ref_logps/rejected": -44.62443542480469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.492722988128662, "rewards/margins": 6.488152503967285, "rewards/rejected": -8.980875015258789, "step": 405 }, { "epoch": 0.81, "grad_norm": 24.698190689086914, "learning_rate": 4.0518518518518517e-07, "logps/chosen": -31.342304229736328, "logps/rejected": -114.50019836425781, "loss": 0.0902, "losses/dpo": 0.04047441482543945, "losses/sft": 0.8049769401550293, "losses/total": 0.04047441482543945, "ref_logps/chosen": -12.826787948608398, "ref_logps/rejected": -41.65668869018555, "rewards/accuracies": 1.0, "rewards/chosen": -1.8515514135360718, "rewards/margins": 5.43280029296875, "rewards/rejected": -7.284351348876953, "step": 406 }, { "epoch": 0.81, "grad_norm": 45.53025436401367, "learning_rate": 4.0481481481481475e-07, "logps/chosen": -37.1544189453125, "logps/rejected": -121.78252410888672, "loss": 0.1076, "losses/dpo": 0.32062214612960815, "losses/sft": 0.5877598524093628, "losses/total": 0.32062214612960815, "ref_logps/chosen": -11.954411506652832, "ref_logps/rejected": -42.16912841796875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.52000093460083, "rewards/margins": 5.441339015960693, "rewards/rejected": -7.961340427398682, "step": 407 }, { "epoch": 0.82, "grad_norm": 17.013662338256836, "learning_rate": 4.044444444444444e-07, "logps/chosen": -36.11525344848633, "logps/rejected": -126.46176147460938, "loss": 0.1019, "losses/dpo": 0.06139393895864487, "losses/sft": 0.6258932948112488, "losses/total": 0.06139393895864487, "ref_logps/chosen": -12.489351272583008, "ref_logps/rejected": -47.692588806152344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3625903129577637, "rewards/margins": 5.514326095581055, "rewards/rejected": -7.87691593170166, "step": 408 }, { "epoch": 0.82, "grad_norm": 13.054990768432617, "learning_rate": 4.040740740740741e-07, "logps/chosen": -39.20123291015625, "logps/rejected": -159.31448364257812, "loss": 0.0268, "losses/dpo": 0.054594915360212326, "losses/sft": 0.7585716843605042, "losses/total": 0.054594915360212326, "ref_logps/chosen": -17.77873992919922, "ref_logps/rejected": -61.389122009277344, "rewards/accuracies": 1.0, "rewards/chosen": -2.14224910736084, "rewards/margins": 7.650287628173828, "rewards/rejected": -9.792536735534668, "step": 409 }, { "epoch": 0.82, "grad_norm": 47.22980880737305, "learning_rate": 4.0370370370370373e-07, "logps/chosen": -47.70698928833008, "logps/rejected": -131.38491821289062, "loss": 0.0699, "losses/dpo": 0.22821170091629028, "losses/sft": 1.048500657081604, "losses/total": 0.22821170091629028, "ref_logps/chosen": -16.506330490112305, "ref_logps/rejected": -40.986934661865234, "rewards/accuracies": 0.9375, "rewards/chosen": -3.120065689086914, "rewards/margins": 5.919732093811035, "rewards/rejected": -9.03979778289795, "step": 410 }, { "epoch": 0.82, "grad_norm": 22.45016860961914, "learning_rate": 4.033333333333333e-07, "logps/chosen": -40.05971908569336, "logps/rejected": -125.94699096679688, "loss": 0.1039, "losses/dpo": 0.05524634197354317, "losses/sft": 0.7447971105575562, "losses/total": 0.05524634197354317, "ref_logps/chosen": -13.532623291015625, "ref_logps/rejected": -42.57023620605469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.652709484100342, "rewards/margins": 5.68496561050415, "rewards/rejected": -8.337675094604492, "step": 411 }, { "epoch": 0.82, "grad_norm": 36.30315399169922, "learning_rate": 4.0296296296296296e-07, "logps/chosen": -39.070838928222656, "logps/rejected": -163.58050537109375, "loss": 0.0916, "losses/dpo": 0.21355107426643372, "losses/sft": 0.7528839707374573, "losses/total": 0.21355107426643372, "ref_logps/chosen": -14.982392311096191, "ref_logps/rejected": -60.20352554321289, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4088447093963623, "rewards/margins": 7.928853988647461, "rewards/rejected": -10.337697982788086, "step": 412 }, { "epoch": 0.83, "grad_norm": 12.464835166931152, "learning_rate": 4.025925925925926e-07, "logps/chosen": -37.97372055053711, "logps/rejected": -147.67945861816406, "loss": 0.0455, "losses/dpo": 0.14440147578716278, "losses/sft": 1.0511807203292847, "losses/total": 0.14440147578716278, "ref_logps/chosen": -10.53607177734375, "ref_logps/rejected": -51.0684814453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.743765115737915, "rewards/margins": 6.917333602905273, "rewards/rejected": -9.66109848022461, "step": 413 }, { "epoch": 0.83, "grad_norm": 34.10635757446289, "learning_rate": 4.022222222222222e-07, "logps/chosen": -48.30426025390625, "logps/rejected": -106.06845092773438, "loss": 0.0948, "losses/dpo": 0.15190474689006805, "losses/sft": 0.7456372976303101, "losses/total": 0.15190474689006805, "ref_logps/chosen": -17.072410583496094, "ref_logps/rejected": -34.360252380371094, "rewards/accuracies": 1.0, "rewards/chosen": -3.1231849193573, "rewards/margins": 4.047634601593018, "rewards/rejected": -7.170819282531738, "step": 414 }, { "epoch": 0.83, "grad_norm": 19.513986587524414, "learning_rate": 4.0185185185185183e-07, "logps/chosen": -34.9712028503418, "logps/rejected": -150.7466278076172, "loss": 0.066, "losses/dpo": 0.1424039751291275, "losses/sft": 0.4295719265937805, "losses/total": 0.1424039751291275, "ref_logps/chosen": -14.411608695983887, "ref_logps/rejected": -58.60187530517578, "rewards/accuracies": 1.0, "rewards/chosen": -2.055959701538086, "rewards/margins": 7.158515930175781, "rewards/rejected": -9.214475631713867, "step": 415 }, { "epoch": 0.83, "grad_norm": 30.81915855407715, "learning_rate": 4.0148148148148147e-07, "logps/chosen": -40.00956726074219, "logps/rejected": -146.01316833496094, "loss": 0.141, "losses/dpo": 0.2193525731563568, "losses/sft": 0.7795408964157104, "losses/total": 0.2193525731563568, "ref_logps/chosen": -14.545209884643555, "ref_logps/rejected": -53.822059631347656, "rewards/accuracies": 0.875, "rewards/chosen": -2.546435832977295, "rewards/margins": 6.672675132751465, "rewards/rejected": -9.219110488891602, "step": 416 }, { "epoch": 0.83, "grad_norm": 11.609371185302734, "learning_rate": 4.0111111111111106e-07, "logps/chosen": -38.845733642578125, "logps/rejected": -145.8206024169922, "loss": 0.0669, "losses/dpo": 0.23528042435646057, "losses/sft": 0.813218355178833, "losses/total": 0.23528042435646057, "ref_logps/chosen": -13.398239135742188, "ref_logps/rejected": -48.828125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5447497367858887, "rewards/margins": 7.154498100280762, "rewards/rejected": -9.699247360229492, "step": 417 }, { "epoch": 0.84, "grad_norm": 15.4019193649292, "learning_rate": 4.007407407407407e-07, "logps/chosen": -25.659170150756836, "logps/rejected": -113.36529541015625, "loss": 0.0456, "losses/dpo": 0.07958737760782242, "losses/sft": 0.48200511932373047, "losses/total": 0.07958737760782242, "ref_logps/chosen": -11.393777847290039, "ref_logps/rejected": -39.325225830078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.426539421081543, "rewards/margins": 5.977468490600586, "rewards/rejected": -7.404007911682129, "step": 418 }, { "epoch": 0.84, "grad_norm": 37.076507568359375, "learning_rate": 4.0037037037037034e-07, "logps/chosen": -33.09357833862305, "logps/rejected": -125.79651641845703, "loss": 0.1009, "losses/dpo": 0.3316452205181122, "losses/sft": 0.946098804473877, "losses/total": 0.3316452205181122, "ref_logps/chosen": -11.90180492401123, "ref_logps/rejected": -47.329490661621094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1191773414611816, "rewards/margins": 5.727524757385254, "rewards/rejected": -7.846702575683594, "step": 419 }, { "epoch": 0.84, "grad_norm": 31.1119384765625, "learning_rate": 4e-07, "logps/chosen": -37.595184326171875, "logps/rejected": -114.86734008789062, "loss": 0.0765, "losses/dpo": 0.15944091975688934, "losses/sft": 1.0308585166931152, "losses/total": 0.15944091975688934, "ref_logps/chosen": -14.364852905273438, "ref_logps/rejected": -39.427894592285156, "rewards/accuracies": 1.0, "rewards/chosen": -2.323032855987549, "rewards/margins": 5.220911026000977, "rewards/rejected": -7.543943881988525, "step": 420 }, { "epoch": 0.84, "grad_norm": 31.826692581176758, "learning_rate": 3.9962962962962957e-07, "logps/chosen": -38.185302734375, "logps/rejected": -102.858642578125, "loss": 0.1069, "losses/dpo": 0.007780781015753746, "losses/sft": 0.9135001301765442, "losses/total": 0.007780781015753746, "ref_logps/chosen": -12.624923706054688, "ref_logps/rejected": -36.459877014160156, "rewards/accuracies": 1.0, "rewards/chosen": -2.5560381412506104, "rewards/margins": 4.083838939666748, "rewards/rejected": -6.6398773193359375, "step": 421 }, { "epoch": 0.84, "grad_norm": 48.20242691040039, "learning_rate": 3.9925925925925926e-07, "logps/chosen": -39.689151763916016, "logps/rejected": -107.40838623046875, "loss": 0.1396, "losses/dpo": 0.08528731763362885, "losses/sft": 0.7178528308868408, "losses/total": 0.08528731763362885, "ref_logps/chosen": -14.831443786621094, "ref_logps/rejected": -35.24195861816406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4857709407806396, "rewards/margins": 4.73087215423584, "rewards/rejected": -7.2166428565979, "step": 422 }, { "epoch": 0.85, "grad_norm": 35.273048400878906, "learning_rate": 3.988888888888889e-07, "logps/chosen": -35.35859680175781, "logps/rejected": -95.65461730957031, "loss": 0.1087, "losses/dpo": 0.16164197027683258, "losses/sft": 0.9079208374023438, "losses/total": 0.16164197027683258, "ref_logps/chosen": -11.853939056396484, "ref_logps/rejected": -33.45928192138672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.350465774536133, "rewards/margins": 3.8690683841705322, "rewards/rejected": -6.219534397125244, "step": 423 }, { "epoch": 0.85, "grad_norm": 20.826913833618164, "learning_rate": 3.985185185185185e-07, "logps/chosen": -33.33007049560547, "logps/rejected": -121.75100708007812, "loss": 0.0494, "losses/dpo": 0.08566058427095413, "losses/sft": 0.4007030129432678, "losses/total": 0.08566058427095413, "ref_logps/chosen": -18.737146377563477, "ref_logps/rejected": -46.80057144165039, "rewards/accuracies": 1.0, "rewards/chosen": -1.4592924118041992, "rewards/margins": 6.035750389099121, "rewards/rejected": -7.4950432777404785, "step": 424 }, { "epoch": 0.85, "grad_norm": 10.35861587524414, "learning_rate": 3.9814814814814813e-07, "logps/chosen": -34.687767028808594, "logps/rejected": -121.10075378417969, "loss": 0.0208, "losses/dpo": 0.005229136906564236, "losses/sft": 0.6532012224197388, "losses/total": 0.005229136906564236, "ref_logps/chosen": -13.203108787536621, "ref_logps/rejected": -41.180877685546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.148465871810913, "rewards/margins": 5.843521595001221, "rewards/rejected": -7.991987705230713, "step": 425 }, { "epoch": 0.85, "grad_norm": 20.87963104248047, "learning_rate": 3.977777777777778e-07, "logps/chosen": -39.72007751464844, "logps/rejected": -120.63939666748047, "loss": 0.06, "losses/dpo": 0.057065702974796295, "losses/sft": 0.828675389289856, "losses/total": 0.057065702974796295, "ref_logps/chosen": -13.151759147644043, "ref_logps/rejected": -41.24854278564453, "rewards/accuracies": 1.0, "rewards/chosen": -2.656831979751587, "rewards/margins": 5.282253265380859, "rewards/rejected": -7.939085006713867, "step": 426 }, { "epoch": 0.85, "grad_norm": 13.600708961486816, "learning_rate": 3.974074074074074e-07, "logps/chosen": -30.19253921508789, "logps/rejected": -126.12801361083984, "loss": 0.0383, "losses/dpo": 0.0667053759098053, "losses/sft": 0.6852974891662598, "losses/total": 0.0667053759098053, "ref_logps/chosen": -9.91547966003418, "ref_logps/rejected": -44.596343994140625, "rewards/accuracies": 1.0, "rewards/chosen": -2.027705669403076, "rewards/margins": 6.125460624694824, "rewards/rejected": -8.153166770935059, "step": 427 }, { "epoch": 0.86, "grad_norm": 18.37424659729004, "learning_rate": 3.97037037037037e-07, "logps/chosen": -38.085697174072266, "logps/rejected": -116.51532745361328, "loss": 0.0591, "losses/dpo": 0.0367395393550396, "losses/sft": 0.8513802289962769, "losses/total": 0.0367395393550396, "ref_logps/chosen": -12.653528213500977, "ref_logps/rejected": -40.740966796875, "rewards/accuracies": 1.0, "rewards/chosen": -2.5432169437408447, "rewards/margins": 5.034219741821289, "rewards/rejected": -7.577436447143555, "step": 428 }, { "epoch": 0.86, "grad_norm": 86.92479705810547, "learning_rate": 3.9666666666666665e-07, "logps/chosen": -33.40900802612305, "logps/rejected": -97.5163345336914, "loss": 0.1676, "losses/dpo": 0.06531870365142822, "losses/sft": 0.718450665473938, "losses/total": 0.06531870365142822, "ref_logps/chosen": -12.761130332946777, "ref_logps/rejected": -34.69020462036133, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0647878646850586, "rewards/margins": 4.217825412750244, "rewards/rejected": -6.282613277435303, "step": 429 }, { "epoch": 0.86, "grad_norm": 16.273277282714844, "learning_rate": 3.962962962962963e-07, "logps/chosen": -38.851871490478516, "logps/rejected": -114.59307098388672, "loss": 0.0732, "losses/dpo": 0.1843334287405014, "losses/sft": 0.5088679790496826, "losses/total": 0.1843334287405014, "ref_logps/chosen": -20.497560501098633, "ref_logps/rejected": -39.37632751464844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8354310989379883, "rewards/margins": 5.686243057250977, "rewards/rejected": -7.521674156188965, "step": 430 }, { "epoch": 0.86, "grad_norm": 23.66449737548828, "learning_rate": 3.959259259259259e-07, "logps/chosen": -36.00865936279297, "logps/rejected": -99.69367218017578, "loss": 0.0811, "losses/dpo": 0.11141210049390793, "losses/sft": 0.8997406959533691, "losses/total": 0.11141210049390793, "ref_logps/chosen": -11.728845596313477, "ref_logps/rejected": -33.64956283569336, "rewards/accuracies": 1.0, "rewards/chosen": -2.42798113822937, "rewards/margins": 4.176429748535156, "rewards/rejected": -6.604410648345947, "step": 431 }, { "epoch": 0.86, "grad_norm": 22.341249465942383, "learning_rate": 3.955555555555555e-07, "logps/chosen": -46.45829391479492, "logps/rejected": -127.15127563476562, "loss": 0.0827, "losses/dpo": 0.14688825607299805, "losses/sft": 0.8811045289039612, "losses/total": 0.14688825607299805, "ref_logps/chosen": -14.368127822875977, "ref_logps/rejected": -39.30841064453125, "rewards/accuracies": 1.0, "rewards/chosen": -3.2090163230895996, "rewards/margins": 5.57526969909668, "rewards/rejected": -8.784286499023438, "step": 432 }, { "epoch": 0.87, "grad_norm": 20.467079162597656, "learning_rate": 3.9518518518518516e-07, "logps/chosen": -40.27122497558594, "logps/rejected": -133.2883758544922, "loss": 0.0947, "losses/dpo": 0.00427745096385479, "losses/sft": 0.7645794749259949, "losses/total": 0.00427745096385479, "ref_logps/chosen": -16.05865478515625, "ref_logps/rejected": -45.82427978515625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4212567806243896, "rewards/margins": 6.325152397155762, "rewards/rejected": -8.74640941619873, "step": 433 }, { "epoch": 0.87, "grad_norm": 43.505855560302734, "learning_rate": 3.948148148148148e-07, "logps/chosen": -41.396522521972656, "logps/rejected": -124.23265838623047, "loss": 0.1128, "losses/dpo": 0.003313305089250207, "losses/sft": 0.8131544589996338, "losses/total": 0.003313305089250207, "ref_logps/chosen": -12.796215057373047, "ref_logps/rejected": -39.55033874511719, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8600306510925293, "rewards/margins": 5.60820198059082, "rewards/rejected": -8.468232154846191, "step": 434 }, { "epoch": 0.87, "grad_norm": 43.20351028442383, "learning_rate": 3.9444444444444444e-07, "logps/chosen": -39.572059631347656, "logps/rejected": -113.74073028564453, "loss": 0.1422, "losses/dpo": 0.3530581593513489, "losses/sft": 0.836111307144165, "losses/total": 0.3530581593513489, "ref_logps/chosen": -14.05323600769043, "ref_logps/rejected": -39.58940505981445, "rewards/accuracies": 0.9375, "rewards/chosen": -2.55188250541687, "rewards/margins": 4.863250732421875, "rewards/rejected": -7.415132522583008, "step": 435 }, { "epoch": 0.87, "grad_norm": 17.075193405151367, "learning_rate": 3.940740740740741e-07, "logps/chosen": -35.89055252075195, "logps/rejected": -122.32846069335938, "loss": 0.0407, "losses/dpo": 0.09936435520648956, "losses/sft": 0.6812224984169006, "losses/total": 0.09936435520648956, "ref_logps/chosen": -12.606057167053223, "ref_logps/rejected": -42.71528625488281, "rewards/accuracies": 1.0, "rewards/chosen": -2.3284494876861572, "rewards/margins": 5.632867813110352, "rewards/rejected": -7.961318016052246, "step": 436 }, { "epoch": 0.87, "grad_norm": 23.870943069458008, "learning_rate": 3.937037037037037e-07, "logps/chosen": -41.031890869140625, "logps/rejected": -152.42169189453125, "loss": 0.0415, "losses/dpo": 0.001001848024316132, "losses/sft": 0.504059910774231, "losses/total": 0.001001848024316132, "ref_logps/chosen": -17.22356414794922, "ref_logps/rejected": -54.20398712158203, "rewards/accuracies": 1.0, "rewards/chosen": -2.3808324337005615, "rewards/margins": 7.4409379959106445, "rewards/rejected": -9.821769714355469, "step": 437 }, { "epoch": 0.88, "grad_norm": 62.79549789428711, "learning_rate": 3.933333333333333e-07, "logps/chosen": -51.775909423828125, "logps/rejected": -132.87208557128906, "loss": 0.1194, "losses/dpo": 0.025578390806913376, "losses/sft": 0.8345500230789185, "losses/total": 0.025578390806913376, "ref_logps/chosen": -20.002838134765625, "ref_logps/rejected": -48.348453521728516, "rewards/accuracies": 0.875, "rewards/chosen": -3.17730712890625, "rewards/margins": 5.275055885314941, "rewards/rejected": -8.452363014221191, "step": 438 }, { "epoch": 0.88, "grad_norm": 56.714908599853516, "learning_rate": 3.9296296296296295e-07, "logps/chosen": -46.454166412353516, "logps/rejected": -162.05075073242188, "loss": 0.1845, "losses/dpo": 0.10782374441623688, "losses/sft": 0.814045786857605, "losses/total": 0.10782374441623688, "ref_logps/chosen": -16.248411178588867, "ref_logps/rejected": -61.597415924072266, "rewards/accuracies": 0.9375, "rewards/chosen": -3.020575761795044, "rewards/margins": 7.024758338928223, "rewards/rejected": -10.045333862304688, "step": 439 }, { "epoch": 0.88, "grad_norm": 55.224342346191406, "learning_rate": 3.925925925925926e-07, "logps/chosen": -39.01628875732422, "logps/rejected": -111.15766143798828, "loss": 0.1918, "losses/dpo": 0.06872375309467316, "losses/sft": 0.5836397409439087, "losses/total": 0.06872375309467316, "ref_logps/chosen": -10.98344898223877, "ref_logps/rejected": -35.79201126098633, "rewards/accuracies": 0.9375, "rewards/chosen": -2.803284168243408, "rewards/margins": 4.733281135559082, "rewards/rejected": -7.536565780639648, "step": 440 }, { "epoch": 0.88, "grad_norm": 77.9546890258789, "learning_rate": 3.9222222222222223e-07, "logps/chosen": -43.733333587646484, "logps/rejected": -130.84396362304688, "loss": 0.1259, "losses/dpo": 0.02421252429485321, "losses/sft": 0.8828197717666626, "losses/total": 0.02421252429485321, "ref_logps/chosen": -15.622417449951172, "ref_logps/rejected": -42.70435333251953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.811091899871826, "rewards/margins": 6.002869606018066, "rewards/rejected": -8.813961029052734, "step": 441 }, { "epoch": 0.88, "grad_norm": 24.473661422729492, "learning_rate": 3.918518518518518e-07, "logps/chosen": -51.01894760131836, "logps/rejected": -132.1776123046875, "loss": 0.0618, "losses/dpo": 0.05584227293729782, "losses/sft": 0.6618070006370544, "losses/total": 0.05584227293729782, "ref_logps/chosen": -20.57441520690918, "ref_logps/rejected": -45.225921630859375, "rewards/accuracies": 1.0, "rewards/chosen": -3.0444531440734863, "rewards/margins": 5.6507158279418945, "rewards/rejected": -8.695169448852539, "step": 442 }, { "epoch": 0.89, "grad_norm": 28.41942024230957, "learning_rate": 3.9148148148148146e-07, "logps/chosen": -41.577335357666016, "logps/rejected": -120.86549377441406, "loss": 0.0793, "losses/dpo": 0.02593686804175377, "losses/sft": 0.5973250865936279, "losses/total": 0.02593686804175377, "ref_logps/chosen": -17.241764068603516, "ref_logps/rejected": -46.92060089111328, "rewards/accuracies": 1.0, "rewards/chosen": -2.4335572719573975, "rewards/margins": 4.960931777954102, "rewards/rejected": -7.394489765167236, "step": 443 }, { "epoch": 0.89, "grad_norm": 20.679649353027344, "learning_rate": 3.911111111111111e-07, "logps/chosen": -36.46300506591797, "logps/rejected": -139.58978271484375, "loss": 0.0482, "losses/dpo": 0.06509046256542206, "losses/sft": 0.7196379899978638, "losses/total": 0.06509046256542206, "ref_logps/chosen": -14.438896179199219, "ref_logps/rejected": -48.49089050292969, "rewards/accuracies": 1.0, "rewards/chosen": -2.202410936355591, "rewards/margins": 6.907477855682373, "rewards/rejected": -9.109889030456543, "step": 444 }, { "epoch": 0.89, "grad_norm": 20.597949981689453, "learning_rate": 3.907407407407407e-07, "logps/chosen": -39.067344665527344, "logps/rejected": -131.234619140625, "loss": 0.0492, "losses/dpo": 0.024714581668376923, "losses/sft": 0.4357023239135742, "losses/total": 0.024714581668376923, "ref_logps/chosen": -17.182262420654297, "ref_logps/rejected": -50.957862854003906, "rewards/accuracies": 1.0, "rewards/chosen": -2.1885082721710205, "rewards/margins": 5.839167594909668, "rewards/rejected": -8.02767562866211, "step": 445 }, { "epoch": 0.89, "grad_norm": 23.935184478759766, "learning_rate": 3.9037037037037033e-07, "logps/chosen": -37.2645378112793, "logps/rejected": -108.57901000976562, "loss": 0.07, "losses/dpo": 0.08478090912103653, "losses/sft": 0.6661969423294067, "losses/total": 0.08478090912103653, "ref_logps/chosen": -16.068418502807617, "ref_logps/rejected": -35.516510009765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.1196117401123047, "rewards/margins": 5.186638832092285, "rewards/rejected": -7.30625057220459, "step": 446 }, { "epoch": 0.89, "grad_norm": 56.30306625366211, "learning_rate": 3.8999999999999997e-07, "logps/chosen": -40.75579833984375, "logps/rejected": -103.54489135742188, "loss": 0.1354, "losses/dpo": 0.06459180265665054, "losses/sft": 0.676565945148468, "losses/total": 0.06459180265665054, "ref_logps/chosen": -18.714557647705078, "ref_logps/rejected": -38.181575775146484, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2041239738464355, "rewards/margins": 4.332207679748535, "rewards/rejected": -6.536331653594971, "step": 447 }, { "epoch": 0.9, "grad_norm": 53.9810905456543, "learning_rate": 3.8962962962962956e-07, "logps/chosen": -44.43140411376953, "logps/rejected": -120.11578369140625, "loss": 0.205, "losses/dpo": 0.2935711741447449, "losses/sft": 0.7037935256958008, "losses/total": 0.2935711741447449, "ref_logps/chosen": -17.124698638916016, "ref_logps/rejected": -42.24882125854492, "rewards/accuracies": 0.8125, "rewards/chosen": -2.73067045211792, "rewards/margins": 5.056025505065918, "rewards/rejected": -7.786696434020996, "step": 448 }, { "epoch": 0.9, "grad_norm": 21.460792541503906, "learning_rate": 3.8925925925925925e-07, "logps/chosen": -34.50408935546875, "logps/rejected": -140.84860229492188, "loss": 0.0972, "losses/dpo": 0.0007553499890491366, "losses/sft": 0.749845027923584, "losses/total": 0.0007553499890491366, "ref_logps/chosen": -13.832296371459961, "ref_logps/rejected": -55.203399658203125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0671794414520264, "rewards/margins": 6.497340202331543, "rewards/rejected": -8.564519882202148, "step": 449 }, { "epoch": 0.9, "grad_norm": 39.593719482421875, "learning_rate": 3.888888888888889e-07, "logps/chosen": -38.65271759033203, "logps/rejected": -110.78114318847656, "loss": 0.1592, "losses/dpo": 0.2055157870054245, "losses/sft": 0.3508860766887665, "losses/total": 0.2055157870054245, "ref_logps/chosen": -17.12994384765625, "ref_logps/rejected": -39.235443115234375, "rewards/accuracies": 0.875, "rewards/chosen": -2.1522774696350098, "rewards/margins": 5.002292633056641, "rewards/rejected": -7.154570579528809, "step": 450 }, { "epoch": 0.9, "grad_norm": 12.109270095825195, "learning_rate": 3.8851851851851854e-07, "logps/chosen": -49.65500259399414, "logps/rejected": -144.41647338867188, "loss": 0.0443, "losses/dpo": 0.11350575089454651, "losses/sft": 0.6022939085960388, "losses/total": 0.11350575089454651, "ref_logps/chosen": -20.374874114990234, "ref_logps/rejected": -49.809410095214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.928013324737549, "rewards/margins": 6.532693386077881, "rewards/rejected": -9.46070671081543, "step": 451 }, { "epoch": 0.9, "grad_norm": 11.183013916015625, "learning_rate": 3.881481481481481e-07, "logps/chosen": -36.81501007080078, "logps/rejected": -180.78744506835938, "loss": 0.032, "losses/dpo": 0.01637108251452446, "losses/sft": 0.5078365802764893, "losses/total": 0.01637108251452446, "ref_logps/chosen": -17.165584564208984, "ref_logps/rejected": -79.67362213134766, "rewards/accuracies": 1.0, "rewards/chosen": -1.9649423360824585, "rewards/margins": 8.146439552307129, "rewards/rejected": -10.111382484436035, "step": 452 }, { "epoch": 0.91, "grad_norm": 41.93173599243164, "learning_rate": 3.8777777777777776e-07, "logps/chosen": -46.13130569458008, "logps/rejected": -105.22616577148438, "loss": 0.1333, "losses/dpo": 0.09395473450422287, "losses/sft": 0.7680277228355408, "losses/total": 0.09395473450422287, "ref_logps/chosen": -16.89548110961914, "ref_logps/rejected": -32.069122314453125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9235825538635254, "rewards/margins": 4.392122268676758, "rewards/rejected": -7.315704822540283, "step": 453 }, { "epoch": 0.91, "grad_norm": 81.26608276367188, "learning_rate": 3.874074074074074e-07, "logps/chosen": -39.240081787109375, "logps/rejected": -134.875244140625, "loss": 0.3474, "losses/dpo": 1.107574701309204, "losses/sft": 0.8552225232124329, "losses/total": 1.107574701309204, "ref_logps/chosen": -12.673192977905273, "ref_logps/rejected": -44.445648193359375, "rewards/accuracies": 0.875, "rewards/chosen": -2.656689167022705, "rewards/margins": 6.3862714767456055, "rewards/rejected": -9.042960166931152, "step": 454 }, { "epoch": 0.91, "grad_norm": 17.12772560119629, "learning_rate": 3.87037037037037e-07, "logps/chosen": -30.340044021606445, "logps/rejected": -118.41051483154297, "loss": 0.0557, "losses/dpo": 0.06403327733278275, "losses/sft": 0.8791080713272095, "losses/total": 0.06403327733278275, "ref_logps/chosen": -11.27092456817627, "ref_logps/rejected": -44.00303649902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.9069119691848755, "rewards/margins": 5.533836364746094, "rewards/rejected": -7.44074821472168, "step": 455 }, { "epoch": 0.91, "grad_norm": 24.304636001586914, "learning_rate": 3.8666666666666664e-07, "logps/chosen": -31.13290023803711, "logps/rejected": -126.29150390625, "loss": 0.0812, "losses/dpo": 0.08947796374559402, "losses/sft": 0.5682837963104248, "losses/total": 0.08947796374559402, "ref_logps/chosen": -13.352134704589844, "ref_logps/rejected": -42.56218338012695, "rewards/accuracies": 0.9375, "rewards/chosen": -1.778076410293579, "rewards/margins": 6.594855308532715, "rewards/rejected": -8.372931480407715, "step": 456 }, { "epoch": 0.91, "grad_norm": 30.6898250579834, "learning_rate": 3.862962962962963e-07, "logps/chosen": -38.754295349121094, "logps/rejected": -122.19239807128906, "loss": 0.1225, "losses/dpo": 0.27622735500335693, "losses/sft": 0.6371285915374756, "losses/total": 0.27622735500335693, "ref_logps/chosen": -15.183916091918945, "ref_logps/rejected": -41.27008819580078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3570375442504883, "rewards/margins": 5.735192775726318, "rewards/rejected": -8.092230796813965, "step": 457 }, { "epoch": 0.92, "grad_norm": 23.223514556884766, "learning_rate": 3.859259259259259e-07, "logps/chosen": -41.43684387207031, "logps/rejected": -117.0394287109375, "loss": 0.0683, "losses/dpo": 0.011316630057990551, "losses/sft": 0.6864824891090393, "losses/total": 0.011316630057990551, "ref_logps/chosen": -18.469993591308594, "ref_logps/rejected": -42.62580108642578, "rewards/accuracies": 1.0, "rewards/chosen": -2.296685218811035, "rewards/margins": 5.14467716217041, "rewards/rejected": -7.441361427307129, "step": 458 }, { "epoch": 0.92, "grad_norm": 27.830724716186523, "learning_rate": 3.855555555555555e-07, "logps/chosen": -42.79747009277344, "logps/rejected": -110.60104370117188, "loss": 0.0738, "losses/dpo": 0.08957645297050476, "losses/sft": 0.6658487319946289, "losses/total": 0.08957645297050476, "ref_logps/chosen": -16.33731460571289, "ref_logps/rejected": -38.744163513183594, "rewards/accuracies": 1.0, "rewards/chosen": -2.6460158824920654, "rewards/margins": 4.539671897888184, "rewards/rejected": -7.185688018798828, "step": 459 }, { "epoch": 0.92, "grad_norm": 14.787923812866211, "learning_rate": 3.8518518518518515e-07, "logps/chosen": -36.30558776855469, "logps/rejected": -123.15371704101562, "loss": 0.0272, "losses/dpo": 0.008614415302872658, "losses/sft": 0.710669994354248, "losses/total": 0.008614415302872658, "ref_logps/chosen": -12.608972549438477, "ref_logps/rejected": -41.72282409667969, "rewards/accuracies": 1.0, "rewards/chosen": -2.369661808013916, "rewards/margins": 5.773427963256836, "rewards/rejected": -8.143089294433594, "step": 460 }, { "epoch": 0.92, "grad_norm": 29.622129440307617, "learning_rate": 3.8481481481481484e-07, "logps/chosen": -43.93711471557617, "logps/rejected": -134.57131958007812, "loss": 0.0522, "losses/dpo": 0.022071661427617073, "losses/sft": 0.6957737803459167, "losses/total": 0.022071661427617073, "ref_logps/chosen": -19.354290008544922, "ref_logps/rejected": -46.689414978027344, "rewards/accuracies": 1.0, "rewards/chosen": -2.458282709121704, "rewards/margins": 6.32990837097168, "rewards/rejected": -8.788190841674805, "step": 461 }, { "epoch": 0.92, "grad_norm": 39.33469009399414, "learning_rate": 3.8444444444444443e-07, "logps/chosen": -42.33057403564453, "logps/rejected": -128.18800354003906, "loss": 0.0846, "losses/dpo": 0.07230532169342041, "losses/sft": 0.7907838821411133, "losses/total": 0.07230532169342041, "ref_logps/chosen": -15.459431648254395, "ref_logps/rejected": -44.470672607421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.687114715576172, "rewards/margins": 5.684619426727295, "rewards/rejected": -8.371733665466309, "step": 462 }, { "epoch": 0.93, "grad_norm": 77.01020812988281, "learning_rate": 3.8407407407407407e-07, "logps/chosen": -41.150062561035156, "logps/rejected": -97.16158294677734, "loss": 0.2559, "losses/dpo": 0.1757294088602066, "losses/sft": 0.8417171239852905, "losses/total": 0.1757294088602066, "ref_logps/chosen": -12.81360912322998, "ref_logps/rejected": -30.693498611450195, "rewards/accuracies": 0.875, "rewards/chosen": -2.8336455821990967, "rewards/margins": 3.8131628036499023, "rewards/rejected": -6.646808624267578, "step": 463 }, { "epoch": 0.93, "grad_norm": 44.11057662963867, "learning_rate": 3.837037037037037e-07, "logps/chosen": -39.26576232910156, "logps/rejected": -128.0055694580078, "loss": 0.0871, "losses/dpo": 0.17416299879550934, "losses/sft": 0.7194896936416626, "losses/total": 0.17416299879550934, "ref_logps/chosen": -14.974311828613281, "ref_logps/rejected": -43.98797607421875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.429144859313965, "rewards/margins": 5.972614288330078, "rewards/rejected": -8.40176010131836, "step": 464 }, { "epoch": 0.93, "grad_norm": 12.82344913482666, "learning_rate": 3.8333333333333335e-07, "logps/chosen": -40.717891693115234, "logps/rejected": -148.91168212890625, "loss": 0.0333, "losses/dpo": 0.040061675012111664, "losses/sft": 0.8902965784072876, "losses/total": 0.040061675012111664, "ref_logps/chosen": -12.840675354003906, "ref_logps/rejected": -51.16914367675781, "rewards/accuracies": 1.0, "rewards/chosen": -2.787721633911133, "rewards/margins": 6.986532211303711, "rewards/rejected": -9.774253845214844, "step": 465 }, { "epoch": 0.93, "grad_norm": 71.54187774658203, "learning_rate": 3.8296296296296294e-07, "logps/chosen": -44.93368148803711, "logps/rejected": -149.85275268554688, "loss": 0.1686, "losses/dpo": 0.4314330816268921, "losses/sft": 0.7240664958953857, "losses/total": 0.4314330816268921, "ref_logps/chosen": -13.75963020324707, "ref_logps/rejected": -53.12620544433594, "rewards/accuracies": 0.875, "rewards/chosen": -3.1174049377441406, "rewards/margins": 6.55525016784668, "rewards/rejected": -9.67265510559082, "step": 466 }, { "epoch": 0.93, "grad_norm": 22.28801727294922, "learning_rate": 3.825925925925926e-07, "logps/chosen": -39.607208251953125, "logps/rejected": -104.81693267822266, "loss": 0.0786, "losses/dpo": 0.0375826358795166, "losses/sft": 0.6283800601959229, "losses/total": 0.0375826358795166, "ref_logps/chosen": -18.35260772705078, "ref_logps/rejected": -37.030540466308594, "rewards/accuracies": 1.0, "rewards/chosen": -2.125459671020508, "rewards/margins": 4.653180122375488, "rewards/rejected": -6.778639793395996, "step": 467 }, { "epoch": 0.94, "grad_norm": 19.925273895263672, "learning_rate": 3.822222222222222e-07, "logps/chosen": -42.20486068725586, "logps/rejected": -147.87164306640625, "loss": 0.0363, "losses/dpo": 0.10509399324655533, "losses/sft": 0.804567813873291, "losses/total": 0.10509399324655533, "ref_logps/chosen": -13.097880363464355, "ref_logps/rejected": -49.719268798828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.910698413848877, "rewards/margins": 6.904539108276367, "rewards/rejected": -9.815237045288086, "step": 468 }, { "epoch": 0.94, "grad_norm": 39.51319122314453, "learning_rate": 3.818518518518518e-07, "logps/chosen": -47.34339141845703, "logps/rejected": -125.14842224121094, "loss": 0.0805, "losses/dpo": 0.09030229598283768, "losses/sft": 0.8959075808525085, "losses/total": 0.09030229598283768, "ref_logps/chosen": -14.1100435256958, "ref_logps/rejected": -38.99297332763672, "rewards/accuracies": 1.0, "rewards/chosen": -3.3233349323272705, "rewards/margins": 5.292210578918457, "rewards/rejected": -8.615545272827148, "step": 469 }, { "epoch": 0.94, "grad_norm": 10.734367370605469, "learning_rate": 3.8148148148148145e-07, "logps/chosen": -59.496826171875, "logps/rejected": -159.21438598632812, "loss": 0.1082, "losses/dpo": 0.3646402955055237, "losses/sft": 0.7248314619064331, "losses/total": 0.3646402955055237, "ref_logps/chosen": -22.694992065429688, "ref_logps/rejected": -59.701148986816406, "rewards/accuracies": 0.875, "rewards/chosen": -3.680183172225952, "rewards/margins": 6.271141529083252, "rewards/rejected": -9.951324462890625, "step": 470 }, { "epoch": 0.94, "grad_norm": 18.338951110839844, "learning_rate": 3.811111111111111e-07, "logps/chosen": -40.23529815673828, "logps/rejected": -144.66590881347656, "loss": 0.0466, "losses/dpo": 0.00546817434951663, "losses/sft": 1.0657308101654053, "losses/total": 0.00546817434951663, "ref_logps/chosen": -11.44991683959961, "ref_logps/rejected": -45.38568115234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.8785383701324463, "rewards/margins": 7.049485206604004, "rewards/rejected": -9.928024291992188, "step": 471 }, { "epoch": 0.94, "grad_norm": 35.54021072387695, "learning_rate": 3.8074074074074073e-07, "logps/chosen": -42.15109634399414, "logps/rejected": -151.8326416015625, "loss": 0.0724, "losses/dpo": 0.028490465134382248, "losses/sft": 0.8861966133117676, "losses/total": 0.028490465134382248, "ref_logps/chosen": -13.033123970031738, "ref_logps/rejected": -51.36277389526367, "rewards/accuracies": 1.0, "rewards/chosen": -2.911797285079956, "rewards/margins": 7.135190486907959, "rewards/rejected": -10.046987533569336, "step": 472 }, { "epoch": 0.95, "grad_norm": 56.16077423095703, "learning_rate": 3.803703703703703e-07, "logps/chosen": -47.64544677734375, "logps/rejected": -105.49983215332031, "loss": 0.2322, "losses/dpo": 0.06201707571744919, "losses/sft": 1.4952541589736938, "losses/total": 0.06201707571744919, "ref_logps/chosen": -10.078681945800781, "ref_logps/rejected": -29.885452270507812, "rewards/accuracies": 0.9375, "rewards/chosen": -3.75667667388916, "rewards/margins": 3.804760694503784, "rewards/rejected": -7.561437129974365, "step": 473 }, { "epoch": 0.95, "grad_norm": 66.81434631347656, "learning_rate": 3.7999999999999996e-07, "logps/chosen": -47.747276306152344, "logps/rejected": -119.25900268554688, "loss": 0.1594, "losses/dpo": 0.46635541319847107, "losses/sft": 0.7916244268417358, "losses/total": 0.46635541319847107, "ref_logps/chosen": -18.893095016479492, "ref_logps/rejected": -41.94963836669922, "rewards/accuracies": 0.9375, "rewards/chosen": -2.885418176651001, "rewards/margins": 4.845518112182617, "rewards/rejected": -7.730936527252197, "step": 474 }, { "epoch": 0.95, "grad_norm": 31.911237716674805, "learning_rate": 3.7962962962962966e-07, "logps/chosen": -38.076499938964844, "logps/rejected": -132.6082763671875, "loss": 0.0621, "losses/dpo": 0.0007034945301711559, "losses/sft": 0.6102277636528015, "losses/total": 0.0007034945301711559, "ref_logps/chosen": -9.867688179016113, "ref_logps/rejected": -48.1339111328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.8208813667297363, "rewards/margins": 5.626555442810059, "rewards/rejected": -8.447437286376953, "step": 475 }, { "epoch": 0.95, "grad_norm": 31.460481643676758, "learning_rate": 3.7925925925925924e-07, "logps/chosen": -36.52131271362305, "logps/rejected": -137.44662475585938, "loss": 0.0412, "losses/dpo": 0.013979414477944374, "losses/sft": 0.40546882152557373, "losses/total": 0.013979414477944374, "ref_logps/chosen": -12.335689544677734, "ref_logps/rejected": -46.54832458496094, "rewards/accuracies": 1.0, "rewards/chosen": -2.418562412261963, "rewards/margins": 6.671266555786133, "rewards/rejected": -9.089829444885254, "step": 476 }, { "epoch": 0.95, "grad_norm": 38.84501647949219, "learning_rate": 3.788888888888889e-07, "logps/chosen": -51.319183349609375, "logps/rejected": -155.2894744873047, "loss": 0.1021, "losses/dpo": 0.23493897914886475, "losses/sft": 0.7687029838562012, "losses/total": 0.23493897914886475, "ref_logps/chosen": -20.000171661376953, "ref_logps/rejected": -53.499454498291016, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1319010257720947, "rewards/margins": 7.04710054397583, "rewards/rejected": -10.179001808166504, "step": 477 }, { "epoch": 0.96, "grad_norm": 22.33247184753418, "learning_rate": 3.785185185185185e-07, "logps/chosen": -40.62896728515625, "logps/rejected": -123.1474609375, "loss": 0.0551, "losses/dpo": 0.05076390877366066, "losses/sft": 0.9945937991142273, "losses/total": 0.05076390877366066, "ref_logps/chosen": -11.130529403686523, "ref_logps/rejected": -41.4739875793457, "rewards/accuracies": 1.0, "rewards/chosen": -2.9498438835144043, "rewards/margins": 5.217503547668457, "rewards/rejected": -8.167346954345703, "step": 478 }, { "epoch": 0.96, "grad_norm": 44.30374526977539, "learning_rate": 3.781481481481481e-07, "logps/chosen": -42.91387176513672, "logps/rejected": -134.16275024414062, "loss": 0.0772, "losses/dpo": 0.02010273188352585, "losses/sft": 0.7935068607330322, "losses/total": 0.02010273188352585, "ref_logps/chosen": -13.567787170410156, "ref_logps/rejected": -44.79924774169922, "rewards/accuracies": 1.0, "rewards/chosen": -2.9346084594726562, "rewards/margins": 6.001742839813232, "rewards/rejected": -8.936351776123047, "step": 479 }, { "epoch": 0.96, "grad_norm": 18.497716903686523, "learning_rate": 3.7777777777777775e-07, "logps/chosen": -38.83562088012695, "logps/rejected": -119.28911590576172, "loss": 0.0535, "losses/dpo": 0.09974934905767441, "losses/sft": 0.7755734324455261, "losses/total": 0.09974934905767441, "ref_logps/chosen": -13.980093955993652, "ref_logps/rejected": -43.400691986083984, "rewards/accuracies": 1.0, "rewards/chosen": -2.4855527877807617, "rewards/margins": 5.10329008102417, "rewards/rejected": -7.58884334564209, "step": 480 }, { "epoch": 0.96, "grad_norm": 26.344017028808594, "learning_rate": 3.774074074074074e-07, "logps/chosen": -34.87080383300781, "logps/rejected": -129.17410278320312, "loss": 0.0709, "losses/dpo": 0.09608794748783112, "losses/sft": 0.6589397192001343, "losses/total": 0.09608794748783112, "ref_logps/chosen": -14.861343383789062, "ref_logps/rejected": -49.386024475097656, "rewards/accuracies": 1.0, "rewards/chosen": -2.000946044921875, "rewards/margins": 5.977862358093262, "rewards/rejected": -7.978808403015137, "step": 481 }, { "epoch": 0.96, "grad_norm": 23.19811248779297, "learning_rate": 3.7703703703703704e-07, "logps/chosen": -38.46788787841797, "logps/rejected": -129.5819549560547, "loss": 0.0465, "losses/dpo": 0.031357597559690475, "losses/sft": 0.6037446856498718, "losses/total": 0.031357597559690475, "ref_logps/chosen": -17.20541000366211, "ref_logps/rejected": -42.455162048339844, "rewards/accuracies": 1.0, "rewards/chosen": -2.1262478828430176, "rewards/margins": 6.586432933807373, "rewards/rejected": -8.71268081665039, "step": 482 }, { "epoch": 0.97, "grad_norm": 58.5645751953125, "learning_rate": 3.766666666666666e-07, "logps/chosen": -36.00074005126953, "logps/rejected": -140.50314331054688, "loss": 0.093, "losses/dpo": 5.722598507418297e-05, "losses/sft": 0.7557867169380188, "losses/total": 5.722598507418297e-05, "ref_logps/chosen": -12.93038272857666, "ref_logps/rejected": -47.007694244384766, "rewards/accuracies": 0.9375, "rewards/chosen": -2.307035446166992, "rewards/margins": 7.042509078979492, "rewards/rejected": -9.349544525146484, "step": 483 }, { "epoch": 0.97, "grad_norm": 8.557694435119629, "learning_rate": 3.7629629629629627e-07, "logps/chosen": -50.58363342285156, "logps/rejected": -162.1007080078125, "loss": 0.0152, "losses/dpo": 0.043552931398153305, "losses/sft": 0.7928053140640259, "losses/total": 0.043552931398153305, "ref_logps/chosen": -16.90772247314453, "ref_logps/rejected": -52.684326171875, "rewards/accuracies": 1.0, "rewards/chosen": -3.36759090423584, "rewards/margins": 7.574047088623047, "rewards/rejected": -10.941637992858887, "step": 484 }, { "epoch": 0.97, "grad_norm": 45.088653564453125, "learning_rate": 3.759259259259259e-07, "logps/chosen": -39.90214538574219, "logps/rejected": -137.9638671875, "loss": 0.1535, "losses/dpo": 0.1695178747177124, "losses/sft": 0.8066875338554382, "losses/total": 0.1695178747177124, "ref_logps/chosen": -12.019525527954102, "ref_logps/rejected": -50.03221130371094, "rewards/accuracies": 1.0, "rewards/chosen": -2.788262367248535, "rewards/margins": 6.004901885986328, "rewards/rejected": -8.79316520690918, "step": 485 }, { "epoch": 0.97, "grad_norm": 60.69712829589844, "learning_rate": 3.755555555555555e-07, "logps/chosen": -45.3070182800293, "logps/rejected": -136.23126220703125, "loss": 0.1879, "losses/dpo": 0.14317449927330017, "losses/sft": 0.4515833258628845, "losses/total": 0.14317449927330017, "ref_logps/chosen": -18.59630012512207, "ref_logps/rejected": -48.34473419189453, "rewards/accuracies": 0.8125, "rewards/chosen": -2.671072006225586, "rewards/margins": 6.117581367492676, "rewards/rejected": -8.788653373718262, "step": 486 }, { "epoch": 0.97, "grad_norm": 8.105025291442871, "learning_rate": 3.7518518518518514e-07, "logps/chosen": -42.688236236572266, "logps/rejected": -125.89185333251953, "loss": 0.0201, "losses/dpo": 0.03883223608136177, "losses/sft": 0.7434147596359253, "losses/total": 0.03883223608136177, "ref_logps/chosen": -18.851747512817383, "ref_logps/rejected": -39.032981872558594, "rewards/accuracies": 1.0, "rewards/chosen": -2.3836491107940674, "rewards/margins": 6.3022379875183105, "rewards/rejected": -8.68588638305664, "step": 487 }, { "epoch": 0.98, "grad_norm": 8.502668380737305, "learning_rate": 3.7481481481481483e-07, "logps/chosen": -35.2692756652832, "logps/rejected": -142.82443237304688, "loss": 0.0284, "losses/dpo": 0.06037643551826477, "losses/sft": 0.9963451623916626, "losses/total": 0.06037643551826477, "ref_logps/chosen": -11.326520919799805, "ref_logps/rejected": -46.48693084716797, "rewards/accuracies": 1.0, "rewards/chosen": -2.394275426864624, "rewards/margins": 7.239475250244141, "rewards/rejected": -9.633749961853027, "step": 488 }, { "epoch": 0.98, "grad_norm": 20.321788787841797, "learning_rate": 3.7444444444444447e-07, "logps/chosen": -37.23273849487305, "logps/rejected": -177.05303955078125, "loss": 0.0364, "losses/dpo": 0.021463543176651, "losses/sft": 0.8575751781463623, "losses/total": 0.021463543176651, "ref_logps/chosen": -13.77434253692627, "ref_logps/rejected": -58.2335205078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.345839738845825, "rewards/margins": 9.536112785339355, "rewards/rejected": -11.881952285766602, "step": 489 }, { "epoch": 0.98, "grad_norm": 66.36872100830078, "learning_rate": 3.7407407407407406e-07, "logps/chosen": -46.914161682128906, "logps/rejected": -101.332763671875, "loss": 0.2092, "losses/dpo": 0.4841746687889099, "losses/sft": 0.9192337989807129, "losses/total": 0.4841746687889099, "ref_logps/chosen": -14.84537124633789, "ref_logps/rejected": -30.192947387695312, "rewards/accuracies": 0.875, "rewards/chosen": -3.206879138946533, "rewards/margins": 3.9071030616760254, "rewards/rejected": -7.1139817237854, "step": 490 }, { "epoch": 0.98, "grad_norm": 17.200246810913086, "learning_rate": 3.737037037037037e-07, "logps/chosen": -39.54547119140625, "logps/rejected": -117.11639404296875, "loss": 0.0466, "losses/dpo": 0.056069329380989075, "losses/sft": 0.6392867565155029, "losses/total": 0.056069329380989075, "ref_logps/chosen": -15.112298011779785, "ref_logps/rejected": -42.334571838378906, "rewards/accuracies": 1.0, "rewards/chosen": -2.4433178901672363, "rewards/margins": 5.034865379333496, "rewards/rejected": -7.478182792663574, "step": 491 }, { "epoch": 0.98, "grad_norm": 55.23065185546875, "learning_rate": 3.7333333333333334e-07, "logps/chosen": -39.42041778564453, "logps/rejected": -126.77442169189453, "loss": 0.1834, "losses/dpo": 0.4038184881210327, "losses/sft": 0.6278300285339355, "losses/total": 0.4038184881210327, "ref_logps/chosen": -12.766145706176758, "ref_logps/rejected": -43.821372985839844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6654272079467773, "rewards/margins": 5.62987756729126, "rewards/rejected": -8.295305252075195, "step": 492 }, { "epoch": 0.99, "grad_norm": 43.6976318359375, "learning_rate": 3.7296296296296293e-07, "logps/chosen": -37.804630279541016, "logps/rejected": -143.37440490722656, "loss": 0.0993, "losses/dpo": 0.27842968702316284, "losses/sft": 0.7042969465255737, "losses/total": 0.27842968702316284, "ref_logps/chosen": -15.15415096282959, "ref_logps/rejected": -56.54274368286133, "rewards/accuracies": 0.9375, "rewards/chosen": -2.265048027038574, "rewards/margins": 6.418117523193359, "rewards/rejected": -8.68316650390625, "step": 493 }, { "epoch": 0.99, "grad_norm": 20.171466827392578, "learning_rate": 3.7259259259259257e-07, "logps/chosen": -43.048973083496094, "logps/rejected": -141.72682189941406, "loss": 0.0436, "losses/dpo": 0.044200535863637924, "losses/sft": 0.6779994964599609, "losses/total": 0.044200535863637924, "ref_logps/chosen": -23.580673217773438, "ref_logps/rejected": -57.72456359863281, "rewards/accuracies": 1.0, "rewards/chosen": -1.946830153465271, "rewards/margins": 6.453395366668701, "rewards/rejected": -8.400225639343262, "step": 494 }, { "epoch": 0.99, "grad_norm": 74.94364166259766, "learning_rate": 3.722222222222222e-07, "logps/chosen": -48.40394973754883, "logps/rejected": -145.72518920898438, "loss": 0.2522, "losses/dpo": 0.19847548007965088, "losses/sft": 1.0321438312530518, "losses/total": 0.19847548007965088, "ref_logps/chosen": -15.101470947265625, "ref_logps/rejected": -49.36922836303711, "rewards/accuracies": 0.875, "rewards/chosen": -3.3302478790283203, "rewards/margins": 6.305349826812744, "rewards/rejected": -9.635598182678223, "step": 495 }, { "epoch": 0.99, "grad_norm": 29.127410888671875, "learning_rate": 3.7185185185185185e-07, "logps/chosen": -41.526939392089844, "logps/rejected": -146.7291259765625, "loss": 0.0671, "losses/dpo": 0.013721669092774391, "losses/sft": 0.9741775989532471, "losses/total": 0.013721669092774391, "ref_logps/chosen": -16.26078224182129, "ref_logps/rejected": -49.89807891845703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.526616096496582, "rewards/margins": 7.15648889541626, "rewards/rejected": -9.68310546875, "step": 496 }, { "epoch": 0.99, "grad_norm": 16.872493743896484, "learning_rate": 3.7148148148148144e-07, "logps/chosen": -39.59349060058594, "logps/rejected": -137.2967987060547, "loss": 0.045, "losses/dpo": 0.1408008486032486, "losses/sft": 1.041797399520874, "losses/total": 0.1408008486032486, "ref_logps/chosen": -14.02829360961914, "ref_logps/rejected": -45.13361740112305, "rewards/accuracies": 1.0, "rewards/chosen": -2.5565197467803955, "rewards/margins": 6.659798622131348, "rewards/rejected": -9.216318130493164, "step": 497 }, { "epoch": 1.0, "grad_norm": 50.911476135253906, "learning_rate": 3.711111111111111e-07, "logps/chosen": -38.06005096435547, "logps/rejected": -123.10880279541016, "loss": 0.1425, "losses/dpo": 0.01334542315453291, "losses/sft": 0.7792074680328369, "losses/total": 0.01334542315453291, "ref_logps/chosen": -14.236019134521484, "ref_logps/rejected": -38.64054489135742, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3824031352996826, "rewards/margins": 6.064422607421875, "rewards/rejected": -8.446825981140137, "step": 498 }, { "epoch": 1.0, "grad_norm": 37.36387252807617, "learning_rate": 3.707407407407407e-07, "logps/chosen": -34.86865997314453, "logps/rejected": -124.82080078125, "loss": 0.148, "losses/dpo": 0.02250811830163002, "losses/sft": 0.4309656322002411, "losses/total": 0.02250811830163002, "ref_logps/chosen": -13.683731079101562, "ref_logps/rejected": -43.348628997802734, "rewards/accuracies": 0.9375, "rewards/chosen": -2.118492603302002, "rewards/margins": 6.0287251472473145, "rewards/rejected": -8.147217750549316, "step": 499 }, { "epoch": 1.0, "grad_norm": 16.937225341796875, "learning_rate": 3.703703703703703e-07, "logps/chosen": -40.25804901123047, "logps/rejected": -125.63944244384766, "loss": 0.0417, "losses/dpo": 0.008113143965601921, "losses/sft": 0.924973726272583, "losses/total": 0.008113143965601921, "ref_logps/chosen": -12.001253128051758, "ref_logps/rejected": -40.38880920410156, "rewards/accuracies": 1.0, "rewards/chosen": -2.8256797790527344, "rewards/margins": 5.699383735656738, "rewards/rejected": -8.525063514709473, "step": 500 } ], "logging_steps": 1.0, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }