{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.247011952191235, "eval_steps": 500, "global_step": 1692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "logps_train/chosen": -84.51311492919922, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -93.47111511230469, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.0958428904414177, "rewards_train/margins": -0.016700252890586853, "rewards_train/rejected": -0.07914263755083084, "step": 0 }, { "epoch": 0.0, "logps_train/chosen": -93.37198638916016, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -107.01815032958984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.13563546538352966, "rewards_train/margins": 0.08024188876152039, "rewards_train/rejected": -0.21587735414505005, "step": 1 }, { "epoch": 0.0, "learning_rate": 6.622516556291391e-09, "loss": 0.6883, "step": 2 }, { "epoch": 0.0, "logps_train/chosen": -91.27803039550781, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -93.9560775756836, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.15905329585075378, "rewards_train/margins": -0.3353206366300583, "rewards_train/rejected": 0.1762673407793045, "step": 2 }, { "epoch": 0.0, "logps_train/chosen": -68.57564544677734, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -68.01426696777344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.021627191454172134, "rewards_train/margins": 0.12198681756854057, "rewards_train/rejected": -0.1436140090227127, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.3245033112582781e-08, "loss": 0.767, "step": 4 }, { "epoch": 0.01, "logps_train/chosen": -62.96977996826172, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -55.99552917480469, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.16729095578193665, "rewards_train/margins": -0.17867529951035976, "rewards_train/rejected": 0.011384343728423119, "step": 4 }, { "epoch": 0.01, "logps_train/chosen": -66.53120422363281, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -66.25678253173828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03125469759106636, "rewards_train/margins": 0.041307687759399414, "rewards_train/rejected": -0.010052990168333054, "step": 5 }, { "epoch": 0.01, "learning_rate": 1.9867549668874173e-08, "loss": 0.7398, "step": 6 }, { "epoch": 0.01, "logps_train/chosen": -61.914283752441406, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -71.35198974609375, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.005490925163030624, "rewards_train/margins": -0.181229617446661, "rewards_train/rejected": 0.17573869228363037, "step": 6 }, { "epoch": 0.01, "logps_train/chosen": -97.30050659179688, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -89.40110778808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.28869879245758057, "rewards_train/margins": 0.16318489611148834, "rewards_train/rejected": 0.12551389634609222, "step": 7 }, { "epoch": 0.01, "learning_rate": 2.6490066225165563e-08, "loss": 0.7093, "step": 8 }, { "epoch": 0.01, "logps_train/chosen": -71.537353515625, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -75.89208221435547, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.031029988080263138, "rewards_train/margins": 0.1296129710972309, "rewards_train/rejected": -0.09858298301696777, "step": 8 }, { "epoch": 0.01, "logps_train/chosen": -56.81492233276367, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -57.13829040527344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.10024227946996689, "rewards_train/margins": -0.02547553926706314, "rewards_train/rejected": -0.07476674020290375, "step": 9 }, { "epoch": 0.01, "learning_rate": 3.311258278145695e-08, "loss": 0.6746, "step": 10 }, { "epoch": 0.01, "logps_train/chosen": -48.844825744628906, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -57.598480224609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1053614467382431, "rewards_train/margins": 0.04724089428782463, "rewards_train/rejected": 0.05812055245041847, "step": 10 }, { "epoch": 0.01, "logps_train/chosen": -52.23993682861328, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -61.658687591552734, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.2528998851776123, "rewards_train/margins": -0.14484356343746185, "rewards_train/rejected": -0.10805632174015045, "step": 11 }, { "epoch": 0.02, "learning_rate": 3.9735099337748346e-08, "loss": 0.7293, "step": 12 }, { "epoch": 0.02, "logps_train/chosen": -59.14757537841797, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.62351989746094, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.09913238137960434, "rewards_train/margins": -0.028967678546905518, "rewards_train/rejected": -0.07016470283269882, "step": 12 }, { "epoch": 0.02, "logps_train/chosen": -51.99066162109375, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -58.12510299682617, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.16156604886054993, "rewards_train/margins": -0.12093072757124901, "rewards_train/rejected": -0.04063532128930092, "step": 13 }, { "epoch": 0.02, "learning_rate": 4.635761589403973e-08, "loss": 0.7412, "step": 14 }, { "epoch": 0.02, "logps_train/chosen": -77.4368896484375, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -84.78419494628906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.04693605750799179, "rewards_train/margins": -0.02464430034160614, "rewards_train/rejected": 0.07158035784959793, "step": 14 }, { "epoch": 0.02, "logps_train/chosen": -44.22900390625, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -52.039039611816406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.013036925345659256, "rewards_train/margins": 0.18959732726216316, "rewards_train/rejected": -0.1765604019165039, "step": 15 }, { "epoch": 0.02, "learning_rate": 5.2980132450331126e-08, "loss": 0.6629, "step": 16 }, { "epoch": 0.02, "logps_train/chosen": -44.198028564453125, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -47.66931915283203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01144709624350071, "rewards_train/margins": 0.0041603464633226395, "rewards_train/rejected": 0.00728674978017807, "step": 16 }, { "epoch": 0.02, "logps_train/chosen": -39.628387451171875, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -47.1257438659668, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0684109702706337, "rewards_train/margins": 0.1266883872449398, "rewards_train/rejected": -0.05827741697430611, "step": 17 }, { "epoch": 0.02, "learning_rate": 5.960264900662251e-08, "loss": 0.6637, "step": 18 }, { "epoch": 0.02, "logps_train/chosen": -43.85874938964844, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -41.347694396972656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.014125023037195206, "rewards_train/margins": -0.0018867962062358856, "rewards_train/rejected": 0.01601181924343109, "step": 18 }, { "epoch": 0.03, "logps_train/chosen": -53.821372985839844, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -61.67257308959961, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.10713744163513184, "rewards_train/margins": -0.15550511330366135, "rewards_train/rejected": 0.04836767166852951, "step": 19 }, { "epoch": 0.03, "learning_rate": 6.62251655629139e-08, "loss": 0.7409, "step": 20 }, { "epoch": 0.03, "logps_train/chosen": -71.7474136352539, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -58.922584533691406, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.14974145591259003, "rewards_train/margins": -0.07467035949230194, "rewards_train/rejected": -0.07507109642028809, "step": 20 }, { "epoch": 0.03, "logps_train/chosen": -86.9837646484375, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -90.32958984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1313113570213318, "rewards_train/margins": 0.2759898006916046, "rewards_train/rejected": -0.14467844367027283, "step": 21 }, { "epoch": 0.03, "learning_rate": 7.284768211920529e-08, "loss": 0.6575, "step": 22 }, { "epoch": 0.03, "logps_train/chosen": -67.74806213378906, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -86.9749755859375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.028709255158901215, "rewards_train/margins": 0.004331827163696289, "rewards_train/rejected": 0.024377427995204926, "step": 22 }, { "epoch": 0.03, "logps_train/chosen": -69.238525390625, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -88.7351303100586, "rewards_train/accuracies": 0.125, "rewards_train/chosen": 0.03630336374044418, "rewards_train/margins": -0.1839330866932869, "rewards_train/rejected": 0.22023645043373108, "step": 23 }, { "epoch": 0.03, "learning_rate": 7.947019867549669e-08, "loss": 0.7637, "step": 24 }, { "epoch": 0.03, "logps_train/chosen": -36.02545166015625, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -42.00984191894531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10682964324951172, "rewards_train/margins": 0.09687619283795357, "rewards_train/rejected": 0.009953450411558151, "step": 24 }, { "epoch": 0.03, "logps_train/chosen": -38.06768035888672, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -46.420928955078125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.10286159813404083, "rewards_train/margins": -0.05803433805704117, "rewards_train/rejected": -0.044827260076999664, "step": 25 }, { "epoch": 0.03, "learning_rate": 8.609271523178807e-08, "loss": 0.6898, "step": 26 }, { "epoch": 0.03, "logps_train/chosen": -66.19087982177734, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -70.36297607421875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.11450610309839249, "rewards_train/margins": 0.13517922163009644, "rewards_train/rejected": -0.02067311853170395, "step": 26 }, { "epoch": 0.04, "logps_train/chosen": -64.31070709228516, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -71.96617126464844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.00017967447638511658, "rewards_train/margins": -0.04382825270295143, "rewards_train/rejected": 0.04400792717933655, "step": 27 }, { "epoch": 0.04, "learning_rate": 9.271523178807946e-08, "loss": 0.6823, "step": 28 }, { "epoch": 0.04, "logps_train/chosen": -45.852569580078125, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.71060180664062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.08291341364383698, "rewards_train/margins": -0.010290443897247314, "rewards_train/rejected": -0.07262296974658966, "step": 28 }, { "epoch": 0.04, "logps_train/chosen": -58.55830764770508, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -48.15802764892578, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.0761432871222496, "rewards_train/margins": -0.02830956131219864, "rewards_train/rejected": -0.047833725810050964, "step": 29 }, { "epoch": 0.04, "learning_rate": 9.933774834437085e-08, "loss": 0.7121, "step": 30 }, { "epoch": 0.04, "logps_train/chosen": -78.21870422363281, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -84.91961669921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.18594178557395935, "rewards_train/margins": -0.034597113728523254, "rewards_train/rejected": 0.2205388993024826, "step": 30 }, { "epoch": 0.04, "logps_train/chosen": -45.589080810546875, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -52.50957107543945, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.038107212632894516, "rewards_train/margins": -0.12816570326685905, "rewards_train/rejected": 0.09005849063396454, "step": 31 }, { "epoch": 0.04, "learning_rate": 1.0596026490066225e-07, "loss": 0.7561, "step": 32 }, { "epoch": 0.04, "logps_train/chosen": -66.94876098632812, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -78.79197692871094, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.030032292008399963, "rewards_train/margins": -0.1750534325838089, "rewards_train/rejected": 0.14502114057540894, "step": 32 }, { "epoch": 0.04, "logps_train/chosen": -71.43030548095703, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -67.35134887695312, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.10709311813116074, "rewards_train/margins": 0.007729530334472656, "rewards_train/rejected": -0.11482264846563339, "step": 33 }, { "epoch": 0.05, "learning_rate": 1.1258278145695364e-07, "loss": 0.7448, "step": 34 }, { "epoch": 0.05, "logps_train/chosen": -42.91541290283203, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -47.038291931152344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.025721075013279915, "rewards_train/margins": -0.025016890838742256, "rewards_train/rejected": -0.0007041841745376587, "step": 34 }, { "epoch": 0.05, "logps_train/chosen": -56.560001373291016, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -53.944644927978516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.09428122639656067, "rewards_train/margins": -0.020129263401031494, "rewards_train/rejected": -0.07415196299552917, "step": 35 }, { "epoch": 0.05, "learning_rate": 1.1920529801324502e-07, "loss": 0.714, "step": 36 }, { "epoch": 0.05, "logps_train/chosen": -34.96800231933594, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -53.19981002807617, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.09406614303588867, "rewards_train/margins": -0.22877249121665955, "rewards_train/rejected": 0.13470634818077087, "step": 36 }, { "epoch": 0.05, "logps_train/chosen": -83.79179382324219, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -73.38970184326172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.041914697736501694, "rewards_train/margins": 0.19865834340453148, "rewards_train/rejected": -0.15674364566802979, "step": 37 }, { "epoch": 0.05, "learning_rate": 1.2582781456953642e-07, "loss": 0.736, "step": 38 }, { "epoch": 0.05, "logps_train/chosen": -50.90718078613281, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -53.10429763793945, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.027436543256044388, "rewards_train/margins": 0.13768086954951286, "rewards_train/rejected": -0.16511741280555725, "step": 38 }, { "epoch": 0.05, "logps_train/chosen": -51.373138427734375, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -44.57908630371094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.015810921788215637, "rewards_train/margins": 0.08465710282325745, "rewards_train/rejected": -0.06884618103504181, "step": 39 }, { "epoch": 0.05, "learning_rate": 1.324503311258278e-07, "loss": 0.6463, "step": 40 }, { "epoch": 0.05, "logps_train/chosen": -62.13481903076172, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -58.01111602783203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06776849925518036, "rewards_train/margins": 0.07356762979179621, "rewards_train/rejected": -0.0057991305366158485, "step": 40 }, { "epoch": 0.05, "logps_train/chosen": -93.16519165039062, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -83.23350524902344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.01339399442076683, "rewards_train/margins": -0.15098082646727562, "rewards_train/rejected": 0.1375868320465088, "step": 41 }, { "epoch": 0.06, "learning_rate": 1.390728476821192e-07, "loss": 0.7213, "step": 42 }, { "epoch": 0.06, "logps_train/chosen": -68.20494079589844, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -75.3348388671875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.023731615394353867, "rewards_train/margins": 0.015028480440378189, "rewards_train/rejected": 0.008703134953975677, "step": 42 }, { "epoch": 0.06, "logps_train/chosen": -38.220970153808594, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -50.798240661621094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.043137263506650925, "rewards_train/margins": 0.12178956344723701, "rewards_train/rejected": -0.07865229994058609, "step": 43 }, { "epoch": 0.06, "learning_rate": 1.4569536423841058e-07, "loss": 0.6651, "step": 44 }, { "epoch": 0.06, "logps_train/chosen": -86.4616928100586, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -92.05072784423828, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.11179409921169281, "rewards_train/margins": -0.18172165006399155, "rewards_train/rejected": 0.06992755085229874, "step": 44 }, { "epoch": 0.06, "logps_train/chosen": -46.09622573852539, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -50.20856475830078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06693994998931885, "rewards_train/margins": 0.0014684349298477173, "rewards_train/rejected": 0.06547151505947113, "step": 45 }, { "epoch": 0.06, "learning_rate": 1.5231788079470197e-07, "loss": 0.7467, "step": 46 }, { "epoch": 0.06, "logps_train/chosen": -49.72835922241211, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -40.57377243041992, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03846099227666855, "rewards_train/margins": -0.011552426964044571, "rewards_train/rejected": -0.026908565312623978, "step": 46 }, { "epoch": 0.06, "logps_train/chosen": -80.9888916015625, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -66.10010528564453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.010486461222171783, "rewards_train/margins": 0.09432516247034073, "rewards_train/rejected": -0.08383870124816895, "step": 47 }, { "epoch": 0.06, "learning_rate": 1.5894039735099338e-07, "loss": 0.6786, "step": 48 }, { "epoch": 0.06, "logps_train/chosen": -67.37806701660156, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -53.611351013183594, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.03563149273395538, "rewards_train/margins": -0.11026449501514435, "rewards_train/rejected": 0.14589598774909973, "step": 48 }, { "epoch": 0.07, "logps_train/chosen": -72.42200469970703, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -81.93806457519531, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.1007942259311676, "rewards_train/margins": -0.01948826014995575, "rewards_train/rejected": -0.08130596578121185, "step": 49 }, { "epoch": 0.07, "learning_rate": 1.6556291390728477e-07, "loss": 0.7421, "step": 50 }, { "epoch": 0.07, "logps_train/chosen": -74.96185302734375, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -91.39497375488281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.1663145124912262, "rewards_train/margins": 0.037061452865600586, "rewards_train/rejected": 0.1292530596256256, "step": 50 }, { "epoch": 0.07, "logps_train/chosen": -25.62216567993164, "logps_train/ref_chosen": -25.0, "logps_train/ref_rejected": -31.5, "logps_train/rejected": -33.801307678222656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06182580441236496, "rewards_train/margins": 0.168891079723835, "rewards_train/rejected": -0.23071688413619995, "step": 51 }, { "epoch": 0.07, "learning_rate": 1.7218543046357613e-07, "loss": 0.6492, "step": 52 }, { "epoch": 0.07, "logps_train/chosen": -80.94408416748047, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -88.69902038574219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06887286901473999, "rewards_train/margins": 0.1254931017756462, "rewards_train/rejected": -0.05662023276090622, "step": 52 }, { "epoch": 0.07, "logps_train/chosen": -70.8918685913086, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -85.67730712890625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.16321057081222534, "rewards_train/margins": -0.11891760677099228, "rewards_train/rejected": -0.04429296404123306, "step": 53 }, { "epoch": 0.07, "learning_rate": 1.7880794701986754e-07, "loss": 0.7014, "step": 54 }, { "epoch": 0.07, "logps_train/chosen": -40.09686279296875, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -42.26985549926758, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.005779888480901718, "rewards_train/margins": -0.07396034523844719, "rewards_train/rejected": 0.06818045675754547, "step": 54 }, { "epoch": 0.07, "logps_train/chosen": -55.24117660522461, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -71.84161376953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.040524162352085114, "rewards_train/margins": 0.029574863612651825, "rewards_train/rejected": -0.07009902596473694, "step": 55 }, { "epoch": 0.07, "learning_rate": 1.8543046357615893e-07, "loss": 0.7135, "step": 56 }, { "epoch": 0.07, "logps_train/chosen": -111.13737487792969, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -81.009033203125, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.030925631523132324, "rewards_train/margins": -0.036272644996643066, "rewards_train/rejected": 0.005347013473510742, "step": 56 }, { "epoch": 0.08, "logps_train/chosen": -68.06702423095703, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -74.94760131835938, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.009703557938337326, "rewards_train/margins": -0.05510689690709114, "rewards_train/rejected": 0.06481045484542847, "step": 57 }, { "epoch": 0.08, "learning_rate": 1.9205298013245034e-07, "loss": 0.7229, "step": 58 }, { "epoch": 0.08, "logps_train/chosen": -81.5542221069336, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -82.67008972167969, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.03855158016085625, "rewards_train/margins": -0.01333947665989399, "rewards_train/rejected": -0.025212103500962257, "step": 58 }, { "epoch": 0.08, "logps_train/chosen": -62.39722442626953, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -59.491939544677734, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.09909740090370178, "rewards_train/margins": -0.12177861109375954, "rewards_train/rejected": 0.022681210190057755, "step": 59 }, { "epoch": 0.08, "learning_rate": 1.986754966887417e-07, "loss": 0.7326, "step": 60 }, { "epoch": 0.08, "logps_train/chosen": -54.281272888183594, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -54.476112365722656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.032028935849666595, "rewards_train/margins": -0.020848341286182404, "rewards_train/rejected": 0.052877277135849, "step": 60 }, { "epoch": 0.08, "logps_train/chosen": -58.266319274902344, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -63.26285171508789, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.1311240941286087, "rewards_train/margins": -0.347710058093071, "rewards_train/rejected": 0.21658596396446228, "step": 61 }, { "epoch": 0.08, "learning_rate": 2.052980132450331e-07, "loss": 0.808, "step": 62 }, { "epoch": 0.08, "logps_train/chosen": -57.68476867675781, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -64.23268127441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11433573067188263, "rewards_train/margins": 0.2719792574644089, "rewards_train/rejected": -0.15764352679252625, "step": 62 }, { "epoch": 0.08, "logps_train/chosen": -44.09886932373047, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -57.92987060546875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.11184025555849075, "rewards_train/margins": -0.031353630125522614, "rewards_train/rejected": -0.08048662543296814, "step": 63 }, { "epoch": 0.08, "learning_rate": 2.119205298013245e-07, "loss": 0.6448, "step": 64 }, { "epoch": 0.08, "logps_train/chosen": -83.86918640136719, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -78.87030029296875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1119191125035286, "rewards_train/margins": 0.15011154860258102, "rewards_train/rejected": -0.2620306611061096, "step": 64 }, { "epoch": 0.09, "logps_train/chosen": -44.80962371826172, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -58.183319091796875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.06494668126106262, "rewards_train/margins": -0.02473984658718109, "rewards_train/rejected": -0.04020683467388153, "step": 65 }, { "epoch": 0.09, "learning_rate": 2.185430463576159e-07, "loss": 0.6741, "step": 66 }, { "epoch": 0.09, "logps_train/chosen": -72.55757904052734, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -69.10940551757812, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.12450781464576721, "rewards_train/margins": -0.13700440526008606, "rewards_train/rejected": 0.012496590614318848, "step": 66 }, { "epoch": 0.09, "logps_train/chosen": -46.07583999633789, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -50.48297119140625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06383398920297623, "rewards_train/margins": -0.010849431157112122, "rewards_train/rejected": -0.052984558045864105, "step": 67 }, { "epoch": 0.09, "learning_rate": 2.2516556291390728e-07, "loss": 0.737, "step": 68 }, { "epoch": 0.09, "logps_train/chosen": -52.24906539916992, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -53.30507278442383, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12490688264369965, "rewards_train/margins": 0.043100327253341675, "rewards_train/rejected": -0.16800720989704132, "step": 68 }, { "epoch": 0.09, "logps_train/chosen": -79.39360046386719, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -74.55288696289062, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.08623476326465607, "rewards_train/margins": 0.01436634361743927, "rewards_train/rejected": -0.10060110688209534, "step": 69 }, { "epoch": 0.09, "learning_rate": 2.3178807947019866e-07, "loss": 0.6844, "step": 70 }, { "epoch": 0.09, "logps_train/chosen": -91.63473510742188, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -92.17798614501953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.13925480842590332, "rewards_train/margins": 0.10901263356208801, "rewards_train/rejected": -0.24826744198799133, "step": 70 }, { "epoch": 0.09, "logps_train/chosen": -63.37866973876953, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -70.65562438964844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08474203944206238, "rewards_train/margins": 0.07769595086574554, "rewards_train/rejected": -0.16243799030780792, "step": 71 }, { "epoch": 0.1, "learning_rate": 2.3841059602649005e-07, "loss": 0.6567, "step": 72 }, { "epoch": 0.1, "logps_train/chosen": -71.5158462524414, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -117.028076171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.056227780878543854, "rewards_train/margins": 0.11840992420911789, "rewards_train/rejected": -0.062182143330574036, "step": 72 }, { "epoch": 0.1, "logps_train/chosen": -45.88077926635742, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -43.769962310791016, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.08885923773050308, "rewards_train/margins": -0.039987921714782715, "rewards_train/rejected": -0.04887131601572037, "step": 73 }, { "epoch": 0.1, "learning_rate": 2.4503311258278146e-07, "loss": 0.6879, "step": 74 }, { "epoch": 0.1, "logps_train/chosen": -52.970603942871094, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -47.509727478027344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.07206050306558609, "rewards_train/margins": -0.07577532436698675, "rewards_train/rejected": 0.0037148213014006615, "step": 74 }, { "epoch": 0.1, "logps_train/chosen": -53.47748565673828, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -39.79557800292969, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.014936279505491257, "rewards_train/margins": -0.033034609630703926, "rewards_train/rejected": 0.01809833012521267, "step": 75 }, { "epoch": 0.1, "learning_rate": 2.5165562913907285e-07, "loss": 0.7242, "step": 76 }, { "epoch": 0.1, "logps_train/chosen": -65.38147735595703, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -62.105567932128906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.010022591799497604, "rewards_train/margins": -0.05102801322937012, "rewards_train/rejected": 0.04100542142987251, "step": 76 }, { "epoch": 0.1, "logps_train/chosen": -87.7823257446289, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -114.4891128540039, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.0835060402750969, "rewards_train/margins": -0.020532339811325073, "rewards_train/rejected": -0.06297370046377182, "step": 77 }, { "epoch": 0.1, "learning_rate": 2.5827814569536424e-07, "loss": 0.7218, "step": 78 }, { "epoch": 0.1, "logps_train/chosen": -39.83503341674805, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -45.94615936279297, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.1506907194852829, "rewards_train/margins": -0.07013702392578125, "rewards_train/rejected": -0.08055369555950165, "step": 78 }, { "epoch": 0.1, "logps_train/chosen": -48.3602294921875, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -48.82488250732422, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.12469467520713806, "rewards_train/margins": -0.06408148258924484, "rewards_train/rejected": -0.06061319261789322, "step": 79 }, { "epoch": 0.11, "learning_rate": 2.649006622516556e-07, "loss": 0.7301, "step": 80 }, { "epoch": 0.11, "logps_train/chosen": -57.425209045410156, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -65.28491973876953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.04888518899679184, "rewards_train/margins": 0.09300201013684273, "rewards_train/rejected": -0.04411682114005089, "step": 80 }, { "epoch": 0.11, "logps_train/chosen": -62.06060028076172, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -76.48416137695312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.09512293338775635, "rewards_train/margins": 0.05173105001449585, "rewards_train/rejected": -0.1468539834022522, "step": 81 }, { "epoch": 0.11, "learning_rate": 2.71523178807947e-07, "loss": 0.6648, "step": 82 }, { "epoch": 0.11, "logps_train/chosen": -70.86492919921875, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -55.90704345703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.10055575519800186, "rewards_train/margins": 0.17217954248189926, "rewards_train/rejected": -0.2727352976799011, "step": 82 }, { "epoch": 0.11, "logps_train/chosen": -95.26240539550781, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -121.41331481933594, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.15436625480651855, "rewards_train/margins": -0.12553440034389496, "rewards_train/rejected": -0.028831854462623596, "step": 83 }, { "epoch": 0.11, "learning_rate": 2.781456953642384e-07, "loss": 0.6969, "step": 84 }, { "epoch": 0.11, "logps_train/chosen": -68.42686462402344, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -66.47966766357422, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.07549843937158585, "rewards_train/margins": -0.163469098508358, "rewards_train/rejected": 0.08797065913677216, "step": 84 }, { "epoch": 0.11, "logps_train/chosen": -41.177154541015625, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -36.356689453125, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.038027845323085785, "rewards_train/margins": -0.009780626744031906, "rewards_train/rejected": -0.02824721857905388, "step": 85 }, { "epoch": 0.11, "learning_rate": 2.847682119205298e-07, "loss": 0.7441, "step": 86 }, { "epoch": 0.11, "logps_train/chosen": -57.412818908691406, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -62.96641540527344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.15053001046180725, "rewards_train/margins": 0.15654639713466167, "rewards_train/rejected": -0.0060163866728544235, "step": 86 }, { "epoch": 0.12, "logps_train/chosen": -56.833213806152344, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -68.21234893798828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.05027265101671219, "rewards_train/margins": 0.034007830545306206, "rewards_train/rejected": 0.016264820471405983, "step": 87 }, { "epoch": 0.12, "learning_rate": 2.9139072847682117e-07, "loss": 0.6559, "step": 88 }, { "epoch": 0.12, "logps_train/chosen": -76.35112762451172, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -61.98542022705078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.014800550416111946, "rewards_train/margins": 0.17749163322150707, "rewards_train/rejected": -0.19229218363761902, "step": 88 }, { "epoch": 0.12, "logps_train/chosen": -49.15941619873047, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -47.03160858154297, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.03679283335804939, "rewards_train/margins": 0.07862583175301552, "rewards_train/rejected": -0.041832998394966125, "step": 89 }, { "epoch": 0.12, "learning_rate": 2.980132450331126e-07, "loss": 0.6457, "step": 90 }, { "epoch": 0.12, "logps_train/chosen": -63.048431396484375, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -58.56714630126953, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.21578097343444824, "rewards_train/margins": 0.09093338251113892, "rewards_train/rejected": -0.30671435594558716, "step": 90 }, { "epoch": 0.12, "logps_train/chosen": -46.299842834472656, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -35.97846984863281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.050296660512685776, "rewards_train/margins": -0.036824654787778854, "rewards_train/rejected": -0.013472005724906921, "step": 91 }, { "epoch": 0.12, "learning_rate": 3.0463576158940394e-07, "loss": 0.6899, "step": 92 }, { "epoch": 0.12, "logps_train/chosen": -43.684913635253906, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -49.776309967041016, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03724118322134018, "rewards_train/margins": -0.13539118319749832, "rewards_train/rejected": 0.09814999997615814, "step": 92 }, { "epoch": 0.12, "logps_train/chosen": -92.63056945800781, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -108.69706726074219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.26770493388175964, "rewards_train/margins": 0.3069436140358448, "rewards_train/rejected": -0.03923868015408516, "step": 93 }, { "epoch": 0.12, "learning_rate": 3.1125827814569533e-07, "loss": 0.6728, "step": 94 }, { "epoch": 0.12, "logps_train/chosen": -64.66177368164062, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -74.56458282470703, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2021145224571228, "rewards_train/margins": -0.059718817472457886, "rewards_train/rejected": -0.14239570498466492, "step": 94 }, { "epoch": 0.13, "logps_train/chosen": -48.30315399169922, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.46471405029297, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03843483328819275, "rewards_train/margins": 0.07865602150559425, "rewards_train/rejected": -0.040221188217401505, "step": 95 }, { "epoch": 0.13, "learning_rate": 3.1788079470198677e-07, "loss": 0.7181, "step": 96 }, { "epoch": 0.13, "logps_train/chosen": -91.6103286743164, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -82.10276794433594, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.12275154143571854, "rewards_train/margins": -0.2640368416905403, "rewards_train/rejected": 0.14128530025482178, "step": 96 }, { "epoch": 0.13, "logps_train/chosen": -76.30892944335938, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -74.22945404052734, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.012143261730670929, "rewards_train/margins": -0.05716678127646446, "rewards_train/rejected": 0.04502351954579353, "step": 97 }, { "epoch": 0.13, "learning_rate": 3.245033112582781e-07, "loss": 0.7964, "step": 98 }, { "epoch": 0.13, "logps_train/chosen": -51.50962448120117, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -78.12258911132812, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.08299360424280167, "rewards_train/margins": -0.08714120648801327, "rewards_train/rejected": 0.004147602245211601, "step": 98 }, { "epoch": 0.13, "logps_train/chosen": -61.6601448059082, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -55.41648864746094, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.061329349875450134, "rewards_train/margins": -0.034521862864494324, "rewards_train/rejected": 0.09585121273994446, "step": 99 }, { "epoch": 0.13, "learning_rate": 3.3112582781456954e-07, "loss": 0.7289, "step": 100 }, { "epoch": 0.13, "logps_train/chosen": -67.82522583007812, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -52.624847412109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11611630767583847, "rewards_train/margins": 0.20339957624673843, "rewards_train/rejected": -0.3195158839225769, "step": 100 }, { "epoch": 0.13, "logps_train/chosen": -112.18073272705078, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -105.205078125, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.08994840830564499, "rewards_train/margins": -0.010066166520118713, "rewards_train/rejected": -0.07988224178552628, "step": 101 }, { "epoch": 0.14, "learning_rate": 3.3774834437086093e-07, "loss": 0.6666, "step": 102 }, { "epoch": 0.14, "logps_train/chosen": -61.86896514892578, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -78.6822509765625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.15564638376235962, "rewards_train/margins": -0.11242171376943588, "rewards_train/rejected": -0.04322466999292374, "step": 102 }, { "epoch": 0.14, "logps_train/chosen": -50.12421798706055, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -55.02149963378906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.1355273276567459, "rewards_train/margins": -0.05919753015041351, "rewards_train/rejected": 0.19472485780715942, "step": 103 }, { "epoch": 0.14, "learning_rate": 3.4437086092715226e-07, "loss": 0.7486, "step": 104 }, { "epoch": 0.14, "logps_train/chosen": -75.27784729003906, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -101.50634765625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.173777773976326, "rewards_train/margins": 0.15253830701112747, "rewards_train/rejected": 0.021239466965198517, "step": 104 }, { "epoch": 0.14, "logps_train/chosen": -107.19461059570312, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -72.81076049804688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.15149196982383728, "rewards_train/margins": -0.021587401628494263, "rewards_train/rejected": -0.12990456819534302, "step": 105 }, { "epoch": 0.14, "learning_rate": 3.509933774834437e-07, "loss": 0.6806, "step": 106 }, { "epoch": 0.14, "logps_train/chosen": -81.36860656738281, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -64.16646575927734, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0951707661151886, "rewards_train/margins": 0.016504988074302673, "rewards_train/rejected": 0.07866577804088593, "step": 106 }, { "epoch": 0.14, "logps_train/chosen": -37.63343048095703, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -36.02953338623047, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.003844795748591423, "rewards_train/margins": -0.07054592855274677, "rewards_train/rejected": 0.0743907243013382, "step": 107 }, { "epoch": 0.14, "learning_rate": 3.576158940397351e-07, "loss": 0.7129, "step": 108 }, { "epoch": 0.14, "logps_train/chosen": -44.51172637939453, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -51.17156982421875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.09570383280515671, "rewards_train/margins": 0.08864045888185501, "rewards_train/rejected": -0.18434429168701172, "step": 108 }, { "epoch": 0.14, "logps_train/chosen": -51.38105392456055, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -53.045955657958984, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.08738280832767487, "rewards_train/margins": 0.03963473439216614, "rewards_train/rejected": 0.04774807393550873, "step": 109 }, { "epoch": 0.15, "learning_rate": 3.642384105960264e-07, "loss": 0.6678, "step": 110 }, { "epoch": 0.15, "logps_train/chosen": -49.302364349365234, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -58.857460021972656, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.07320532202720642, "rewards_train/margins": 0.07660304009914398, "rewards_train/rejected": -0.1498083621263504, "step": 110 }, { "epoch": 0.15, "logps_train/chosen": -72.46064758300781, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -66.34900665283203, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.15856443345546722, "rewards_train/margins": -0.06116417050361633, "rewards_train/rejected": -0.09740026295185089, "step": 111 }, { "epoch": 0.15, "learning_rate": 3.7086092715231786e-07, "loss": 0.6988, "step": 112 }, { "epoch": 0.15, "logps_train/chosen": -71.7870101928711, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -82.71179962158203, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.03956109285354614, "rewards_train/margins": 0.056053757667541504, "rewards_train/rejected": -0.01649266481399536, "step": 112 }, { "epoch": 0.15, "logps_train/chosen": -72.26091003417969, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -57.19587707519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0692216157913208, "rewards_train/margins": 0.19974707067012787, "rewards_train/rejected": -0.13052545487880707, "step": 113 }, { "epoch": 0.15, "learning_rate": 3.7748344370860925e-07, "loss": 0.6416, "step": 114 }, { "epoch": 0.15, "logps_train/chosen": -48.94877624511719, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -42.96794891357422, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.007377296686172485, "rewards_train/margins": -0.02152006048709154, "rewards_train/rejected": 0.014142763800919056, "step": 114 }, { "epoch": 0.15, "logps_train/chosen": -118.05181884765625, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -110.0438232421875, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.00106887798756361, "rewards_train/margins": -0.07892351318150759, "rewards_train/rejected": 0.0799923911690712, "step": 115 }, { "epoch": 0.15, "learning_rate": 3.841059602649007e-07, "loss": 0.7267, "step": 116 }, { "epoch": 0.15, "logps_train/chosen": -58.02381896972656, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -64.1988754272461, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.10494227707386017, "rewards_train/margins": 0.084204763174057, "rewards_train/rejected": 0.02073751389980316, "step": 116 }, { "epoch": 0.16, "logps_train/chosen": -73.90216064453125, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -90.1214599609375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.17615409195423126, "rewards_train/margins": -0.17650871723890305, "rewards_train/rejected": 0.00035462528467178345, "step": 117 }, { "epoch": 0.16, "learning_rate": 3.90728476821192e-07, "loss": 0.7343, "step": 118 }, { "epoch": 0.16, "logps_train/chosen": -51.456459045410156, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -63.093505859375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0026772618293762207, "rewards_train/margins": 0.1941731870174408, "rewards_train/rejected": -0.19685044884681702, "step": 118 }, { "epoch": 0.16, "logps_train/chosen": -53.95072937011719, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -51.750728607177734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05492675304412842, "rewards_train/margins": 0.017499543726444244, "rewards_train/rejected": 0.037427209317684174, "step": 119 }, { "epoch": 0.16, "learning_rate": 3.973509933774834e-07, "loss": 0.6564, "step": 120 }, { "epoch": 0.16, "logps_train/chosen": -95.12608337402344, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -95.79153442382812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.16942286491394043, "rewards_train/margins": 0.1782645247876644, "rewards_train/rejected": -0.008841659873723984, "step": 120 }, { "epoch": 0.16, "logps_train/chosen": -65.90806579589844, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -72.44938659667969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06033756583929062, "rewards_train/margins": 0.05335157364606857, "rewards_train/rejected": -0.11368913948535919, "step": 121 }, { "epoch": 0.16, "learning_rate": 4.0397350993377485e-07, "loss": 0.6521, "step": 122 }, { "epoch": 0.16, "logps_train/chosen": -66.84635162353516, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -52.49150085449219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.022135093808174133, "rewards_train/margins": 0.12310890853404999, "rewards_train/rejected": -0.14524400234222412, "step": 122 }, { "epoch": 0.16, "logps_train/chosen": -53.78950500488281, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -67.31559753417969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.04917469620704651, "rewards_train/margins": 0.1572965681552887, "rewards_train/rejected": -0.10812187194824219, "step": 123 }, { "epoch": 0.16, "learning_rate": 4.105960264900662e-07, "loss": 0.6372, "step": 124 }, { "epoch": 0.16, "logps_train/chosen": -61.14987564086914, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -56.155784606933594, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.04779965430498123, "rewards_train/margins": 0.0021537616848945618, "rewards_train/rejected": -0.04995341598987579, "step": 124 }, { "epoch": 0.17, "logps_train/chosen": -40.7641487121582, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -48.47035217285156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.019383713603019714, "rewards_train/margins": 0.0010889321565628052, "rewards_train/rejected": -0.02047264575958252, "step": 125 }, { "epoch": 0.17, "learning_rate": 4.172185430463576e-07, "loss": 0.699, "step": 126 }, { "epoch": 0.17, "logps_train/chosen": -57.5948371887207, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -62.32328796386719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.042078785598278046, "rewards_train/margins": 0.041594890877604485, "rewards_train/rejected": 0.0004838947206735611, "step": 126 }, { "epoch": 0.17, "logps_train/chosen": -71.44166564941406, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -67.69255828857422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.19489586353302002, "rewards_train/margins": 0.13602691888809204, "rewards_train/rejected": 0.05886894464492798, "step": 127 }, { "epoch": 0.17, "learning_rate": 4.23841059602649e-07, "loss": 0.6663, "step": 128 }, { "epoch": 0.17, "logps_train/chosen": -78.86708068847656, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -81.16221618652344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.013291553594172001, "rewards_train/margins": -0.024784128181636333, "rewards_train/rejected": 0.038075681775808334, "step": 128 }, { "epoch": 0.17, "logps_train/chosen": -57.14472961425781, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -58.5099983215332, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.14333954453468323, "rewards_train/margins": 0.1412142775952816, "rewards_train/rejected": 0.0021252669394016266, "step": 129 }, { "epoch": 0.17, "learning_rate": 4.3046357615894034e-07, "loss": 0.6866, "step": 130 }, { "epoch": 0.17, "logps_train/chosen": -64.92575073242188, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -63.77936553955078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.07930011302232742, "rewards_train/margins": 0.21036160737276077, "rewards_train/rejected": -0.13106149435043335, "step": 130 }, { "epoch": 0.17, "logps_train/chosen": -70.21444702148438, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -77.1873779296875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1667574793100357, "rewards_train/margins": -0.016770094633102417, "rewards_train/rejected": -0.1499873846769333, "step": 131 }, { "epoch": 0.18, "learning_rate": 4.370860927152318e-07, "loss": 0.6749, "step": 132 }, { "epoch": 0.18, "logps_train/chosen": -87.6690673828125, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -94.22894287109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09090535342693329, "rewards_train/margins": 0.2075498104095459, "rewards_train/rejected": -0.11664445698261261, "step": 132 }, { "epoch": 0.18, "logps_train/chosen": -82.33617401123047, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.46888732910156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.04333607479929924, "rewards_train/margins": -0.013290923088788986, "rewards_train/rejected": 0.056626997888088226, "step": 133 }, { "epoch": 0.18, "learning_rate": 4.4370860927152317e-07, "loss": 0.6607, "step": 134 }, { "epoch": 0.18, "logps_train/chosen": -75.47453308105469, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -92.07035064697266, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.03692217171192169, "rewards_train/margins": 0.08458223938941956, "rewards_train/rejected": -0.047660067677497864, "step": 134 }, { "epoch": 0.18, "logps_train/chosen": -54.34320831298828, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -54.20006561279297, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.09146048873662949, "rewards_train/margins": 0.048185981810092926, "rewards_train/rejected": 0.04327450692653656, "step": 135 }, { "epoch": 0.18, "learning_rate": 4.5033112582781455e-07, "loss": 0.6764, "step": 136 }, { "epoch": 0.18, "logps_train/chosen": -70.4914321899414, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -66.65330505371094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1680445671081543, "rewards_train/margins": 0.2169685736298561, "rewards_train/rejected": -0.04892400652170181, "step": 136 }, { "epoch": 0.18, "logps_train/chosen": -100.63397216796875, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -79.61186218261719, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.04519620165228844, "rewards_train/margins": 0.07474162802100182, "rewards_train/rejected": -0.02954542636871338, "step": 137 }, { "epoch": 0.18, "learning_rate": 4.5695364238410594e-07, "loss": 0.6578, "step": 138 }, { "epoch": 0.18, "logps_train/chosen": -88.49551391601562, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -88.72273254394531, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.04419870302081108, "rewards_train/margins": -0.0772779993712902, "rewards_train/rejected": 0.12147670239210129, "step": 138 }, { "epoch": 0.18, "logps_train/chosen": -56.23085021972656, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -57.55557632446289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.005040287971496582, "rewards_train/margins": 0.28872281312942505, "rewards_train/rejected": -0.28368252515792847, "step": 139 }, { "epoch": 0.19, "learning_rate": 4.635761589403973e-07, "loss": 0.6585, "step": 140 }, { "epoch": 0.19, "logps_train/chosen": -95.6607437133789, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -83.00914764404297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.14154255390167236, "rewards_train/margins": 0.18151971325278282, "rewards_train/rejected": -0.03997715935111046, "step": 140 }, { "epoch": 0.19, "logps_train/chosen": -83.20655059814453, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.08049011230469, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.005029797554016113, "rewards_train/margins": -0.08760550618171692, "rewards_train/rejected": 0.0825757086277008, "step": 141 }, { "epoch": 0.19, "learning_rate": 4.701986754966887e-07, "loss": 0.7011, "step": 142 }, { "epoch": 0.19, "logps_train/chosen": -58.488731384277344, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -64.4619369506836, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.13081428408622742, "rewards_train/margins": 0.18716440349817276, "rewards_train/rejected": -0.05635011941194534, "step": 142 }, { "epoch": 0.19, "logps_train/chosen": -44.711212158203125, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -55.97105407714844, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.12073065340518951, "rewards_train/margins": -0.0017506703734397888, "rewards_train/rejected": -0.11897998303174973, "step": 143 }, { "epoch": 0.19, "learning_rate": 4.768211920529801e-07, "loss": 0.6712, "step": 144 }, { "epoch": 0.19, "logps_train/chosen": -61.643856048583984, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -61.83428192138672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.16862204670906067, "rewards_train/margins": 0.08330032229423523, "rewards_train/rejected": 0.08532172441482544, "step": 144 }, { "epoch": 0.19, "logps_train/chosen": -49.488956451416016, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -42.87102508544922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06829171627759933, "rewards_train/margins": 0.03742557391524315, "rewards_train/rejected": 0.030866142362356186, "step": 145 }, { "epoch": 0.19, "learning_rate": 4.834437086092715e-07, "loss": 0.6815, "step": 146 }, { "epoch": 0.19, "logps_train/chosen": -75.34461975097656, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -77.51017761230469, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0483504980802536, "rewards_train/margins": 0.047025082632899284, "rewards_train/rejected": 0.0013254154473543167, "step": 146 }, { "epoch": 0.2, "logps_train/chosen": -82.11029052734375, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -86.05786895751953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06084573641419411, "rewards_train/margins": 0.4760078825056553, "rewards_train/rejected": -0.4151621460914612, "step": 147 }, { "epoch": 0.2, "learning_rate": 4.900662251655629e-07, "loss": 0.5963, "step": 148 }, { "epoch": 0.2, "logps_train/chosen": -78.65860748291016, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -69.48919677734375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.07289192080497742, "rewards_train/margins": 0.09243376553058624, "rewards_train/rejected": -0.16532568633556366, "step": 148 }, { "epoch": 0.2, "logps_train/chosen": -86.75151062011719, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -82.36729431152344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.1904740333557129, "rewards_train/margins": 0.1213437020778656, "rewards_train/rejected": 0.06913033127784729, "step": 149 }, { "epoch": 0.2, "learning_rate": 4.966887417218543e-07, "loss": 0.6674, "step": 150 }, { "epoch": 0.2, "logps_train/chosen": -61.20390701293945, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -40.24456024169922, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.18273454904556274, "rewards_train/margins": 0.16109668835997581, "rewards_train/rejected": 0.02163786068558693, "step": 150 }, { "epoch": 0.2, "logps_train/chosen": -45.33393096923828, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -49.1523323059082, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.023637887090444565, "rewards_train/margins": 0.15449609979987144, "rewards_train/rejected": -0.13085821270942688, "step": 151 }, { "epoch": 0.2, "learning_rate": 4.999998488565838e-07, "loss": 0.6326, "step": 152 }, { "epoch": 0.2, "logps_train/chosen": -67.12495422363281, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -90.41917419433594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.14062972366809845, "rewards_train/margins": 0.3200472891330719, "rewards_train/rejected": -0.17941756546497345, "step": 152 }, { "epoch": 0.2, "logps_train/chosen": -35.22588348388672, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -43.22602081298828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.06067415326833725, "rewards_train/margins": -0.0028182752430438995, "rewards_train/rejected": -0.05785587802529335, "step": 153 }, { "epoch": 0.2, "learning_rate": 4.999986397103514e-07, "loss": 0.6345, "step": 154 }, { "epoch": 0.2, "logps_train/chosen": -53.85517883300781, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -63.273860931396484, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.254268079996109, "rewards_train/margins": -0.017507046461105347, "rewards_train/rejected": -0.23676103353500366, "step": 154 }, { "epoch": 0.21, "logps_train/chosen": -54.74589920043945, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -59.33473205566406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1675974428653717, "rewards_train/margins": 0.11825825273990631, "rewards_train/rejected": 0.04933919012546539, "step": 155 }, { "epoch": 0.21, "learning_rate": 4.999962214237345e-07, "loss": 0.6804, "step": 156 }, { "epoch": 0.21, "logps_train/chosen": -44.9425048828125, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -52.465545654296875, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.2215944528579712, "rewards_train/margins": -0.13910240679979324, "rewards_train/rejected": -0.08249204605817795, "step": 156 }, { "epoch": 0.21, "logps_train/chosen": -76.8110580444336, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -85.73584747314453, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.007668402045965195, "rewards_train/margins": -0.08720861002802849, "rewards_train/rejected": 0.0795402079820633, "step": 157 }, { "epoch": 0.21, "learning_rate": 4.999925940084296e-07, "loss": 0.7699, "step": 158 }, { "epoch": 0.21, "logps_train/chosen": -45.07207489013672, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -55.9972038269043, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.08810484409332275, "rewards_train/margins": 0.3167314976453781, "rewards_train/rejected": -0.22862665355205536, "step": 158 }, { "epoch": 0.21, "logps_train/chosen": -91.51863861083984, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -108.4189453125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.14701326191425323, "rewards_train/margins": 0.09125164151191711, "rewards_train/rejected": 0.05576162040233612, "step": 159 }, { "epoch": 0.21, "learning_rate": 4.999877574819808e-07, "loss": 0.6321, "step": 160 }, { "epoch": 0.21, "logps_train/chosen": -64.62169647216797, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -64.21802520751953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.3722056746482849, "rewards_train/margins": 0.3815084397792816, "rewards_train/rejected": -0.009302765130996704, "step": 160 }, { "epoch": 0.21, "logps_train/chosen": -48.48457336425781, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -55.74340057373047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0968550443649292, "rewards_train/margins": 0.12275736406445503, "rewards_train/rejected": -0.025902319699525833, "step": 161 }, { "epoch": 0.22, "learning_rate": 4.999817118677806e-07, "loss": 0.6014, "step": 162 }, { "epoch": 0.22, "logps_train/chosen": -40.219757080078125, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -55.90486145019531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.04697583243250847, "rewards_train/margins": 0.09233813360333443, "rewards_train/rejected": -0.1393139660358429, "step": 162 }, { "epoch": 0.22, "logps_train/chosen": -53.66594696044922, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -57.26887512207031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0791085734963417, "rewards_train/margins": -0.001816391944885254, "rewards_train/rejected": 0.08092496544122696, "step": 163 }, { "epoch": 0.22, "learning_rate": 4.999744571950691e-07, "loss": 0.6981, "step": 164 }, { "epoch": 0.22, "logps_train/chosen": -55.405059814453125, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -58.94813919067383, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1326933205127716, "rewards_train/margins": 0.18868306279182434, "rewards_train/rejected": -0.32137638330459595, "step": 164 }, { "epoch": 0.22, "logps_train/chosen": -62.670623779296875, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -56.03486251831055, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06935718655586243, "rewards_train/margins": 0.06083807349205017, "rewards_train/rejected": -0.1301952600479126, "step": 165 }, { "epoch": 0.22, "learning_rate": 4.99965993498934e-07, "loss": 0.6793, "step": 166 }, { "epoch": 0.22, "logps_train/chosen": -55.48243713378906, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -57.08064651489258, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.08691291511058807, "rewards_train/margins": 0.2840401232242584, "rewards_train/rejected": -0.19712720811367035, "step": 166 }, { "epoch": 0.22, "logps_train/chosen": -63.22955322265625, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -72.77273559570312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1098572239279747, "rewards_train/margins": 0.20783356577157974, "rewards_train/rejected": -0.09797634184360504, "step": 167 }, { "epoch": 0.22, "learning_rate": 4.999563208203109e-07, "loss": 0.6387, "step": 168 }, { "epoch": 0.22, "logps_train/chosen": -48.420230865478516, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -50.79782485961914, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0876644104719162, "rewards_train/margins": 0.45729057490825653, "rewards_train/rejected": -0.36962616443634033, "step": 168 }, { "epoch": 0.22, "logps_train/chosen": -60.79928970336914, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -60.372554779052734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.16382136940956116, "rewards_train/margins": 0.6014676988124847, "rewards_train/rejected": -0.4376463294029236, "step": 169 }, { "epoch": 0.23, "learning_rate": 4.999454392059824e-07, "loss": 0.5151, "step": 170 }, { "epoch": 0.23, "logps_train/chosen": -80.40921783447266, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -103.83680725097656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03154726326465607, "rewards_train/margins": 0.05682013928890228, "rewards_train/rejected": -0.08836740255355835, "step": 170 }, { "epoch": 0.23, "logps_train/chosen": -65.59782409667969, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -65.05398559570312, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.019156955182552338, "rewards_train/margins": 0.29854581505060196, "rewards_train/rejected": -0.3177027702331543, "step": 171 }, { "epoch": 0.23, "learning_rate": 4.999333487085786e-07, "loss": 0.6326, "step": 172 }, { "epoch": 0.23, "logps_train/chosen": -43.34238815307617, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -58.07159423828125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03938404098153114, "rewards_train/margins": -0.035740336403250694, "rewards_train/rejected": -0.003643704578280449, "step": 172 }, { "epoch": 0.23, "logps_train/chosen": -51.10371398925781, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -56.03058624267578, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.132597416639328, "rewards_train/margins": 0.44737502932548523, "rewards_train/rejected": -0.3147776126861572, "step": 173 }, { "epoch": 0.23, "learning_rate": 4.999200493865761e-07, "loss": 0.6503, "step": 174 }, { "epoch": 0.23, "logps_train/chosen": -53.67700958251953, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -60.64039611816406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.11276771128177643, "rewards_train/margins": 0.5689948946237564, "rewards_train/rejected": -0.45622718334198, "step": 174 }, { "epoch": 0.23, "logps_train/chosen": -71.80726623535156, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -65.7108154296875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.05989867448806763, "rewards_train/margins": 0.2153552770614624, "rewards_train/rejected": -0.15545660257339478, "step": 175 }, { "epoch": 0.23, "learning_rate": 4.99905541304298e-07, "loss": 0.5703, "step": 176 }, { "epoch": 0.23, "logps_train/chosen": -54.552207946777344, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -64.19302368164062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.016654152423143387, "rewards_train/margins": 0.3797064907848835, "rewards_train/rejected": -0.3630523383617401, "step": 176 }, { "epoch": 0.24, "logps_train/chosen": -39.572669982910156, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -45.44940185546875, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.05570477247238159, "rewards_train/margins": -0.10842059552669525, "rewards_train/rejected": 0.05271582305431366, "step": 177 }, { "epoch": 0.24, "learning_rate": 4.998898245319145e-07, "loss": 0.6695, "step": 178 }, { "epoch": 0.24, "logps_train/chosen": -77.93913269042969, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -88.1424560546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.4013994336128235, "rewards_train/margins": 0.6250197887420654, "rewards_train/rejected": -0.22362035512924194, "step": 178 }, { "epoch": 0.24, "logps_train/chosen": -40.6803092956543, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -49.181480407714844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.06295301765203476, "rewards_train/margins": 0.1731637939810753, "rewards_train/rejected": -0.23611681163311005, "step": 179 }, { "epoch": 0.24, "learning_rate": 4.998728991454407e-07, "loss": 0.5836, "step": 180 }, { "epoch": 0.24, "logps_train/chosen": -54.1745491027832, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -42.93806457519531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.1278574913740158, "rewards_train/margins": 0.2341638058423996, "rewards_train/rejected": -0.10630631446838379, "step": 180 }, { "epoch": 0.24, "logps_train/chosen": -42.657432556152344, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -49.74334716796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0067177414894104, "rewards_train/margins": 0.3238258957862854, "rewards_train/rejected": -0.317108154296875, "step": 181 }, { "epoch": 0.24, "learning_rate": 4.998547652267378e-07, "loss": 0.6148, "step": 182 }, { "epoch": 0.24, "logps_train/chosen": -77.77507019042969, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -122.24191284179688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3404620885848999, "rewards_train/margins": 0.9459036588668823, "rewards_train/rejected": -0.6054415702819824, "step": 182 }, { "epoch": 0.24, "logps_train/chosen": -61.900638580322266, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -44.3643798828125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.007251359522342682, "rewards_train/margins": 0.1948118731379509, "rewards_train/rejected": -0.20206323266029358, "step": 183 }, { "epoch": 0.24, "learning_rate": 4.998354228635121e-07, "loss": 0.5137, "step": 184 }, { "epoch": 0.24, "logps_train/chosen": -71.55892944335938, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -77.88035583496094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.15745575726032257, "rewards_train/margins": -0.14442064613103867, "rewards_train/rejected": -0.013035111129283905, "step": 184 }, { "epoch": 0.25, "logps_train/chosen": -65.37760925292969, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -73.73092651367188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.02994891256093979, "rewards_train/margins": 0.15876873582601547, "rewards_train/rejected": -0.18871764838695526, "step": 185 }, { "epoch": 0.25, "learning_rate": 4.998148721493147e-07, "loss": 0.7105, "step": 186 }, { "epoch": 0.25, "logps_train/chosen": -53.002296447753906, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -60.96434783935547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02633272111415863, "rewards_train/margins": 0.3727674037218094, "rewards_train/rejected": -0.34643468260765076, "step": 186 }, { "epoch": 0.25, "logps_train/chosen": -53.66192626953125, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -50.88130569458008, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1213073655962944, "rewards_train/margins": 0.1867815926671028, "rewards_train/rejected": -0.06547422707080841, "step": 187 }, { "epoch": 0.25, "learning_rate": 4.997931131835409e-07, "loss": 0.6, "step": 188 }, { "epoch": 0.25, "logps_train/chosen": -87.65532684326172, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -89.2316665649414, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.08290465176105499, "rewards_train/margins": 0.29513366520404816, "rewards_train/rejected": -0.21222901344299316, "step": 188 }, { "epoch": 0.25, "logps_train/chosen": -60.90528869628906, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -64.6880874633789, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.11240382492542267, "rewards_train/margins": 0.08921770751476288, "rewards_train/rejected": -0.20162153244018555, "step": 189 }, { "epoch": 0.25, "learning_rate": 4.997701460714298e-07, "loss": 0.6243, "step": 190 }, { "epoch": 0.25, "logps_train/chosen": -78.42573547363281, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -74.37881469726562, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.029301732778549194, "rewards_train/margins": 0.12187006324529648, "rewards_train/rejected": -0.09256833046674728, "step": 190 }, { "epoch": 0.25, "logps_train/chosen": -39.57395553588867, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -43.86948776245117, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.042326103895902634, "rewards_train/margins": 0.1289978213608265, "rewards_train/rejected": -0.17132392525672913, "step": 191 }, { "epoch": 0.25, "learning_rate": 4.99745970924064e-07, "loss": 0.6525, "step": 192 }, { "epoch": 0.25, "logps_train/chosen": -64.17208862304688, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -100.61174011230469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.19060376286506653, "rewards_train/margins": 0.7814652025699615, "rewards_train/rejected": -0.590861439704895, "step": 192 }, { "epoch": 0.26, "logps_train/chosen": -35.9684944152832, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -43.29087829589844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.036549150943756104, "rewards_train/margins": 0.3875119686126709, "rewards_train/rejected": -0.3509628176689148, "step": 193 }, { "epoch": 0.26, "learning_rate": 4.997205878583687e-07, "loss": 0.4882, "step": 194 }, { "epoch": 0.26, "logps_train/chosen": -73.85863494873047, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -52.19065856933594, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.414917916059494, "rewards_train/margins": 0.3871087599545717, "rewards_train/rejected": 0.027809156104922295, "step": 194 }, { "epoch": 0.26, "logps_train/chosen": -59.111663818359375, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -80.7671127319336, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.07145082205533981, "rewards_train/margins": 0.3192560598254204, "rewards_train/rejected": -0.24780523777008057, "step": 195 }, { "epoch": 0.26, "learning_rate": 4.996939969971111e-07, "loss": 0.5863, "step": 196 }, { "epoch": 0.26, "logps_train/chosen": -86.3590316772461, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -96.23582458496094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 3.456324338912964e-05, "rewards_train/margins": 0.06424279510974884, "rewards_train/rejected": -0.06420823186635971, "step": 196 }, { "epoch": 0.26, "logps_train/chosen": -51.63418197631836, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -77.60122680664062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.07642568647861481, "rewards_train/margins": 0.7146729379892349, "rewards_train/rejected": -0.6382472515106201, "step": 197 }, { "epoch": 0.26, "learning_rate": 4.996661984689006e-07, "loss": 0.564, "step": 198 }, { "epoch": 0.26, "logps_train/chosen": -67.92711639404297, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -68.38245391845703, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07161794602870941, "rewards_train/margins": 0.09943979978561401, "rewards_train/rejected": -0.17105774581432343, "step": 198 }, { "epoch": 0.26, "logps_train/chosen": -84.33158874511719, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -85.19648742675781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0538620799779892, "rewards_train/margins": 0.5251610428094864, "rewards_train/rejected": -0.5790231227874756, "step": 199 }, { "epoch": 0.27, "learning_rate": 4.996371924081868e-07, "loss": 0.5895, "step": 200 }, { "epoch": 0.27, "logps_train/chosen": -36.751338958740234, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -41.08473205566406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.12564730644226074, "rewards_train/margins": 0.32513612508773804, "rewards_train/rejected": -0.1994888186454773, "step": 200 }, { "epoch": 0.27, "logps_train/chosen": -72.52227783203125, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -66.40202331542969, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.008477356284856796, "rewards_train/margins": -0.02452481910586357, "rewards_train/rejected": 0.016047462821006775, "step": 201 }, { "epoch": 0.27, "learning_rate": 4.996069789552604e-07, "loss": 0.656, "step": 202 }, { "epoch": 0.27, "logps_train/chosen": -46.98569869995117, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -46.165889739990234, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.09349186718463898, "rewards_train/margins": 0.23715956509113312, "rewards_train/rejected": -0.3306514322757721, "step": 202 }, { "epoch": 0.27, "logps_train/chosen": -61.59912872314453, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -78.4246597290039, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.00366289378143847, "rewards_train/margins": 0.22942834137938917, "rewards_train/rejected": -0.23309123516082764, "step": 203 }, { "epoch": 0.27, "learning_rate": 4.995755582562513e-07, "loss": 0.621, "step": 204 }, { "epoch": 0.27, "logps_train/chosen": -68.05762481689453, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -74.42012023925781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11611265689134598, "rewards_train/margins": 0.24953097850084305, "rewards_train/rejected": -0.13341832160949707, "step": 204 }, { "epoch": 0.27, "logps_train/chosen": -50.00814437866211, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -40.646636962890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28199827671051025, "rewards_train/margins": 0.37869328260421753, "rewards_train/rejected": -0.09669500589370728, "step": 205 }, { "epoch": 0.27, "learning_rate": 4.995429304631284e-07, "loss": 0.5676, "step": 206 }, { "epoch": 0.27, "logps_train/chosen": -45.341514587402344, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -50.673377990722656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.09397386014461517, "rewards_train/margins": 0.4449055641889572, "rewards_train/rejected": -0.35093170404434204, "step": 206 }, { "epoch": 0.27, "logps_train/chosen": -72.52935791015625, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -77.76852416992188, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.01815762370824814, "rewards_train/margins": 0.38563456386327744, "rewards_train/rejected": -0.3674769401550293, "step": 207 }, { "epoch": 0.28, "learning_rate": 4.99509095733699e-07, "loss": 0.5756, "step": 208 }, { "epoch": 0.28, "logps_train/chosen": -39.61733627319336, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -40.18269348144531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03592272102832794, "rewards_train/margins": 0.1682545244693756, "rewards_train/rejected": -0.13233180344104767, "step": 208 }, { "epoch": 0.28, "logps_train/chosen": -41.508949279785156, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -50.5323486328125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.11051559448242188, "rewards_train/margins": 0.19812536239624023, "rewards_train/rejected": -0.08760976791381836, "step": 209 }, { "epoch": 0.28, "learning_rate": 4.994740542316077e-07, "loss": 0.6436, "step": 210 }, { "epoch": 0.28, "logps_train/chosen": -67.76651763916016, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -93.08468627929688, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.036026522517204285, "rewards_train/margins": 0.17088010907173157, "rewards_train/rejected": -0.20690663158893585, "step": 210 }, { "epoch": 0.28, "logps_train/chosen": -89.65243530273438, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -86.19413757324219, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.33319321274757385, "rewards_train/margins": 0.45416951179504395, "rewards_train/rejected": -0.12097629904747009, "step": 211 }, { "epoch": 0.28, "learning_rate": 4.994378061263359e-07, "loss": 0.6295, "step": 212 }, { "epoch": 0.28, "logps_train/chosen": -70.17023468017578, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -80.16148376464844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4048513174057007, "rewards_train/margins": 0.744436502456665, "rewards_train/rejected": -0.33958518505096436, "step": 212 }, { "epoch": 0.28, "logps_train/chosen": -58.20573806762695, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -63.61360549926758, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.10377678275108337, "rewards_train/margins": 0.41852131485939026, "rewards_train/rejected": -0.5222980976104736, "step": 213 }, { "epoch": 0.28, "learning_rate": 4.994003515932005e-07, "loss": 0.537, "step": 214 }, { "epoch": 0.28, "logps_train/chosen": -73.37883758544922, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -80.34822082519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.13867858052253723, "rewards_train/margins": 0.3086574822664261, "rewards_train/rejected": -0.16997890174388885, "step": 214 }, { "epoch": 0.29, "logps_train/chosen": -82.57935333251953, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -89.07049560546875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2420646846294403, "rewards_train/margins": 0.5491144061088562, "rewards_train/rejected": -0.3070497214794159, "step": 215 }, { "epoch": 0.29, "learning_rate": 4.993616908133538e-07, "loss": 0.5552, "step": 216 }, { "epoch": 0.29, "logps_train/chosen": -62.509071350097656, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -47.232276916503906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1608111560344696, "rewards_train/margins": 0.2496638000011444, "rewards_train/rejected": -0.0888526439666748, "step": 216 }, { "epoch": 0.29, "logps_train/chosen": -46.436553955078125, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -73.767578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.37353235483169556, "rewards_train/margins": 0.3127901256084442, "rewards_train/rejected": 0.06074222922325134, "step": 217 }, { "epoch": 0.29, "learning_rate": 4.993218239737822e-07, "loss": 0.6078, "step": 218 }, { "epoch": 0.29, "logps_train/chosen": -76.15928649902344, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -64.4079360961914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.03311648219823837, "rewards_train/margins": 0.32017726451158524, "rewards_train/rejected": -0.3532937467098236, "step": 218 }, { "epoch": 0.29, "logps_train/chosen": -43.111488342285156, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -37.24299240112305, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.026741839945316315, "rewards_train/margins": 0.2276037111878395, "rewards_train/rejected": -0.2008618712425232, "step": 219 }, { "epoch": 0.29, "learning_rate": 4.992807512673049e-07, "loss": 0.5998, "step": 220 }, { "epoch": 0.29, "logps_train/chosen": -72.62298583984375, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -58.47601318359375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.15711729228496552, "rewards_train/margins": 0.21350783109664917, "rewards_train/rejected": -0.056390538811683655, "step": 220 }, { "epoch": 0.29, "logps_train/chosen": -45.64373779296875, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -38.30986785888672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07413950562477112, "rewards_train/margins": 0.038097307085990906, "rewards_train/rejected": -0.11223681271076202, "step": 221 }, { "epoch": 0.29, "learning_rate": 4.992384728925738e-07, "loss": 0.6771, "step": 222 }, { "epoch": 0.29, "logps_train/chosen": -81.40364074707031, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -101.76524353027344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.009114399552345276, "rewards_train/margins": 0.7064722627401352, "rewards_train/rejected": -0.7155866622924805, "step": 222 }, { "epoch": 0.3, "logps_train/chosen": -40.5022087097168, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -62.6319580078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.05690810829401016, "rewards_train/margins": 0.8615098968148232, "rewards_train/rejected": -0.804601788520813, "step": 223 }, { "epoch": 0.3, "learning_rate": 4.99194989054072e-07, "loss": 0.449, "step": 224 }, { "epoch": 0.3, "logps_train/chosen": -56.36979293823242, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -71.65719604492188, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.37395814061164856, "rewards_train/margins": 0.5599899291992188, "rewards_train/rejected": -0.1860317885875702, "step": 224 }, { "epoch": 0.3, "logps_train/chosen": -79.16781616210938, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -100.56978607177734, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.2839689254760742, "rewards_train/margins": 0.3089473247528076, "rewards_train/rejected": -0.5929162502288818, "step": 225 }, { "epoch": 0.3, "learning_rate": 4.991502999621128e-07, "loss": 0.6277, "step": 226 }, { "epoch": 0.3, "logps_train/chosen": -39.173728942871094, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -48.77214050292969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.15918953716754913, "rewards_train/margins": 0.5731227844953537, "rewards_train/rejected": -0.41393324732780457, "step": 226 }, { "epoch": 0.3, "logps_train/chosen": -86.06292724609375, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -70.32135009765625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.45151931047439575, "rewards_train/margins": 0.5352165624499321, "rewards_train/rejected": -0.08369725197553635, "step": 227 }, { "epoch": 0.3, "learning_rate": 4.99104405832839e-07, "loss": 0.5527, "step": 228 }, { "epoch": 0.3, "logps_train/chosen": -48.39280700683594, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -72.61219024658203, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03923491761088371, "rewards_train/margins": 0.37076691910624504, "rewards_train/rejected": -0.33153200149536133, "step": 228 }, { "epoch": 0.3, "logps_train/chosen": -81.25634765625, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -80.35000610351562, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.10219761729240417, "rewards_train/margins": 0.15233466029167175, "rewards_train/rejected": -0.2545322775840759, "step": 229 }, { "epoch": 0.31, "learning_rate": 4.990573068882216e-07, "loss": 0.6863, "step": 230 }, { "epoch": 0.31, "logps_train/chosen": -53.20347213745117, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -60.157066345214844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.22669494152069092, "rewards_train/margins": -0.049269452691078186, "rewards_train/rejected": -0.17742548882961273, "step": 230 }, { "epoch": 0.31, "logps_train/chosen": -68.158447265625, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -70.55867004394531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2888917326927185, "rewards_train/margins": 0.277912974357605, "rewards_train/rejected": -0.5668047070503235, "step": 231 }, { "epoch": 0.31, "learning_rate": 4.990090033560585e-07, "loss": 0.6917, "step": 232 }, { "epoch": 0.31, "logps_train/chosen": -61.90306854248047, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -68.86416625976562, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.21413478255271912, "rewards_train/margins": 0.2761881649494171, "rewards_train/rejected": -0.49032294750213623, "step": 232 }, { "epoch": 0.31, "logps_train/chosen": -76.01016235351562, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -77.12147521972656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.30835872888565063, "rewards_train/margins": 0.8986305594444275, "rewards_train/rejected": -0.5902718305587769, "step": 233 }, { "epoch": 0.31, "learning_rate": 4.989594954699745e-07, "loss": 0.5612, "step": 234 }, { "epoch": 0.31, "logps_train/chosen": -110.30442810058594, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -100.64962005615234, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.07580772787332535, "rewards_train/margins": 0.5724109187722206, "rewards_train/rejected": -0.49660319089889526, "step": 234 }, { "epoch": 0.31, "logps_train/chosen": -54.16011428833008, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -35.82278060913086, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.15663626790046692, "rewards_train/margins": -0.0583425834774971, "rewards_train/rejected": -0.09829368442296982, "step": 235 }, { "epoch": 0.31, "learning_rate": 4.989087834694185e-07, "loss": 0.6368, "step": 236 }, { "epoch": 0.31, "logps_train/chosen": -70.70149993896484, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -89.41949462890625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.01468108594417572, "rewards_train/margins": 0.6382056027650833, "rewards_train/rejected": -0.652886688709259, "step": 236 }, { "epoch": 0.31, "logps_train/chosen": -53.71183776855469, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -71.12318420410156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.19852736592292786, "rewards_train/margins": 0.6309789717197418, "rewards_train/rejected": -0.8295063376426697, "step": 237 }, { "epoch": 0.32, "learning_rate": 4.988568675996635e-07, "loss": 0.5057, "step": 238 }, { "epoch": 0.32, "logps_train/chosen": -66.60150146484375, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -58.6766242980957, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.051555898040533066, "rewards_train/margins": 0.053606484085321426, "rewards_train/rejected": -0.10516238212585449, "step": 238 }, { "epoch": 0.32, "logps_train/chosen": -42.52677536010742, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -57.406883239746094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.08267395198345184, "rewards_train/margins": 0.4117414802312851, "rewards_train/rejected": -0.32906752824783325, "step": 239 }, { "epoch": 0.32, "learning_rate": 4.988037481118053e-07, "loss": 0.6346, "step": 240 }, { "epoch": 0.32, "logps_train/chosen": -46.27587890625, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -54.25221252441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11057731509208679, "rewards_train/margins": 0.35708776116371155, "rewards_train/rejected": -0.24651044607162476, "step": 240 }, { "epoch": 0.32, "logps_train/chosen": -52.66162109375, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -57.842891693115234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.30415022373199463, "rewards_train/margins": 0.932189404964447, "rewards_train/rejected": -0.6280391812324524, "step": 241 }, { "epoch": 0.32, "learning_rate": 4.987494252627611e-07, "loss": 0.5422, "step": 242 }, { "epoch": 0.32, "logps_train/chosen": -72.02131652832031, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -69.80036163330078, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.39744386076927185, "rewards_train/margins": -0.30568890273571014, "rewards_train/rejected": -0.0917549580335617, "step": 242 }, { "epoch": 0.32, "logps_train/chosen": -56.40485763549805, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -61.2404670715332, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07290759682655334, "rewards_train/margins": 0.22550436854362488, "rewards_train/rejected": -0.2984119653701782, "step": 243 }, { "epoch": 0.32, "learning_rate": 4.986938993152679e-07, "loss": 0.7547, "step": 244 }, { "epoch": 0.32, "logps_train/chosen": -57.6363525390625, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -61.31168746948242, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06597905606031418, "rewards_train/margins": 0.12378331273794174, "rewards_train/rejected": -0.18976236879825592, "step": 244 }, { "epoch": 0.33, "logps_train/chosen": -56.0382080078125, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -64.04031372070312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.31024178862571716, "rewards_train/margins": 0.8728669583797455, "rewards_train/rejected": -0.5626251697540283, "step": 245 }, { "epoch": 0.33, "learning_rate": 4.986371705378818e-07, "loss": 0.5424, "step": 246 }, { "epoch": 0.33, "logps_train/chosen": -64.49261474609375, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -68.74340057373047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08363626897335052, "rewards_train/margins": 0.5039849430322647, "rewards_train/rejected": -0.5876212120056152, "step": 246 }, { "epoch": 0.33, "logps_train/chosen": -70.82987976074219, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -86.32960510253906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.14044982194900513, "rewards_train/margins": 0.8562231659889221, "rewards_train/rejected": -0.715773344039917, "step": 247 }, { "epoch": 0.33, "learning_rate": 4.985792392049767e-07, "loss": 0.4757, "step": 248 }, { "epoch": 0.33, "logps_train/chosen": -62.25496292114258, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -61.74409484863281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.1362227350473404, "rewards_train/margins": 0.9793819934129715, "rewards_train/rejected": -0.8431592583656311, "step": 248 }, { "epoch": 0.33, "logps_train/chosen": -50.92721176147461, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -95.39331817626953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.15102873742580414, "rewards_train/margins": 0.6309859603643417, "rewards_train/rejected": -0.4799572229385376, "step": 249 }, { "epoch": 0.33, "learning_rate": 4.985201055967425e-07, "loss": 0.5244, "step": 250 }, { "epoch": 0.33, "logps_train/chosen": -89.32518005371094, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -103.10430908203125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.27295079827308655, "rewards_train/margins": 0.9575996100902557, "rewards_train/rejected": -0.6846488118171692, "step": 250 }, { "epoch": 0.33, "logps_train/chosen": -68.58715057373047, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -62.4371337890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2209724485874176, "rewards_train/margins": 0.6537483632564545, "rewards_train/rejected": -0.43277591466903687, "step": 251 }, { "epoch": 0.33, "learning_rate": 4.98459769999184e-07, "loss": 0.5427, "step": 252 }, { "epoch": 0.33, "logps_train/chosen": -86.50833129882812, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -92.8584976196289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.09926999360322952, "rewards_train/margins": 0.26158010214567184, "rewards_train/rejected": -0.36085009574890137, "step": 252 }, { "epoch": 0.34, "logps_train/chosen": -81.6760482788086, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -75.13443756103516, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.08869902789592743, "rewards_train/margins": -0.03736482560634613, "rewards_train/rejected": -0.0513342022895813, "step": 253 }, { "epoch": 0.34, "learning_rate": 4.983982327041198e-07, "loss": 0.6613, "step": 254 }, { "epoch": 0.34, "logps_train/chosen": -56.557308197021484, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -91.56390380859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3580141067504883, "rewards_train/margins": 1.1722168922424316, "rewards_train/rejected": -0.8142027854919434, "step": 254 }, { "epoch": 0.34, "logps_train/chosen": -62.262168884277344, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -65.96533203125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.10434179753065109, "rewards_train/margins": 0.20469141751527786, "rewards_train/rejected": -0.30903321504592896, "step": 255 }, { "epoch": 0.34, "learning_rate": 4.983354940091803e-07, "loss": 0.5755, "step": 256 }, { "epoch": 0.34, "logps_train/chosen": -49.22135543823242, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -73.50614929199219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.08072912693023682, "rewards_train/margins": 1.0987917184829712, "rewards_train/rejected": -1.179520845413208, "step": 256 }, { "epoch": 0.34, "logps_train/chosen": -68.78033447265625, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -84.46699523925781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.14306002855300903, "rewards_train/margins": 1.2370248436927795, "rewards_train/rejected": -1.0939648151397705, "step": 257 }, { "epoch": 0.34, "learning_rate": 4.982715542178068e-07, "loss": 0.4, "step": 258 }, { "epoch": 0.34, "logps_train/chosen": -65.43258666992188, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -61.63449478149414, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.36982113122940063, "rewards_train/margins": 0.0022222399711608887, "rewards_train/rejected": -0.3720433712005615, "step": 258 }, { "epoch": 0.34, "logps_train/chosen": -37.54210662841797, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -42.75946807861328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2721795439720154, "rewards_train/margins": 0.24439257383346558, "rewards_train/rejected": -0.516572117805481, "step": 259 }, { "epoch": 0.35, "learning_rate": 4.982064136392495e-07, "loss": 0.6831, "step": 260 }, { "epoch": 0.35, "logps_train/chosen": -73.95309448242188, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -64.70706176757812, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.27343499660491943, "rewards_train/margins": -0.008197635412216187, "rewards_train/rejected": -0.26523736119270325, "step": 260 }, { "epoch": 0.35, "logps_train/chosen": -78.92915344238281, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -72.20160675048828, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.23900993168354034, "rewards_train/margins": 0.2604477256536484, "rewards_train/rejected": -0.4994576573371887, "step": 261 }, { "epoch": 0.35, "learning_rate": 4.981400725885669e-07, "loss": 0.728, "step": 262 }, { "epoch": 0.35, "logps_train/chosen": -40.57674789428711, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -72.76495361328125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.260977566242218, "rewards_train/margins": 1.0370829701423645, "rewards_train/rejected": -0.7761054039001465, "step": 262 }, { "epoch": 0.35, "logps_train/chosen": -59.44086456298828, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -56.47510528564453, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.10580535978078842, "rewards_train/margins": 0.4057677760720253, "rewards_train/rejected": -0.5115731358528137, "step": 263 }, { "epoch": 0.35, "learning_rate": 4.98072531386623e-07, "loss": 0.5195, "step": 264 }, { "epoch": 0.35, "logps_train/chosen": -68.55766296386719, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -65.55680847167969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.29735875129699707, "rewards_train/margins": 0.7186649739742279, "rewards_train/rejected": -0.42130622267723083, "step": 264 }, { "epoch": 0.35, "logps_train/chosen": -59.90800476074219, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -75.57239532470703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.20314466953277588, "rewards_train/margins": 1.250227689743042, "rewards_train/rejected": -1.0470830202102661, "step": 265 }, { "epoch": 0.35, "learning_rate": 4.98003790360087e-07, "loss": 0.4394, "step": 266 }, { "epoch": 0.35, "logps_train/chosen": -94.09783935546875, "logps_train/ref_chosen": -92.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -76.35623168945312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17072133719921112, "rewards_train/margins": 0.32583968341350555, "rewards_train/rejected": -0.4965610206127167, "step": 266 }, { "epoch": 0.35, "logps_train/chosen": -50.88528060913086, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -44.34239196777344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17094998061656952, "rewards_train/margins": 0.05645342171192169, "rewards_train/rejected": -0.2274034023284912, "step": 267 }, { "epoch": 0.36, "learning_rate": 4.979338498414306e-07, "loss": 0.6395, "step": 268 }, { "epoch": 0.36, "logps_train/chosen": -45.69984436035156, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -48.23707580566406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11126578599214554, "rewards_train/margins": 0.2724729999899864, "rewards_train/rejected": -0.16120721399784088, "step": 268 }, { "epoch": 0.36, "logps_train/chosen": -71.47410583496094, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -74.85859680175781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.07133936882019043, "rewards_train/margins": 0.7587611675262451, "rewards_train/rejected": -0.6874217987060547, "step": 269 }, { "epoch": 0.36, "learning_rate": 4.978627101689276e-07, "loss": 0.562, "step": 270 }, { "epoch": 0.36, "logps_train/chosen": -61.465003967285156, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -81.43618774414062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.01014012098312378, "rewards_train/margins": 0.9752429127693176, "rewards_train/rejected": -0.9651027917861938, "step": 270 }, { "epoch": 0.36, "logps_train/chosen": -93.34466552734375, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -104.21041870117188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.20634135603904724, "rewards_train/margins": 0.169388085603714, "rewards_train/rejected": -0.37572944164276123, "step": 271 }, { "epoch": 0.36, "learning_rate": 4.977903716866511e-07, "loss": 0.7196, "step": 272 }, { "epoch": 0.36, "logps_train/chosen": -45.60688018798828, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -54.93670654296875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07514083385467529, "rewards_train/margins": 0.27477964758872986, "rewards_train/rejected": -0.34992048144340515, "step": 272 }, { "epoch": 0.36, "logps_train/chosen": -79.77778625488281, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -85.91455078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0809035375714302, "rewards_train/margins": 0.8574260398745537, "rewards_train/rejected": -0.9383295774459839, "step": 273 }, { "epoch": 0.36, "learning_rate": 4.977168347444725e-07, "loss": 0.5162, "step": 274 }, { "epoch": 0.36, "logps_train/chosen": -47.941383361816406, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -57.00628662109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17245863378047943, "rewards_train/margins": 0.5937950760126114, "rewards_train/rejected": -0.7662537097930908, "step": 274 }, { "epoch": 0.37, "logps_train/chosen": -75.7536392211914, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -76.83694458007812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4878634810447693, "rewards_train/margins": -0.008856594562530518, "rewards_train/rejected": -0.47900688648223877, "step": 275 }, { "epoch": 0.37, "learning_rate": 4.976420996980598e-07, "loss": 0.6649, "step": 276 }, { "epoch": 0.37, "logps_train/chosen": -42.84344482421875, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -38.799224853515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.25628066062927246, "rewards_train/margins": 0.5717495977878571, "rewards_train/rejected": -0.3154689371585846, "step": 276 }, { "epoch": 0.37, "logps_train/chosen": -54.45808792114258, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -76.51191711425781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12705844640731812, "rewards_train/margins": 0.3819460868835449, "rewards_train/rejected": -0.509004533290863, "step": 277 }, { "epoch": 0.37, "learning_rate": 4.975661669088754e-07, "loss": 0.5544, "step": 278 }, { "epoch": 0.37, "logps_train/chosen": -60.03647232055664, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -72.93626403808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.08489708602428436, "rewards_train/margins": 0.6290422230958939, "rewards_train/rejected": -0.7139393091201782, "step": 278 }, { "epoch": 0.37, "logps_train/chosen": -56.25327682495117, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -63.97766876220703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03639087826013565, "rewards_train/margins": 0.4865014925599098, "rewards_train/rejected": -0.45011061429977417, "step": 279 }, { "epoch": 0.37, "learning_rate": 4.974890367441752e-07, "loss": 0.5193, "step": 280 }, { "epoch": 0.37, "logps_train/chosen": -72.85212707519531, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -77.4080810546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.14801473915576935, "rewards_train/margins": 0.6396040171384811, "rewards_train/rejected": -0.4915892779827118, "step": 280 }, { "epoch": 0.37, "logps_train/chosen": -62.62663269042969, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -71.96431732177734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0982101559638977, "rewards_train/margins": 0.49509674310684204, "rewards_train/rejected": -0.5933068990707397, "step": 281 }, { "epoch": 0.37, "learning_rate": 4.974107095770059e-07, "loss": 0.5013, "step": 282 }, { "epoch": 0.37, "logps_train/chosen": -45.46825408935547, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -47.06733322143555, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.16098734736442566, "rewards_train/margins": 0.6063923537731171, "rewards_train/rejected": -0.4454050064086914, "step": 282 }, { "epoch": 0.38, "logps_train/chosen": -60.39244842529297, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -59.32325744628906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.009557027369737625, "rewards_train/margins": 0.445034246891737, "rewards_train/rejected": -0.4545912742614746, "step": 283 }, { "epoch": 0.38, "learning_rate": 4.973311857862036e-07, "loss": 0.5232, "step": 284 }, { "epoch": 0.38, "logps_train/chosen": -70.64202880859375, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -66.93512725830078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.18032830953598022, "rewards_train/margins": 0.8500127792358398, "rewards_train/rejected": -0.6696844696998596, "step": 284 }, { "epoch": 0.38, "logps_train/chosen": -67.68699645996094, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -86.08522033691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04016879200935364, "rewards_train/margins": 0.299603670835495, "rewards_train/rejected": -0.33977246284484863, "step": 285 }, { "epoch": 0.38, "learning_rate": 4.972504657563922e-07, "loss": 0.6041, "step": 286 }, { "epoch": 0.38, "logps_train/chosen": -63.0327033996582, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -91.54096221923828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.05795776844024658, "rewards_train/margins": 1.0820763111114502, "rewards_train/rejected": -1.1400340795516968, "step": 286 }, { "epoch": 0.38, "logps_train/chosen": -56.81370162963867, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -87.04418182373047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.32331734895706177, "rewards_train/margins": 1.3011727929115295, "rewards_train/rejected": -0.9778554439544678, "step": 287 }, { "epoch": 0.38, "learning_rate": 4.971685498779813e-07, "loss": 0.3884, "step": 288 }, { "epoch": 0.38, "logps_train/chosen": -24.749740600585938, "logps_train/ref_chosen": -27.5, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -40.481956481933594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2730727791786194, "rewards_train/margins": 0.765408992767334, "rewards_train/rejected": -0.4923362135887146, "step": 288 }, { "epoch": 0.38, "logps_train/chosen": -62.213356018066406, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -54.45060729980469, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2290196269750595, "rewards_train/margins": 0.08283768594264984, "rewards_train/rejected": -0.31185731291770935, "step": 289 }, { "epoch": 0.39, "learning_rate": 4.970854385471642e-07, "loss": 0.6327, "step": 290 }, { "epoch": 0.39, "logps_train/chosen": -63.23533630371094, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -65.18846893310547, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.09492029249668121, "rewards_train/margins": 0.31572385132312775, "rewards_train/rejected": -0.41064414381980896, "step": 290 }, { "epoch": 0.39, "logps_train/chosen": -60.477413177490234, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -72.69172668457031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.028821326792240143, "rewards_train/margins": 0.66518185287714, "rewards_train/rejected": -0.6363605260848999, "step": 291 }, { "epoch": 0.39, "learning_rate": 4.97001132165916e-07, "loss": 0.5527, "step": 292 }, { "epoch": 0.39, "logps_train/chosen": -80.474853515625, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -68.58543395996094, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.5334223508834839, "rewards_train/margins": -0.02644103765487671, "rewards_train/rejected": -0.5069813132286072, "step": 292 }, { "epoch": 0.39, "logps_train/chosen": -73.7633056640625, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -81.98731994628906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.19039292633533478, "rewards_train/margins": 0.6599016040563583, "rewards_train/rejected": -0.8502945303916931, "step": 293 }, { "epoch": 0.39, "learning_rate": 4.969156311419921e-07, "loss": 0.6576, "step": 294 }, { "epoch": 0.39, "logps_train/chosen": -63.43901062011719, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -63.561363220214844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2873491942882538, "rewards_train/margins": 0.6466104388237, "rewards_train/rejected": -0.35926124453544617, "step": 294 }, { "epoch": 0.39, "logps_train/chosen": -62.90460205078125, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -61.785762786865234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.15608543157577515, "rewards_train/margins": 0.555303156375885, "rewards_train/rejected": -0.7113885879516602, "step": 295 }, { "epoch": 0.39, "learning_rate": 4.968289358889256e-07, "loss": 0.5243, "step": 296 }, { "epoch": 0.39, "logps_train/chosen": -47.86812973022461, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -57.66960906982422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.11415675282478333, "rewards_train/margins": 0.3715541958808899, "rewards_train/rejected": -0.4857109487056732, "step": 296 }, { "epoch": 0.39, "logps_train/chosen": -70.55889129638672, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -93.11337280273438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.20198270678520203, "rewards_train/margins": 0.23122942447662354, "rewards_train/rejected": -0.43321213126182556, "step": 297 }, { "epoch": 0.4, "learning_rate": 4.967410468260258e-07, "loss": 0.6547, "step": 298 }, { "epoch": 0.4, "logps_train/chosen": -68.3116455078125, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -74.50334930419922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.03568052500486374, "rewards_train/margins": 0.6055467054247856, "rewards_train/rejected": -0.5698661804199219, "step": 298 }, { "epoch": 0.4, "logps_train/chosen": -70.03067016601562, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -75.57959747314453, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2819732427597046, "rewards_train/margins": 0.19043996930122375, "rewards_train/rejected": -0.47241321206092834, "step": 299 }, { "epoch": 0.4, "learning_rate": 4.966519643783757e-07, "loss": 0.6075, "step": 300 }, { "epoch": 0.4, "logps_train/chosen": -63.84186553955078, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -99.1169204711914, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5275319218635559, "rewards_train/margins": 1.7415676712989807, "rewards_train/rejected": -1.2140357494354248, "step": 300 }, { "epoch": 0.4, "logps_train/chosen": -47.97945785522461, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -57.391029357910156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0356481559574604, "rewards_train/margins": 0.41518085077404976, "rewards_train/rejected": -0.37953269481658936, "step": 301 }, { "epoch": 0.4, "learning_rate": 4.965616889768307e-07, "loss": 0.4504, "step": 302 }, { "epoch": 0.4, "logps_train/chosen": -77.11512756347656, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -97.99443054199219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.22598733007907867, "rewards_train/margins": 1.2152746468782425, "rewards_train/rejected": -0.9892873167991638, "step": 302 }, { "epoch": 0.4, "logps_train/chosen": -75.13955688476562, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -68.55375671386719, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.20692503452301025, "rewards_train/margins": 0.06837305426597595, "rewards_train/rejected": -0.2752980887889862, "step": 303 }, { "epoch": 0.4, "learning_rate": 4.964702210580154e-07, "loss": 0.5417, "step": 304 }, { "epoch": 0.4, "logps_train/chosen": -77.9247817993164, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -62.123497009277344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.11845962703227997, "rewards_train/margins": 0.4901845306158066, "rewards_train/rejected": -0.3717249035835266, "step": 304 }, { "epoch": 0.41, "logps_train/chosen": -80.21438598632812, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -69.56214141845703, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2601107656955719, "rewards_train/margins": -0.014052674174308777, "rewards_train/rejected": -0.24605809152126312, "step": 305 }, { "epoch": 0.41, "learning_rate": 4.963775610643226e-07, "loss": 0.6385, "step": 306 }, { "epoch": 0.41, "logps_train/chosen": -68.18052673339844, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -59.258644104003906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.06085354834794998, "rewards_train/margins": 0.29843688756227493, "rewards_train/rejected": -0.23758333921432495, "step": 306 }, { "epoch": 0.41, "logps_train/chosen": -89.0968017578125, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -99.31160736083984, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.6780396699905396, "rewards_train/margins": 0.31874561309814453, "rewards_train/rejected": -0.9967852830886841, "step": 307 }, { "epoch": 0.41, "learning_rate": 4.962837094439104e-07, "loss": 0.6843, "step": 308 }, { "epoch": 0.41, "logps_train/chosen": -61.58351135253906, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -74.03178405761719, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.047636453062295914, "rewards_train/margins": 0.603549424558878, "rewards_train/rejected": -0.555912971496582, "step": 308 }, { "epoch": 0.41, "logps_train/chosen": -77.00076293945312, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -69.06582641601562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08445076644420624, "rewards_train/margins": 0.7791628986597061, "rewards_train/rejected": -0.8636136651039124, "step": 309 }, { "epoch": 0.41, "learning_rate": 4.961886666507005e-07, "loss": 0.5575, "step": 310 }, { "epoch": 0.41, "logps_train/chosen": -47.377220153808594, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -47.27829360961914, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.047824859619140625, "rewards_train/margins": 0.379560649394989, "rewards_train/rejected": -0.3317357897758484, "step": 310 }, { "epoch": 0.41, "logps_train/chosen": -79.80460357666016, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -89.61198425292969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.10467958450317383, "rewards_train/margins": 0.1393318474292755, "rewards_train/rejected": -0.24401143193244934, "step": 311 }, { "epoch": 0.41, "learning_rate": 4.960924331443757e-07, "loss": 0.6267, "step": 312 }, { "epoch": 0.41, "logps_train/chosen": -56.72797775268555, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -77.94141387939453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08423339575529099, "rewards_train/margins": 0.7916556522250175, "rewards_train/rejected": -0.7074222564697266, "step": 312 }, { "epoch": 0.42, "logps_train/chosen": -85.17864990234375, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -111.50273132324219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3915092945098877, "rewards_train/margins": 1.9886574745178223, "rewards_train/rejected": -1.5971481800079346, "step": 313 }, { "epoch": 0.42, "learning_rate": 4.959950093903778e-07, "loss": 0.3627, "step": 314 }, { "epoch": 0.42, "logps_train/chosen": -61.4759407043457, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -64.37782287597656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4038441777229309, "rewards_train/margins": 0.06831297278404236, "rewards_train/rejected": -0.47215715050697327, "step": 314 }, { "epoch": 0.42, "logps_train/chosen": -46.53078079223633, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -48.23991394042969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.27348440885543823, "rewards_train/margins": 0.7396632432937622, "rewards_train/rejected": -0.466178834438324, "step": 315 }, { "epoch": 0.42, "learning_rate": 4.958963958599051e-07, "loss": 0.6122, "step": 316 }, { "epoch": 0.42, "logps_train/chosen": -85.80899047851562, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -72.19682312011719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.22535136342048645, "rewards_train/margins": 0.8200333416461945, "rewards_train/rejected": -0.594681978225708, "step": 316 }, { "epoch": 0.42, "logps_train/chosen": -66.50135040283203, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -60.84355926513672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.05638488382101059, "rewards_train/margins": 0.779533363878727, "rewards_train/rejected": -0.8359182476997375, "step": 317 }, { "epoch": 0.42, "learning_rate": 4.957965930299111e-07, "loss": 0.4619, "step": 318 }, { "epoch": 0.42, "logps_train/chosen": -42.19713592529297, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -49.122840881347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06463585048913956, "rewards_train/margins": 0.2484295293688774, "rewards_train/rejected": -0.31306537985801697, "step": 318 }, { "epoch": 0.42, "logps_train/chosen": -34.947113037109375, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -63.90047073364258, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.43302297592163086, "rewards_train/margins": 1.2254139184951782, "rewards_train/rejected": -0.7923909425735474, "step": 319 }, { "epoch": 0.42, "learning_rate": 4.956956013831006e-07, "loss": 0.5162, "step": 320 }, { "epoch": 0.42, "logps_train/chosen": -42.32313537597656, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -62.905433654785156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.20831172168254852, "rewards_train/margins": 0.8730739802122116, "rewards_train/rejected": -0.6647622585296631, "step": 320 }, { "epoch": 0.43, "logps_train/chosen": -77.35755920410156, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -86.58023071289062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.11934994906187057, "rewards_train/margins": 0.780079610645771, "rewards_train/rejected": -0.8994295597076416, "step": 321 }, { "epoch": 0.43, "learning_rate": 4.955934214079287e-07, "loss": 0.4413, "step": 322 }, { "epoch": 0.43, "logps_train/chosen": -37.126502990722656, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -36.14399719238281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.13109973073005676, "rewards_train/margins": 0.08143697679042816, "rewards_train/rejected": 0.0496627539396286, "step": 322 }, { "epoch": 0.43, "logps_train/chosen": -68.67786407470703, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -88.44883728027344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.1494007557630539, "rewards_train/margins": 1.0177215188741684, "rewards_train/rejected": -0.8683207631111145, "step": 323 }, { "epoch": 0.43, "learning_rate": 4.954900535985977e-07, "loss": 0.5993, "step": 324 }, { "epoch": 0.43, "logps_train/chosen": -68.79336547851562, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -68.96935272216797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.015274770557880402, "rewards_train/margins": 0.8582229539752007, "rewards_train/rejected": -0.873497724533081, "step": 324 }, { "epoch": 0.43, "logps_train/chosen": -107.9514389038086, "logps_train/ref_chosen": -115.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -110.89006042480469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.6939187049865723, "rewards_train/margins": 1.861049771308899, "rewards_train/rejected": -1.1671310663223267, "step": 325 }, { "epoch": 0.43, "learning_rate": 4.953854984550552e-07, "loss": 0.3781, "step": 326 }, { "epoch": 0.43, "logps_train/chosen": -37.977882385253906, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -45.1030158996582, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.17252424359321594, "rewards_train/margins": 0.7726697027683258, "rewards_train/rejected": -0.6001454591751099, "step": 326 }, { "epoch": 0.43, "logps_train/chosen": -53.901161193847656, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -60.99518585205078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.23230373859405518, "rewards_train/margins": 0.1328398585319519, "rewards_train/rejected": -0.3651435971260071, "step": 327 }, { "epoch": 0.44, "learning_rate": 4.952797564829914e-07, "loss": 0.6096, "step": 328 }, { "epoch": 0.44, "logps_train/chosen": -78.5294418334961, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -76.37509155273438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3529440760612488, "rewards_train/margins": 0.21737802028656006, "rewards_train/rejected": -0.5703220963478088, "step": 328 }, { "epoch": 0.44, "logps_train/chosen": -81.8572006225586, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -99.53673553466797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.07556366920471191, "rewards_train/margins": 1.1124851703643799, "rewards_train/rejected": -1.1880488395690918, "step": 329 }, { "epoch": 0.44, "learning_rate": 4.951728281938364e-07, "loss": 0.5967, "step": 330 }, { "epoch": 0.44, "logps_train/chosen": -49.91069030761719, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -73.83796691894531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.2737748920917511, "rewards_train/margins": 1.2731963098049164, "rewards_train/rejected": -0.9994214177131653, "step": 330 }, { "epoch": 0.44, "logps_train/chosen": -27.490650177001953, "logps_train/ref_chosen": -28.375, "logps_train/ref_rejected": -32.0, "logps_train/rejected": -35.97441101074219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.09077858179807663, "rewards_train/margins": 0.47845368832349777, "rewards_train/rejected": -0.38767510652542114, "step": 331 }, { "epoch": 0.44, "learning_rate": 4.950647141047585e-07, "loss": 0.4559, "step": 332 }, { "epoch": 0.44, "logps_train/chosen": -70.85807800292969, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -70.3080062866211, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0858074426651001, "rewards_train/margins": 0.37936830520629883, "rewards_train/rejected": -0.4651757478713989, "step": 332 }, { "epoch": 0.44, "logps_train/chosen": -75.27079772949219, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -83.43508911132812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5645802021026611, "rewards_train/margins": 0.29142916202545166, "rewards_train/rejected": -0.8560093641281128, "step": 333 }, { "epoch": 0.44, "learning_rate": 4.94955414738661e-07, "loss": 0.6423, "step": 334 }, { "epoch": 0.44, "logps_train/chosen": -70.99369812011719, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -85.11231231689453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.18218286335468292, "rewards_train/margins": 1.3665484637022018, "rewards_train/rejected": -1.5487313270568848, "step": 334 }, { "epoch": 0.44, "logps_train/chosen": -43.42034149169922, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -40.67958450317383, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32984113693237305, "rewards_train/margins": 0.8313934803009033, "rewards_train/rejected": -0.5015523433685303, "step": 335 }, { "epoch": 0.45, "learning_rate": 4.948449306241797e-07, "loss": 0.3684, "step": 336 }, { "epoch": 0.45, "logps_train/chosen": -61.599510192871094, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -76.62318420410156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.43145498633384705, "rewards_train/margins": 0.9359607398509979, "rewards_train/rejected": -0.5045057535171509, "step": 336 }, { "epoch": 0.45, "logps_train/chosen": -56.99884033203125, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -47.48579406738281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.07800908386707306, "rewards_train/margins": 0.7662735432386398, "rewards_train/rejected": -0.8442826271057129, "step": 337 }, { "epoch": 0.45, "learning_rate": 4.947332622956807e-07, "loss": 0.6157, "step": 338 }, { "epoch": 0.45, "logps_train/chosen": -59.299381256103516, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -52.84434509277344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.10650087147951126, "rewards_train/margins": 0.5584022924304008, "rewards_train/rejected": -0.6649031639099121, "step": 338 }, { "epoch": 0.45, "logps_train/chosen": -79.95851135253906, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -68.88861846923828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5856958031654358, "rewards_train/margins": 0.3781662583351135, "rewards_train/rejected": -0.9638620615005493, "step": 339 }, { "epoch": 0.45, "learning_rate": 4.94620410293258e-07, "loss": 0.6119, "step": 340 }, { "epoch": 0.45, "logps_train/chosen": -105.9596176147461, "logps_train/ref_chosen": -109.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -104.4390869140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31966346502304077, "rewards_train/margins": 1.447946846485138, "rewards_train/rejected": -1.1282833814620972, "step": 340 }, { "epoch": 0.45, "logps_train/chosen": -65.1340560913086, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -70.03223419189453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.30378198623657227, "rewards_train/margins": 0.8163807392120361, "rewards_train/rejected": -0.5125987529754639, "step": 341 }, { "epoch": 0.45, "learning_rate": 4.945063751627299e-07, "loss": 0.3778, "step": 342 }, { "epoch": 0.45, "logps_train/chosen": -81.37132263183594, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -80.34870910644531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05739879608154297, "rewards_train/margins": 0.8703944683074951, "rewards_train/rejected": -0.8129956722259521, "step": 342 }, { "epoch": 0.46, "logps_train/chosen": -89.68537139892578, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -102.48727416992188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4041192829608917, "rewards_train/margins": 1.5481591522693634, "rewards_train/rejected": -1.1440398693084717, "step": 343 }, { "epoch": 0.46, "learning_rate": 4.943911574556375e-07, "loss": 0.449, "step": 344 }, { "epoch": 0.46, "logps_train/chosen": -91.92752075195312, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -81.88960266113281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.30412301421165466, "rewards_train/margins": 1.4133960902690887, "rewards_train/rejected": -1.109273076057434, "step": 344 }, { "epoch": 0.46, "logps_train/chosen": -48.4801025390625, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -51.764827728271484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.014416344463825226, "rewards_train/margins": 0.4800353869795799, "rewards_train/rejected": -0.49445173144340515, "step": 345 }, { "epoch": 0.46, "learning_rate": 4.942747577292414e-07, "loss": 0.5624, "step": 346 }, { "epoch": 0.46, "logps_train/chosen": -58.10334777832031, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -64.0764389038086, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06730224192142487, "rewards_train/margins": 0.6444771438837051, "rewards_train/rejected": -0.5771749019622803, "step": 346 }, { "epoch": 0.46, "logps_train/chosen": -48.601993560791016, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -74.01139068603516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.48296457529067993, "rewards_train/margins": 1.589670479297638, "rewards_train/rejected": -1.106705904006958, "step": 347 }, { "epoch": 0.46, "learning_rate": 4.941571765465189e-07, "loss": 0.425, "step": 348 }, { "epoch": 0.46, "logps_train/chosen": -57.15316390991211, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -83.12677001953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4104648530483246, "rewards_train/margins": 1.683297485113144, "rewards_train/rejected": -1.2728326320648193, "step": 348 }, { "epoch": 0.46, "logps_train/chosen": -76.943115234375, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -92.99237060546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.28850117325782776, "rewards_train/margins": 1.4932072460651398, "rewards_train/rejected": -1.204706072807312, "step": 349 }, { "epoch": 0.46, "learning_rate": 4.940384144761619e-07, "loss": 0.2798, "step": 350 }, { "epoch": 0.46, "logps_train/chosen": -44.18724060058594, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -61.78169631958008, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0042709968984127045, "rewards_train/margins": 0.6879613734781742, "rewards_train/rejected": -0.6922323703765869, "step": 350 }, { "epoch": 0.47, "logps_train/chosen": -82.18058013916016, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -75.33881378173828, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.18290191888809204, "rewards_train/margins": 0.12285453081130981, "rewards_train/rejected": -0.30575644969940186, "step": 351 }, { "epoch": 0.47, "learning_rate": 4.939184720925734e-07, "loss": 0.6843, "step": 352 }, { "epoch": 0.47, "logps_train/chosen": -69.00310516357422, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -63.12940979003906, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.46046683192253113, "rewards_train/margins": -0.21979157626628876, "rewards_train/rejected": -0.24067525565624237, "step": 352 }, { "epoch": 0.47, "logps_train/chosen": -66.58317565917969, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -61.301902770996094, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.28956806659698486, "rewards_train/margins": 0.10624682903289795, "rewards_train/rejected": -0.3958148956298828, "step": 353 }, { "epoch": 0.47, "learning_rate": 4.937973499758656e-07, "loss": 0.7655, "step": 354 }, { "epoch": 0.47, "logps_train/chosen": -48.65770721435547, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -69.75018310546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1936039924621582, "rewards_train/margins": 1.5748724937438965, "rewards_train/rejected": -1.3812685012817383, "step": 354 }, { "epoch": 0.47, "logps_train/chosen": -56.52252960205078, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -74.19181060791016, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08023122698068619, "rewards_train/margins": 0.788950227200985, "rewards_train/rejected": -0.8691814541816711, "step": 355 }, { "epoch": 0.47, "learning_rate": 4.93675048711856e-07, "loss": 0.3794, "step": 356 }, { "epoch": 0.47, "logps_train/chosen": -58.07610321044922, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -81.87864685058594, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.011139392852783203, "rewards_train/margins": 1.0685354471206665, "rewards_train/rejected": -1.0573960542678833, "step": 356 }, { "epoch": 0.47, "logps_train/chosen": -67.69930267333984, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -93.54232788085938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4394451677799225, "rewards_train/margins": 1.7280526459217072, "rewards_train/rejected": -1.2886074781417847, "step": 357 }, { "epoch": 0.48, "learning_rate": 4.935515688920653e-07, "loss": 0.412, "step": 358 }, { "epoch": 0.48, "logps_train/chosen": -68.78108215332031, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -83.67660522460938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.04420631378889084, "rewards_train/margins": 0.30014824122190475, "rewards_train/rejected": -0.2559419274330139, "step": 358 }, { "epoch": 0.48, "logps_train/chosen": -46.6857795715332, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -58.41501235961914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.16540639102458954, "rewards_train/margins": 0.8850326985120773, "rewards_train/rejected": -0.7196263074874878, "step": 359 }, { "epoch": 0.48, "learning_rate": 4.934269111137148e-07, "loss": 0.5129, "step": 360 }, { "epoch": 0.48, "logps_train/chosen": -78.81183624267578, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -76.78553771972656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.5985039472579956, "rewards_train/margins": 1.4020576477050781, "rewards_train/rejected": -0.8035537004470825, "step": 360 }, { "epoch": 0.48, "logps_train/chosen": -113.30257415771484, "logps_train/ref_chosen": -116.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -102.77424621582031, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.3189617395401001, "rewards_train/margins": 0.519042506814003, "rewards_train/rejected": -0.2000807672739029, "step": 361 }, { "epoch": 0.48, "learning_rate": 4.933010759797227e-07, "loss": 0.4731, "step": 362 }, { "epoch": 0.48, "logps_train/chosen": -32.92317199707031, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -34.44102478027344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.23668691515922546, "rewards_train/margins": 0.6995397508144379, "rewards_train/rejected": -0.4628528356552124, "step": 362 }, { "epoch": 0.48, "logps_train/chosen": -46.1734733581543, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -56.22780227661133, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1868787705898285, "rewards_train/margins": 0.1780889928340912, "rewards_train/rejected": -0.3649677634239197, "step": 363 }, { "epoch": 0.48, "learning_rate": 4.931740640987015e-07, "loss": 0.5609, "step": 364 }, { "epoch": 0.48, "logps_train/chosen": -72.57083129882812, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -94.20811462402344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.044582709670066833, "rewards_train/margins": 0.6231044083833694, "rewards_train/rejected": -0.6676871180534363, "step": 364 }, { "epoch": 0.48, "logps_train/chosen": -51.9361572265625, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -83.75634765625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.33841538429260254, "rewards_train/margins": 1.1796748638153076, "rewards_train/rejected": -0.8412594795227051, "step": 365 }, { "epoch": 0.49, "learning_rate": 4.930458760849557e-07, "loss": 0.5417, "step": 366 }, { "epoch": 0.49, "logps_train/chosen": -63.89988708496094, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -54.842193603515625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01782386004924774, "rewards_train/margins": 0.37235577404499054, "rewards_train/rejected": -0.3545319139957428, "step": 366 }, { "epoch": 0.49, "logps_train/chosen": -44.214874267578125, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -81.29801940917969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11367510259151459, "rewards_train/margins": 1.0380017906427383, "rewards_train/rejected": -1.151676893234253, "step": 367 }, { "epoch": 0.49, "learning_rate": 4.929165125584775e-07, "loss": 0.5475, "step": 368 }, { "epoch": 0.49, "logps_train/chosen": -51.261497497558594, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -67.07583618164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4597877860069275, "rewards_train/margins": 1.0126840472221375, "rewards_train/rejected": -0.55289626121521, "step": 368 }, { "epoch": 0.49, "logps_train/chosen": -29.50927734375, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -31.0, "logps_train/rejected": -36.309104919433594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0846189558506012, "rewards_train/margins": 0.6108417809009552, "rewards_train/rejected": -0.526222825050354, "step": 369 }, { "epoch": 0.49, "learning_rate": 4.92785974144945e-07, "loss": 0.4209, "step": 370 }, { "epoch": 0.49, "logps_train/chosen": -89.02214050292969, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -112.5206069946289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.28216132521629333, "rewards_train/margins": 1.4850035607814789, "rewards_train/rejected": -1.2028422355651855, "step": 370 }, { "epoch": 0.49, "logps_train/chosen": -62.36957550048828, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -67.99769592285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19898012280464172, "rewards_train/margins": 0.6932806968688965, "rewards_train/rejected": -0.49430057406425476, "step": 371 }, { "epoch": 0.49, "learning_rate": 4.92654261475719e-07, "loss": 0.4435, "step": 372 }, { "epoch": 0.49, "logps_train/chosen": -48.92306137084961, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -65.15996551513672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.02644379436969757, "rewards_train/margins": 0.7869715243577957, "rewards_train/rejected": -0.7605277299880981, "step": 372 }, { "epoch": 0.5, "logps_train/chosen": -72.53135681152344, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -80.98039245605469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5171771049499512, "rewards_train/margins": 1.6464661359786987, "rewards_train/rejected": -1.1292890310287476, "step": 373 }, { "epoch": 0.5, "learning_rate": 4.925213751878392e-07, "loss": 0.4706, "step": 374 }, { "epoch": 0.5, "logps_train/chosen": -74.22731018066406, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -73.24191284179688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.30851906538009644, "rewards_train/margins": 1.0670854449272156, "rewards_train/rejected": -0.7585663795471191, "step": 374 }, { "epoch": 0.5, "logps_train/chosen": -61.54717254638672, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -72.13079833984375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.15471746027469635, "rewards_train/margins": 0.031800076365470886, "rewards_train/rejected": -0.18651753664016724, "step": 375 }, { "epoch": 0.5, "learning_rate": 4.923873159240218e-07, "loss": 0.605, "step": 376 }, { "epoch": 0.5, "logps_train/chosen": -46.21941375732422, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -54.117191314697266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.11633986979722977, "rewards_train/margins": 1.2749341204762459, "rewards_train/rejected": -1.1585942506790161, "step": 376 }, { "epoch": 0.5, "logps_train/chosen": -73.43480682373047, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -92.40774536132812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.058081869035959244, "rewards_train/margins": 1.2426066435873508, "rewards_train/rejected": -1.1845247745513916, "step": 377 }, { "epoch": 0.5, "learning_rate": 4.922520843326562e-07, "loss": 0.5104, "step": 378 }, { "epoch": 0.5, "logps_train/chosen": -61.36712646484375, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -74.85710144042969, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.31641197204589844, "rewards_train/margins": 0.6255594193935394, "rewards_train/rejected": -0.309147447347641, "step": 378 }, { "epoch": 0.5, "logps_train/chosen": -53.90198516845703, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -55.54674530029297, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.12542665004730225, "rewards_train/margins": 0.7019765377044678, "rewards_train/rejected": -0.5765498876571655, "step": 379 }, { "epoch": 0.5, "learning_rate": 4.921156810678019e-07, "loss": 0.592, "step": 380 }, { "epoch": 0.5, "logps_train/chosen": -52.57203674316406, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -77.65272521972656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.1776473969221115, "rewards_train/margins": 1.2010255306959152, "rewards_train/rejected": -1.0233781337738037, "step": 380 }, { "epoch": 0.51, "logps_train/chosen": -103.65930938720703, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -113.58880615234375, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.18780572712421417, "rewards_train/margins": 1.203105852007866, "rewards_train/rejected": -1.39091157913208, "step": 381 }, { "epoch": 0.51, "learning_rate": 4.919781067891853e-07, "loss": 0.5634, "step": 382 }, { "epoch": 0.51, "logps_train/chosen": -57.32777404785156, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -96.48308563232422, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3375351130962372, "rewards_train/margins": 1.1202187836170197, "rewards_train/rejected": -0.7826836705207825, "step": 382 }, { "epoch": 0.51, "logps_train/chosen": -54.46477508544922, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -69.27169036865234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.08397741615772247, "rewards_train/margins": 0.9650664776563644, "rewards_train/rejected": -1.049043893814087, "step": 383 }, { "epoch": 0.51, "learning_rate": 4.918393621621964e-07, "loss": 0.4372, "step": 384 }, { "epoch": 0.51, "logps_train/chosen": -67.64324951171875, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -80.74447631835938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3679015338420868, "rewards_train/margins": 0.7657865583896637, "rewards_train/rejected": -0.3978850245475769, "step": 384 }, { "epoch": 0.51, "logps_train/chosen": -55.20287322998047, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -75.34164428710938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.4390876889228821, "rewards_train/margins": 1.5216899514198303, "rewards_train/rejected": -1.0826022624969482, "step": 385 }, { "epoch": 0.51, "learning_rate": 4.916994478578859e-07, "loss": 0.454, "step": 386 }, { "epoch": 0.51, "logps_train/chosen": -58.48170852661133, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -96.11143493652344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.4533919095993042, "rewards_train/margins": 1.6254730224609375, "rewards_train/rejected": -1.1720811128616333, "step": 386 }, { "epoch": 0.51, "logps_train/chosen": -51.650306701660156, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -60.75696563720703, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.3501867949962616, "rewards_train/margins": -0.10456836223602295, "rewards_train/rejected": -0.24561843276023865, "step": 387 }, { "epoch": 0.52, "learning_rate": 4.915583645529615e-07, "loss": 0.6997, "step": 388 }, { "epoch": 0.52, "logps_train/chosen": -59.05073547363281, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -44.821739196777344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0691358894109726, "rewards_train/margins": 0.33647577464580536, "rewards_train/rejected": -0.40561166405677795, "step": 388 }, { "epoch": 0.52, "logps_train/chosen": -98.45674896240234, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -124.94722747802734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.24098728597164154, "rewards_train/margins": 1.051001325249672, "rewards_train/rejected": -1.2919886112213135, "step": 389 }, { "epoch": 0.52, "learning_rate": 4.914161129297852e-07, "loss": 0.5328, "step": 390 }, { "epoch": 0.52, "logps_train/chosen": -68.8272476196289, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -79.08497619628906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7243063449859619, "rewards_train/margins": 2.165616273880005, "rewards_train/rejected": -1.441309928894043, "step": 390 }, { "epoch": 0.52, "logps_train/chosen": -85.1076889038086, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -101.42495727539062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.21936267614364624, "rewards_train/margins": 0.6028200387954712, "rewards_train/rejected": -0.8221827149391174, "step": 391 }, { "epoch": 0.52, "learning_rate": 4.912726936763692e-07, "loss": 0.4175, "step": 392 }, { "epoch": 0.52, "logps_train/chosen": -49.854393005371094, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -72.9985122680664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.46846693754196167, "rewards_train/margins": 1.4745680689811707, "rewards_train/rejected": -1.006101131439209, "step": 392 }, { "epoch": 0.52, "logps_train/chosen": -32.078857421875, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -63.90654754638672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.18117690086364746, "rewards_train/margins": 1.3562067747116089, "rewards_train/rejected": -1.1750298738479614, "step": 393 }, { "epoch": 0.52, "learning_rate": 4.911281074863735e-07, "loss": 0.3996, "step": 394 }, { "epoch": 0.52, "logps_train/chosen": -67.24622344970703, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -71.84961700439453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30506497621536255, "rewards_train/margins": 1.1228389143943787, "rewards_train/rejected": -0.8177739381790161, "step": 394 }, { "epoch": 0.52, "logps_train/chosen": -55.591651916503906, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -80.74789428710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8548972606658936, "rewards_train/margins": 2.823436141014099, "rewards_train/rejected": -1.9685388803482056, "step": 395 }, { "epoch": 0.53, "learning_rate": 4.909823550591018e-07, "loss": 0.2353, "step": 396 }, { "epoch": 0.53, "logps_train/chosen": -64.13090515136719, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -79.94658660888672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.17597264051437378, "rewards_train/margins": 1.6503190398216248, "rewards_train/rejected": -1.474346399307251, "step": 396 }, { "epoch": 0.53, "logps_train/chosen": -61.688507080078125, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -67.70826721191406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4420871138572693, "rewards_train/margins": 1.6894757151603699, "rewards_train/rejected": -1.2473886013031006, "step": 397 }, { "epoch": 0.53, "learning_rate": 4.908354370994987e-07, "loss": 0.3082, "step": 398 }, { "epoch": 0.53, "logps_train/chosen": -83.1932373046875, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -104.8465576171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.15994855761528015, "rewards_train/margins": 1.0153322517871857, "rewards_train/rejected": -1.1752808094024658, "step": 398 }, { "epoch": 0.53, "logps_train/chosen": -61.557281494140625, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -80.44500732421875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07760316133499146, "rewards_train/margins": 0.5653344988822937, "rewards_train/rejected": -0.6429376602172852, "step": 399 }, { "epoch": 0.53, "learning_rate": 4.906873543181456e-07, "loss": 0.5385, "step": 400 }, { "epoch": 0.53, "logps_train/chosen": -46.099403381347656, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -57.333839416503906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1619347631931305, "rewards_train/margins": 1.1343809068202972, "rewards_train/rejected": -0.9724461436271667, "step": 400 }, { "epoch": 0.53, "logps_train/chosen": -61.619873046875, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -46.24669647216797, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.041137948632240295, "rewards_train/margins": 0.24080754816532135, "rewards_train/rejected": -0.19966959953308105, "step": 401 }, { "epoch": 0.53, "learning_rate": 4.90538107431258e-07, "loss": 0.5621, "step": 402 }, { "epoch": 0.53, "logps_train/chosen": -73.45378112792969, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -109.92085266113281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.018034063279628754, "rewards_train/margins": 1.397488035261631, "rewards_train/rejected": -1.4155220985412598, "step": 402 }, { "epoch": 0.54, "logps_train/chosen": -84.02957916259766, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -86.36756896972656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5454791784286499, "rewards_train/margins": 0.9900488555431366, "rewards_train/rejected": -0.4445696771144867, "step": 403 }, { "epoch": 0.54, "learning_rate": 4.903876971606817e-07, "loss": 0.4382, "step": 404 }, { "epoch": 0.54, "logps_train/chosen": -68.48152160644531, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -72.48524475097656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.28465989232063293, "rewards_train/margins": 1.394121676683426, "rewards_train/rejected": -1.109461784362793, "step": 404 }, { "epoch": 0.54, "logps_train/chosen": -79.98915100097656, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -67.21395874023438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.19110223650932312, "rewards_train/margins": 0.39162173867225647, "rewards_train/rejected": -0.5827239751815796, "step": 405 }, { "epoch": 0.54, "learning_rate": 4.902361242338889e-07, "loss": 0.4773, "step": 406 }, { "epoch": 0.54, "logps_train/chosen": -94.2977066040039, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -112.37322998046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.13116693496704102, "rewards_train/margins": 1.7677087783813477, "rewards_train/rejected": -1.6365418434143066, "step": 406 }, { "epoch": 0.54, "logps_train/chosen": -70.03573608398438, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -59.45812225341797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.20267634093761444, "rewards_train/margins": 1.0203634351491928, "rewards_train/rejected": -0.8176870942115784, "step": 407 }, { "epoch": 0.54, "learning_rate": 4.900833893839756e-07, "loss": 0.3619, "step": 408 }, { "epoch": 0.54, "logps_train/chosen": -40.740821838378906, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -70.22329711914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42591768503189087, "rewards_train/margins": 1.8652393221855164, "rewards_train/rejected": -1.4393216371536255, "step": 408 }, { "epoch": 0.54, "logps_train/chosen": -53.683387756347656, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -60.01963806152344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.10822384059429169, "rewards_train/margins": 1.1664374321699142, "rewards_train/rejected": -1.0582135915756226, "step": 409 }, { "epoch": 0.54, "learning_rate": 4.899294933496571e-07, "loss": 0.3035, "step": 410 }, { "epoch": 0.54, "logps_train/chosen": -75.93618774414062, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -97.18470764160156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.1305999755859375, "rewards_train/margins": 1.514695644378662, "rewards_train/rejected": -1.3840956687927246, "step": 410 }, { "epoch": 0.55, "logps_train/chosen": -66.03938293457031, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -95.2296142578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.544499397277832, "rewards_train/margins": 2.1408989429473877, "rewards_train/rejected": -1.5963995456695557, "step": 411 }, { "epoch": 0.55, "learning_rate": 4.897744368752655e-07, "loss": 0.3635, "step": 412 }, { "epoch": 0.55, "logps_train/chosen": -43.639827728271484, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -37.0, "logps_train/rejected": -42.41300582885742, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.5869936943054199, "rewards_train/margins": 1.12360680103302, "rewards_train/rejected": -0.5366131067276001, "step": 412 }, { "epoch": 0.55, "logps_train/chosen": -86.75970458984375, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -89.42134094238281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.29159581661224365, "rewards_train/margins": 0.6411628723144531, "rewards_train/rejected": -0.9327586889266968, "step": 413 }, { "epoch": 0.55, "learning_rate": 4.896182207107446e-07, "loss": 0.5745, "step": 414 }, { "epoch": 0.55, "logps_train/chosen": -54.34934997558594, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -54.63396453857422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.19118517637252808, "rewards_train/margins": 0.25502413511276245, "rewards_train/rejected": -0.4462093114852905, "step": 414 }, { "epoch": 0.55, "logps_train/chosen": -58.31639862060547, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -80.00607299804688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5683600306510925, "rewards_train/margins": 1.5939671397209167, "rewards_train/rejected": -1.0256071090698242, "step": 415 }, { "epoch": 0.55, "learning_rate": 4.894608456116479e-07, "loss": 0.4535, "step": 416 }, { "epoch": 0.55, "logps_train/chosen": -86.6167984008789, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -95.35467529296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.20863284170627594, "rewards_train/margins": 1.4565999656915665, "rewards_train/rejected": -1.2479671239852905, "step": 416 }, { "epoch": 0.55, "logps_train/chosen": -46.06257629394531, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -85.15351867675781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.35311707854270935, "rewards_train/margins": 1.7778445184230804, "rewards_train/rejected": -1.424727439880371, "step": 417 }, { "epoch": 0.56, "learning_rate": 4.893023123391337e-07, "loss": 0.4398, "step": 418 }, { "epoch": 0.56, "logps_train/chosen": -73.14177703857422, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -97.67268371582031, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2333184778690338, "rewards_train/margins": 0.7245757281780243, "rewards_train/rejected": -0.9578942060470581, "step": 418 }, { "epoch": 0.56, "logps_train/chosen": -61.0244140625, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -71.52328491210938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.26201167702674866, "rewards_train/margins": 1.4994970858097076, "rewards_train/rejected": -1.237485408782959, "step": 419 }, { "epoch": 0.56, "learning_rate": 4.891426216599623e-07, "loss": 0.5166, "step": 420 }, { "epoch": 0.56, "logps_train/chosen": -76.00321960449219, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -73.52159118652344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.28938475251197815, "rewards_train/margins": 0.634649246931076, "rewards_train/rejected": -0.9240339994430542, "step": 420 }, { "epoch": 0.56, "logps_train/chosen": -62.26316833496094, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -71.09805297851562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06694143265485764, "rewards_train/margins": 0.6334886029362679, "rewards_train/rejected": -0.7004300355911255, "step": 421 }, { "epoch": 0.56, "learning_rate": 4.889817743464916e-07, "loss": 0.5631, "step": 422 }, { "epoch": 0.56, "logps_train/chosen": -54.261268615722656, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -73.1705093383789, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.12143951654434204, "rewards_train/margins": 1.2471739649772644, "rewards_train/rejected": -1.3686134815216064, "step": 422 }, { "epoch": 0.56, "logps_train/chosen": -40.85841369628906, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -57.837738037109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.143846333026886, "rewards_train/margins": 0.8303542137145996, "rewards_train/rejected": -0.6865078806877136, "step": 423 }, { "epoch": 0.56, "learning_rate": 4.888197711766736e-07, "loss": 0.4953, "step": 424 }, { "epoch": 0.56, "logps_train/chosen": -48.88616180419922, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -64.90988159179688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3746649920940399, "rewards_train/margins": 1.9347938001155853, "rewards_train/rejected": -1.5601288080215454, "step": 424 }, { "epoch": 0.56, "logps_train/chosen": -39.213130950927734, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -50.036067962646484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.011937960982322693, "rewards_train/margins": 0.6338563114404678, "rewards_train/rejected": -0.6457942724227905, "step": 425 }, { "epoch": 0.57, "learning_rate": 4.886566129340512e-07, "loss": 0.486, "step": 426 }, { "epoch": 0.57, "logps_train/chosen": -48.407718658447266, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -71.31959533691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09594693779945374, "rewards_train/margins": 1.4154061377048492, "rewards_train/rejected": -1.3194591999053955, "step": 426 }, { "epoch": 0.57, "logps_train/chosen": -72.96916198730469, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -78.33598327636719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.23129120469093323, "rewards_train/margins": 0.38121286034584045, "rewards_train/rejected": -0.6125040650367737, "step": 427 }, { "epoch": 0.57, "learning_rate": 4.884923004077534e-07, "loss": 0.5291, "step": 428 }, { "epoch": 0.57, "logps_train/chosen": -72.09370422363281, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -85.7626724243164, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.02187054604291916, "rewards_train/margins": 0.2200218364596367, "rewards_train/rejected": -0.24189238250255585, "step": 428 }, { "epoch": 0.57, "logps_train/chosen": -87.04708862304688, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -107.77870178222656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.8859164118766785, "rewards_train/margins": 1.9637873768806458, "rewards_train/rejected": -1.0778709650039673, "step": 429 }, { "epoch": 0.57, "learning_rate": 4.88326834392492e-07, "loss": 0.5969, "step": 430 }, { "epoch": 0.57, "logps_train/chosen": -67.62312316894531, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -67.78407287597656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.13887476921081543, "rewards_train/margins": 0.2731262147426605, "rewards_train/rejected": -0.41200098395347595, "step": 430 }, { "epoch": 0.57, "logps_train/chosen": -84.49793243408203, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -104.53606414794922, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.11739447712898254, "rewards_train/margins": 1.5178757011890411, "rewards_train/rejected": -1.4004812240600586, "step": 431 }, { "epoch": 0.57, "learning_rate": 4.881602156885582e-07, "loss": 0.6548, "step": 432 }, { "epoch": 0.57, "logps_train/chosen": -60.758872985839844, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -75.25013732910156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.36083099246025085, "rewards_train/margins": 1.157719761133194, "rewards_train/rejected": -0.7968887686729431, "step": 432 }, { "epoch": 0.58, "logps_train/chosen": -64.21223449707031, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -90.84576416015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.8115890622138977, "rewards_train/margins": 1.600852906703949, "rewards_train/rejected": -0.7892638444900513, "step": 433 }, { "epoch": 0.58, "learning_rate": 4.87992445101818e-07, "loss": 0.4658, "step": 434 }, { "epoch": 0.58, "logps_train/chosen": -98.69319915771484, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -102.20870971679688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.06931973248720169, "rewards_train/margins": 0.5703013464808464, "rewards_train/rejected": -0.6396210789680481, "step": 434 }, { "epoch": 0.58, "logps_train/chosen": -50.661983489990234, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -58.152793884277344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.08692646026611328, "rewards_train/margins": 1.2787686586380005, "rewards_train/rejected": -1.1918421983718872, "step": 435 }, { "epoch": 0.58, "learning_rate": 4.878235234437083e-07, "loss": 0.5462, "step": 436 }, { "epoch": 0.58, "logps_train/chosen": -52.73915100097656, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -69.75190734863281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.009852513670921326, "rewards_train/margins": 0.29268230497837067, "rewards_train/rejected": -0.302534818649292, "step": 436 }, { "epoch": 0.58, "logps_train/chosen": -66.21002197265625, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -64.44618225097656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.45009124279022217, "rewards_train/margins": 1.8634592294692993, "rewards_train/rejected": -1.4133679866790771, "step": 437 }, { "epoch": 0.58, "learning_rate": 4.876534515312337e-07, "loss": 0.4549, "step": 438 }, { "epoch": 0.58, "logps_train/chosen": -37.91630554199219, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -30.625, "logps_train/rejected": -33.68592834472656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.027900636196136475, "rewards_train/margins": 0.3359465003013611, "rewards_train/rejected": -0.3080458641052246, "step": 438 }, { "epoch": 0.58, "logps_train/chosen": -44.24502182006836, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -60.00847625732422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12334935367107391, "rewards_train/margins": 1.4113061875104904, "rewards_train/rejected": -1.2879568338394165, "step": 439 }, { "epoch": 0.58, "learning_rate": 4.87482230186962e-07, "loss": 0.4918, "step": 440 }, { "epoch": 0.58, "logps_train/chosen": -93.13862609863281, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -78.01073455810547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7580126523971558, "rewards_train/margins": 1.4481490850448608, "rewards_train/rejected": -0.6901364326477051, "step": 440 }, { "epoch": 0.59, "logps_train/chosen": -61.005889892578125, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -75.66497039794922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1103547215461731, "rewards_train/margins": 0.7170796990394592, "rewards_train/rejected": -0.8274344205856323, "step": 441 }, { "epoch": 0.59, "learning_rate": 4.873098602390202e-07, "loss": 0.4477, "step": 442 }, { "epoch": 0.59, "logps_train/chosen": -60.18880844116211, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -61.91142272949219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.36705663800239563, "rewards_train/margins": 1.0706988871097565, "rewards_train/rejected": -0.7036422491073608, "step": 442 }, { "epoch": 0.59, "logps_train/chosen": -53.55289840698242, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -71.93734741210938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03260091692209244, "rewards_train/margins": 1.4294608011841774, "rewards_train/rejected": -1.396859884262085, "step": 443 }, { "epoch": 0.59, "learning_rate": 4.871363425210907e-07, "loss": 0.4319, "step": 444 }, { "epoch": 0.59, "logps_train/chosen": -43.11422348022461, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -52.44865417480469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.11435892432928085, "rewards_train/margins": 0.9166460260748863, "rewards_train/rejected": -0.8022871017456055, "step": 444 }, { "epoch": 0.59, "logps_train/chosen": -51.13971710205078, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -61.27005386352539, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.32274702191352844, "rewards_train/margins": 0.973190039396286, "rewards_train/rejected": -0.6504430174827576, "step": 445 }, { "epoch": 0.59, "learning_rate": 4.869616778724073e-07, "loss": 0.5021, "step": 446 }, { "epoch": 0.59, "logps_train/chosen": -43.573551177978516, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -58.84613037109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.16516748070716858, "rewards_train/margins": 0.9741333425045013, "rewards_train/rejected": -1.13930082321167, "step": 446 }, { "epoch": 0.59, "logps_train/chosen": -64.185302734375, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -69.64631652832031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.40022003650665283, "rewards_train/margins": 0.8390710055828094, "rewards_train/rejected": -0.4388509690761566, "step": 447 }, { "epoch": 0.59, "learning_rate": 4.867858671377508e-07, "loss": 0.4529, "step": 448 }, { "epoch": 0.59, "logps_train/chosen": -57.2574348449707, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -68.23394012451172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.08746234327554703, "rewards_train/margins": 0.857806883752346, "rewards_train/rejected": -0.9452692270278931, "step": 448 }, { "epoch": 0.6, "logps_train/chosen": -56.99577331542969, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -63.00306701660156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.6629226207733154, "rewards_train/margins": 1.1366665363311768, "rewards_train/rejected": -0.47374391555786133, "step": 449 }, { "epoch": 0.6, "learning_rate": 4.866089111674452e-07, "loss": 0.4792, "step": 450 }, { "epoch": 0.6, "logps_train/chosen": -73.94206237792969, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -112.22356414794922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5557940006256104, "rewards_train/margins": 2.2969000339508057, "rewards_train/rejected": -1.7411060333251953, "step": 450 }, { "epoch": 0.6, "logps_train/chosen": -58.695640563964844, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -72.12869262695312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.15231075882911682, "rewards_train/margins": 0.9808046519756317, "rewards_train/rejected": -0.8284938931465149, "step": 451 }, { "epoch": 0.6, "learning_rate": 4.864308108173538e-07, "loss": 0.3683, "step": 452 }, { "epoch": 0.6, "logps_train/chosen": -86.11430358886719, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -81.59931945800781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.34033703804016113, "rewards_train/margins": 0.1844393014907837, "rewards_train/rejected": -0.5247763395309448, "step": 452 }, { "epoch": 0.6, "logps_train/chosen": -73.1890869140625, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -99.24513244628906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7810919284820557, "rewards_train/margins": 1.422011375427246, "rewards_train/rejected": -0.6409194469451904, "step": 453 }, { "epoch": 0.6, "learning_rate": 4.862515669488744e-07, "loss": 0.6111, "step": 454 }, { "epoch": 0.6, "logps_train/chosen": -60.15624237060547, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -72.10064697265625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.36875075101852417, "rewards_train/margins": 1.6006905436515808, "rewards_train/rejected": -1.2319397926330566, "step": 454 }, { "epoch": 0.6, "logps_train/chosen": -74.43499755859375, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -93.25853729248047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7174383997917175, "rewards_train/margins": 1.777666985988617, "rewards_train/rejected": -1.0602285861968994, "step": 455 }, { "epoch": 0.61, "learning_rate": 4.86071180428936e-07, "loss": 0.3871, "step": 456 }, { "epoch": 0.61, "logps_train/chosen": -67.68987274169922, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -92.97109985351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41695016622543335, "rewards_train/margins": 1.7437474131584167, "rewards_train/rejected": -1.3267972469329834, "step": 456 }, { "epoch": 0.61, "logps_train/chosen": -62.74115753173828, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -95.05789947509766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8883844017982483, "rewards_train/margins": 2.741049349308014, "rewards_train/rejected": -1.8526649475097656, "step": 457 }, { "epoch": 0.61, "learning_rate": 4.858896521299934e-07, "loss": 0.2323, "step": 458 }, { "epoch": 0.61, "logps_train/chosen": -63.582603454589844, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -65.38656616210938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.09798998385667801, "rewards_train/margins": 0.6616466119885445, "rewards_train/rejected": -0.5636566281318665, "step": 458 }, { "epoch": 0.61, "logps_train/chosen": -89.062255859375, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -106.24118041992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4718993306159973, "rewards_train/margins": 2.10383003950119, "rewards_train/rejected": -1.6319307088851929, "step": 459 }, { "epoch": 0.61, "learning_rate": 4.857069829300246e-07, "loss": 0.3481, "step": 460 }, { "epoch": 0.61, "logps_train/chosen": -61.54225158691406, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -93.00715637207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.19108706712722778, "rewards_train/margins": 1.9386783242225647, "rewards_train/rejected": -1.747591257095337, "step": 460 }, { "epoch": 0.61, "logps_train/chosen": -51.11946105957031, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -89.09141540527344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3427412807941437, "rewards_train/margins": 1.519070416688919, "rewards_train/rejected": -1.1763291358947754, "step": 461 }, { "epoch": 0.61, "learning_rate": 4.855231737125249e-07, "loss": 0.3517, "step": 462 }, { "epoch": 0.61, "logps_train/chosen": -50.43305206298828, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -73.903076171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4086480736732483, "rewards_train/margins": 1.6661434769630432, "rewards_train/rejected": -1.257495403289795, "step": 462 }, { "epoch": 0.61, "logps_train/chosen": -48.7312126159668, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -64.36223602294922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.21320679783821106, "rewards_train/margins": 1.1892743408679962, "rewards_train/rejected": -0.9760675430297852, "step": 463 }, { "epoch": 0.62, "learning_rate": 4.85338225366504e-07, "loss": 0.4167, "step": 464 }, { "epoch": 0.62, "logps_train/chosen": -76.38656616210938, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -72.82841491699219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.9097812175750732, "rewards_train/margins": 2.301215887069702, "rewards_train/rejected": -1.391434669494629, "step": 464 }, { "epoch": 0.62, "logps_train/chosen": -55.65221405029297, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -52.41108703613281, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.33240920305252075, "rewards_train/margins": 0.3176838159561157, "rewards_train/rejected": -0.6500930190086365, "step": 465 }, { "epoch": 0.62, "learning_rate": 4.851521387864806e-07, "loss": 0.5332, "step": 466 }, { "epoch": 0.62, "logps_train/chosen": -95.60820007324219, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -118.2801513671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5579298734664917, "rewards_train/margins": 2.7578204870224, "rewards_train/rejected": -2.199890613555908, "step": 466 }, { "epoch": 0.62, "logps_train/chosen": -48.76190948486328, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -52.718875885009766, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.19494125247001648, "rewards_train/margins": 0.5378837287425995, "rewards_train/rejected": -0.732824981212616, "step": 467 }, { "epoch": 0.62, "learning_rate": 4.849649148724789e-07, "loss": 0.4533, "step": 468 }, { "epoch": 0.62, "logps_train/chosen": -55.786354064941406, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -77.19148254394531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.8362081050872803, "rewards_train/margins": 1.538950800895691, "rewards_train/rejected": -0.7027426958084106, "step": 468 }, { "epoch": 0.62, "logps_train/chosen": -75.44213104248047, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -87.67589569091797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.6971932053565979, "rewards_train/margins": 1.175720751285553, "rewards_train/rejected": -0.4785275459289551, "step": 469 }, { "epoch": 0.62, "learning_rate": 4.847765545300238e-07, "loss": 0.377, "step": 470 }, { "epoch": 0.62, "logps_train/chosen": -72.38058471679688, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -78.2701187133789, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1575891077518463, "rewards_train/margins": 1.2209852635860443, "rewards_train/rejected": -1.3785743713378906, "step": 470 }, { "epoch": 0.63, "logps_train/chosen": -80.75613403320312, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -77.01436614990234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6556369662284851, "rewards_train/margins": 1.0344175100326538, "rewards_train/rejected": -0.3787805438041687, "step": 471 }, { "epoch": 0.63, "learning_rate": 4.845870586701367e-07, "loss": 0.4686, "step": 472 }, { "epoch": 0.63, "logps_train/chosen": -42.87356948852539, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -66.19390869140625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2454555183649063, "rewards_train/margins": 1.0968775898218155, "rewards_train/rejected": -0.8514220714569092, "step": 472 }, { "epoch": 0.63, "logps_train/chosen": -45.4461669921875, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -61.40019607543945, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18194574117660522, "rewards_train/margins": 1.5563400387763977, "rewards_train/rejected": -1.3743942975997925, "step": 473 }, { "epoch": 0.63, "learning_rate": 4.84396428209331e-07, "loss": 0.4146, "step": 474 }, { "epoch": 0.63, "logps_train/chosen": -86.17939758300781, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -84.92279052734375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4413773715496063, "rewards_train/margins": 0.2665269672870636, "rewards_train/rejected": -0.7079043388366699, "step": 474 }, { "epoch": 0.63, "logps_train/chosen": -42.84635925292969, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -48.5110969543457, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23216086626052856, "rewards_train/margins": 1.226239264011383, "rewards_train/rejected": -0.9940783977508545, "step": 475 }, { "epoch": 0.63, "learning_rate": 4.842046640696075e-07, "loss": 0.5151, "step": 476 }, { "epoch": 0.63, "logps_train/chosen": -70.34722900390625, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -85.06442260742188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5965275168418884, "rewards_train/margins": 1.881094515323639, "rewards_train/rejected": -1.2845669984817505, "step": 476 }, { "epoch": 0.63, "logps_train/chosen": -70.40995788574219, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -79.18315887451172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21326139569282532, "rewards_train/margins": 1.004273384809494, "rewards_train/rejected": -1.2175347805023193, "step": 477 }, { "epoch": 0.63, "learning_rate": 4.840117671784504e-07, "loss": 0.3941, "step": 478 }, { "epoch": 0.63, "logps_train/chosen": -65.40243530273438, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -77.40702819824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7550691962242126, "rewards_train/margins": 1.822334110736847, "rewards_train/rejected": -1.0672649145126343, "step": 478 }, { "epoch": 0.64, "logps_train/chosen": -84.39859771728516, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -119.66218566894531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 1.0163898468017578, "rewards_train/margins": 2.5365145206451416, "rewards_train/rejected": -1.5201246738433838, "step": 479 }, { "epoch": 0.64, "learning_rate": 4.838177384688225e-07, "loss": 0.2242, "step": 480 }, { "epoch": 0.64, "logps_train/chosen": -74.55684661865234, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -108.84405517578125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.39431503415107727, "rewards_train/margins": 0.969345360994339, "rewards_train/rejected": -0.5750303268432617, "step": 480 }, { "epoch": 0.64, "logps_train/chosen": -74.06786346435547, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -77.08515167236328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.04389568418264389, "rewards_train/margins": 0.8724319115281105, "rewards_train/rejected": -0.9163275957107544, "step": 481 }, { "epoch": 0.64, "learning_rate": 4.836225788791606e-07, "loss": 0.5093, "step": 482 }, { "epoch": 0.64, "logps_train/chosen": -78.9788818359375, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -97.8614501953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2822626829147339, "rewards_train/margins": 0.9585698843002319, "rewards_train/rejected": -1.2408325672149658, "step": 482 }, { "epoch": 0.64, "logps_train/chosen": -39.03144836425781, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -50.362998962402344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5556445121765137, "rewards_train/margins": 1.6556164026260376, "rewards_train/rejected": -1.099971890449524, "step": 483 }, { "epoch": 0.64, "learning_rate": 4.834262893533713e-07, "loss": 0.3714, "step": 484 }, { "epoch": 0.64, "logps_train/chosen": -103.36849975585938, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -126.58375549316406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12513086199760437, "rewards_train/margins": 0.9703533947467804, "rewards_train/rejected": -1.0954842567443848, "step": 484 }, { "epoch": 0.64, "logps_train/chosen": -91.72317504882812, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -84.20274353027344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.03794288635253906, "rewards_train/margins": 0.865924596786499, "rewards_train/rejected": -0.9038674831390381, "step": 485 }, { "epoch": 0.65, "learning_rate": 4.83228870840826e-07, "loss": 0.6328, "step": 486 }, { "epoch": 0.65, "logps_train/chosen": -67.49491119384766, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -89.57960510253906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.36160022020339966, "rewards_train/margins": 0.3533911108970642, "rewards_train/rejected": -0.7149913311004639, "step": 486 }, { "epoch": 0.65, "logps_train/chosen": -55.67702102661133, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -59.93408203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.13503268361091614, "rewards_train/margins": 0.6487534940242767, "rewards_train/rejected": -0.5137208104133606, "step": 487 }, { "epoch": 0.65, "learning_rate": 4.830303242963569e-07, "loss": 0.6315, "step": 488 }, { "epoch": 0.65, "logps_train/chosen": -57.108734130859375, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -71.58772277832031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3375643491744995, "rewards_train/margins": 1.7150697708129883, "rewards_train/rejected": -1.3775054216384888, "step": 488 }, { "epoch": 0.65, "logps_train/chosen": -81.05008697509766, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -100.58329010009766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.416866272687912, "rewards_train/margins": 1.5095703303813934, "rewards_train/rejected": -1.0927040576934814, "step": 489 }, { "epoch": 0.65, "learning_rate": 4.828306506802516e-07, "loss": 0.3959, "step": 490 }, { "epoch": 0.65, "logps_train/chosen": -43.09886169433594, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -54.933494567871094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.30730152130126953, "rewards_train/margins": 1.191275954246521, "rewards_train/rejected": -0.8839744329452515, "step": 490 }, { "epoch": 0.65, "logps_train/chosen": -38.58753204345703, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -56.39495086669922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.060315392911434174, "rewards_train/margins": 0.8994922414422035, "rewards_train/rejected": -0.9598076343536377, "step": 491 }, { "epoch": 0.65, "learning_rate": 4.826298509582492e-07, "loss": 0.4602, "step": 492 }, { "epoch": 0.65, "logps_train/chosen": -46.844520568847656, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -64.6305160522461, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.24304601550102234, "rewards_train/margins": 0.1325054168701172, "rewards_train/rejected": -0.3755514323711395, "step": 492 }, { "epoch": 0.65, "logps_train/chosen": -86.06369018554688, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -116.0888900756836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7295681238174438, "rewards_train/margins": 2.379863142967224, "rewards_train/rejected": -1.6502950191497803, "step": 493 }, { "epoch": 0.66, "learning_rate": 4.824279261015352e-07, "loss": 0.4359, "step": 494 }, { "epoch": 0.66, "logps_train/chosen": -48.86855697631836, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -87.00870513916016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.10814464092254639, "rewards_train/margins": 1.6630383729934692, "rewards_train/rejected": -1.7711830139160156, "step": 494 }, { "epoch": 0.66, "logps_train/chosen": -54.31206512451172, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -76.25498962402344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.376605749130249, "rewards_train/margins": 0.7646042108535767, "rewards_train/rejected": -0.38799846172332764, "step": 495 }, { "epoch": 0.66, "learning_rate": 4.82224877086737e-07, "loss": 0.4099, "step": 496 }, { "epoch": 0.66, "logps_train/chosen": -66.955810546875, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -103.06253814697266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.5559815168380737, "rewards_train/margins": 1.7966104745864868, "rewards_train/rejected": -1.240628957748413, "step": 496 }, { "epoch": 0.66, "logps_train/chosen": -42.07504653930664, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -71.94281768798828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2909327745437622, "rewards_train/margins": 1.5977145433425903, "rewards_train/rejected": -1.3067817687988281, "step": 497 }, { "epoch": 0.66, "learning_rate": 4.820207048959186e-07, "loss": 0.3154, "step": 498 }, { "epoch": 0.66, "logps_train/chosen": -69.0484848022461, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -66.33747100830078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6654641628265381, "rewards_train/margins": 1.6960864067077637, "rewards_train/rejected": -1.0306222438812256, "step": 498 }, { "epoch": 0.66, "logps_train/chosen": -58.36433029174805, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -83.72349548339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10262942314147949, "rewards_train/margins": 1.709354281425476, "rewards_train/rejected": -1.6067248582839966, "step": 499 }, { "epoch": 0.66, "learning_rate": 4.818154105165772e-07, "loss": 0.3195, "step": 500 }, { "epoch": 0.66, "logps_train/chosen": -64.72539520263672, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -82.59965515136719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03722620755434036, "rewards_train/margins": 1.0901601389050484, "rewards_train/rejected": -1.052933931350708, "step": 500 }, { "epoch": 0.67, "logps_train/chosen": -78.46614837646484, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -116.8248062133789, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.8393229246139526, "rewards_train/margins": 2.340553641319275, "rewards_train/rejected": -1.5012307167053223, "step": 501 }, { "epoch": 0.67, "learning_rate": 4.816089949416369e-07, "loss": 0.3819, "step": 502 }, { "epoch": 0.67, "logps_train/chosen": -44.76847457885742, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -56.458106994628906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.05284015089273453, "rewards_train/margins": 0.9736508503556252, "rewards_train/rejected": -0.9208106994628906, "step": 502 }, { "epoch": 0.67, "logps_train/chosen": -68.35040283203125, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -107.94994354248047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.04464678838849068, "rewards_train/margins": 1.8271412141621113, "rewards_train/rejected": -1.7824944257736206, "step": 503 }, { "epoch": 0.67, "learning_rate": 4.814014591694448e-07, "loss": 0.4376, "step": 504 }, { "epoch": 0.67, "logps_train/chosen": -35.27831268310547, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -55.747501373291016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.544044017791748, "rewards_train/margins": 1.4187942147254944, "rewards_train/rejected": -0.8747501969337463, "step": 504 }, { "epoch": 0.67, "logps_train/chosen": -66.682861328125, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -114.08990478515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.49577608704566956, "rewards_train/margins": 1.9907038509845734, "rewards_train/rejected": -1.4949277639389038, "step": 505 }, { "epoch": 0.67, "learning_rate": 4.811928042037658e-07, "loss": 0.3063, "step": 506 }, { "epoch": 0.67, "logps_train/chosen": -60.73814010620117, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -92.0196304321289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.04649835824966431, "rewards_train/margins": 1.5498284697532654, "rewards_train/rejected": -1.503330111503601, "step": 506 }, { "epoch": 0.67, "logps_train/chosen": -61.18023681640625, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -82.28202819824219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.599945068359375, "rewards_train/margins": 1.6086167097091675, "rewards_train/rejected": -1.0086716413497925, "step": 507 }, { "epoch": 0.67, "learning_rate": 4.809830310537781e-07, "loss": 0.4885, "step": 508 }, { "epoch": 0.67, "logps_train/chosen": -79.5849609375, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -91.89620208740234, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.2649410367012024, "rewards_train/margins": 1.0717484951019287, "rewards_train/rejected": -0.8068074584007263, "step": 508 }, { "epoch": 0.68, "logps_train/chosen": -74.50267028808594, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -75.73983764648438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 1.0059832334518433, "rewards_train/margins": 2.048716425895691, "rewards_train/rejected": -1.0427331924438477, "step": 509 }, { "epoch": 0.68, "learning_rate": 4.807721407340679e-07, "loss": 0.4407, "step": 510 }, { "epoch": 0.68, "logps_train/chosen": -101.51117706298828, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -145.41726684570312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.36450710892677307, "rewards_train/margins": 1.9242017567157745, "rewards_train/rejected": -1.5596946477890015, "step": 510 }, { "epoch": 0.68, "logps_train/chosen": -94.90898895263672, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -119.2308578491211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.002851039171218872, "rewards_train/margins": 1.7821872532367706, "rewards_train/rejected": -1.7793362140655518, "step": 511 }, { "epoch": 0.68, "learning_rate": 4.805601342646248e-07, "loss": 0.4452, "step": 512 }, { "epoch": 0.68, "logps_train/chosen": -97.69891357421875, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -124.135009765625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.2769832909107208, "rewards_train/margins": 1.5795471966266632, "rewards_train/rejected": -1.3025639057159424, "step": 512 }, { "epoch": 0.68, "logps_train/chosen": -63.78382110595703, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -70.51071166992188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.012756690382957458, "rewards_train/margins": 0.8789393454790115, "rewards_train/rejected": -0.891696035861969, "step": 513 }, { "epoch": 0.68, "learning_rate": 4.803470126708366e-07, "loss": 0.5079, "step": 514 }, { "epoch": 0.68, "logps_train/chosen": -51.104488372802734, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -66.3725357055664, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5988103151321411, "rewards_train/margins": 1.5354778170585632, "rewards_train/rejected": -0.9366675019264221, "step": 514 }, { "epoch": 0.68, "logps_train/chosen": -49.2669792175293, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -53.96672058105469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4954700469970703, "rewards_train/margins": 1.2980016469955444, "rewards_train/rejected": -0.8025315999984741, "step": 515 }, { "epoch": 0.69, "learning_rate": 4.801327769834847e-07, "loss": 0.3506, "step": 516 }, { "epoch": 0.69, "logps_train/chosen": -70.37693786621094, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -71.87669372558594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2626972496509552, "rewards_train/margins": 0.9868898689746857, "rewards_train/rejected": -0.7241926193237305, "step": 516 }, { "epoch": 0.69, "logps_train/chosen": -43.73329162597656, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.25, "logps_train/rejected": -48.40955352783203, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.025889843702316284, "rewards_train/margins": 0.7465327084064484, "rewards_train/rejected": -0.7206428647041321, "step": 517 }, { "epoch": 0.69, "learning_rate": 4.79917428238739e-07, "loss": 0.4547, "step": 518 }, { "epoch": 0.69, "logps_train/chosen": -43.598175048828125, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -66.93354797363281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6667448878288269, "rewards_train/margins": 1.8444743752479553, "rewards_train/rejected": -1.1777294874191284, "step": 518 }, { "epoch": 0.69, "logps_train/chosen": -43.813804626464844, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -71.97845458984375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2615884244441986, "rewards_train/margins": 1.4438091218471527, "rewards_train/rejected": -1.182220697402954, "step": 519 }, { "epoch": 0.69, "learning_rate": 4.797009674781523e-07, "loss": 0.3965, "step": 520 }, { "epoch": 0.69, "logps_train/chosen": -39.76426696777344, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -63.16413116455078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47044843435287476, "rewards_train/margins": 1.7993616461753845, "rewards_train/rejected": -1.3289132118225098, "step": 520 }, { "epoch": 0.69, "logps_train/chosen": -36.2171745300293, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -49.16302490234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.15289199352264404, "rewards_train/margins": 1.0125537514686584, "rewards_train/rejected": -0.8596617579460144, "step": 521 }, { "epoch": 0.69, "learning_rate": 4.794833957486562e-07, "loss": 0.3429, "step": 522 }, { "epoch": 0.69, "logps_train/chosen": -89.44718933105469, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -113.53695678710938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.4443441331386566, "rewards_train/margins": 2.4218682944774628, "rewards_train/rejected": -1.9775241613388062, "step": 522 }, { "epoch": 0.69, "logps_train/chosen": -65.70970916748047, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -92.09840393066406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 1.1571540832519531, "rewards_train/margins": 2.8185572624206543, "rewards_train/rejected": -1.6614031791687012, "step": 523 }, { "epoch": 0.7, "learning_rate": 4.792647141025557e-07, "loss": 0.2876, "step": 524 }, { "epoch": 0.7, "logps_train/chosen": -85.04306030273438, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -118.11507415771484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.9925685524940491, "rewards_train/margins": 2.74313884973526, "rewards_train/rejected": -1.750570297241211, "step": 524 }, { "epoch": 0.7, "logps_train/chosen": -47.077213287353516, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -64.46700286865234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.05098319053649902, "rewards_train/margins": 0.4019668400287628, "rewards_train/rejected": -0.45295003056526184, "step": 525 }, { "epoch": 0.7, "learning_rate": 4.790449235975235e-07, "loss": 0.5276, "step": 526 }, { "epoch": 0.7, "logps_train/chosen": -65.16095733642578, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -65.19416809082031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.42966973781585693, "rewards_train/margins": 0.9237319231033325, "rewards_train/rejected": -1.3534016609191895, "step": 526 }, { "epoch": 0.7, "logps_train/chosen": -58.66972351074219, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -94.27264404296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6892780065536499, "rewards_train/margins": 2.7020891904830933, "rewards_train/rejected": -2.0128111839294434, "step": 527 }, { "epoch": 0.7, "learning_rate": 4.788240252965957e-07, "loss": 0.3182, "step": 528 }, { "epoch": 0.7, "logps_train/chosen": -55.83493423461914, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -80.23184204101562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.444430947303772, "rewards_train/margins": 1.1334413290023804, "rewards_train/rejected": -1.5778722763061523, "step": 528 }, { "epoch": 0.7, "logps_train/chosen": -47.8358039855957, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -59.1377067565918, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5781385898590088, "rewards_train/margins": 1.2555812001228333, "rewards_train/rejected": -0.6774426102638245, "step": 529 }, { "epoch": 0.7, "learning_rate": 4.786020202681666e-07, "loss": 0.4616, "step": 530 }, { "epoch": 0.7, "logps_train/chosen": -60.454132080078125, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -86.97035217285156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.436618447303772, "rewards_train/margins": 1.9867793321609497, "rewards_train/rejected": -1.5501608848571777, "step": 530 }, { "epoch": 0.71, "logps_train/chosen": -60.42911911010742, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -81.11143493652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17740082740783691, "rewards_train/margins": 1.4744820594787598, "rewards_train/rejected": -1.2970812320709229, "step": 531 }, { "epoch": 0.71, "learning_rate": 4.783789095859828e-07, "loss": 0.3121, "step": 532 }, { "epoch": 0.71, "logps_train/chosen": -53.50497817993164, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -66.90718078613281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.21434569358825684, "rewards_train/margins": 1.379673719406128, "rewards_train/rejected": -1.165328025817871, "step": 532 }, { "epoch": 0.71, "logps_train/chosen": -55.165855407714844, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -58.01957702636719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.24630485475063324, "rewards_train/margins": 1.100215658545494, "rewards_train/rejected": -0.8539108037948608, "step": 533 }, { "epoch": 0.71, "learning_rate": 4.781546943291387e-07, "loss": 0.454, "step": 534 }, { "epoch": 0.71, "logps_train/chosen": -81.78358459472656, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -103.94087219238281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.08460818231105804, "rewards_train/margins": 1.2782291322946548, "rewards_train/rejected": -1.362837314605713, "step": 534 }, { "epoch": 0.71, "logps_train/chosen": -58.39218521118164, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -62.64170455932617, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4240626096725464, "rewards_train/margins": 1.6991705894470215, "rewards_train/rejected": -1.275107979774475, "step": 535 }, { "epoch": 0.71, "learning_rate": 4.779293755820712e-07, "loss": 0.4349, "step": 536 }, { "epoch": 0.71, "logps_train/chosen": -61.329803466796875, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -90.47566223144531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.02204272150993347, "rewards_train/margins": 1.566148191690445, "rewards_train/rejected": -1.5881909132003784, "step": 536 }, { "epoch": 0.71, "logps_train/chosen": -80.10034942626953, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -81.34458923339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47277745604515076, "rewards_train/margins": 1.5916114747524261, "rewards_train/rejected": -1.1188340187072754, "step": 537 }, { "epoch": 0.71, "learning_rate": 4.777029544345543e-07, "loss": 0.302, "step": 538 }, { "epoch": 0.71, "logps_train/chosen": -62.506080627441406, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -94.4066390991211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.8689229488372803, "rewards_train/margins": 2.4548988342285156, "rewards_train/rejected": -1.5859758853912354, "step": 538 }, { "epoch": 0.72, "logps_train/chosen": -53.01947021484375, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -61.01969909667969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0628846287727356, "rewards_train/margins": 1.023460328578949, "rewards_train/rejected": -1.0863449573516846, "step": 539 }, { "epoch": 0.72, "learning_rate": 4.774754319816936e-07, "loss": 0.3846, "step": 540 }, { "epoch": 0.72, "logps_train/chosen": -86.12532043457031, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -99.66450500488281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5593426823616028, "rewards_train/margins": 2.1148563027381897, "rewards_train/rejected": -1.555513620376587, "step": 540 }, { "epoch": 0.72, "logps_train/chosen": -89.7861328125, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -121.96415710449219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.15673795342445374, "rewards_train/margins": 1.1709276139736176, "rewards_train/rejected": -1.3276655673980713, "step": 541 }, { "epoch": 0.72, "learning_rate": 4.772468093239214e-07, "loss": 0.3335, "step": 542 }, { "epoch": 0.72, "logps_train/chosen": -65.43281555175781, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -88.52119445800781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8801558017730713, "rewards_train/margins": 2.388525128364563, "rewards_train/rejected": -1.5083693265914917, "step": 542 }, { "epoch": 0.72, "logps_train/chosen": -100.86902618408203, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -112.65705871582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.17127755284309387, "rewards_train/margins": 0.7600526511669159, "rewards_train/rejected": -0.9313302040100098, "step": 543 }, { "epoch": 0.72, "learning_rate": 4.770170875669915e-07, "loss": 0.4514, "step": 544 }, { "epoch": 0.72, "logps_train/chosen": -75.40251159667969, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -103.71463775634766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.41287341713905334, "rewards_train/margins": 2.479649692773819, "rewards_train/rejected": -2.0667762756347656, "step": 544 }, { "epoch": 0.72, "logps_train/chosen": -78.18917846679688, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -67.2279281616211, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.4767308235168457, "rewards_train/margins": 0.06949913501739502, "rewards_train/rejected": -0.5462299585342407, "step": 545 }, { "epoch": 0.73, "learning_rate": 4.767862678219731e-07, "loss": 0.5378, "step": 546 }, { "epoch": 0.73, "logps_train/chosen": -86.35397338867188, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -106.66950988769531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.055226948112249374, "rewards_train/margins": 2.0128040574491024, "rewards_train/rejected": -1.957577109336853, "step": 546 }, { "epoch": 0.73, "logps_train/chosen": -34.413246154785156, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -50.791778564453125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.25437861680984497, "rewards_train/margins": 0.895861029624939, "rewards_train/rejected": -0.641482412815094, "step": 547 }, { "epoch": 0.73, "learning_rate": 4.765543512052463e-07, "loss": 0.4256, "step": 548 }, { "epoch": 0.73, "logps_train/chosen": -64.68608093261719, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -61.228912353515625, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.7217331528663635, "rewards_train/margins": -0.45431074500083923, "rewards_train/rejected": -0.2674224078655243, "step": 548 }, { "epoch": 0.73, "logps_train/chosen": -86.13417053222656, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -89.27899169921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2092394232749939, "rewards_train/margins": 1.8133100867271423, "rewards_train/rejected": -1.6040706634521484, "step": 549 }, { "epoch": 0.73, "learning_rate": 4.7632133883849623e-07, "loss": 0.699, "step": 550 }, { "epoch": 0.73, "logps_train/chosen": -77.55008697509766, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -79.2755126953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.18420809507369995, "rewards_train/margins": 0.9898279309272766, "rewards_train/rejected": -1.1740360260009766, "step": 550 }, { "epoch": 0.73, "logps_train/chosen": -53.07927703857422, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -74.06660461425781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2993340790271759, "rewards_train/margins": 1.4542015492916107, "rewards_train/rejected": -1.7535356283187866, "step": 551 }, { "epoch": 0.73, "learning_rate": 4.7608723184870757e-07, "loss": 0.4229, "step": 552 }, { "epoch": 0.73, "logps_train/chosen": -63.72083282470703, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -93.95362854003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35447901487350464, "rewards_train/margins": 2.702966630458832, "rewards_train/rejected": -2.348487615585327, "step": 552 }, { "epoch": 0.73, "logps_train/chosen": -63.52777862548828, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -75.32018280029297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.13168418407440186, "rewards_train/margins": 1.4268964529037476, "rewards_train/rejected": -1.5585806369781494, "step": 553 }, { "epoch": 0.74, "learning_rate": 4.7585203136815945e-07, "loss": 0.2584, "step": 554 }, { "epoch": 0.74, "logps_train/chosen": -75.23867797851562, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -94.06930541992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15738201141357422, "rewards_train/margins": 2.759624719619751, "rewards_train/rejected": -2.6022427082061768, "step": 554 }, { "epoch": 0.74, "logps_train/chosen": -57.15555191040039, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -84.36032104492188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.6344447731971741, "rewards_train/margins": 1.9454770684242249, "rewards_train/rejected": -1.3110322952270508, "step": 555 }, { "epoch": 0.74, "learning_rate": 4.756157385344195e-07, "loss": 0.2589, "step": 556 }, { "epoch": 0.74, "logps_train/chosen": -50.42674255371094, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -39.773258209228516, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03486183285713196, "rewards_train/margins": 0.3577960431575775, "rewards_train/rejected": -0.3926578760147095, "step": 556 }, { "epoch": 0.74, "logps_train/chosen": -44.924530029296875, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -66.36564636230469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.443484902381897, "rewards_train/margins": 1.3737996220588684, "rewards_train/rejected": -0.9303147196769714, "step": 557 }, { "epoch": 0.74, "learning_rate": 4.75378354490339e-07, "loss": 0.4999, "step": 558 }, { "epoch": 0.74, "logps_train/chosen": -26.142358779907227, "logps_train/ref_chosen": -30.5, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -44.81050109863281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43498295545578003, "rewards_train/margins": 1.6269704699516296, "rewards_train/rejected": -1.1919875144958496, "step": 558 }, { "epoch": 0.74, "logps_train/chosen": -38.286808013916016, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -56.23208236694336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.034930914640426636, "rewards_train/margins": 1.289449006319046, "rewards_train/rejected": -1.3243799209594727, "step": 559 }, { "epoch": 0.74, "learning_rate": 4.7513988038404653e-07, "loss": 0.3509, "step": 560 }, { "epoch": 0.74, "logps_train/chosen": -48.065032958984375, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -52.52865982055664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2533780634403229, "rewards_train/margins": 0.9471441209316254, "rewards_train/rejected": -1.2005221843719482, "step": 560 }, { "epoch": 0.75, "logps_train/chosen": -51.579593658447266, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -62.28582000732422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2532718777656555, "rewards_train/margins": 0.3842945694923401, "rewards_train/rejected": -0.6375664472579956, "step": 561 }, { "epoch": 0.75, "learning_rate": 4.749003173689432e-07, "loss": 0.5397, "step": 562 }, { "epoch": 0.75, "logps_train/chosen": -72.37049102783203, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -105.22444152832031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.19107604026794434, "rewards_train/margins": 2.619770050048828, "rewards_train/rejected": -2.428694009780884, "step": 562 }, { "epoch": 0.75, "logps_train/chosen": -88.8067855834961, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -90.50834655761719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4817650318145752, "rewards_train/margins": 0.8550066947937012, "rewards_train/rejected": -1.3367717266082764, "step": 563 }, { "epoch": 0.75, "learning_rate": 4.746596666036964e-07, "loss": 0.3867, "step": 564 }, { "epoch": 0.75, "logps_train/chosen": -92.42796325683594, "logps_train/ref_chosen": -97.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -92.03213500976562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5228289365768433, "rewards_train/margins": 1.8104172945022583, "rewards_train/rejected": -1.287588357925415, "step": 564 }, { "epoch": 0.75, "logps_train/chosen": -65.4759292602539, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -69.63304138183594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.053969502449035645, "rewards_train/margins": 0.7282111644744873, "rewards_train/rejected": -0.6742416620254517, "step": 565 }, { "epoch": 0.75, "learning_rate": 4.744179292522349e-07, "loss": 0.4414, "step": 566 }, { "epoch": 0.75, "logps_train/chosen": -38.7037239074707, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -64.80675506591797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.059435099363327026, "rewards_train/margins": 1.1368655264377594, "rewards_train/rejected": -1.1963006258010864, "step": 566 }, { "epoch": 0.75, "logps_train/chosen": -57.235206604003906, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -77.59497833251953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2553855776786804, "rewards_train/margins": 1.805508315563202, "rewards_train/rejected": -1.5501227378845215, "step": 567 }, { "epoch": 0.75, "learning_rate": 4.741751064837426e-07, "loss": 0.3489, "step": 568 }, { "epoch": 0.75, "logps_train/chosen": -79.77310180664062, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -95.32447814941406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09768963605165482, "rewards_train/margins": 1.492636926472187, "rewards_train/rejected": -1.3949472904205322, "step": 568 }, { "epoch": 0.76, "logps_train/chosen": -77.84187316894531, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -102.27454376220703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22518739104270935, "rewards_train/margins": 1.972954124212265, "rewards_train/rejected": -1.7477667331695557, "step": 569 }, { "epoch": 0.76, "learning_rate": 4.7393119947265303e-07, "loss": 0.3305, "step": 570 }, { "epoch": 0.76, "logps_train/chosen": -51.39000701904297, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -68.27716827392578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3172490894794464, "rewards_train/margins": 1.2996536791324615, "rewards_train/rejected": -0.9824045896530151, "step": 570 }, { "epoch": 0.76, "logps_train/chosen": -55.66794967651367, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -77.81556701660156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3058575987815857, "rewards_train/margins": 1.2678866982460022, "rewards_train/rejected": -1.573744297027588, "step": 571 }, { "epoch": 0.76, "learning_rate": 4.7368620939864395e-07, "loss": 0.49, "step": 572 }, { "epoch": 0.76, "logps_train/chosen": -55.084468841552734, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -67.74577331542969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1571781039237976, "rewards_train/margins": 1.2743340134620667, "rewards_train/rejected": -1.117155909538269, "step": 572 }, { "epoch": 0.76, "logps_train/chosen": -38.189327239990234, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -46.556495666503906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.5357547998428345, "rewards_train/margins": 1.4703106880187988, "rewards_train/rejected": -0.9345558881759644, "step": 573 }, { "epoch": 0.76, "learning_rate": 4.7344013744663137e-07, "loss": 0.3834, "step": 574 }, { "epoch": 0.76, "logps_train/chosen": -79.90370178222656, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -95.92884826660156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5768181085586548, "rewards_train/margins": 2.593140721321106, "rewards_train/rejected": -2.016322612762451, "step": 574 }, { "epoch": 0.76, "logps_train/chosen": -68.4347152709961, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -73.60333251953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22115738689899445, "rewards_train/margins": 1.2821438759565353, "rewards_train/rejected": -1.5033012628555298, "step": 575 }, { "epoch": 0.76, "learning_rate": 4.7319298480676393e-07, "loss": 0.3204, "step": 576 }, { "epoch": 0.76, "logps_train/chosen": -73.40341186523438, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -71.17469787597656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4700283408164978, "rewards_train/margins": 0.9177539944648743, "rewards_train/rejected": -1.387782335281372, "step": 576 }, { "epoch": 0.77, "logps_train/chosen": -59.3175163269043, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -69.69569396972656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.93175208568573, "rewards_train/margins": 0.5768803358078003, "rewards_train/rejected": -1.5086324214935303, "step": 577 }, { "epoch": 0.77, "learning_rate": 4.7294475267441703e-07, "loss": 0.5989, "step": 578 }, { "epoch": 0.77, "logps_train/chosen": -41.835750579833984, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -58.98059844970703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3351374864578247, "rewards_train/margins": 1.003546953201294, "rewards_train/rejected": -1.3386844396591187, "step": 578 }, { "epoch": 0.77, "logps_train/chosen": -49.31288146972656, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -74.24522399902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19175879657268524, "rewards_train/margins": 1.8944060057401657, "rewards_train/rejected": -1.7026472091674805, "step": 579 }, { "epoch": 0.77, "learning_rate": 4.726954422501873e-07, "loss": 0.3095, "step": 580 }, { "epoch": 0.77, "logps_train/chosen": -51.600826263427734, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -62.637237548828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.41437941789627075, "rewards_train/margins": 0.5985633730888367, "rewards_train/rejected": -1.0129427909851074, "step": 580 }, { "epoch": 0.77, "logps_train/chosen": -57.84700393676758, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -108.85273742675781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.551237165927887, "rewards_train/margins": 2.521667182445526, "rewards_train/rejected": -1.9704300165176392, "step": 581 }, { "epoch": 0.77, "learning_rate": 4.724450547398864e-07, "loss": 0.4508, "step": 582 }, { "epoch": 0.77, "logps_train/chosen": -63.33903503417969, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -76.01646423339844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.16202855110168457, "rewards_train/margins": 1.342742681503296, "rewards_train/rejected": -1.5047712326049805, "step": 582 }, { "epoch": 0.77, "logps_train/chosen": -68.54399108886719, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -75.78659057617188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.29502445459365845, "rewards_train/margins": 0.4758225083351135, "rewards_train/rejected": -0.770846962928772, "step": 583 }, { "epoch": 0.78, "learning_rate": 4.7219359135453554e-07, "loss": 0.4989, "step": 584 }, { "epoch": 0.78, "logps_train/chosen": -44.65009307861328, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -70.82889556884766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4951471984386444, "rewards_train/margins": 1.8256931006908417, "rewards_train/rejected": -1.3305459022521973, "step": 584 }, { "epoch": 0.78, "logps_train/chosen": -62.56684112548828, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -63.533782958984375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.04487829655408859, "rewards_train/margins": 0.42325635999441147, "rewards_train/rejected": -0.3783780634403229, "step": 585 }, { "epoch": 0.78, "learning_rate": 4.719410533103595e-07, "loss": 0.5448, "step": 586 }, { "epoch": 0.78, "logps_train/chosen": -66.92024230957031, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -98.66958618164062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.49235033988952637, "rewards_train/margins": 2.4733721017837524, "rewards_train/rejected": -1.981021761894226, "step": 586 }, { "epoch": 0.78, "logps_train/chosen": -56.734954833984375, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -77.53395080566406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.04212941229343414, "rewards_train/margins": 1.367399349808693, "rewards_train/rejected": -1.3252699375152588, "step": 587 }, { "epoch": 0.78, "learning_rate": 4.7168744182878065e-07, "loss": 0.3397, "step": 588 }, { "epoch": 0.78, "logps_train/chosen": -65.14064025878906, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -89.91858673095703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19062352180480957, "rewards_train/margins": 2.379357099533081, "rewards_train/rejected": -2.1887335777282715, "step": 588 }, { "epoch": 0.78, "logps_train/chosen": -67.61961364746094, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -76.6443099975586, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.039305105805397034, "rewards_train/margins": 1.3173136562108994, "rewards_train/rejected": -1.3566187620162964, "step": 589 }, { "epoch": 0.78, "learning_rate": 4.7143275813641336e-07, "loss": 0.3752, "step": 590 }, { "epoch": 0.78, "logps_train/chosen": -54.337440490722656, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -73.7213134765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.050931595265865326, "rewards_train/margins": 1.2258866801857948, "rewards_train/rejected": -1.2768182754516602, "step": 590 }, { "epoch": 0.78, "logps_train/chosen": -57.397369384765625, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -63.53017807006836, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.06416883319616318, "rewards_train/margins": 1.4984368309378624, "rewards_train/rejected": -1.4342679977416992, "step": 591 }, { "epoch": 0.79, "learning_rate": 4.711770034650575e-07, "loss": 0.3846, "step": 592 }, { "epoch": 0.79, "logps_train/chosen": -48.90729522705078, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -61.438758850097656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.030363978818058968, "rewards_train/margins": 1.6310763973742723, "rewards_train/rejected": -1.6007124185562134, "step": 592 }, { "epoch": 0.79, "logps_train/chosen": -69.3033447265625, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -90.70845794677734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.29466527700424194, "rewards_train/margins": 2.1362144351005554, "rewards_train/rejected": -1.8415491580963135, "step": 593 }, { "epoch": 0.79, "learning_rate": 4.7092017905169315e-07, "loss": 0.4115, "step": 594 }, { "epoch": 0.79, "logps_train/chosen": -50.840511322021484, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -60.229576110839844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.39420759677886963, "rewards_train/margins": 0.7975000143051147, "rewards_train/rejected": -1.1917076110839844, "step": 594 }, { "epoch": 0.79, "logps_train/chosen": -75.55046081542969, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -111.72606658935547, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0347975492477417, "rewards_train/margins": 2.419904589653015, "rewards_train/rejected": -2.3851070404052734, "step": 595 }, { "epoch": 0.79, "learning_rate": 4.7066228613847405e-07, "loss": 0.6544, "step": 596 }, { "epoch": 0.79, "logps_train/chosen": -32.979820251464844, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -44.78044891357422, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.26998698711395264, "rewards_train/margins": 1.061313509941101, "rewards_train/rejected": -0.7913265228271484, "step": 596 }, { "epoch": 0.79, "logps_train/chosen": -74.37362670898438, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -80.77278137207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04908075928688049, "rewards_train/margins": 0.9125727117061615, "rewards_train/rejected": -0.961653470993042, "step": 597 }, { "epoch": 0.79, "learning_rate": 4.704033259727219e-07, "loss": 0.4005, "step": 598 }, { "epoch": 0.79, "logps_train/chosen": -62.520835876464844, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -93.00614166259766, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1364586502313614, "rewards_train/margins": 0.7469682544469833, "rewards_train/rejected": -0.8834269046783447, "step": 598 }, { "epoch": 0.8, "logps_train/chosen": -46.40895080566406, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -100.99507904052734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5051011443138123, "rewards_train/margins": 2.7905468344688416, "rewards_train/rejected": -2.2854456901550293, "step": 599 }, { "epoch": 0.8, "learning_rate": 4.701432998069205e-07, "loss": 0.3991, "step": 600 }, { "epoch": 0.8, "logps_train/chosen": -44.63404083251953, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -73.91123962402344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3678460717201233, "rewards_train/margins": 1.933970034122467, "rewards_train/rejected": -1.5661239624023438, "step": 600 }, { "epoch": 0.8, "logps_train/chosen": -53.41337203979492, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -64.40904235839844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.011649668216705322, "rewards_train/margins": 1.3730050921440125, "rewards_train/rejected": -1.3846547603607178, "step": 601 }, { "epoch": 0.8, "learning_rate": 4.69882208898709e-07, "loss": 0.4513, "step": 602 }, { "epoch": 0.8, "logps_train/chosen": -66.7323226928711, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -77.54193115234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1904197335243225, "rewards_train/margins": 1.1668007969856262, "rewards_train/rejected": -1.3572205305099487, "step": 602 }, { "epoch": 0.8, "logps_train/chosen": -47.96442413330078, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -86.06922912597656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.667229175567627, "rewards_train/margins": 1.8507150411605835, "rewards_train/rejected": -1.1834858655929565, "step": 603 }, { "epoch": 0.8, "learning_rate": 4.696200545108767e-07, "loss": 0.3027, "step": 604 }, { "epoch": 0.8, "logps_train/chosen": -46.41473388671875, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -60.637451171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.07428610324859619, "rewards_train/margins": 0.7341855764389038, "rewards_train/rejected": -0.8084716796875, "step": 604 }, { "epoch": 0.8, "logps_train/chosen": -64.93851470947266, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -88.19393920898438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.9686485528945923, "rewards_train/margins": 2.5130425691604614, "rewards_train/rejected": -1.5443940162658691, "step": 605 }, { "epoch": 0.8, "learning_rate": 4.693568379113562e-07, "loss": 0.4029, "step": 606 }, { "epoch": 0.8, "logps_train/chosen": -87.84767150878906, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -89.39312744140625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.05820435285568237, "rewards_train/margins": 1.8061079382896423, "rewards_train/rejected": -1.8643122911453247, "step": 606 }, { "epoch": 0.81, "logps_train/chosen": -75.93603515625, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -112.70449829101562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7161625623703003, "rewards_train/margins": 3.1756755113601685, "rewards_train/rejected": -2.459512948989868, "step": 607 }, { "epoch": 0.81, "learning_rate": 4.6909256037321775e-07, "loss": 0.3524, "step": 608 }, { "epoch": 0.81, "logps_train/chosen": -84.64514923095703, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -114.46732330322266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4057977497577667, "rewards_train/margins": 2.3462802469730377, "rewards_train/rejected": -1.940482497215271, "step": 608 }, { "epoch": 0.81, "logps_train/chosen": -70.9254150390625, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -96.82177734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4920283257961273, "rewards_train/margins": 2.728503078222275, "rewards_train/rejected": -2.2364747524261475, "step": 609 }, { "epoch": 0.81, "learning_rate": 4.688272231746629e-07, "loss": 0.2993, "step": 610 }, { "epoch": 0.81, "logps_train/chosen": -67.98867797851562, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -74.17568969726562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.22222566604614258, "rewards_train/margins": 0.9976067543029785, "rewards_train/rejected": -0.7753810882568359, "step": 610 }, { "epoch": 0.81, "logps_train/chosen": -79.0730972290039, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -142.41741943359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8614403605461121, "rewards_train/margins": 3.7375559210777283, "rewards_train/rejected": -2.876115560531616, "step": 611 }, { "epoch": 0.81, "learning_rate": 4.6856082759901825e-07, "loss": 0.3766, "step": 612 }, { "epoch": 0.81, "logps_train/chosen": -56.294185638427734, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -74.82308959960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07761275768280029, "rewards_train/margins": 1.2661720514297485, "rewards_train/rejected": -1.1885592937469482, "step": 612 }, { "epoch": 0.81, "logps_train/chosen": -62.906375885009766, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -84.80770111083984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.5328003168106079, "rewards_train/margins": 2.062007784843445, "rewards_train/rejected": -1.529207468032837, "step": 613 }, { "epoch": 0.82, "learning_rate": 4.682933749347296e-07, "loss": 0.3217, "step": 614 }, { "epoch": 0.82, "logps_train/chosen": -47.23320007324219, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -53.342132568359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.014179930090904236, "rewards_train/margins": 1.314409002661705, "rewards_train/rejected": -1.3002290725708008, "step": 614 }, { "epoch": 0.82, "logps_train/chosen": -47.72241973876953, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -73.51026153564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9652582406997681, "rewards_train/margins": 1.9389408230781555, "rewards_train/rejected": -0.9736825823783875, "step": 615 }, { "epoch": 0.82, "learning_rate": 4.6802486647535505e-07, "loss": 0.4198, "step": 616 }, { "epoch": 0.82, "logps_train/chosen": -59.407447814941406, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -84.48947143554688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7764426469802856, "rewards_train/margins": 2.597265601158142, "rewards_train/rejected": -1.8208229541778564, "step": 616 }, { "epoch": 0.82, "logps_train/chosen": -66.02926635742188, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -80.5020980834961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3130892813205719, "rewards_train/margins": 1.5679869949817657, "rewards_train/rejected": -1.2548977136611938, "step": 617 }, { "epoch": 0.82, "learning_rate": 4.677553035195594e-07, "loss": 0.2156, "step": 618 }, { "epoch": 0.82, "logps_train/chosen": -38.977821350097656, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -58.88930130004883, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3470008671283722, "rewards_train/margins": 0.7302102744579315, "rewards_train/rejected": -1.0772111415863037, "step": 618 }, { "epoch": 0.82, "logps_train/chosen": -51.70515060424805, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -73.06501007080078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2525317072868347, "rewards_train/margins": 2.369188964366913, "rewards_train/rejected": -2.116657257080078, "step": 619 }, { "epoch": 0.82, "learning_rate": 4.6748468737110764e-07, "loss": 0.482, "step": 620 }, { "epoch": 0.82, "logps_train/chosen": -81.810791015625, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -103.40557861328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6767330169677734, "rewards_train/margins": 2.3571348190307617, "rewards_train/rejected": -1.6804018020629883, "step": 620 }, { "epoch": 0.82, "logps_train/chosen": -61.70752716064453, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -96.13740539550781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5980952978134155, "rewards_train/margins": 2.403046488761902, "rewards_train/rejected": -1.8049511909484863, "step": 621 }, { "epoch": 0.83, "learning_rate": 4.672130193388585e-07, "loss": 0.2022, "step": 622 }, { "epoch": 0.83, "logps_train/chosen": -62.559471130371094, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -66.36294555664062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.03485351800918579, "rewards_train/margins": 1.0897228121757507, "rewards_train/rejected": -1.1245763301849365, "step": 622 }, { "epoch": 0.83, "logps_train/chosen": -51.45391845703125, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -87.12033081054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6155455708503723, "rewards_train/margins": 2.421328842639923, "rewards_train/rejected": -1.8057832717895508, "step": 623 }, { "epoch": 0.83, "learning_rate": 4.6694030073675826e-07, "loss": 0.3773, "step": 624 }, { "epoch": 0.83, "logps_train/chosen": -84.90255737304688, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -122.0093002319336, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1941189467906952, "rewards_train/margins": 2.598174065351486, "rewards_train/rejected": -2.404055118560791, "step": 624 }, { "epoch": 0.83, "logps_train/chosen": -57.79358673095703, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -79.19303131103516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2270151674747467, "rewards_train/margins": 1.1704131066799164, "rewards_train/rejected": -1.397428274154663, "step": 625 }, { "epoch": 0.83, "learning_rate": 4.666665328838344e-07, "loss": 0.376, "step": 626 }, { "epoch": 0.83, "logps_train/chosen": -38.393798828125, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -66.88373565673828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30327117443084717, "rewards_train/margins": 1.9635199308395386, "rewards_train/rejected": -1.6602487564086914, "step": 626 }, { "epoch": 0.83, "logps_train/chosen": -52.85305404663086, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -78.20223999023438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7006322145462036, "rewards_train/margins": 2.7958565950393677, "rewards_train/rejected": -2.095224380493164, "step": 627 }, { "epoch": 0.83, "learning_rate": 4.663917171041893e-07, "loss": 0.2327, "step": 628 }, { "epoch": 0.83, "logps_train/chosen": -44.11957550048828, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -72.79949951171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14194902777671814, "rewards_train/margins": 1.2289305627346039, "rewards_train/rejected": -1.0869815349578857, "step": 628 }, { "epoch": 0.84, "logps_train/chosen": -65.86761474609375, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -74.30508422851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0835514068603516, "rewards_train/margins": 2.748434543609619, "rewards_train/rejected": -1.6648831367492676, "step": 629 }, { "epoch": 0.84, "learning_rate": 4.6611585472699344e-07, "loss": 0.2665, "step": 630 }, { "epoch": 0.84, "logps_train/chosen": -60.44417953491211, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -85.952880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.388394296169281, "rewards_train/margins": 2.166495144367218, "rewards_train/rejected": -1.778100848197937, "step": 630 }, { "epoch": 0.84, "logps_train/chosen": -61.03472137451172, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -87.6741943359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7902776598930359, "rewards_train/margins": 2.656915843486786, "rewards_train/rejected": -1.86663818359375, "step": 631 }, { "epoch": 0.84, "learning_rate": 4.658389470864796e-07, "loss": 0.2038, "step": 632 }, { "epoch": 0.84, "logps_train/chosen": -86.6244125366211, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -92.52574157714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42037108540534973, "rewards_train/margins": 2.318257600069046, "rewards_train/rejected": -1.8978865146636963, "step": 632 }, { "epoch": 0.84, "logps_train/chosen": -43.07469177246094, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -70.0228271484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37612465023994446, "rewards_train/margins": 1.9957411587238312, "rewards_train/rejected": -1.6196165084838867, "step": 633 }, { "epoch": 0.84, "learning_rate": 4.6556099552193583e-07, "loss": 0.2876, "step": 634 }, { "epoch": 0.84, "logps_train/chosen": -59.48136901855469, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -87.19058227539062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02217533439397812, "rewards_train/margins": 1.0662338957190514, "rewards_train/rejected": -1.0440585613250732, "step": 634 }, { "epoch": 0.84, "logps_train/chosen": -61.226661682128906, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -87.68091583251953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2226659059524536, "rewards_train/margins": 1.1673005819320679, "rewards_train/rejected": -1.3899664878845215, "step": 635 }, { "epoch": 0.84, "learning_rate": 4.6528200137769935e-07, "loss": 0.4699, "step": 636 }, { "epoch": 0.84, "logps_train/chosen": -77.63619232177734, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -99.56551361083984, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.19106817245483398, "rewards_train/margins": 1.1944947242736816, "rewards_train/rejected": -1.0034265518188477, "step": 636 }, { "epoch": 0.85, "logps_train/chosen": -43.92618179321289, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -71.39808654785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5339444279670715, "rewards_train/margins": 2.8354724049568176, "rewards_train/rejected": -2.301527976989746, "step": 637 }, { "epoch": 0.85, "learning_rate": 4.650019660031498e-07, "loss": 0.3389, "step": 638 }, { "epoch": 0.85, "logps_train/chosen": -85.78387451171875, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -122.40510559082031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4106752872467041, "rewards_train/margins": 2.593374013900757, "rewards_train/rejected": -2.1826987266540527, "step": 638 }, { "epoch": 0.85, "logps_train/chosen": -78.40846252441406, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -91.34258270263672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.2771225869655609, "rewards_train/margins": 1.4066929519176483, "rewards_train/rejected": -1.1295703649520874, "step": 639 }, { "epoch": 0.85, "learning_rate": 4.6472089075270296e-07, "loss": 0.3054, "step": 640 }, { "epoch": 0.85, "logps_train/chosen": -67.42459106445312, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -93.22029113769531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.008517671376466751, "rewards_train/margins": 1.6871870048344135, "rewards_train/rejected": -1.6786693334579468, "step": 640 }, { "epoch": 0.85, "logps_train/chosen": -56.728477478027344, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -84.64283752441406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12909775972366333, "rewards_train/margins": 1.3039358258247375, "rewards_train/rejected": -1.4330335855484009, "step": 641 }, { "epoch": 0.85, "learning_rate": 4.6443877698580373e-07, "loss": 0.3351, "step": 642 }, { "epoch": 0.85, "logps_train/chosen": -45.55366897583008, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -77.58822631835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6110394597053528, "rewards_train/margins": 2.5198622345924377, "rewards_train/rejected": -1.908822774887085, "step": 642 }, { "epoch": 0.85, "logps_train/chosen": -50.26631164550781, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -71.51134490966797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.48274391889572144, "rewards_train/margins": 1.5401285290718079, "rewards_train/rejected": -1.0573846101760864, "step": 643 }, { "epoch": 0.86, "learning_rate": 4.641556260669204e-07, "loss": 0.2628, "step": 644 }, { "epoch": 0.86, "logps_train/chosen": -59.88041687011719, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -79.37681579589844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.32445842027664185, "rewards_train/margins": 1.2433894276618958, "rewards_train/rejected": -0.9189310073852539, "step": 644 }, { "epoch": 0.86, "logps_train/chosen": -58.12947463989258, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -87.97662353515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.36439618468284607, "rewards_train/margins": 1.1671367585659027, "rewards_train/rejected": -0.8027405738830566, "step": 645 }, { "epoch": 0.86, "learning_rate": 4.638714393655372e-07, "loss": 0.4209, "step": 646 }, { "epoch": 0.86, "logps_train/chosen": -46.86067199707031, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -72.31005859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3084641098976135, "rewards_train/margins": 2.6027517914772034, "rewards_train/rejected": -2.29428768157959, "step": 646 }, { "epoch": 0.86, "logps_train/chosen": -63.42823028564453, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -72.93348693847656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3899889588356018, "rewards_train/margins": 1.8958377242088318, "rewards_train/rejected": -1.50584876537323, "step": 647 }, { "epoch": 0.86, "learning_rate": 4.63586218256148e-07, "loss": 0.2343, "step": 648 }, { "epoch": 0.86, "logps_train/chosen": -75.07272338867188, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -90.54558563232422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 1.2747584581375122, "rewards_train/margins": 3.251191735267639, "rewards_train/rejected": -1.976433277130127, "step": 648 }, { "epoch": 0.86, "logps_train/chosen": -36.89295196533203, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -72.14701080322266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.5247675776481628, "rewards_train/margins": 1.6832186579704285, "rewards_train/rejected": -1.1584510803222656, "step": 649 }, { "epoch": 0.86, "learning_rate": 4.6329996411824967e-07, "loss": 0.34, "step": 650 }, { "epoch": 0.86, "logps_train/chosen": -77.40049743652344, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -90.66729736328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.46698179841041565, "rewards_train/margins": 2.3087117969989777, "rewards_train/rejected": -1.841729998588562, "step": 650 }, { "epoch": 0.86, "logps_train/chosen": -44.888511657714844, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -72.77568054199219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7658365964889526, "rewards_train/margins": 2.4144986867904663, "rewards_train/rejected": -1.6486620903015137, "step": 651 }, { "epoch": 0.87, "learning_rate": 4.630126783363357e-07, "loss": 0.3182, "step": 652 }, { "epoch": 0.87, "logps_train/chosen": -58.66350555419922, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -92.73113250732422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.03666294366121292, "rewards_train/margins": 2.4958256408572197, "rewards_train/rejected": -2.5324885845184326, "step": 652 }, { "epoch": 0.87, "logps_train/chosen": -50.652225494384766, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -84.22692108154297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.7629026174545288, "rewards_train/margins": 2.276610255241394, "rewards_train/rejected": -1.5137076377868652, "step": 653 }, { "epoch": 0.87, "learning_rate": 4.62724362299889e-07, "loss": 0.2203, "step": 654 }, { "epoch": 0.87, "logps_train/chosen": -51.455596923828125, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -74.4891586303711, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3560027480125427, "rewards_train/margins": 1.6361684203147888, "rewards_train/rejected": -1.280165672302246, "step": 654 }, { "epoch": 0.87, "logps_train/chosen": -58.885311126708984, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -83.89421081542969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.47709405422210693, "rewards_train/margins": 2.321202278137207, "rewards_train/rejected": -1.8441082239151, "step": 655 }, { "epoch": 0.87, "learning_rate": 4.6243501740337533e-07, "loss": 0.3013, "step": 656 }, { "epoch": 0.87, "logps_train/chosen": -76.16789245605469, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -95.82583618164062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5207105278968811, "rewards_train/margins": 3.203294575214386, "rewards_train/rejected": -2.682584047317505, "step": 656 }, { "epoch": 0.87, "logps_train/chosen": -38.421749114990234, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -38.714141845703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.7054815292358398, "rewards_train/margins": 1.1366615295410156, "rewards_train/rejected": -0.4311800003051758, "step": 657 }, { "epoch": 0.87, "learning_rate": 4.621446450462366e-07, "loss": 0.3042, "step": 658 }, { "epoch": 0.87, "logps_train/chosen": -49.40616226196289, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -68.30242919921875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.3336026072502136, "rewards_train/margins": 2.0818141102790833, "rewards_train/rejected": -1.7482115030288696, "step": 658 }, { "epoch": 0.88, "logps_train/chosen": -92.52722930908203, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -123.93988037109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.18709762394428253, "rewards_train/margins": 2.013141080737114, "rewards_train/rejected": -2.2002387046813965, "step": 659 }, { "epoch": 0.88, "learning_rate": 4.618532466328845e-07, "loss": 0.3679, "step": 660 }, { "epoch": 0.88, "logps_train/chosen": -59.85950469970703, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -91.0672378540039, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.21600297093391418, "rewards_train/margins": 2.184445410966873, "rewards_train/rejected": -1.968442440032959, "step": 660 }, { "epoch": 0.88, "logps_train/chosen": -74.84912872314453, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -120.09780883789062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2744620740413666, "rewards_train/margins": 2.246742933988571, "rewards_train/rejected": -1.9722808599472046, "step": 661 }, { "epoch": 0.88, "learning_rate": 4.6156082357269277e-07, "loss": 0.2698, "step": 662 }, { "epoch": 0.88, "logps_train/chosen": -42.33638381958008, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -57.59506607055664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28979912400245667, "rewards_train/margins": 1.525086909532547, "rewards_train/rejected": -1.2352877855300903, "step": 662 }, { "epoch": 0.88, "logps_train/chosen": -70.05482482910156, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -84.63400268554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10701710730791092, "rewards_train/margins": 1.741121120750904, "rewards_train/rejected": -1.6341040134429932, "step": 663 }, { "epoch": 0.88, "learning_rate": 4.612673772799914e-07, "loss": 0.2698, "step": 664 }, { "epoch": 0.88, "logps_train/chosen": -50.864524841308594, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -77.48883056640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21901603043079376, "rewards_train/margins": 2.4303988963365555, "rewards_train/rejected": -2.2113828659057617, "step": 664 }, { "epoch": 0.88, "logps_train/chosen": -36.89140319824219, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -79.10737609863281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.26758599281311035, "rewards_train/margins": 2.490823984146118, "rewards_train/rejected": -2.223237991333008, "step": 665 }, { "epoch": 0.88, "learning_rate": 4.609729091740592e-07, "loss": 0.198, "step": 666 }, { "epoch": 0.88, "logps_train/chosen": -42.206085205078125, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -56.44530487060547, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.516891598701477, "rewards_train/margins": 1.7512658834457397, "rewards_train/rejected": -1.2343742847442627, "step": 666 }, { "epoch": 0.89, "logps_train/chosen": -55.27179718017578, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -100.38606262207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.6599295139312744, "rewards_train/margins": 3.447364091873169, "rewards_train/rejected": -2.7874345779418945, "step": 667 }, { "epoch": 0.89, "learning_rate": 4.6067742067911685e-07, "loss": 0.2073, "step": 668 }, { "epoch": 0.89, "logps_train/chosen": -53.89271545410156, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -73.7257080078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.23260357975959778, "rewards_train/margins": 1.7473616898059845, "rewards_train/rejected": -1.5147581100463867, "step": 668 }, { "epoch": 0.89, "logps_train/chosen": -70.37216186523438, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -100.627685546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2971585988998413, "rewards_train/margins": 2.5364898443222046, "rewards_train/rejected": -2.2393312454223633, "step": 669 }, { "epoch": 0.89, "learning_rate": 4.603809132243205e-07, "loss": 0.3615, "step": 670 }, { "epoch": 0.89, "logps_train/chosen": -59.09383773803711, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -85.25665283203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.41329044103622437, "rewards_train/margins": 1.0139370560646057, "rewards_train/rejected": -1.42722749710083, "step": 670 }, { "epoch": 0.89, "logps_train/chosen": -71.18547821044922, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -96.42425537109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5794854760169983, "rewards_train/margins": 1.5285653471946716, "rewards_train/rejected": -2.10805082321167, "step": 671 }, { "epoch": 0.89, "learning_rate": 4.6008338824375457e-07, "loss": 0.4862, "step": 672 }, { "epoch": 0.89, "logps_train/chosen": -102.10392761230469, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -121.85100555419922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2630443572998047, "rewards_train/margins": 2.5442392826080322, "rewards_train/rejected": -2.2811949253082275, "step": 672 }, { "epoch": 0.89, "logps_train/chosen": -80.49946594238281, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -101.16230773925781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3679158091545105, "rewards_train/margins": 1.3440185189247131, "rewards_train/rejected": -1.7119343280792236, "step": 673 }, { "epoch": 0.9, "learning_rate": 4.597848471764248e-07, "loss": 0.3464, "step": 674 }, { "epoch": 0.9, "logps_train/chosen": -40.62296676635742, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -72.34452819824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7470782995223999, "rewards_train/margins": 2.914734721183777, "rewards_train/rejected": -2.167656421661377, "step": 674 }, { "epoch": 0.9, "logps_train/chosen": -86.5342788696289, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -104.23762512207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08875927329063416, "rewards_train/margins": 2.117208808660507, "rewards_train/rejected": -2.028449535369873, "step": 675 }, { "epoch": 0.9, "learning_rate": 4.5948529146625115e-07, "loss": 0.2071, "step": 676 }, { "epoch": 0.9, "logps_train/chosen": -37.710025787353516, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -63.450538635253906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.24678394198417664, "rewards_train/margins": 1.2150667011737823, "rewards_train/rejected": -1.461850643157959, "step": 676 }, { "epoch": 0.9, "logps_train/chosen": -90.51107025146484, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -127.83465576171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5727205276489258, "rewards_train/margins": 2.931185722351074, "rewards_train/rejected": -2.3584651947021484, "step": 677 }, { "epoch": 0.9, "learning_rate": 4.591847225620612e-07, "loss": 0.3685, "step": 678 }, { "epoch": 0.9, "logps_train/chosen": -82.14286804199219, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -96.72637176513672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5700880289077759, "rewards_train/margins": 3.014600396156311, "rewards_train/rejected": -2.444512367248535, "step": 678 }, { "epoch": 0.9, "logps_train/chosen": -70.38860321044922, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -82.55715942382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08651609718799591, "rewards_train/margins": 1.546543762087822, "rewards_train/rejected": -1.6330598592758179, "step": 679 }, { "epoch": 0.9, "learning_rate": 4.588831419175828e-07, "loss": 0.2551, "step": 680 }, { "epoch": 0.9, "logps_train/chosen": -55.20639419555664, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -96.28793334960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40757089853286743, "rewards_train/margins": 2.2351924777030945, "rewards_train/rejected": -1.827621579170227, "step": 680 }, { "epoch": 0.9, "logps_train/chosen": -47.29354476928711, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -74.71430969238281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.369083046913147, "rewards_train/margins": 2.21238911151886, "rewards_train/rejected": -1.843306064605713, "step": 681 }, { "epoch": 0.91, "learning_rate": 4.585805509914374e-07, "loss": 0.3413, "step": 682 }, { "epoch": 0.91, "logps_train/chosen": -66.68551635742188, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -107.8899917602539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3392612040042877, "rewards_train/margins": 3.1876353323459625, "rewards_train/rejected": -2.848374128341675, "step": 682 }, { "epoch": 0.91, "logps_train/chosen": -57.441612243652344, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -94.74931335449219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.30744242668151855, "rewards_train/margins": 2.2448320388793945, "rewards_train/rejected": -2.552274465560913, "step": 683 }, { "epoch": 0.91, "learning_rate": 4.582769512471324e-07, "loss": 0.1799, "step": 684 }, { "epoch": 0.91, "logps_train/chosen": -54.16114044189453, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -78.58146667480469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.49404197931289673, "rewards_train/margins": 2.8709381222724915, "rewards_train/rejected": -2.3768961429595947, "step": 684 }, { "epoch": 0.91, "logps_train/chosen": -77.51919555664062, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -68.96359252929688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.3886374831199646, "rewards_train/margins": 0.7249093651771545, "rewards_train/rejected": -1.1135468482971191, "step": 685 }, { "epoch": 0.91, "learning_rate": 4.579723441530547e-07, "loss": 0.4346, "step": 686 }, { "epoch": 0.91, "logps_train/chosen": -34.131839752197266, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -66.95238494873047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.48525357246398926, "rewards_train/margins": 2.4324450492858887, "rewards_train/rejected": -1.9471914768218994, "step": 686 }, { "epoch": 0.91, "logps_train/chosen": -38.947418212890625, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -59.30021286010742, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09119536727666855, "rewards_train/margins": 2.0321542248129845, "rewards_train/rejected": -1.940958857536316, "step": 687 }, { "epoch": 0.91, "learning_rate": 4.5766673118246333e-07, "loss": 0.2346, "step": 688 }, { "epoch": 0.91, "logps_train/chosen": -45.25869369506836, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -51.026123046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7766506671905518, "rewards_train/margins": 0.6806492805480957, "rewards_train/rejected": -1.4572999477386475, "step": 688 }, { "epoch": 0.92, "logps_train/chosen": -78.32164001464844, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -148.12022399902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08528921008110046, "rewards_train/margins": 3.92985799908638, "rewards_train/rejected": -4.0151472091674805, "step": 689 }, { "epoch": 0.92, "learning_rate": 4.573601138134823e-07, "loss": 0.2725, "step": 690 }, { "epoch": 0.92, "logps_train/chosen": -83.4992904663086, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -100.03118133544922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.10149146616458893, "rewards_train/margins": 2.1328768581151962, "rewards_train/rejected": -2.234368324279785, "step": 690 }, { "epoch": 0.92, "logps_train/chosen": -74.4854736328125, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -93.02217102050781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.36729687452316284, "rewards_train/margins": 1.2177329659461975, "rewards_train/rejected": -1.5850298404693604, "step": 691 }, { "epoch": 0.92, "learning_rate": 4.570524935290934e-07, "loss": 0.4091, "step": 692 }, { "epoch": 0.92, "logps_train/chosen": -34.78716278076172, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -79.95111083984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10839307308197021, "rewards_train/margins": 2.5113171339035034, "rewards_train/rejected": -2.402924060821533, "step": 692 }, { "epoch": 0.92, "logps_train/chosen": -74.54714965820312, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -63.44025421142578, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4172154664993286, "rewards_train/margins": 1.0455600023269653, "rewards_train/rejected": -1.462775468826294, "step": 693 }, { "epoch": 0.92, "learning_rate": 4.5674387181712904e-07, "loss": 0.3039, "step": 694 }, { "epoch": 0.92, "logps_train/chosen": -49.48997116088867, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -73.1671142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45295602083206177, "rewards_train/margins": 2.5520885586738586, "rewards_train/rejected": -2.099132537841797, "step": 694 }, { "epoch": 0.92, "logps_train/chosen": -84.90861511230469, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -119.9660415649414, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5013256072998047, "rewards_train/margins": 3.685429811477661, "rewards_train/rejected": -3.1841042041778564, "step": 695 }, { "epoch": 0.92, "learning_rate": 4.564342501702654e-07, "loss": 0.1737, "step": 696 }, { "epoch": 0.92, "logps_train/chosen": -56.27851104736328, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -81.65951538085938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11535114049911499, "rewards_train/margins": 1.8834131360054016, "rewards_train/rejected": -1.9987642765045166, "step": 696 }, { "epoch": 0.93, "logps_train/chosen": -61.96138000488281, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -109.97370147705078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.052387937903404236, "rewards_train/margins": 2.344982400536537, "rewards_train/rejected": -2.3973703384399414, "step": 697 }, { "epoch": 0.93, "learning_rate": 4.5612363008601474e-07, "loss": 0.3469, "step": 698 }, { "epoch": 0.93, "logps_train/chosen": -42.722694396972656, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -111.35914611816406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.22616808116436005, "rewards_train/margins": 3.968332216143608, "rewards_train/rejected": -3.742164134979248, "step": 698 }, { "epoch": 0.93, "logps_train/chosen": -50.69742202758789, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -53.74729537963867, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.053238146007061005, "rewards_train/margins": 1.148834951221943, "rewards_train/rejected": -1.202073097229004, "step": 699 }, { "epoch": 0.93, "learning_rate": 4.5581201306671835e-07, "loss": 0.2556, "step": 700 }, { "epoch": 0.93, "logps_train/chosen": -80.26610565185547, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -100.37316131591797, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9812982082366943, "rewards_train/margins": 0.9841432571411133, "rewards_train/rejected": -1.9654414653778076, "step": 700 }, { "epoch": 0.93, "logps_train/chosen": -46.84473419189453, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -67.64947509765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.14365153014659882, "rewards_train/margins": 2.397661730647087, "rewards_train/rejected": -2.2540102005004883, "step": 701 }, { "epoch": 0.93, "learning_rate": 4.5549940061953934e-07, "loss": 0.4381, "step": 702 }, { "epoch": 0.93, "logps_train/chosen": -76.68373107910156, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -106.86540985107422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.018372446298599243, "rewards_train/margins": 2.3791061341762543, "rewards_train/rejected": -2.3974785804748535, "step": 702 }, { "epoch": 0.93, "logps_train/chosen": -92.51097869873047, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -120.3010482788086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.14968341588974, "rewards_train/margins": 3.161038815975189, "rewards_train/rejected": -3.011355400085449, "step": 703 }, { "epoch": 0.93, "learning_rate": 4.551857942564553e-07, "loss": 0.3566, "step": 704 }, { "epoch": 0.93, "logps_train/chosen": -64.683837890625, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -74.9739990234375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09763213992118835, "rewards_train/margins": 1.050109714269638, "rewards_train/rejected": -0.9524775743484497, "step": 704 }, { "epoch": 0.94, "logps_train/chosen": -73.48666381835938, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -98.82264709472656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.22015084326267242, "rewards_train/margins": 2.1777390092611313, "rewards_train/rejected": -2.3978898525238037, "step": 705 }, { "epoch": 0.94, "learning_rate": 4.548711954942509e-07, "loss": 0.397, "step": 706 }, { "epoch": 0.94, "logps_train/chosen": -48.78385925292969, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -74.55316925048828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02786429226398468, "rewards_train/margins": 1.659743383526802, "rewards_train/rejected": -1.6318790912628174, "step": 706 }, { "epoch": 0.94, "logps_train/chosen": -58.168251037597656, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -74.8335952758789, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5363562107086182, "rewards_train/margins": 1.3251283168792725, "rewards_train/rejected": -1.8614845275878906, "step": 707 }, { "epoch": 0.94, "learning_rate": 4.545556058545108e-07, "loss": 0.4259, "step": 708 }, { "epoch": 0.94, "logps_train/chosen": -123.55560302734375, "logps_train/ref_chosen": -120.5, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -138.19094848632812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2821231484413147, "rewards_train/margins": 2.266659438610077, "rewards_train/rejected": -2.5487825870513916, "step": 708 }, { "epoch": 0.94, "logps_train/chosen": -56.70548629760742, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -93.85681915283203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.920857846736908, "rewards_train/margins": 2.8893527388572693, "rewards_train/rejected": -1.9684948921203613, "step": 709 }, { "epoch": 0.94, "learning_rate": 4.542390268636119e-07, "loss": 0.3146, "step": 710 }, { "epoch": 0.94, "logps_train/chosen": -61.79959487915039, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -110.82610321044922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2856650650501251, "rewards_train/margins": 3.362416058778763, "rewards_train/rejected": -3.0767509937286377, "step": 710 }, { "epoch": 0.94, "logps_train/chosen": -52.4349479675293, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -84.93408203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.14400531351566315, "rewards_train/margins": 1.994445100426674, "rewards_train/rejected": -1.8504397869110107, "step": 711 }, { "epoch": 0.95, "learning_rate": 4.5392146005271635e-07, "loss": 0.2052, "step": 712 }, { "epoch": 0.95, "logps_train/chosen": -50.81437683105469, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -64.52072143554688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06895294785499573, "rewards_train/margins": 1.2913378775119781, "rewards_train/rejected": -1.2223849296569824, "step": 712 }, { "epoch": 0.95, "logps_train/chosen": -42.27362823486328, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -74.12132263183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3402157425880432, "rewards_train/margins": 3.3289108872413635, "rewards_train/rejected": -2.9886951446533203, "step": 713 }, { "epoch": 0.95, "learning_rate": 4.536029069577639e-07, "loss": 0.3486, "step": 714 }, { "epoch": 0.95, "logps_train/chosen": -41.063987731933594, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -55.044532775878906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.17172624170780182, "rewards_train/margins": 1.1371169835329056, "rewards_train/rejected": -0.9653907418251038, "step": 714 }, { "epoch": 0.95, "logps_train/chosen": -87.18270874023438, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -100.08426666259766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04717755317687988, "rewards_train/margins": 2.071796178817749, "rewards_train/rejected": -2.118973731994629, "step": 715 }, { "epoch": 0.95, "learning_rate": 4.532833691194647e-07, "loss": 0.4849, "step": 716 }, { "epoch": 0.95, "logps_train/chosen": -95.56340026855469, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -111.3342056274414, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.928034782409668, "rewards_train/margins": 3.498955249786377, "rewards_train/rejected": -2.570920467376709, "step": 716 }, { "epoch": 0.95, "logps_train/chosen": -52.635711669921875, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -84.30975341796875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.21611656248569489, "rewards_train/margins": 2.9564668089151382, "rewards_train/rejected": -2.7403502464294434, "step": 717 }, { "epoch": 0.95, "learning_rate": 4.5296284808329146e-07, "loss": 0.4621, "step": 718 }, { "epoch": 0.95, "logps_train/chosen": -69.68258666992188, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -94.92339324951172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.18017911911010742, "rewards_train/margins": 2.367830753326416, "rewards_train/rejected": -2.1876516342163086, "step": 718 }, { "epoch": 0.95, "logps_train/chosen": -64.23617553710938, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -90.95034790039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24357017874717712, "rewards_train/margins": 2.9589172303676605, "rewards_train/rejected": -2.7153470516204834, "step": 719 }, { "epoch": 0.96, "learning_rate": 4.526413453994723e-07, "loss": 0.2166, "step": 720 }, { "epoch": 0.96, "logps_train/chosen": -40.586029052734375, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -63.831321716308594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3554595708847046, "rewards_train/margins": 1.4214046001434326, "rewards_train/rejected": -1.065945029258728, "step": 720 }, { "epoch": 0.96, "logps_train/chosen": -84.15662384033203, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -108.63420867919922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2937871813774109, "rewards_train/margins": 1.6274459958076477, "rewards_train/rejected": -1.9212331771850586, "step": 721 }, { "epoch": 0.96, "learning_rate": 4.523188626229834e-07, "loss": 0.5752, "step": 722 }, { "epoch": 0.96, "logps_train/chosen": -61.15065383911133, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -84.80239868164062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.24104198813438416, "rewards_train/margins": 1.2579480707645416, "rewards_train/rejected": -1.4989900588989258, "step": 722 }, { "epoch": 0.96, "logps_train/chosen": -46.31986999511719, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -84.28214263916016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4547317922115326, "rewards_train/margins": 2.214196056127548, "rewards_train/rejected": -1.7594642639160156, "step": 723 }, { "epoch": 0.96, "learning_rate": 4.5199540131354075e-07, "loss": 0.4115, "step": 724 }, { "epoch": 0.96, "logps_train/chosen": -66.86719512939453, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -89.36840057373047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3429681062698364, "rewards_train/margins": 2.328245520591736, "rewards_train/rejected": -1.9852774143218994, "step": 724 }, { "epoch": 0.96, "logps_train/chosen": -84.69651794433594, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -117.97013092041016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12277615070343018, "rewards_train/margins": 2.4367364645004272, "rewards_train/rejected": -2.5595126152038574, "step": 725 }, { "epoch": 0.96, "learning_rate": 4.5167096303559356e-07, "loss": 0.2787, "step": 726 }, { "epoch": 0.96, "logps_train/chosen": -48.453407287597656, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -74.62812805175781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08362184464931488, "rewards_train/margins": 2.262785330414772, "rewards_train/rejected": -2.346407175064087, "step": 726 }, { "epoch": 0.97, "logps_train/chosen": -52.40443801879883, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -68.63796997070312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4413379728794098, "rewards_train/margins": 0.8755845725536346, "rewards_train/rejected": -1.3169225454330444, "step": 727 }, { "epoch": 0.97, "learning_rate": 4.5134554935831604e-07, "loss": 0.3611, "step": 728 }, { "epoch": 0.97, "logps_train/chosen": -87.60906982421875, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -121.25372314453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04840691387653351, "rewards_train/margins": 2.3191534727811813, "rewards_train/rejected": -2.367560386657715, "step": 728 }, { "epoch": 0.97, "logps_train/chosen": -69.85812377929688, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -77.41719055175781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4936249852180481, "rewards_train/margins": 1.409032166004181, "rewards_train/rejected": -1.902657151222229, "step": 729 }, { "epoch": 0.97, "learning_rate": 4.5101916185560005e-07, "loss": 0.3447, "step": 730 }, { "epoch": 0.97, "logps_train/chosen": -82.50523376464844, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -104.87754821777344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -1.1114604473114014, "rewards_train/margins": -0.17214328050613403, "rewards_train/rejected": -0.9393171668052673, "step": 730 }, { "epoch": 0.97, "logps_train/chosen": -62.417701721191406, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -99.60670471191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8504173755645752, "rewards_train/margins": 3.742337942123413, "rewards_train/rejected": -2.891920566558838, "step": 731 }, { "epoch": 0.97, "learning_rate": 4.506918021060474e-07, "loss": 0.5334, "step": 732 }, { "epoch": 0.97, "logps_train/chosen": -61.03002166748047, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -101.71241760253906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1899665743112564, "rewards_train/margins": 3.1283959299325943, "rewards_train/rejected": -2.938429355621338, "step": 732 }, { "epoch": 0.97, "logps_train/chosen": -70.00407409667969, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -104.87008666992188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.18396775424480438, "rewards_train/margins": 2.7678515166044235, "rewards_train/rejected": -2.583883762359619, "step": 733 }, { "epoch": 0.97, "learning_rate": 4.5036347169296227e-07, "loss": 0.2146, "step": 734 }, { "epoch": 0.97, "logps_train/chosen": -70.09794616699219, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -91.58123779296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4808884263038635, "rewards_train/margins": 1.6428598761558533, "rewards_train/rejected": -2.123748302459717, "step": 734 }, { "epoch": 0.98, "logps_train/chosen": -62.288578033447266, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -81.8724365234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7055174112319946, "rewards_train/margins": 2.3990107774734497, "rewards_train/rejected": -1.693493366241455, "step": 735 }, { "epoch": 0.98, "learning_rate": 4.500341722043436e-07, "loss": 0.2486, "step": 736 }, { "epoch": 0.98, "logps_train/chosen": -66.16824340820312, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -85.70519256591797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.31369999051094055, "rewards_train/margins": 1.7849442660808563, "rewards_train/rejected": -2.098644256591797, "step": 736 }, { "epoch": 0.98, "logps_train/chosen": -34.342872619628906, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -64.12893676757812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5120015144348145, "rewards_train/margins": 2.242473602294922, "rewards_train/rejected": -1.7304720878601074, "step": 737 }, { "epoch": 0.98, "learning_rate": 4.4970390523287714e-07, "loss": 0.3299, "step": 738 }, { "epoch": 0.98, "logps_train/chosen": -55.41838836669922, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -112.56553649902344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0659736692905426, "rewards_train/margins": 2.5209650099277496, "rewards_train/rejected": -2.454991340637207, "step": 738 }, { "epoch": 0.98, "logps_train/chosen": -51.32921600341797, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -80.92022705078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8358285427093506, "rewards_train/margins": 1.9247263669967651, "rewards_train/rejected": -1.0888978242874146, "step": 739 }, { "epoch": 0.98, "learning_rate": 4.493726723759284e-07, "loss": 0.2085, "step": 740 }, { "epoch": 0.98, "logps_train/chosen": -53.95283126831055, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -88.43611145019531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.10315460711717606, "rewards_train/margins": 1.8795777037739754, "rewards_train/rejected": -1.7764230966567993, "step": 740 }, { "epoch": 0.98, "logps_train/chosen": -60.53403854370117, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -117.7838134765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7934710383415222, "rewards_train/margins": 3.574976861476898, "rewards_train/rejected": -2.781505823135376, "step": 741 }, { "epoch": 0.99, "learning_rate": 4.490404752355339e-07, "loss": 0.1912, "step": 742 }, { "epoch": 0.99, "logps_train/chosen": -55.65306854248047, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -64.404052734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08313048630952835, "rewards_train/margins": 0.9758799150586128, "rewards_train/rejected": -0.8927494287490845, "step": 742 }, { "epoch": 0.99, "logps_train/chosen": -47.5407829284668, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -68.78717041015625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2595471143722534, "rewards_train/margins": 1.4488579034805298, "rewards_train/rejected": -1.7084050178527832, "step": 743 }, { "epoch": 0.99, "learning_rate": 4.487073154183944e-07, "loss": 0.4031, "step": 744 }, { "epoch": 0.99, "logps_train/chosen": -74.14268493652344, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -69.08543395996094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22793997824192047, "rewards_train/margins": -0.0350215882062912, "rewards_train/rejected": -0.19291839003562927, "step": 744 }, { "epoch": 0.99, "logps_train/chosen": -78.53152465820312, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -104.10023498535156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.20622184872627258, "rewards_train/margins": 2.5287459194660187, "rewards_train/rejected": -2.322524070739746, "step": 745 }, { "epoch": 0.99, "learning_rate": 4.4837319453586664e-07, "loss": 0.6077, "step": 746 }, { "epoch": 0.99, "logps_train/chosen": -65.99571228027344, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -101.71450805664062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.31246185302734375, "rewards_train/margins": 2.214848041534424, "rewards_train/rejected": -2.5273098945617676, "step": 746 }, { "epoch": 0.99, "logps_train/chosen": -67.29641723632812, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -74.00518798828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2702668309211731, "rewards_train/margins": 1.7958771586418152, "rewards_train/rejected": -2.0661439895629883, "step": 747 }, { "epoch": 0.99, "learning_rate": 4.4803811420395566e-07, "loss": 0.3117, "step": 748 }, { "epoch": 0.99, "logps_train/chosen": -84.3549575805664, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -129.38941955566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6551293134689331, "rewards_train/margins": 3.9331332445144653, "rewards_train/rejected": -3.2780039310455322, "step": 748 }, { "epoch": 0.99, "logps_train/chosen": -39.928504943847656, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -63.72937774658203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09933708608150482, "rewards_train/margins": 2.127352699637413, "rewards_train/rejected": -2.028015613555908, "step": 749 }, { "epoch": 1.0, "learning_rate": 4.477020760433069e-07, "loss": 0.1322, "step": 750 }, { "epoch": 1.0, "logps_train/chosen": -62.46615219116211, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -93.07612609863281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.00807209312915802, "rewards_train/margins": 1.75006003677845, "rewards_train/rejected": -1.741987943649292, "step": 750 }, { "epoch": 1.0, "logps_train/chosen": -48.001670837402344, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -73.4656982421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.021707601845264435, "rewards_train/margins": 2.780777521431446, "rewards_train/rejected": -2.7590699195861816, "step": 751 }, { "epoch": 1.0, "learning_rate": 4.473650816791984e-07, "loss": 0.2478, "step": 752 }, { "epoch": 1.0, "logps_train/chosen": -70.86872863769531, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -102.60623931884766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.20609530806541443, "rewards_train/margins": 2.0167196691036224, "rewards_train/rejected": -1.810624361038208, "step": 752 }, { "epoch": 1.0, "logps_train/chosen": -72.06849670410156, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -123.32846069335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1486189365386963, "rewards_train/margins": 4.213496208190918, "rewards_train/rejected": -3.0648772716522217, "step": 753 }, { "epoch": 1.0, "learning_rate": 4.47027132741533e-07, "loss": 0.2285, "step": 754 }, { "epoch": 1.0, "logps_train/chosen": -89.99444580078125, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -135.48133850097656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20211690664291382, "rewards_train/margins": 3.2643126845359802, "rewards_train/rejected": -3.0621957778930664, "step": 754 }, { "epoch": 1.0, "logps_train/chosen": -82.13720703125, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -136.76634216308594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7550290822982788, "rewards_train/margins": 4.859788298606873, "rewards_train/rejected": -4.104759216308594, "step": 755 }, { "epoch": 1.0, "learning_rate": 4.4668823086483056e-07, "loss": 0.1162, "step": 756 }, { "epoch": 1.0, "logps_train/chosen": -65.73158264160156, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -94.82227325439453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2627795934677124, "rewards_train/margins": 3.087194323539734, "rewards_train/rejected": -2.8244147300720215, "step": 756 }, { "epoch": 1.01, "logps_train/chosen": -58.29868698120117, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -66.34871673583984, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.29981881380081177, "rewards_train/margins": 1.3237531781196594, "rewards_train/rejected": -1.0239343643188477, "step": 757 }, { "epoch": 1.01, "learning_rate": 4.4634837768821963e-07, "loss": 0.2896, "step": 758 }, { "epoch": 1.01, "logps_train/chosen": -68.19380187988281, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -84.99763488769531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.13500526547431946, "rewards_train/margins": 1.7491337954998016, "rewards_train/rejected": -1.884139060974121, "step": 758 }, { "epoch": 1.01, "logps_train/chosen": -58.27888870239258, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -100.84491729736328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3580487370491028, "rewards_train/margins": 3.1316030621528625, "rewards_train/rejected": -2.7735543251037598, "step": 759 }, { "epoch": 1.01, "learning_rate": 4.4600757485543006e-07, "loss": 0.1651, "step": 760 }, { "epoch": 1.01, "logps_train/chosen": -88.1324234008789, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -115.2786865234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2055070400238037, "rewards_train/margins": 3.667750120162964, "rewards_train/rejected": -2.46224308013916, "step": 760 }, { "epoch": 1.01, "logps_train/chosen": -70.9462661743164, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -101.74241638183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09013915061950684, "rewards_train/margins": 2.7737557888031006, "rewards_train/rejected": -2.6836166381835938, "step": 761 }, { "epoch": 1.01, "learning_rate": 4.456658240147846e-07, "loss": 0.1019, "step": 762 }, { "epoch": 1.01, "logps_train/chosen": -57.915916442871094, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -79.24769592285156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.21034187078475952, "rewards_train/margins": 2.0753656029701233, "rewards_train/rejected": -2.285707473754883, "step": 762 }, { "epoch": 1.01, "logps_train/chosen": -46.743648529052734, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -79.69302368164062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3154788315296173, "rewards_train/margins": 2.466812163591385, "rewards_train/rejected": -2.1513333320617676, "step": 763 }, { "epoch": 1.01, "learning_rate": 4.4532312681919127e-07, "loss": 0.2739, "step": 764 }, { "epoch": 1.01, "logps_train/chosen": -50.862674713134766, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -83.14360809326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1151735782623291, "rewards_train/margins": 2.1413750648498535, "rewards_train/rejected": -2.2565486431121826, "step": 764 }, { "epoch": 1.02, "logps_train/chosen": -55.300506591796875, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -91.79389953613281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2855740189552307, "rewards_train/margins": 2.6727762818336487, "rewards_train/rejected": -2.387202262878418, "step": 765 }, { "epoch": 1.02, "learning_rate": 4.449794849261351e-07, "loss": 0.2408, "step": 766 }, { "epoch": 1.02, "logps_train/chosen": -50.523902893066406, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -79.38288879394531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.014890104532241821, "rewards_train/margins": 2.151524156332016, "rewards_train/rejected": -2.166414260864258, "step": 766 }, { "epoch": 1.02, "logps_train/chosen": -75.68093872070312, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -108.38290405273438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.22253058850765228, "rewards_train/margins": 2.5108210891485214, "rewards_train/rejected": -2.288290500640869, "step": 767 }, { "epoch": 1.02, "learning_rate": 4.4463489999767047e-07, "loss": 0.2223, "step": 768 }, { "epoch": 1.02, "logps_train/chosen": -48.434234619140625, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -74.44261169433594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4074857831001282, "rewards_train/margins": 2.009432017803192, "rewards_train/rejected": -2.4169178009033203, "step": 768 }, { "epoch": 1.02, "logps_train/chosen": -42.807106018066406, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -60.74381637573242, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.15053941309452057, "rewards_train/margins": 1.4507022053003311, "rewards_train/rejected": -1.3001627922058105, "step": 769 }, { "epoch": 1.02, "learning_rate": 4.442893737004124e-07, "loss": 0.3038, "step": 770 }, { "epoch": 1.02, "logps_train/chosen": -37.18920135498047, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -60.47882080078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.31232985854148865, "rewards_train/margins": 1.7059153020381927, "rewards_train/rejected": -1.393585443496704, "step": 770 }, { "epoch": 1.02, "logps_train/chosen": -42.942779541015625, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -61.62098693847656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10572205483913422, "rewards_train/margins": 2.1170397847890854, "rewards_train/rejected": -2.011317729949951, "step": 771 }, { "epoch": 1.03, "learning_rate": 4.439429077055294e-07, "loss": 0.3675, "step": 772 }, { "epoch": 1.03, "logps_train/chosen": -50.876564025878906, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -85.4740219116211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1873438060283661, "rewards_train/margins": 2.519121140241623, "rewards_train/rejected": -2.331777334213257, "step": 772 }, { "epoch": 1.03, "logps_train/chosen": -65.5296859741211, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -72.11442565917969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4720316231250763, "rewards_train/margins": 1.8662865459918976, "rewards_train/rejected": -1.3942549228668213, "step": 773 }, { "epoch": 1.03, "learning_rate": 4.4359550368873463e-07, "loss": 0.2622, "step": 774 }, { "epoch": 1.03, "logps_train/chosen": -82.28711700439453, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -107.65868377685547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6009758710861206, "rewards_train/margins": 2.4785631895065308, "rewards_train/rejected": -1.8775873184204102, "step": 774 }, { "epoch": 1.03, "logps_train/chosen": -65.40483093261719, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -102.3169174194336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.26303285360336304, "rewards_train/margins": 1.7728497385978699, "rewards_train/rejected": -1.5098168849945068, "step": 775 }, { "epoch": 1.03, "learning_rate": 4.4324716333027814e-07, "loss": 0.3129, "step": 776 }, { "epoch": 1.03, "logps_train/chosen": -64.79135131835938, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -126.04985046386719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.48102080821990967, "rewards_train/margins": 3.9922555685043335, "rewards_train/rejected": -3.511234760284424, "step": 776 }, { "epoch": 1.03, "logps_train/chosen": -34.996681213378906, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -62.42568588256836, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.209706649184227, "rewards_train/margins": 2.2413378804922104, "rewards_train/rejected": -2.0316312313079834, "step": 777 }, { "epoch": 1.03, "learning_rate": 4.428978883149386e-07, "loss": 0.1648, "step": 778 }, { "epoch": 1.03, "logps_train/chosen": -36.58696746826172, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -60.58026123046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04520928114652634, "rewards_train/margins": 1.5059699937701225, "rewards_train/rejected": -1.4607607126235962, "step": 778 }, { "epoch": 1.03, "logps_train/chosen": -62.581119537353516, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -94.9194564819336, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47548168897628784, "rewards_train/margins": 2.951802670955658, "rewards_train/rejected": -2.47632098197937, "step": 779 }, { "epoch": 1.04, "learning_rate": 4.425476803320153e-07, "loss": 0.2512, "step": 780 }, { "epoch": 1.04, "logps_train/chosen": -58.24970626831055, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -95.43296813964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6062796115875244, "rewards_train/margins": 2.908951997756958, "rewards_train/rejected": -2.3026723861694336, "step": 780 }, { "epoch": 1.04, "logps_train/chosen": -44.945281982421875, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -95.61111450195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.007815830409526825, "rewards_train/margins": 2.7704892084002495, "rewards_train/rejected": -2.7626733779907227, "step": 781 }, { "epoch": 1.04, "learning_rate": 4.421965410753201e-07, "loss": 0.1053, "step": 782 }, { "epoch": 1.04, "logps_train/chosen": -48.387107849121094, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -68.50040435791016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.9409765005111694, "rewards_train/margins": 3.0230478048324585, "rewards_train/rejected": -2.082071304321289, "step": 782 }, { "epoch": 1.04, "logps_train/chosen": -74.5488052368164, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -110.64103698730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5529320240020752, "rewards_train/margins": 2.904536485671997, "rewards_train/rejected": -2.351604461669922, "step": 783 }, { "epoch": 1.04, "learning_rate": 4.418444722431687e-07, "loss": 0.164, "step": 784 }, { "epoch": 1.04, "logps_train/chosen": -40.978477478027344, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -74.71595001220703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4229532480239868, "rewards_train/margins": 2.5535322427749634, "rewards_train/rejected": -2.1305789947509766, "step": 784 }, { "epoch": 1.04, "logps_train/chosen": -66.73492431640625, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -95.45710754394531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0086488276720047, "rewards_train/margins": 1.5128434151411057, "rewards_train/rejected": -1.5214922428131104, "step": 785 }, { "epoch": 1.04, "learning_rate": 4.41491475538373e-07, "loss": 0.2578, "step": 786 }, { "epoch": 1.04, "logps_train/chosen": -65.47206115722656, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -74.70218658447266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4887310266494751, "rewards_train/margins": 1.338637351989746, "rewards_train/rejected": -0.849906325340271, "step": 786 }, { "epoch": 1.05, "logps_train/chosen": -38.711307525634766, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -64.75392150878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.39468950033187866, "rewards_train/margins": 2.166956841945648, "rewards_train/rejected": -1.7722673416137695, "step": 787 }, { "epoch": 1.05, "learning_rate": 4.411375526682326e-07, "loss": 0.3052, "step": 788 }, { "epoch": 1.05, "logps_train/chosen": -56.16551208496094, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -70.11296844482422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.054832443594932556, "rewards_train/margins": 1.6361520439386368, "rewards_train/rejected": -1.6909844875335693, "step": 788 }, { "epoch": 1.05, "logps_train/chosen": -36.383453369140625, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -76.16281127929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.235611230134964, "rewards_train/margins": 1.9259819686412811, "rewards_train/rejected": -2.161593198776245, "step": 789 }, { "epoch": 1.05, "learning_rate": 4.4078270534452644e-07, "loss": 0.2719, "step": 790 }, { "epoch": 1.05, "logps_train/chosen": -83.35650634765625, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -94.11077880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08544264733791351, "rewards_train/margins": 2.3142934888601303, "rewards_train/rejected": -2.228850841522217, "step": 790 }, { "epoch": 1.05, "logps_train/chosen": -48.94058609008789, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -80.95195007324219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1692226231098175, "rewards_train/margins": 3.1191051304340363, "rewards_train/rejected": -2.9498825073242188, "step": 791 }, { "epoch": 1.05, "learning_rate": 4.4042693528350487e-07, "loss": 0.1622, "step": 792 }, { "epoch": 1.05, "logps_train/chosen": -58.48086166381836, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -59.32624053955078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.694961428642273, "rewards_train/margins": 0.8486001491546631, "rewards_train/rejected": -1.543561577796936, "step": 792 }, { "epoch": 1.05, "logps_train/chosen": -59.933555603027344, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -78.39978790283203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2878941297531128, "rewards_train/margins": 2.332560420036316, "rewards_train/rejected": -2.044666290283203, "step": 793 }, { "epoch": 1.05, "learning_rate": 4.4007024420588103e-07, "loss": 0.3822, "step": 794 }, { "epoch": 1.05, "logps_train/chosen": -93.20623779296875, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -111.58499145507812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.017498541623353958, "rewards_train/margins": 2.68006319925189, "rewards_train/rejected": -2.697561740875244, "step": 794 }, { "epoch": 1.06, "logps_train/chosen": -63.818111419677734, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -94.49789428710938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4624147117137909, "rewards_train/margins": 2.370016783475876, "rewards_train/rejected": -1.907602071762085, "step": 795 }, { "epoch": 1.06, "learning_rate": 4.397126338368227e-07, "loss": 0.2426, "step": 796 }, { "epoch": 1.06, "logps_train/chosen": -39.92469787597656, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -70.1273193359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12723538279533386, "rewards_train/margins": 1.8843248188495636, "rewards_train/rejected": -2.0115602016448975, "step": 796 }, { "epoch": 1.06, "logps_train/chosen": -84.03326416015625, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -136.01007080078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1310492902994156, "rewards_train/margins": 4.457055702805519, "rewards_train/rejected": -4.3260064125061035, "step": 797 }, { "epoch": 1.06, "learning_rate": 4.393541059059437e-07, "loss": 0.15, "step": 798 }, { "epoch": 1.06, "logps_train/chosen": -45.70124053955078, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -67.06489562988281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10643841326236725, "rewards_train/margins": 1.7265999466180801, "rewards_train/rejected": -1.620161533355713, "step": 798 }, { "epoch": 1.06, "logps_train/chosen": -40.22917175292969, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -52.668575286865234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9114576578140259, "rewards_train/margins": 2.1478464603424072, "rewards_train/rejected": -1.2363888025283813, "step": 799 }, { "epoch": 1.06, "learning_rate": 4.38994662147296e-07, "loss": 0.2226, "step": 800 }, { "epoch": 1.06, "logps_train/chosen": -78.17513275146484, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -92.73866271972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2918616831302643, "rewards_train/margins": 3.039556175470352, "rewards_train/rejected": -2.747694492340088, "step": 800 }, { "epoch": 1.06, "logps_train/chosen": -65.75807189941406, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -68.25862121582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19763082265853882, "rewards_train/margins": 1.5164613127708435, "rewards_train/rejected": -1.3188304901123047, "step": 801 }, { "epoch": 1.07, "learning_rate": 4.3863430429936087e-07, "loss": 0.1679, "step": 802 }, { "epoch": 1.07, "logps_train/chosen": -66.94666290283203, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -103.9609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4467398524284363, "rewards_train/margins": 2.7303337454795837, "rewards_train/rejected": -2.2835938930511475, "step": 802 }, { "epoch": 1.07, "logps_train/chosen": -68.72746276855469, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -121.6444091796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7897540926933289, "rewards_train/margins": 3.685445487499237, "rewards_train/rejected": -2.895691394805908, "step": 803 }, { "epoch": 1.07, "learning_rate": 4.382730341050408e-07, "loss": 0.1476, "step": 804 }, { "epoch": 1.07, "logps_train/chosen": -21.226465225219727, "logps_train/ref_chosen": -25.0, "logps_train/ref_rejected": -31.5, "logps_train/rejected": -53.17919158935547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37774407863616943, "rewards_train/margins": 2.546249270439148, "rewards_train/rejected": -2.1685051918029785, "step": 804 }, { "epoch": 1.07, "logps_train/chosen": -76.6697998046875, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -116.48828887939453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4963010847568512, "rewards_train/margins": 3.3318497240543365, "rewards_train/rejected": -2.8355486392974854, "step": 805 }, { "epoch": 1.07, "learning_rate": 4.379108533116507e-07, "loss": 0.1468, "step": 806 }, { "epoch": 1.07, "logps_train/chosen": -69.28903198242188, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -119.68470764160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0029266774654388428, "rewards_train/margins": 3.4421073496341705, "rewards_train/rejected": -3.4450340270996094, "step": 806 }, { "epoch": 1.07, "logps_train/chosen": -34.827537536621094, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -66.54564666748047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5211523771286011, "rewards_train/margins": 2.8805512189865112, "rewards_train/rejected": -2.35939884185791, "step": 807 }, { "epoch": 1.07, "learning_rate": 4.3754776367090974e-07, "loss": 0.1153, "step": 808 }, { "epoch": 1.07, "logps_train/chosen": -52.845703125, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -95.40000915527344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19902321696281433, "rewards_train/margins": 2.624961346387863, "rewards_train/rejected": -2.425938129425049, "step": 808 }, { "epoch": 1.07, "logps_train/chosen": -101.28802490234375, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -113.39967346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9540099501609802, "rewards_train/margins": 4.187727391719818, "rewards_train/rejected": -3.233717441558838, "step": 809 }, { "epoch": 1.08, "learning_rate": 4.37183766938933e-07, "loss": 0.12, "step": 810 }, { "epoch": 1.08, "logps_train/chosen": -61.20787811279297, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -103.71305847167969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6956185102462769, "rewards_train/margins": 3.5073541402816772, "rewards_train/rejected": -2.8117356300354004, "step": 810 }, { "epoch": 1.08, "logps_train/chosen": -80.150390625, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -105.05763244628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10183099657297134, "rewards_train/margins": 2.36579779535532, "rewards_train/rejected": -2.2639667987823486, "step": 811 }, { "epoch": 1.08, "learning_rate": 4.368188648762227e-07, "loss": 0.1604, "step": 812 }, { "epoch": 1.08, "logps_train/chosen": -58.470069885253906, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -86.8299331665039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2936176657676697, "rewards_train/margins": 3.0047358870506287, "rewards_train/rejected": -2.711118221282959, "step": 812 }, { "epoch": 1.08, "logps_train/chosen": -51.30886459350586, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -77.62238311767578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32926976680755615, "rewards_train/margins": 2.5910202264785767, "rewards_train/rejected": -2.2617504596710205, "step": 813 }, { "epoch": 1.08, "learning_rate": 4.364530592476595e-07, "loss": 0.1232, "step": 814 }, { "epoch": 1.08, "logps_train/chosen": -48.05463409423828, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -87.86077117919922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.8900444507598877, "rewards_train/margins": 3.1332504749298096, "rewards_train/rejected": -2.243206024169922, "step": 814 }, { "epoch": 1.08, "logps_train/chosen": -52.04695129394531, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -87.9521484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6781174540519714, "rewards_train/margins": 3.2077078223228455, "rewards_train/rejected": -2.529590368270874, "step": 815 }, { "epoch": 1.08, "learning_rate": 4.3608635182249465e-07, "loss": 0.1106, "step": 816 }, { "epoch": 1.08, "logps_train/chosen": -39.63652038574219, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -84.72727966308594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33439499139785767, "rewards_train/margins": 3.0946224331855774, "rewards_train/rejected": -2.7602274417877197, "step": 816 }, { "epoch": 1.08, "logps_train/chosen": -72.6854476928711, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -103.39086151123047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.006455421447754, "rewards_train/margins": 3.720541477203369, "rewards_train/rejected": -2.7140860557556152, "step": 817 }, { "epoch": 1.09, "learning_rate": 4.3571874437434074e-07, "loss": 0.134, "step": 818 }, { "epoch": 1.09, "logps_train/chosen": -44.015933990478516, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -85.70500946044922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.014422163367271423, "rewards_train/margins": 2.806798204779625, "rewards_train/rejected": -2.7923760414123535, "step": 818 }, { "epoch": 1.09, "logps_train/chosen": -70.45325469970703, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -90.65718078613281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08592414855957031, "rewards_train/margins": 2.2282047271728516, "rewards_train/rejected": -2.1422805786132812, "step": 819 }, { "epoch": 1.09, "learning_rate": 4.3535023868116363e-07, "loss": 0.1947, "step": 820 }, { "epoch": 1.09, "logps_train/chosen": -43.031463623046875, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -72.33087921142578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24060384929180145, "rewards_train/margins": 2.478379413485527, "rewards_train/rejected": -2.2377755641937256, "step": 820 }, { "epoch": 1.09, "logps_train/chosen": -51.21966552734375, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -78.81327056884766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.021966993808746338, "rewards_train/margins": 2.6968597769737244, "rewards_train/rejected": -2.7188267707824707, "step": 821 }, { "epoch": 1.09, "learning_rate": 4.349808365252733e-07, "loss": 0.1072, "step": 822 }, { "epoch": 1.09, "logps_train/chosen": -70.05642700195312, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -96.88026428222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8474821448326111, "rewards_train/margins": 3.1808213591575623, "rewards_train/rejected": -2.333339214324951, "step": 822 }, { "epoch": 1.09, "logps_train/chosen": -91.50241088867188, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -118.50920104980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12602216005325317, "rewards_train/margins": 2.7553674578666687, "rewards_train/rejected": -2.881389617919922, "step": 823 }, { "epoch": 1.09, "learning_rate": 4.3461053969331573e-07, "loss": 0.1176, "step": 824 }, { "epoch": 1.09, "logps_train/chosen": -51.377254486083984, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -92.90863800048828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1153994798660278, "rewards_train/margins": 3.503137946128845, "rewards_train/rejected": -2.3877384662628174, "step": 824 }, { "epoch": 1.1, "logps_train/chosen": -65.79637145996094, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -144.54788208007812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.6281752586364746, "rewards_train/margins": 3.4423394203186035, "rewards_train/rejected": -2.814164161682129, "step": 825 }, { "epoch": 1.1, "learning_rate": 4.3423934997626426e-07, "loss": 0.287, "step": 826 }, { "epoch": 1.1, "logps_train/chosen": -41.493492126464844, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -69.26010131835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3498693108558655, "rewards_train/margins": 2.947754681110382, "rewards_train/rejected": -2.5978853702545166, "step": 826 }, { "epoch": 1.1, "logps_train/chosen": -42.88385772705078, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -67.99557495117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9366143345832825, "rewards_train/margins": 2.9814847111701965, "rewards_train/rejected": -2.044870376586914, "step": 827 }, { "epoch": 1.1, "learning_rate": 4.338672691694104e-07, "loss": 0.1421, "step": 828 }, { "epoch": 1.1, "logps_train/chosen": -49.87078094482422, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -61.25836181640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3457341492176056, "rewards_train/margins": 2.4739144146442413, "rewards_train/rejected": -2.1281802654266357, "step": 828 }, { "epoch": 1.1, "logps_train/chosen": -57.14312744140625, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -92.68283081054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8138123750686646, "rewards_train/margins": 3.8305329084396362, "rewards_train/rejected": -3.0167205333709717, "step": 829 }, { "epoch": 1.1, "learning_rate": 4.334942990723558e-07, "loss": 0.1216, "step": 830 }, { "epoch": 1.1, "logps_train/chosen": -84.87217712402344, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -151.1685791015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20750823616981506, "rewards_train/margins": 3.9384283125400543, "rewards_train/rejected": -3.7309200763702393, "step": 830 }, { "epoch": 1.1, "logps_train/chosen": -32.686134338378906, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -67.4171142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5641993284225464, "rewards_train/margins": 2.791848063468933, "rewards_train/rejected": -2.2276487350463867, "step": 831 }, { "epoch": 1.1, "learning_rate": 4.3312044148900293e-07, "loss": 0.1063, "step": 832 }, { "epoch": 1.1, "logps_train/chosen": -39.43988800048828, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -70.92292785644531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7673392295837402, "rewards_train/margins": 3.0377566814422607, "rewards_train/rejected": -2.2704174518585205, "step": 832 }, { "epoch": 1.11, "logps_train/chosen": -53.904075622558594, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -98.24420928955078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4009988307952881, "rewards_train/margins": 3.741044521331787, "rewards_train/rejected": -3.340045690536499, "step": 833 }, { "epoch": 1.11, "learning_rate": 4.327456982275469e-07, "loss": 0.0821, "step": 834 }, { "epoch": 1.11, "logps_train/chosen": -53.978431701660156, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -104.92790222167969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.713094174861908, "rewards_train/margins": 3.704321801662445, "rewards_train/rejected": -2.991227626800537, "step": 834 }, { "epoch": 1.11, "logps_train/chosen": -65.21916198730469, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -75.34265899658203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4640212059020996, "rewards_train/margins": 2.6803178787231445, "rewards_train/rejected": -2.216296672821045, "step": 835 }, { "epoch": 1.11, "learning_rate": 4.323700711004665e-07, "loss": 0.0967, "step": 836 }, { "epoch": 1.11, "logps_train/chosen": -89.44677734375, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -162.52613830566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42719754576683044, "rewards_train/margins": 4.5673108994960785, "rewards_train/rejected": -4.140113353729248, "step": 836 }, { "epoch": 1.11, "logps_train/chosen": -59.180702209472656, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -91.57246398925781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8491173982620239, "rewards_train/margins": 3.270426392555237, "rewards_train/rejected": -2.421308994293213, "step": 837 }, { "epoch": 1.11, "learning_rate": 4.319935619245153e-07, "loss": 0.0547, "step": 838 }, { "epoch": 1.11, "logps_train/chosen": -36.24166488647461, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -61.67096710205078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4555209279060364, "rewards_train/margins": 3.0151954293251038, "rewards_train/rejected": -2.5596745014190674, "step": 838 }, { "epoch": 1.11, "logps_train/chosen": -51.77582931518555, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -92.5146713256836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7142292261123657, "rewards_train/margins": 3.675071597099304, "rewards_train/rejected": -2.9608423709869385, "step": 839 }, { "epoch": 1.12, "learning_rate": 4.31616172520713e-07, "loss": 0.0824, "step": 840 }, { "epoch": 1.12, "logps_train/chosen": -47.10211944580078, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -92.96401977539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0233820676803589, "rewards_train/margins": 3.482284188270569, "rewards_train/rejected": -2.45890212059021, "step": 840 }, { "epoch": 1.12, "logps_train/chosen": -68.681396484375, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -85.32242584228516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7521727085113525, "rewards_train/margins": 3.278165578842163, "rewards_train/rejected": -2.5259928703308105, "step": 841 }, { "epoch": 1.12, "learning_rate": 4.312379047143365e-07, "loss": 0.0508, "step": 842 }, { "epoch": 1.12, "logps_train/chosen": -43.5899658203125, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -70.38139343261719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5937379598617554, "rewards_train/margins": 2.970548987388611, "rewards_train/rejected": -2.3768110275268555, "step": 842 }, { "epoch": 1.12, "logps_train/chosen": -52.637733459472656, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -78.90478515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.8252888321876526, "rewards_train/margins": 3.165767252445221, "rewards_train/rejected": -2.3404784202575684, "step": 843 }, { "epoch": 1.12, "learning_rate": 4.3085876033491146e-07, "loss": 0.1228, "step": 844 }, { "epoch": 1.12, "logps_train/chosen": -35.52029800415039, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -60.43755340576172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0276578664779663, "rewards_train/margins": 3.487038016319275, "rewards_train/rejected": -2.4593801498413086, "step": 844 }, { "epoch": 1.12, "logps_train/chosen": -39.0677604675293, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -80.95925903320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4244741201400757, "rewards_train/margins": 3.4446195363998413, "rewards_train/rejected": -3.0201454162597656, "step": 845 }, { "epoch": 1.12, "learning_rate": 4.3047874121620284e-07, "loss": 0.0585, "step": 846 }, { "epoch": 1.12, "logps_train/chosen": -82.25259399414062, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -141.22845458984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.3055022954940796, "rewards_train/margins": 4.597878575325012, "rewards_train/rejected": -3.2923762798309326, "step": 846 }, { "epoch": 1.12, "logps_train/chosen": -59.74720764160156, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -107.6315689086914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28934168815612793, "rewards_train/margins": 3.738436222076416, "rewards_train/rejected": -3.449094533920288, "step": 847 }, { "epoch": 1.13, "learning_rate": 4.3009784919620655e-07, "loss": 0.1233, "step": 848 }, { "epoch": 1.13, "logps_train/chosen": -41.59056854248047, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -103.746826171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7096932530403137, "rewards_train/margins": 4.278126060962677, "rewards_train/rejected": -3.5684328079223633, "step": 848 }, { "epoch": 1.13, "logps_train/chosen": -72.16027069091797, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -116.64436340332031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.8222544193267822, "rewards_train/margins": 5.135128736495972, "rewards_train/rejected": -3.3128743171691895, "step": 849 }, { "epoch": 1.13, "learning_rate": 4.2971608611714026e-07, "loss": 0.0486, "step": 850 }, { "epoch": 1.13, "logps_train/chosen": -69.67103576660156, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -104.18919372558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.651646614074707, "rewards_train/margins": 3.6025967597961426, "rewards_train/rejected": -2.9509501457214355, "step": 850 }, { "epoch": 1.13, "logps_train/chosen": -51.434364318847656, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -109.86626434326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07546746730804443, "rewards_train/margins": 3.0947519540786743, "rewards_train/rejected": -3.1702194213867188, "step": 851 }, { "epoch": 1.13, "learning_rate": 4.2933345382543474e-07, "loss": 0.1412, "step": 852 }, { "epoch": 1.13, "logps_train/chosen": -55.59050750732422, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -71.9244384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6682929992675781, "rewards_train/margins": 2.223236560821533, "rewards_train/rejected": -1.554943561553955, "step": 852 }, { "epoch": 1.13, "logps_train/chosen": -56.12427520751953, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -80.46771240234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.053978681564331, "rewards_train/margins": 4.1577818393707275, "rewards_train/rejected": -3.1038031578063965, "step": 853 }, { "epoch": 1.13, "learning_rate": 4.2894995417172463e-07, "loss": 0.0992, "step": 854 }, { "epoch": 1.13, "logps_train/chosen": -109.62224578857422, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -142.90023803710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16590020060539246, "rewards_train/margins": 4.015298575162888, "rewards_train/rejected": -3.849398374557495, "step": 854 }, { "epoch": 1.14, "logps_train/chosen": -56.15511703491211, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -110.76558685302734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41573822498321533, "rewards_train/margins": 3.667296528816223, "rewards_train/rejected": -3.251558303833008, "step": 855 }, { "epoch": 1.14, "learning_rate": 4.2856558901083966e-07, "loss": 0.0626, "step": 856 }, { "epoch": 1.14, "logps_train/chosen": -41.163368225097656, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -84.08399963378906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0316120386123657, "rewards_train/margins": 3.74313747882843, "rewards_train/rejected": -2.7115254402160645, "step": 856 }, { "epoch": 1.14, "logps_train/chosen": -70.54540252685547, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -138.29885864257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6470222473144531, "rewards_train/margins": 4.305031776428223, "rewards_train/rejected": -3.6580095291137695, "step": 857 }, { "epoch": 1.14, "learning_rate": 4.281803602017957e-07, "loss": 0.0722, "step": 858 }, { "epoch": 1.14, "logps_train/chosen": -92.97169494628906, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -101.36845397949219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.270798921585083, "rewards_train/margins": 4.256472110748291, "rewards_train/rejected": -2.985673189163208, "step": 858 }, { "epoch": 1.14, "logps_train/chosen": -73.10415649414062, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -98.9078369140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9216163754463196, "rewards_train/margins": 4.317087471485138, "rewards_train/rejected": -3.3954710960388184, "step": 859 }, { "epoch": 1.14, "learning_rate": 4.2779426960778587e-07, "loss": 0.0547, "step": 860 }, { "epoch": 1.14, "logps_train/chosen": -27.713716506958008, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -56.51048278808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9958158731460571, "rewards_train/margins": 2.969520688056946, "rewards_train/rejected": -1.9737048149108887, "step": 860 }, { "epoch": 1.14, "logps_train/chosen": -38.01380920410156, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -76.73248291015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.5540881156921387, "rewards_train/margins": 3.2945237159729004, "rewards_train/rejected": -2.7404356002807617, "step": 861 }, { "epoch": 1.14, "learning_rate": 4.27407319096171e-07, "loss": 0.1109, "step": 862 }, { "epoch": 1.14, "logps_train/chosen": -48.41714096069336, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -89.53971862792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38377442955970764, "rewards_train/margins": 3.9854022562503815, "rewards_train/rejected": -3.601627826690674, "step": 862 }, { "epoch": 1.15, "logps_train/chosen": -46.59536361694336, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -90.42716979980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1974947154521942, "rewards_train/margins": 3.5042745769023895, "rewards_train/rejected": -3.3067798614501953, "step": 863 }, { "epoch": 1.15, "learning_rate": 4.270195105384714e-07, "loss": 0.0837, "step": 864 }, { "epoch": 1.15, "logps_train/chosen": -64.23978424072266, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -97.1863784790039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6635218858718872, "rewards_train/margins": 3.844659924507141, "rewards_train/rejected": -3.181138038635254, "step": 864 }, { "epoch": 1.15, "logps_train/chosen": -62.37693786621094, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -112.25022888183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9805680513381958, "rewards_train/margins": 3.9509037733078003, "rewards_train/rejected": -2.9703357219696045, "step": 865 }, { "epoch": 1.15, "learning_rate": 4.2663084581035727e-07, "loss": 0.0839, "step": 866 }, { "epoch": 1.15, "logps_train/chosen": -66.78048706054688, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -85.95433044433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6172642707824707, "rewards_train/margins": 3.6236352920532227, "rewards_train/rejected": -3.006371021270752, "step": 866 }, { "epoch": 1.15, "logps_train/chosen": -46.817909240722656, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -64.48454284667969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20570902526378632, "rewards_train/margins": 2.3432260006666183, "rewards_train/rejected": -2.137516975402832, "step": 867 }, { "epoch": 1.15, "learning_rate": 4.262413267916396e-07, "loss": 0.1671, "step": 868 }, { "epoch": 1.15, "logps_train/chosen": -108.32286071777344, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -156.0655975341797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9739636182785034, "rewards_train/margins": 5.496148467063904, "rewards_train/rejected": -4.5221848487854, "step": 868 }, { "epoch": 1.15, "logps_train/chosen": -50.23903274536133, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -90.75271606445312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8834207057952881, "rewards_train/margins": 3.5180675983428955, "rewards_train/rejected": -2.6346468925476074, "step": 869 }, { "epoch": 1.16, "learning_rate": 4.2585095536626143e-07, "loss": 0.0436, "step": 870 }, { "epoch": 1.16, "logps_train/chosen": -69.28868103027344, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -135.10012817382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.285194456577301, "rewards_train/margins": 4.782706797122955, "rewards_train/rejected": -4.497512340545654, "step": 870 }, { "epoch": 1.16, "logps_train/chosen": -49.640625, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -94.07460021972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17890629172325134, "rewards_train/margins": 3.473866790533066, "rewards_train/rejected": -3.2949604988098145, "step": 871 }, { "epoch": 1.16, "learning_rate": 4.2545973342228837e-07, "loss": 0.0504, "step": 872 }, { "epoch": 1.16, "logps_train/chosen": -49.62659454345703, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -82.99362182617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4873405396938324, "rewards_train/margins": 3.574202388525009, "rewards_train/rejected": -3.0868618488311768, "step": 872 }, { "epoch": 1.16, "logps_train/chosen": -87.91971588134766, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -133.903076171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.890059769153595, "rewards_train/margins": 4.710054457187653, "rewards_train/rejected": -3.8199946880340576, "step": 873 }, { "epoch": 1.16, "learning_rate": 4.2506766285189976e-07, "loss": 0.0391, "step": 874 }, { "epoch": 1.16, "logps_train/chosen": -62.36461639404297, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -108.12089538574219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2940070927143097, "rewards_train/margins": 3.9748471081256866, "rewards_train/rejected": -3.680840015411377, "step": 874 }, { "epoch": 1.16, "logps_train/chosen": -59.10348892211914, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -78.975830078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7521509528160095, "rewards_train/margins": 3.5458282828330994, "rewards_train/rejected": -2.79367733001709, "step": 875 }, { "epoch": 1.16, "learning_rate": 4.246747455513794e-07, "loss": 0.0529, "step": 876 }, { "epoch": 1.16, "logps_train/chosen": -45.67261505126953, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -102.71471405029297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8608638048171997, "rewards_train/margins": 4.508898138999939, "rewards_train/rejected": -3.6480343341827393, "step": 876 }, { "epoch": 1.16, "logps_train/chosen": -49.87294387817383, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -89.14911651611328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.079892873764038, "rewards_train/margins": 4.429179430007935, "rewards_train/rejected": -3.3492865562438965, "step": 877 }, { "epoch": 1.17, "learning_rate": 4.242809834211063e-07, "loss": 0.0352, "step": 878 }, { "epoch": 1.17, "logps_train/chosen": -42.957130432128906, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -73.38704681396484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2386818528175354, "rewards_train/margins": 2.2734598517417908, "rewards_train/rejected": -2.512141704559326, "step": 878 }, { "epoch": 1.17, "logps_train/chosen": -56.34748840332031, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -92.78182983398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16681388020515442, "rewards_train/margins": 3.2121849358081818, "rewards_train/rejected": -3.0453710556030273, "step": 879 }, { "epoch": 1.17, "learning_rate": 4.238863783655456e-07, "loss": 0.0929, "step": 880 }, { "epoch": 1.17, "logps_train/chosen": -61.931610107421875, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -97.60562133789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1459019184112549, "rewards_train/margins": 4.078338623046875, "rewards_train/rejected": -2.93243670463562, "step": 880 }, { "epoch": 1.17, "logps_train/chosen": -76.86953735351562, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -112.93692779541016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2130463868379593, "rewards_train/margins": 3.3524423390626907, "rewards_train/rejected": -3.1393959522247314, "step": 881 }, { "epoch": 1.17, "learning_rate": 4.234909322932393e-07, "loss": 0.0861, "step": 882 }, { "epoch": 1.17, "logps_train/chosen": -53.91347122192383, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -85.40789031982422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.466465562582016, "rewards_train/margins": 3.154129832983017, "rewards_train/rejected": -2.687664270401001, "step": 882 }, { "epoch": 1.17, "logps_train/chosen": -62.2425537109375, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -92.77987670898438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.34761956334114075, "rewards_train/margins": 3.3787329494953156, "rewards_train/rejected": -3.031113386154175, "step": 883 }, { "epoch": 1.17, "learning_rate": 4.230946471167971e-07, "loss": 0.0892, "step": 884 }, { "epoch": 1.17, "logps_train/chosen": -65.4237060546875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -117.238037109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31231626868247986, "rewards_train/margins": 4.46736940741539, "rewards_train/rejected": -4.15505313873291, "step": 884 }, { "epoch": 1.18, "logps_train/chosen": -81.63449096679688, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -134.44198608398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.694363534450531, "rewards_train/margins": 4.832311570644379, "rewards_train/rejected": -4.137948036193848, "step": 885 }, { "epoch": 1.18, "learning_rate": 4.22697524752887e-07, "loss": 0.0369, "step": 886 }, { "epoch": 1.18, "logps_train/chosen": -72.50392150878906, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -92.62897491455078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0265616178512573, "rewards_train/margins": 3.4859436750411987, "rewards_train/rejected": -2.4593820571899414, "step": 886 }, { "epoch": 1.18, "logps_train/chosen": -68.845947265625, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -133.44363403320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6997804641723633, "rewards_train/margins": 4.884769439697266, "rewards_train/rejected": -4.184988975524902, "step": 887 }, { "epoch": 1.18, "learning_rate": 4.2229956712222625e-07, "loss": 0.0456, "step": 888 }, { "epoch": 1.18, "logps_train/chosen": -49.1992301940918, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -86.04940032958984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6058585047721863, "rewards_train/margins": 3.7475168108940125, "rewards_train/rejected": -3.141658306121826, "step": 888 }, { "epoch": 1.18, "logps_train/chosen": -60.015098571777344, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -95.0704345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2156777381896973, "rewards_train/margins": 4.1063148975372314, "rewards_train/rejected": -2.890637159347534, "step": 889 }, { "epoch": 1.18, "learning_rate": 4.21900776149572e-07, "loss": 0.0681, "step": 890 }, { "epoch": 1.18, "logps_train/chosen": -85.4272232055664, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -120.84565734863281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.5658717155456543, "rewards_train/margins": 5.718797206878662, "rewards_train/rejected": -4.152925491333008, "step": 890 }, { "epoch": 1.18, "logps_train/chosen": -77.84867858886719, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -139.948974609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1088826656341553, "rewards_train/margins": 6.110029458999634, "rewards_train/rejected": -5.0011467933654785, "step": 891 }, { "epoch": 1.18, "learning_rate": 4.2150115376371165e-07, "loss": 0.043, "step": 892 }, { "epoch": 1.18, "logps_train/chosen": -51.44811248779297, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -93.90492248535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4833136796951294, "rewards_train/margins": 4.401930928230286, "rewards_train/rejected": -3.9186172485351562, "step": 892 }, { "epoch": 1.19, "logps_train/chosen": -90.07806396484375, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -124.75885772705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6998106241226196, "rewards_train/margins": 4.914758801460266, "rewards_train/rejected": -4.2149481773376465, "step": 893 }, { "epoch": 1.19, "learning_rate": 4.2110070189745405e-07, "loss": 0.0382, "step": 894 }, { "epoch": 1.19, "logps_train/chosen": -67.50318145751953, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -96.06961059570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.5653066635131836, "rewards_train/margins": 4.381642818450928, "rewards_train/rejected": -2.816336154937744, "step": 894 }, { "epoch": 1.19, "logps_train/chosen": -49.06819152832031, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -97.88206481933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0728683471679688, "rewards_train/margins": 4.471231460571289, "rewards_train/rejected": -3.3983631134033203, "step": 895 }, { "epoch": 1.19, "learning_rate": 4.2069942248761984e-07, "loss": 0.0444, "step": 896 }, { "epoch": 1.19, "logps_train/chosen": -39.21394348144531, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -86.92652893066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42899614572525024, "rewards_train/margins": 3.643524706363678, "rewards_train/rejected": -3.2145285606384277, "step": 896 }, { "epoch": 1.19, "logps_train/chosen": -49.27470397949219, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -97.69639587402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.4055376052856445, "rewards_train/margins": 4.906427621841431, "rewards_train/rejected": -3.500890016555786, "step": 897 }, { "epoch": 1.19, "learning_rate": 4.2029731747503215e-07, "loss": 0.0921, "step": 898 }, { "epoch": 1.19, "logps_train/chosen": -42.635555267333984, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -68.8574447631836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7536320686340332, "rewards_train/margins": 3.321408271789551, "rewards_train/rejected": -2.5677762031555176, "step": 898 }, { "epoch": 1.19, "logps_train/chosen": -65.1983642578125, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -117.09519958496094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0629760026931763, "rewards_train/margins": 5.020152449607849, "rewards_train/rejected": -3.957176446914673, "step": 899 }, { "epoch": 1.2, "learning_rate": 4.198943888045072e-07, "loss": 0.0711, "step": 900 }, { "epoch": 1.2, "logps_train/chosen": -72.8696060180664, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -122.85391998291016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9849143624305725, "rewards_train/margins": 5.079681932926178, "rewards_train/rejected": -4.0947675704956055, "step": 900 }, { "epoch": 1.2, "logps_train/chosen": -67.7914047241211, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -104.09950256347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0138282775878906, "rewards_train/margins": 4.64018440246582, "rewards_train/rejected": -3.6263561248779297, "step": 901 }, { "epoch": 1.2, "learning_rate": 4.194906384248449e-07, "loss": 0.0427, "step": 902 }, { "epoch": 1.2, "logps_train/chosen": -69.12123107910156, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -129.08428955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.9535012245178223, "rewards_train/margins": 6.556071758270264, "rewards_train/rejected": -4.602570533752441, "step": 902 }, { "epoch": 1.2, "logps_train/chosen": -48.77479934692383, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -63.59720993041992, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.425645112991333, "rewards_train/margins": 3.7392725944519043, "rewards_train/rejected": -2.3136274814605713, "step": 903 }, { "epoch": 1.2, "learning_rate": 4.190860682888194e-07, "loss": 0.0277, "step": 904 }, { "epoch": 1.2, "logps_train/chosen": -42.04167556762695, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -76.12769317626953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35286375880241394, "rewards_train/margins": 3.1812581717967987, "rewards_train/rejected": -2.8283944129943848, "step": 904 }, { "epoch": 1.2, "logps_train/chosen": -57.059051513671875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -128.88531494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1472200155258179, "rewards_train/margins": 5.173252463340759, "rewards_train/rejected": -4.026032447814941, "step": 905 }, { "epoch": 1.2, "learning_rate": 4.186806803531697e-07, "loss": 0.0597, "step": 906 }, { "epoch": 1.2, "logps_train/chosen": -30.11660385131836, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -66.88246154785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45025354623794556, "rewards_train/margins": 2.873753845691681, "rewards_train/rejected": -2.4235002994537354, "step": 906 }, { "epoch": 1.2, "logps_train/chosen": -51.4620361328125, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -101.95337677001953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.014953434467315674, "rewards_train/margins": 4.089759528636932, "rewards_train/rejected": -4.104712963104248, "step": 907 }, { "epoch": 1.21, "learning_rate": 4.1827447657859024e-07, "loss": 0.0841, "step": 908 }, { "epoch": 1.21, "logps_train/chosen": -52.18760681152344, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -100.23721313476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42342692613601685, "rewards_train/margins": 4.464335262775421, "rewards_train/rejected": -4.040908336639404, "step": 908 }, { "epoch": 1.21, "logps_train/chosen": -35.78356170654297, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -82.17020416259766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6942998766899109, "rewards_train/margins": 3.747257649898529, "rewards_train/rejected": -3.052957773208618, "step": 909 }, { "epoch": 1.21, "learning_rate": 4.178674589297212e-07, "loss": 0.0391, "step": 910 }, { "epoch": 1.21, "logps_train/chosen": -72.89419555664062, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -136.70672607421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3840184807777405, "rewards_train/margins": 5.401565611362457, "rewards_train/rejected": -5.017547130584717, "step": 910 }, { "epoch": 1.21, "logps_train/chosen": -39.258819580078125, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -86.83905029296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6694308519363403, "rewards_train/margins": 3.982242226600647, "rewards_train/rejected": -3.3128113746643066, "step": 911 }, { "epoch": 1.21, "learning_rate": 4.174596293751391e-07, "loss": 0.0796, "step": 912 }, { "epoch": 1.21, "logps_train/chosen": -81.449462890625, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -141.7239990234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1539310216903687, "rewards_train/margins": 4.428674101829529, "rewards_train/rejected": -3.27474308013916, "step": 912 }, { "epoch": 1.21, "logps_train/chosen": -55.227561950683594, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -97.64739990234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.3116190433502197, "rewards_train/margins": 4.6638593673706055, "rewards_train/rejected": -3.3522403240203857, "step": 913 }, { "epoch": 1.21, "learning_rate": 4.1705098988734767e-07, "loss": 0.0277, "step": 914 }, { "epoch": 1.21, "logps_train/chosen": -44.630889892578125, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -88.82794189453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4822237193584442, "rewards_train/margins": 3.8165800273418427, "rewards_train/rejected": -3.3343563079833984, "step": 914 }, { "epoch": 1.22, "logps_train/chosen": -38.097747802734375, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -82.90342712402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1652250587940216, "rewards_train/margins": 3.004395514726639, "rewards_train/rejected": -2.839170455932617, "step": 915 }, { "epoch": 1.22, "learning_rate": 4.166415424427675e-07, "loss": 0.0809, "step": 916 }, { "epoch": 1.22, "logps_train/chosen": -53.195701599121094, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -86.19358825683594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1261327862739563, "rewards_train/margins": 2.937678635120392, "rewards_train/rejected": -2.8115458488464355, "step": 916 }, { "epoch": 1.22, "logps_train/chosen": -54.32292175292969, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -92.09500122070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.02447935938835144, "rewards_train/margins": 3.611582785844803, "rewards_train/rejected": -3.6360621452331543, "step": 917 }, { "epoch": 1.22, "learning_rate": 4.162312890217272e-07, "loss": 0.102, "step": 918 }, { "epoch": 1.22, "logps_train/chosen": -61.09229278564453, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -76.89395141601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08847537636756897, "rewards_train/margins": 2.3045794069767, "rewards_train/rejected": -2.216104030609131, "step": 918 }, { "epoch": 1.22, "logps_train/chosen": -50.561588287353516, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -92.11100769042969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5789974331855774, "rewards_train/margins": 4.27916032075882, "rewards_train/rejected": -3.700162887573242, "step": 919 }, { "epoch": 1.22, "learning_rate": 4.1582023160845343e-07, "loss": 0.1111, "step": 920 }, { "epoch": 1.22, "logps_train/chosen": -55.1916389465332, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -109.59123992919922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9136486649513245, "rewards_train/margins": 4.693476021289825, "rewards_train/rejected": -3.779827356338501, "step": 920 }, { "epoch": 1.22, "logps_train/chosen": -40.989498138427734, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -77.27867126464844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8307376503944397, "rewards_train/margins": 3.8484485745429993, "rewards_train/rejected": -3.0177109241485596, "step": 921 }, { "epoch": 1.22, "learning_rate": 4.154083721910615e-07, "loss": 0.0287, "step": 922 }, { "epoch": 1.22, "logps_train/chosen": -63.161231994628906, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -85.92396545410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07237371802330017, "rewards_train/margins": 2.9204141199588776, "rewards_train/rejected": -2.9927878379821777, "step": 922 }, { "epoch": 1.23, "logps_train/chosen": -70.62162780761719, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -142.40940856933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9472123384475708, "rewards_train/margins": 4.892840504646301, "rewards_train/rejected": -3.9456281661987305, "step": 923 }, { "epoch": 1.23, "learning_rate": 4.149957127615457e-07, "loss": 0.0843, "step": 924 }, { "epoch": 1.23, "logps_train/chosen": -63.873023986816406, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -96.33056640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15332257747650146, "rewards_train/margins": 3.598683714866638, "rewards_train/rejected": -3.4453611373901367, "step": 924 }, { "epoch": 1.23, "logps_train/chosen": -35.75093078613281, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -94.66659545898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7197617888450623, "rewards_train/margins": 4.382905662059784, "rewards_train/rejected": -3.6631438732147217, "step": 925 }, { "epoch": 1.23, "learning_rate": 4.145822553157695e-07, "loss": 0.0711, "step": 926 }, { "epoch": 1.23, "logps_train/chosen": -44.981101989746094, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -88.52735900878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7448585033416748, "rewards_train/margins": 4.309313058853149, "rewards_train/rejected": -3.5644545555114746, "step": 926 }, { "epoch": 1.23, "logps_train/chosen": -48.46117401123047, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -82.884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6343510150909424, "rewards_train/margins": 3.315014600753784, "rewards_train/rejected": -2.680663585662842, "step": 927 }, { "epoch": 1.23, "learning_rate": 4.141680018534563e-07, "loss": 0.0584, "step": 928 }, { "epoch": 1.23, "logps_train/chosen": -62.465728759765625, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -94.887939453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9940519332885742, "rewards_train/margins": 4.067220687866211, "rewards_train/rejected": -3.0731687545776367, "step": 928 }, { "epoch": 1.23, "logps_train/chosen": -53.429527282714844, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -99.75729370117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1289224624633789, "rewards_train/margins": 4.048401355743408, "rewards_train/rejected": -3.9194788932800293, "step": 929 }, { "epoch": 1.24, "learning_rate": 4.137529543781794e-07, "loss": 0.0342, "step": 930 }, { "epoch": 1.24, "logps_train/chosen": -38.94437026977539, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -82.13406372070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0071253180503845215, "rewards_train/margins": 3.622875154018402, "rewards_train/rejected": -3.6157498359680176, "step": 930 }, { "epoch": 1.24, "logps_train/chosen": -70.44154357910156, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -117.73814392089844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1511577367782593, "rewards_train/margins": 4.334346890449524, "rewards_train/rejected": -3.1831891536712646, "step": 931 }, { "epoch": 1.24, "learning_rate": 4.1333711489735224e-07, "loss": 0.0601, "step": 932 }, { "epoch": 1.24, "logps_train/chosen": -34.402015686035156, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -73.76387023925781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5648766756057739, "rewards_train/margins": 3.2592321634292603, "rewards_train/rejected": -2.6943554878234863, "step": 932 }, { "epoch": 1.24, "logps_train/chosen": -46.12580490112305, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -67.19065856933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.932732105255127, "rewards_train/margins": 3.4642982482910156, "rewards_train/rejected": -2.5315661430358887, "step": 933 }, { "epoch": 1.24, "learning_rate": 4.12920485422219e-07, "loss": 0.0891, "step": 934 }, { "epoch": 1.24, "logps_train/chosen": -40.333343505859375, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -74.30396270751953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2391263246536255, "rewards_train/margins": 3.0122958421707153, "rewards_train/rejected": -2.77316951751709, "step": 934 }, { "epoch": 1.24, "logps_train/chosen": -66.84693908691406, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -158.54598999023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.4332746267318726, "rewards_train/margins": 5.669124960899353, "rewards_train/rejected": -4.2358503341674805, "step": 935 }, { "epoch": 1.24, "learning_rate": 4.1250306796784486e-07, "loss": 0.0859, "step": 936 }, { "epoch": 1.24, "logps_train/chosen": -59.74602508544922, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -78.83283233642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20821015536785126, "rewards_train/margins": 3.857118770480156, "rewards_train/rejected": -3.6489086151123047, "step": 936 }, { "epoch": 1.24, "logps_train/chosen": -59.689247131347656, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -115.943115234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0295121669769287, "rewards_train/margins": 4.8488242626190186, "rewards_train/rejected": -3.81931209564209, "step": 937 }, { "epoch": 1.25, "learning_rate": 4.120848645531059e-07, "loss": 0.0394, "step": 938 }, { "epoch": 1.25, "logps_train/chosen": -62.38712692260742, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -110.88436889648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2691000699996948, "rewards_train/margins": 4.173161625862122, "rewards_train/rejected": -3.9040615558624268, "step": 938 }, { "epoch": 1.25, "logps_train/chosen": -46.03522491455078, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -94.3001937866211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.723039984703064, "rewards_train/margins": 4.403059124946594, "rewards_train/rejected": -3.6800191402435303, "step": 939 }, { "epoch": 1.25, "learning_rate": 4.116658772006797e-07, "loss": 0.0427, "step": 940 }, { "epoch": 1.25, "logps_train/chosen": -47.388282775878906, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -82.37748718261719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7486715316772461, "rewards_train/margins": 3.9637644290924072, "rewards_train/rejected": -3.215092897415161, "step": 940 }, { "epoch": 1.25, "logps_train/chosen": -79.65668487548828, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -135.0927734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.882768988609314, "rewards_train/margins": 5.68110978603363, "rewards_train/rejected": -4.798340797424316, "step": 941 }, { "epoch": 1.25, "learning_rate": 4.1124610793703554e-07, "loss": 0.0444, "step": 942 }, { "epoch": 1.25, "logps_train/chosen": -58.79033660888672, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -99.20828247070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0990912914276123, "rewards_train/margins": 3.752732515335083, "rewards_train/rejected": -3.6536412239074707, "step": 942 }, { "epoch": 1.25, "logps_train/chosen": -70.18021392822266, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -97.68711853027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8538536429405212, "rewards_train/margins": 3.2772530913352966, "rewards_train/rejected": -2.4233994483947754, "step": 943 }, { "epoch": 1.25, "learning_rate": 4.108255587924241e-07, "loss": 0.0913, "step": 944 }, { "epoch": 1.25, "logps_train/chosen": -29.704769134521484, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -63.272972106933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9445928335189819, "rewards_train/margins": 3.0562652349472046, "rewards_train/rejected": -2.1116724014282227, "step": 944 }, { "epoch": 1.25, "logps_train/chosen": -59.856346130371094, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -145.3099822998047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6221776008605957, "rewards_train/margins": 5.682863712310791, "rewards_train/rejected": -5.060686111450195, "step": 945 }, { "epoch": 1.26, "learning_rate": 4.1040423180086835e-07, "loss": 0.0693, "step": 946 }, { "epoch": 1.26, "logps_train/chosen": -30.28253173828125, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -71.4852294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.605145275592804, "rewards_train/margins": 3.775543510913849, "rewards_train/rejected": -3.170398235321045, "step": 946 }, { "epoch": 1.26, "logps_train/chosen": -65.02572631835938, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -73.66020202636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2982089519500732, "rewards_train/margins": 3.4173545837402344, "rewards_train/rejected": -2.119145631790161, "step": 947 }, { "epoch": 1.26, "learning_rate": 4.0998212900015343e-07, "loss": 0.0795, "step": 948 }, { "epoch": 1.26, "logps_train/chosen": -50.15205383300781, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -115.87500762939453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9674117565155029, "rewards_train/margins": 4.726006984710693, "rewards_train/rejected": -3.7585952281951904, "step": 948 }, { "epoch": 1.26, "logps_train/chosen": -74.43000793457031, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -143.4873046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.192936658859253, "rewards_train/margins": 5.982293367385864, "rewards_train/rejected": -4.789356708526611, "step": 949 }, { "epoch": 1.26, "learning_rate": 4.095592524318165e-07, "loss": 0.0247, "step": 950 }, { "epoch": 1.26, "logps_train/chosen": -52.825592041015625, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -111.33204650878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04271543025970459, "rewards_train/margins": 3.9686142206192017, "rewards_train/rejected": -4.011329650878906, "step": 950 }, { "epoch": 1.26, "logps_train/chosen": -62.18234634399414, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -97.35087585449219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5028594136238098, "rewards_train/margins": 3.570759356021881, "rewards_train/rejected": -3.0678999423980713, "step": 951 }, { "epoch": 1.26, "learning_rate": 4.0913560414113725e-07, "loss": 0.0887, "step": 952 }, { "epoch": 1.26, "logps_train/chosen": -76.3884506225586, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -121.98712158203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7404517531394958, "rewards_train/margins": 4.998538911342621, "rewards_train/rejected": -4.258087158203125, "step": 952 }, { "epoch": 1.27, "logps_train/chosen": -34.61338806152344, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -62.511173248291016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3394424617290497, "rewards_train/margins": 2.681575506925583, "rewards_train/rejected": -2.342133045196533, "step": 953 }, { "epoch": 1.27, "learning_rate": 4.0871118617712785e-07, "loss": 0.1339, "step": 954 }, { "epoch": 1.27, "logps_train/chosen": -68.00106811523438, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -101.47566223144531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.44364291429519653, "rewards_train/margins": 3.934958755970001, "rewards_train/rejected": -3.4913158416748047, "step": 954 }, { "epoch": 1.27, "logps_train/chosen": -41.08306121826172, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -77.29725646972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.49677225947380066, "rewards_train/margins": 3.9405607879161835, "rewards_train/rejected": -3.443788528442383, "step": 955 }, { "epoch": 1.27, "learning_rate": 4.082860005925231e-07, "loss": 0.0729, "step": 956 }, { "epoch": 1.27, "logps_train/chosen": -56.42616653442383, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -122.04458618164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5136333703994751, "rewards_train/margins": 5.108717560768127, "rewards_train/rejected": -4.595084190368652, "step": 956 }, { "epoch": 1.27, "logps_train/chosen": -62.269317626953125, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -107.33318328857422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6949433088302612, "rewards_train/margins": 4.119667887687683, "rewards_train/rejected": -3.424724578857422, "step": 957 }, { "epoch": 1.27, "learning_rate": 4.0786004944377043e-07, "loss": 0.0289, "step": 958 }, { "epoch": 1.27, "logps_train/chosen": -42.874794006347656, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -63.91020584106445, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9953328371047974, "rewards_train/margins": 3.4183846712112427, "rewards_train/rejected": -2.4230518341064453, "step": 958 }, { "epoch": 1.27, "logps_train/chosen": -42.67525100708008, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -79.3193359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3605998158454895, "rewards_train/margins": 3.5761268734931946, "rewards_train/rejected": -3.215527057647705, "step": 959 }, { "epoch": 1.27, "learning_rate": 4.0743333479102e-07, "loss": 0.0704, "step": 960 }, { "epoch": 1.27, "logps_train/chosen": -66.95230102539062, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -112.90501403808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5758639574050903, "rewards_train/margins": 4.456990361213684, "rewards_train/rejected": -3.8811264038085938, "step": 960 }, { "epoch": 1.28, "logps_train/chosen": -37.265350341796875, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -59.17976379394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27112123370170593, "rewards_train/margins": 2.303160399198532, "rewards_train/rejected": -2.032039165496826, "step": 961 }, { "epoch": 1.28, "learning_rate": 4.0700585869811465e-07, "loss": 0.108, "step": 962 }, { "epoch": 1.28, "logps_train/chosen": -40.67923355102539, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -75.93785095214844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.19348713755607605, "rewards_train/margins": 2.821647137403488, "rewards_train/rejected": -2.628159999847412, "step": 962 }, { "epoch": 1.28, "logps_train/chosen": -54.76226043701172, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -131.7702178955078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2643990516662598, "rewards_train/margins": 5.339857578277588, "rewards_train/rejected": -4.075458526611328, "step": 963 }, { "epoch": 1.28, "learning_rate": 4.0657762323258014e-07, "loss": 0.0975, "step": 964 }, { "epoch": 1.28, "logps_train/chosen": -83.19999694824219, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -124.19242858886719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.978438138961792, "rewards_train/margins": 4.899243593215942, "rewards_train/rejected": -3.9208054542541504, "step": 964 }, { "epoch": 1.28, "logps_train/chosen": -73.39173126220703, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -113.95267486572266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08270162343978882, "rewards_train/margins": 3.8014068007469177, "rewards_train/rejected": -3.718705177307129, "step": 965 }, { "epoch": 1.28, "learning_rate": 4.061486304656149e-07, "loss": 0.0427, "step": 966 }, { "epoch": 1.28, "logps_train/chosen": -49.0689582824707, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -98.26251983642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8099011182785034, "rewards_train/margins": 4.797090411186218, "rewards_train/rejected": -3.987189292907715, "step": 966 }, { "epoch": 1.28, "logps_train/chosen": -66.22107696533203, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -121.72024536132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8544548153877258, "rewards_train/margins": 5.161636173725128, "rewards_train/rejected": -4.307181358337402, "step": 967 }, { "epoch": 1.29, "learning_rate": 4.057188824720801e-07, "loss": 0.0643, "step": 968 }, { "epoch": 1.29, "logps_train/chosen": -80.0987319946289, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -117.8000717163086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.4901275038719177, "rewards_train/margins": 3.670134961605072, "rewards_train/rejected": -3.1800074577331543, "step": 968 }, { "epoch": 1.29, "logps_train/chosen": -51.99189758300781, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -73.40035247802734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2125288248062134, "rewards_train/margins": 3.9181896448135376, "rewards_train/rejected": -2.705660820007324, "step": 969 }, { "epoch": 1.29, "learning_rate": 4.052883813304897e-07, "loss": 0.1029, "step": 970 }, { "epoch": 1.29, "logps_train/chosen": -41.81707000732422, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -108.32887268066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8354808688163757, "rewards_train/margins": 4.230868756771088, "rewards_train/rejected": -3.395387887954712, "step": 970 }, { "epoch": 1.29, "logps_train/chosen": -70.6596908569336, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -91.08563995361328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5168430805206299, "rewards_train/margins": 3.537907123565674, "rewards_train/rejected": -3.021064043045044, "step": 971 }, { "epoch": 1.29, "learning_rate": 4.048571291230003e-07, "loss": 0.071, "step": 972 }, { "epoch": 1.29, "logps_train/chosen": -36.96552276611328, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -63.379859924316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6413381099700928, "rewards_train/margins": 3.4558866024017334, "rewards_train/rejected": -2.8145484924316406, "step": 972 }, { "epoch": 1.29, "logps_train/chosen": -62.48179626464844, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -95.2090835571289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1712360382080078, "rewards_train/margins": 4.900933265686035, "rewards_train/rejected": -3.7296972274780273, "step": 973 }, { "epoch": 1.29, "learning_rate": 4.0442512793540107e-07, "loss": 0.1032, "step": 974 }, { "epoch": 1.29, "logps_train/chosen": -42.25353240966797, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -52.122840881347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2648809850215912, "rewards_train/margins": 1.7584150731563568, "rewards_train/rejected": -1.4935340881347656, "step": 974 }, { "epoch": 1.29, "logps_train/chosen": -80.43029022216797, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -153.0605010986328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08822091668844223, "rewards_train/margins": 5.933333240449429, "rewards_train/rejected": -5.845112323760986, "step": 975 }, { "epoch": 1.3, "learning_rate": 4.0399237985710365e-07, "loss": 0.1042, "step": 976 }, { "epoch": 1.3, "logps_train/chosen": -42.721588134765625, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -95.30130767822266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.16502982378005981, "rewards_train/margins": 3.906507194042206, "rewards_train/rejected": -4.071537017822266, "step": 976 }, { "epoch": 1.3, "logps_train/chosen": -50.351463317871094, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -101.41901397705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9757912158966064, "rewards_train/margins": 4.138005495071411, "rewards_train/rejected": -3.1622142791748047, "step": 977 }, { "epoch": 1.3, "learning_rate": 4.0355888698113227e-07, "loss": 0.0927, "step": 978 }, { "epoch": 1.3, "logps_train/chosen": -73.92927551269531, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -141.10940551757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23988446593284607, "rewards_train/margins": 4.886761277914047, "rewards_train/rejected": -4.646876811981201, "step": 978 }, { "epoch": 1.3, "logps_train/chosen": -33.883792877197266, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -76.19090270996094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6881831884384155, "rewards_train/margins": 3.8439923524856567, "rewards_train/rejected": -3.155809164047241, "step": 979 }, { "epoch": 1.3, "learning_rate": 4.0312465140411323e-07, "loss": 0.0837, "step": 980 }, { "epoch": 1.3, "logps_train/chosen": -74.60830688476562, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -101.61077880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.5969820022583008, "rewards_train/margins": 4.809622049331665, "rewards_train/rejected": -3.2126400470733643, "step": 980 }, { "epoch": 1.3, "logps_train/chosen": -45.12177276611328, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -107.99531555175781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3663383722305298, "rewards_train/margins": 4.236182332038879, "rewards_train/rejected": -3.8698439598083496, "step": 981 }, { "epoch": 1.3, "learning_rate": 4.02689675226265e-07, "loss": 0.0304, "step": 982 }, { "epoch": 1.3, "logps_train/chosen": -71.71450805664062, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -108.8749771118164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8519866466522217, "rewards_train/margins": 3.9590156078338623, "rewards_train/rejected": -3.1070289611816406, "step": 982 }, { "epoch": 1.31, "logps_train/chosen": -44.77873992919922, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -90.83544921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6157783269882202, "rewards_train/margins": 3.8610416650772095, "rewards_train/rejected": -3.2452633380889893, "step": 983 }, { "epoch": 1.31, "learning_rate": 4.022539605513882e-07, "loss": 0.0651, "step": 984 }, { "epoch": 1.31, "logps_train/chosen": -69.95819854736328, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -108.64181518554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.46886688470840454, "rewards_train/margins": 3.9062523245811462, "rewards_train/rejected": -4.375119209289551, "step": 984 }, { "epoch": 1.31, "logps_train/chosen": -53.61906814575195, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -96.06985473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6142650246620178, "rewards_train/margins": 3.8251567482948303, "rewards_train/rejected": -3.2108917236328125, "step": 985 }, { "epoch": 1.31, "learning_rate": 4.01817509486855e-07, "loss": 0.1021, "step": 986 }, { "epoch": 1.31, "logps_train/chosen": -70.08271026611328, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -116.93687438964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9011039137840271, "rewards_train/margins": 5.472917020320892, "rewards_train/rejected": -4.571813106536865, "step": 986 }, { "epoch": 1.31, "logps_train/chosen": -106.01170349121094, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -136.1728515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5050796866416931, "rewards_train/margins": 4.554005563259125, "rewards_train/rejected": -4.048925876617432, "step": 987 }, { "epoch": 1.31, "learning_rate": 4.013803241435995e-07, "loss": 0.0495, "step": 988 }, { "epoch": 1.31, "logps_train/chosen": -39.71161651611328, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -60.27724075317383, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2882133722305298, "rewards_train/margins": 3.8319531679153442, "rewards_train/rejected": -2.5437397956848145, "step": 988 }, { "epoch": 1.31, "logps_train/chosen": -69.28598022460938, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -120.88604736328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12687040865421295, "rewards_train/margins": 3.9264133125543594, "rewards_train/rejected": -3.7995429039001465, "step": 989 }, { "epoch": 1.31, "learning_rate": 4.009424066361068e-07, "loss": 0.0698, "step": 990 }, { "epoch": 1.31, "logps_train/chosen": -49.20613479614258, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -100.90812683105469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2520427405834198, "rewards_train/margins": 4.060043066740036, "rewards_train/rejected": -3.808000326156616, "step": 990 }, { "epoch": 1.32, "logps_train/chosen": -59.874046325683594, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -83.2501220703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6211891174316406, "rewards_train/margins": 3.183701992034912, "rewards_train/rejected": -2.5625128746032715, "step": 991 }, { "epoch": 1.32, "learning_rate": 4.0050375908240354e-07, "loss": 0.0626, "step": 992 }, { "epoch": 1.32, "logps_train/chosen": -37.99253463745117, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -87.21085357666016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5360980033874512, "rewards_train/margins": 3.845561981201172, "rewards_train/rejected": -3.3094639778137207, "step": 992 }, { "epoch": 1.32, "logps_train/chosen": -41.14564514160156, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -87.1806640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6236007213592529, "rewards_train/margins": 4.1629555225372314, "rewards_train/rejected": -3.5393548011779785, "step": 993 }, { "epoch": 1.32, "learning_rate": 4.0006438360404706e-07, "loss": 0.0665, "step": 994 }, { "epoch": 1.32, "logps_train/chosen": -46.028621673583984, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -87.7632827758789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9674502611160278, "rewards_train/margins": 4.587528586387634, "rewards_train/rejected": -3.6200783252716064, "step": 994 }, { "epoch": 1.32, "logps_train/chosen": -56.307796478271484, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -104.26795959472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.17390775680542, "rewards_train/margins": 4.7124223709106445, "rewards_train/rejected": -3.5385146141052246, "step": 995 }, { "epoch": 1.32, "learning_rate": 3.9962428232611557e-07, "loss": 0.0223, "step": 996 }, { "epoch": 1.32, "logps_train/chosen": -53.28760528564453, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -90.47726440429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23881781101226807, "rewards_train/margins": 3.460908532142639, "rewards_train/rejected": -3.222090721130371, "step": 996 }, { "epoch": 1.32, "logps_train/chosen": -53.569358825683594, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -82.82012176513672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.3407207727432251, "rewards_train/margins": 2.681327223777771, "rewards_train/rejected": -2.340606451034546, "step": 997 }, { "epoch": 1.33, "learning_rate": 3.991834573771975e-07, "loss": 0.152, "step": 998 }, { "epoch": 1.33, "logps_train/chosen": -53.26763153076172, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -101.80069732666016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5872995853424072, "rewards_train/margins": 4.925963640213013, "rewards_train/rejected": -4.3386640548706055, "step": 998 }, { "epoch": 1.33, "logps_train/chosen": -51.77572250366211, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -97.53910827636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1880526542663574, "rewards_train/margins": 4.655244827270508, "rewards_train/rejected": -3.4671921730041504, "step": 999 }, { "epoch": 1.33, "learning_rate": 3.9874191088938145e-07, "loss": 0.0473, "step": 1000 }, { "epoch": 1.33, "logps_train/chosen": -67.94255828857422, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -109.68108367919922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42918145656585693, "rewards_train/margins": 3.4801026582717896, "rewards_train/rejected": -3.0509212017059326, "step": 1000 }, { "epoch": 1.33, "logps_train/chosen": -53.12202453613281, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -89.68846130371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.049516201019287, "rewards_train/margins": 4.687112092971802, "rewards_train/rejected": -3.6375958919525146, "step": 1001 }, { "epoch": 1.33, "learning_rate": 3.9829964499824584e-07, "loss": 0.0681, "step": 1002 }, { "epoch": 1.33, "logps_train/chosen": -45.417903900146484, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -136.85548400878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7019596099853516, "rewards_train/margins": 5.3281331062316895, "rewards_train/rejected": -4.626173496246338, "step": 1002 }, { "epoch": 1.33, "logps_train/chosen": -80.79769897460938, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -133.72149658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1256986856460571, "rewards_train/margins": 4.87206757068634, "rewards_train/rejected": -3.746368885040283, "step": 1003 }, { "epoch": 1.33, "learning_rate": 3.9785666184284845e-07, "loss": 0.0213, "step": 1004 }, { "epoch": 1.33, "logps_train/chosen": -56.036407470703125, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -94.72097778320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.4760464429855347, "rewards_train/margins": 5.1372071504592896, "rewards_train/rejected": -3.661160707473755, "step": 1004 }, { "epoch": 1.33, "logps_train/chosen": -83.94972229003906, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -139.4557647705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15659064054489136, "rewards_train/margins": 5.177167117595673, "rewards_train/rejected": -5.020576477050781, "step": 1005 }, { "epoch": 1.34, "learning_rate": 3.974129635657162e-07, "loss": 0.0134, "step": 1006 }, { "epoch": 1.34, "logps_train/chosen": -70.18133544921875, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -111.02115631103516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0607725381851196, "rewards_train/margins": 4.70077908039093, "rewards_train/rejected": -3.6400065422058105, "step": 1006 }, { "epoch": 1.34, "logps_train/chosen": -50.34516525268555, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -115.85397338867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9792284965515137, "rewards_train/margins": 4.222438097000122, "rewards_train/rejected": -3.2432096004486084, "step": 1007 }, { "epoch": 1.34, "learning_rate": 3.969685523128349e-07, "loss": 0.0694, "step": 1008 }, { "epoch": 1.34, "logps_train/chosen": -52.69221496582031, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -98.31047058105469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8526535034179688, "rewards_train/margins": 4.3962016105651855, "rewards_train/rejected": -3.543548107147217, "step": 1008 }, { "epoch": 1.34, "logps_train/chosen": -45.293094635009766, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -101.41578674316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3120967149734497, "rewards_train/margins": 4.282581686973572, "rewards_train/rejected": -3.970484972000122, "step": 1009 }, { "epoch": 1.34, "learning_rate": 3.9652343023363854e-07, "loss": 0.0461, "step": 1010 }, { "epoch": 1.34, "logps_train/chosen": -65.95108795166016, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -109.56602478027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4259849488735199, "rewards_train/margins": 4.029852956533432, "rewards_train/rejected": -3.603868007659912, "step": 1010 }, { "epoch": 1.34, "logps_train/chosen": -54.834251403808594, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -94.55369567871094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6900125741958618, "rewards_train/margins": 4.3539756536483765, "rewards_train/rejected": -3.6639630794525146, "step": 1011 }, { "epoch": 1.34, "learning_rate": 3.960775994809992e-07, "loss": 0.0807, "step": 1012 }, { "epoch": 1.34, "logps_train/chosen": -36.85002517700195, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -71.35670471191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2029712200164795, "rewards_train/margins": 3.1733243465423584, "rewards_train/rejected": -3.376295566558838, "step": 1012 }, { "epoch": 1.35, "logps_train/chosen": -60.6200065612793, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -89.31283569335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.059874415397644, "rewards_train/margins": 3.7856889963150024, "rewards_train/rejected": -2.7258145809173584, "step": 1013 }, { "epoch": 1.35, "learning_rate": 3.956310622112165e-07, "loss": 0.072, "step": 1014 }, { "epoch": 1.35, "logps_train/chosen": -67.36927795410156, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -101.2596664428711, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9169785976409912, "rewards_train/margins": 4.32224178314209, "rewards_train/rejected": -3.4052631855010986, "step": 1014 }, { "epoch": 1.35, "logps_train/chosen": -41.60442352294922, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -96.79115295410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1582099199295044, "rewards_train/margins": 3.336934447288513, "rewards_train/rejected": -3.178724527359009, "step": 1015 }, { "epoch": 1.35, "learning_rate": 3.9518382058400704e-07, "loss": 0.1142, "step": 1016 }, { "epoch": 1.35, "logps_train/chosen": -52.19397735595703, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -83.73917388916016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6188836097717285, "rewards_train/margins": 3.8568637371063232, "rewards_train/rejected": -3.2379801273345947, "step": 1016 }, { "epoch": 1.35, "logps_train/chosen": -65.00798034667969, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -91.54377746582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6523267030715942, "rewards_train/margins": 3.672329068183899, "rewards_train/rejected": -3.0200023651123047, "step": 1017 }, { "epoch": 1.35, "learning_rate": 3.947358767624944e-07, "loss": 0.0424, "step": 1018 }, { "epoch": 1.35, "logps_train/chosen": -59.24694061279297, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -107.57969665527344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2692509591579437, "rewards_train/margins": 4.517064183950424, "rewards_train/rejected": -4.2478132247924805, "step": 1018 }, { "epoch": 1.35, "logps_train/chosen": -89.07691955566406, "logps_train/ref_chosen": -92.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -104.98500061035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3313709497451782, "rewards_train/margins": 3.690808892250061, "rewards_train/rejected": -3.359437942504883, "step": 1019 }, { "epoch": 1.35, "learning_rate": 3.942872329131982e-07, "loss": 0.0533, "step": 1020 }, { "epoch": 1.35, "logps_train/chosen": -48.38117218017578, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -68.29963684082031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.07946094125509262, "rewards_train/margins": 2.702588878571987, "rewards_train/rejected": -2.6231279373168945, "step": 1020 }, { "epoch": 1.36, "logps_train/chosen": -39.482635498046875, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -68.35737609863281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7329864501953125, "rewards_train/margins": 2.9062247276306152, "rewards_train/rejected": -2.1732382774353027, "step": 1021 }, { "epoch": 1.36, "learning_rate": 3.9383789120602373e-07, "loss": 0.1518, "step": 1022 }, { "epoch": 1.36, "logps_train/chosen": -67.55949401855469, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -110.52125549316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4628005921840668, "rewards_train/margins": 4.716488927602768, "rewards_train/rejected": -4.253688335418701, "step": 1022 }, { "epoch": 1.36, "logps_train/chosen": -59.67716979980469, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -122.22808837890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18892377614974976, "rewards_train/margins": 5.23321670293808, "rewards_train/rejected": -5.04429292678833, "step": 1023 }, { "epoch": 1.36, "learning_rate": 3.9338785381425176e-07, "loss": 0.0433, "step": 1024 }, { "epoch": 1.36, "logps_train/chosen": -90.24518585205078, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -139.3506622314453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.10360658168792725, "rewards_train/margins": 3.993359684944153, "rewards_train/rejected": -3.8897531032562256, "step": 1024 }, { "epoch": 1.36, "logps_train/chosen": -35.725196838378906, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -82.62857818603516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9130270481109619, "rewards_train/margins": 4.032134771347046, "rewards_train/rejected": -3.119107723236084, "step": 1025 }, { "epoch": 1.36, "learning_rate": 3.929371229145275e-07, "loss": 0.2019, "step": 1026 }, { "epoch": 1.36, "logps_train/chosen": -74.83785247802734, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -119.43509674072266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4130899906158447, "rewards_train/margins": 4.7034752368927, "rewards_train/rejected": -4.2903852462768555, "step": 1026 }, { "epoch": 1.36, "logps_train/chosen": -42.357696533203125, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -84.29924774169922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38591018319129944, "rewards_train/margins": 3.8814598619937897, "rewards_train/rejected": -3.4955496788024902, "step": 1027 }, { "epoch": 1.37, "learning_rate": 3.924857006868508e-07, "loss": 0.031, "step": 1028 }, { "epoch": 1.37, "logps_train/chosen": -62.99864196777344, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -113.484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7876359224319458, "rewards_train/margins": 4.931385159492493, "rewards_train/rejected": -4.143749237060547, "step": 1028 }, { "epoch": 1.37, "logps_train/chosen": -39.19382858276367, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -67.39474487304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6212420463562012, "rewards_train/margins": 3.7962634563446045, "rewards_train/rejected": -3.1750214099884033, "step": 1029 }, { "epoch": 1.37, "learning_rate": 3.9203358931456474e-07, "loss": 0.0315, "step": 1030 }, { "epoch": 1.37, "logps_train/chosen": -51.22350311279297, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -103.72412872314453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19639964401721954, "rewards_train/margins": 3.426625445485115, "rewards_train/rejected": -3.2302258014678955, "step": 1030 }, { "epoch": 1.37, "logps_train/chosen": -55.18017578125, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -104.96635437011719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4007323384284973, "rewards_train/margins": 4.317681133747101, "rewards_train/rejected": -3.9169487953186035, "step": 1031 }, { "epoch": 1.37, "learning_rate": 3.915807909843457e-07, "loss": 0.0399, "step": 1032 }, { "epoch": 1.37, "logps_train/chosen": -51.672515869140625, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -97.20822143554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.49446725845336914, "rewards_train/margins": 4.267633438110352, "rewards_train/rejected": -3.7731661796569824, "step": 1032 }, { "epoch": 1.37, "logps_train/chosen": -76.35983276367188, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -106.61408996582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2027553915977478, "rewards_train/margins": 3.2094350457191467, "rewards_train/rejected": -3.4121904373168945, "step": 1033 }, { "epoch": 1.37, "learning_rate": 3.911273078861926e-07, "loss": 0.1049, "step": 1034 }, { "epoch": 1.37, "logps_train/chosen": -59.87396240234375, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -102.09280395507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17705705761909485, "rewards_train/margins": 3.7832129299640656, "rewards_train/rejected": -3.6061558723449707, "step": 1034 }, { "epoch": 1.37, "logps_train/chosen": -36.76939392089844, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -69.31784057617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0308732986450195, "rewards_train/margins": 3.701328754425049, "rewards_train/rejected": -2.6704554557800293, "step": 1035 }, { "epoch": 1.38, "learning_rate": 3.906731422134164e-07, "loss": 0.0946, "step": 1036 }, { "epoch": 1.38, "logps_train/chosen": -54.1653938293457, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -84.251708984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6131480932235718, "rewards_train/margins": 3.5605849027633667, "rewards_train/rejected": -2.947436809539795, "step": 1036 }, { "epoch": 1.38, "logps_train/chosen": -67.73870086669922, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -106.1065444946289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4706614911556244, "rewards_train/margins": 5.057487815618515, "rewards_train/rejected": -4.586826324462891, "step": 1037 }, { "epoch": 1.38, "learning_rate": 3.9021829616262913e-07, "loss": 0.0614, "step": 1038 }, { "epoch": 1.38, "logps_train/chosen": -63.47399139404297, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -127.60704040527344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3811319172382355, "rewards_train/margins": 4.873085767030716, "rewards_train/rejected": -4.4919538497924805, "step": 1038 }, { "epoch": 1.38, "logps_train/chosen": -59.90660858154297, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -136.5363006591797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25465142726898193, "rewards_train/margins": 5.894218802452087, "rewards_train/rejected": -5.6395673751831055, "step": 1039 }, { "epoch": 1.38, "learning_rate": 3.8976277193373377e-07, "loss": 0.0246, "step": 1040 }, { "epoch": 1.38, "logps_train/chosen": -52.551116943359375, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -115.63165283203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7495759725570679, "rewards_train/margins": 4.58617889881134, "rewards_train/rejected": -3.8366029262542725, "step": 1040 }, { "epoch": 1.38, "logps_train/chosen": -22.925827026367188, "logps_train/ref_chosen": -27.5, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -62.66178894042969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4554642140865326, "rewards_train/margins": 3.1657839715480804, "rewards_train/rejected": -2.710319757461548, "step": 1041 }, { "epoch": 1.38, "learning_rate": 3.8930657172991316e-07, "loss": 0.0512, "step": 1042 }, { "epoch": 1.38, "logps_train/chosen": -53.44230651855469, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -93.5882568359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6480851173400879, "rewards_train/margins": 4.873707294464111, "rewards_train/rejected": -4.225622177124023, "step": 1042 }, { "epoch": 1.39, "logps_train/chosen": -59.77875900268555, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -100.12228393554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25073757767677307, "rewards_train/margins": 4.154762893915176, "rewards_train/rejected": -3.9040253162384033, "step": 1043 }, { "epoch": 1.39, "learning_rate": 3.888496977576198e-07, "loss": 0.0637, "step": 1044 }, { "epoch": 1.39, "logps_train/chosen": -55.947975158691406, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -112.9357681274414, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4817647635936737, "rewards_train/margins": 5.142529457807541, "rewards_train/rejected": -4.660764694213867, "step": 1044 }, { "epoch": 1.39, "logps_train/chosen": -69.31583404541016, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -102.5622787475586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5824788808822632, "rewards_train/margins": 4.487144112586975, "rewards_train/rejected": -3.904665231704712, "step": 1045 }, { "epoch": 1.39, "learning_rate": 3.883921522265646e-07, "loss": 0.0261, "step": 1046 }, { "epoch": 1.39, "logps_train/chosen": -70.0384521484375, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -113.40721130371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18209242820739746, "rewards_train/margins": 4.174376487731934, "rewards_train/rejected": -3.992284059524536, "step": 1046 }, { "epoch": 1.39, "logps_train/chosen": -58.87603759765625, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -87.86286926269531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7436462640762329, "rewards_train/margins": 3.533059000968933, "rewards_train/rejected": -2.7894127368927, "step": 1047 }, { "epoch": 1.39, "learning_rate": 3.87933937349707e-07, "loss": 0.0678, "step": 1048 }, { "epoch": 1.39, "logps_train/chosen": -58.437591552734375, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -90.81820678710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29061585664749146, "rewards_train/margins": 3.9052491784095764, "rewards_train/rejected": -3.614633321762085, "step": 1048 }, { "epoch": 1.39, "logps_train/chosen": -43.98640441894531, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -81.9910888671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27401575446128845, "rewards_train/margins": 3.1918743550777435, "rewards_train/rejected": -2.917858600616455, "step": 1049 }, { "epoch": 1.39, "learning_rate": 3.874750553432433e-07, "loss": 0.0729, "step": 1050 }, { "epoch": 1.39, "logps_train/chosen": -62.12126922607422, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -138.9076385498047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6417795419692993, "rewards_train/margins": 5.654418587684631, "rewards_train/rejected": -5.012639045715332, "step": 1050 }, { "epoch": 1.4, "logps_train/chosen": -69.0708236694336, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -97.9734878540039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04023642838001251, "rewards_train/margins": 2.8766434639692307, "rewards_train/rejected": -2.916879892349243, "step": 1051 }, { "epoch": 1.4, "learning_rate": 3.870155084265967e-07, "loss": 0.069, "step": 1052 }, { "epoch": 1.4, "logps_train/chosen": -61.396236419677734, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -106.94522094726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5814701318740845, "rewards_train/margins": 4.19044554233551, "rewards_train/rejected": -3.608975410461426, "step": 1052 }, { "epoch": 1.4, "logps_train/chosen": -65.21392059326172, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -127.26935577392578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.390326589345932, "rewards_train/margins": 4.41960534453392, "rewards_train/rejected": -4.029278755187988, "step": 1053 }, { "epoch": 1.4, "learning_rate": 3.865552988224062e-07, "loss": 0.0455, "step": 1054 }, { "epoch": 1.4, "logps_train/chosen": -39.886680603027344, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -82.67132568359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8449255228042603, "rewards_train/margins": 3.7524880170822144, "rewards_train/rejected": -2.907562494277954, "step": 1054 }, { "epoch": 1.4, "logps_train/chosen": -72.55946350097656, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -135.72982788085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6815534830093384, "rewards_train/margins": 5.444380402565002, "rewards_train/rejected": -4.762826919555664, "step": 1055 }, { "epoch": 1.4, "learning_rate": 3.86094428756516e-07, "loss": 0.079, "step": 1056 }, { "epoch": 1.4, "logps_train/chosen": -69.78994750976562, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -99.00350952148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.328036904335022, "rewards_train/margins": 3.648309111595154, "rewards_train/rejected": -3.320272207260132, "step": 1056 }, { "epoch": 1.4, "logps_train/chosen": -73.80904388427734, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -91.90657043457031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5300331115722656, "rewards_train/margins": 3.8800649642944336, "rewards_train/rejected": -3.350031852722168, "step": 1057 }, { "epoch": 1.41, "learning_rate": 3.856329004579647e-07, "loss": 0.0279, "step": 1058 }, { "epoch": 1.41, "logps_train/chosen": -67.9787368774414, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -91.948486328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9634546637535095, "rewards_train/margins": 3.448147237300873, "rewards_train/rejected": -2.4846925735473633, "step": 1058 }, { "epoch": 1.41, "logps_train/chosen": -56.517417907714844, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -78.15455627441406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2271642684936523, "rewards_train/margins": 3.354339122772217, "rewards_train/rejected": -2.1271748542785645, "step": 1059 }, { "epoch": 1.41, "learning_rate": 3.8517071615897443e-07, "loss": 0.1296, "step": 1060 }, { "epoch": 1.41, "logps_train/chosen": -84.07479858398438, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -139.5961151123047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1758396327495575, "rewards_train/margins": 4.849397450685501, "rewards_train/rejected": -5.025237083435059, "step": 1060 }, { "epoch": 1.41, "logps_train/chosen": -57.02593994140625, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -105.87884521484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5033937096595764, "rewards_train/margins": 4.244012415409088, "rewards_train/rejected": -3.7406187057495117, "step": 1061 }, { "epoch": 1.41, "learning_rate": 3.847078780949401e-07, "loss": 0.0518, "step": 1062 }, { "epoch": 1.41, "logps_train/chosen": -71.30848693847656, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -104.53205871582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4847757816314697, "rewards_train/margins": 4.8950135707855225, "rewards_train/rejected": -4.410237789154053, "step": 1062 }, { "epoch": 1.41, "logps_train/chosen": -47.513587951660156, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -69.87454223632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.03418812155723572, "rewards_train/margins": 2.6255482137203217, "rewards_train/rejected": -2.591360092163086, "step": 1063 }, { "epoch": 1.41, "learning_rate": 3.8424438850441885e-07, "loss": 0.0852, "step": 1064 }, { "epoch": 1.41, "logps_train/chosen": -73.03597259521484, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -123.27498626708984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.572184145450592, "rewards_train/margins": 4.182495176792145, "rewards_train/rejected": -3.6103110313415527, "step": 1064 }, { "epoch": 1.41, "logps_train/chosen": -58.52642059326172, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -111.962646484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09561094641685486, "rewards_train/margins": 4.01393523812294, "rewards_train/rejected": -4.109546184539795, "step": 1065 }, { "epoch": 1.42, "learning_rate": 3.837802496291186e-07, "loss": 0.0634, "step": 1066 }, { "epoch": 1.42, "logps_train/chosen": -87.14884948730469, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -140.74317932128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19448992609977722, "rewards_train/margins": 4.715682476758957, "rewards_train/rejected": -4.52119255065918, "step": 1066 }, { "epoch": 1.42, "logps_train/chosen": -55.47509765625, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -103.63142395019531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1962401121854782, "rewards_train/margins": 4.593757793307304, "rewards_train/rejected": -4.397517681121826, "step": 1067 }, { "epoch": 1.42, "learning_rate": 3.8331546371388797e-07, "loss": 0.026, "step": 1068 }, { "epoch": 1.42, "logps_train/chosen": -46.59416198730469, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -73.27717590332031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2671464681625366, "rewards_train/margins": 3.237052083015442, "rewards_train/rejected": -2.9699056148529053, "step": 1068 }, { "epoch": 1.42, "logps_train/chosen": -81.00228881835938, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -108.65495300292969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7060210704803467, "rewards_train/margins": 4.946516752243042, "rewards_train/rejected": -4.240495681762695, "step": 1069 }, { "epoch": 1.42, "learning_rate": 3.828500330067047e-07, "loss": 0.053, "step": 1070 }, { "epoch": 1.42, "logps_train/chosen": -60.29121017456055, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -86.41423034667969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5646289587020874, "rewards_train/margins": 3.957614779472351, "rewards_train/rejected": -3.3929858207702637, "step": 1070 }, { "epoch": 1.42, "logps_train/chosen": -36.730682373046875, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -70.3037109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4820095896720886, "rewards_train/margins": 2.9131621718406677, "rewards_train/rejected": -2.431152582168579, "step": 1071 }, { "epoch": 1.42, "learning_rate": 3.823839597586654e-07, "loss": 0.0692, "step": 1072 }, { "epoch": 1.42, "logps_train/chosen": -33.606929779052734, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -90.87196350097656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5670413374900818, "rewards_train/margins": 4.056581437587738, "rewards_train/rejected": -3.4895401000976562, "step": 1072 }, { "epoch": 1.42, "logps_train/chosen": -36.388885498046875, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -87.69721221923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8017363548278809, "rewards_train/margins": 3.945675849914551, "rewards_train/rejected": -3.14393949508667, "step": 1073 }, { "epoch": 1.43, "learning_rate": 3.8191724622397424e-07, "loss": 0.0336, "step": 1074 }, { "epoch": 1.43, "logps_train/chosen": -71.81846618652344, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -126.84252166748047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4345601499080658, "rewards_train/margins": 5.360218375921249, "rewards_train/rejected": -4.925658226013184, "step": 1074 }, { "epoch": 1.43, "logps_train/chosen": -33.8405876159668, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -61.79624938964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4596914052963257, "rewards_train/margins": 2.975253462791443, "rewards_train/rejected": -2.515562057495117, "step": 1075 }, { "epoch": 1.43, "learning_rate": 3.8144989465993237e-07, "loss": 0.0405, "step": 1076 }, { "epoch": 1.43, "logps_train/chosen": -69.2583236694336, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -123.81806945800781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09135481715202332, "rewards_train/margins": 4.496599167585373, "rewards_train/rejected": -4.40524435043335, "step": 1076 }, { "epoch": 1.43, "logps_train/chosen": -68.01119995117188, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -100.36489868164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06294223666191101, "rewards_train/margins": 4.075995177030563, "rewards_train/rejected": -4.013052940368652, "step": 1077 }, { "epoch": 1.43, "learning_rate": 3.809819073269265e-07, "loss": 0.0309, "step": 1078 }, { "epoch": 1.43, "logps_train/chosen": -109.69962310791016, "logps_train/ref_chosen": -115.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -137.47056579589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5191001892089844, "rewards_train/margins": 4.344281196594238, "rewards_train/rejected": -3.825181007385254, "step": 1078 }, { "epoch": 1.43, "logps_train/chosen": -37.2322998046875, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -68.06069946289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2470826506614685, "rewards_train/margins": 3.1429962515830994, "rewards_train/rejected": -2.895913600921631, "step": 1079 }, { "epoch": 1.43, "learning_rate": 3.8051328648841854e-07, "loss": 0.0729, "step": 1080 }, { "epoch": 1.43, "logps_train/chosen": -45.251182556152344, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -88.29017639160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6326940655708313, "rewards_train/margins": 3.7273367047309875, "rewards_train/rejected": -3.0946426391601562, "step": 1080 }, { "epoch": 1.44, "logps_train/chosen": -73.56950378417969, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -111.57798767089844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1430499106645584, "rewards_train/margins": 4.233661368489265, "rewards_train/rejected": -4.090611457824707, "step": 1081 }, { "epoch": 1.44, "learning_rate": 3.8004403441093436e-07, "loss": 0.0348, "step": 1082 }, { "epoch": 1.44, "logps_train/chosen": -77.24303436279297, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -126.18246459960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3858528733253479, "rewards_train/margins": 4.23847371339798, "rewards_train/rejected": -3.852620840072632, "step": 1082 }, { "epoch": 1.44, "logps_train/chosen": -47.20745849609375, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -102.12366485595703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5440980195999146, "rewards_train/margins": 4.372089743614197, "rewards_train/rejected": -3.8279917240142822, "step": 1083 }, { "epoch": 1.44, "learning_rate": 3.7957415336405284e-07, "loss": 0.0587, "step": 1084 }, { "epoch": 1.44, "logps_train/chosen": -28.81665802001953, "logps_train/ref_chosen": -28.375, "logps_train/ref_rejected": -32.0, "logps_train/rejected": -56.57765197753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04182225465774536, "rewards_train/margins": 2.4061774611473083, "rewards_train/rejected": -2.4479997158050537, "step": 1084 }, { "epoch": 1.44, "logps_train/chosen": -66.69900512695312, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -93.84065246582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33009931445121765, "rewards_train/margins": 3.148539274930954, "rewards_train/rejected": -2.8184399604797363, "step": 1085 }, { "epoch": 1.44, "learning_rate": 3.79103645620395e-07, "loss": 0.1178, "step": 1086 }, { "epoch": 1.44, "logps_train/chosen": -66.89918518066406, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -119.32633972167969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2725811302661896, "rewards_train/margins": 4.717714816331863, "rewards_train/rejected": -4.445133686065674, "step": 1086 }, { "epoch": 1.44, "logps_train/chosen": -73.22869110107422, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -119.14118194580078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.40568163990974426, "rewards_train/margins": 4.545936554670334, "rewards_train/rejected": -4.951618194580078, "step": 1087 }, { "epoch": 1.44, "learning_rate": 3.786325134556128e-07, "loss": 0.0214, "step": 1088 }, { "epoch": 1.44, "logps_train/chosen": -41.32347106933594, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -62.88675308227539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5395280122756958, "rewards_train/margins": 3.261797070503235, "rewards_train/rejected": -2.722269058227539, "step": 1088 }, { "epoch": 1.45, "logps_train/chosen": -56.772151947021484, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -109.71208953857422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9141911864280701, "rewards_train/margins": 4.727587401866913, "rewards_train/rejected": -3.8133962154388428, "step": 1089 }, { "epoch": 1.45, "learning_rate": 3.781607591483784e-07, "loss": 0.036, "step": 1090 }, { "epoch": 1.45, "logps_train/chosen": -52.89753723144531, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -72.97647094726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33212143182754517, "rewards_train/margins": 3.725471556186676, "rewards_train/rejected": -3.393350124359131, "step": 1090 }, { "epoch": 1.45, "logps_train/chosen": -53.46153259277344, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -85.15753173828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4772842526435852, "rewards_train/margins": 4.3735063672065735, "rewards_train/rejected": -3.8962221145629883, "step": 1091 }, { "epoch": 1.45, "learning_rate": 3.7768838498037293e-07, "loss": 0.084, "step": 1092 }, { "epoch": 1.45, "logps_train/chosen": -68.80516815185547, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -101.49811553955078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5296397805213928, "rewards_train/margins": 4.754451811313629, "rewards_train/rejected": -4.224812030792236, "step": 1092 }, { "epoch": 1.45, "logps_train/chosen": -108.41425323486328, "logps_train/ref_chosen": -109.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -132.552001953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07419991493225098, "rewards_train/margins": 4.013775110244751, "rewards_train/rejected": -3.9395751953125, "step": 1093 }, { "epoch": 1.45, "learning_rate": 3.772153932362756e-07, "loss": 0.0411, "step": 1094 }, { "epoch": 1.45, "logps_train/chosen": -60.97172546386719, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -105.09901428222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7200145125389099, "rewards_train/margins": 4.739291131496429, "rewards_train/rejected": -4.0192766189575195, "step": 1094 }, { "epoch": 1.45, "logps_train/chosen": -73.78340911865234, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -105.59697723388672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8161907196044922, "rewards_train/margins": 4.154013633728027, "rewards_train/rejected": -3.337822914123535, "step": 1095 }, { "epoch": 1.46, "learning_rate": 3.767417862037525e-07, "loss": 0.0321, "step": 1096 }, { "epoch": 1.46, "logps_train/chosen": -89.45512390136719, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -131.07769775390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4271436929702759, "rewards_train/margins": 4.430226445198059, "rewards_train/rejected": -4.003082752227783, "step": 1096 }, { "epoch": 1.46, "logps_train/chosen": -83.60784912109375, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -113.8121566772461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 1.13608980178833, "rewards_train/margins": 5.437617778778076, "rewards_train/rejected": -4.301527976989746, "step": 1097 }, { "epoch": 1.46, "learning_rate": 3.762675661734457e-07, "loss": 0.0868, "step": 1098 }, { "epoch": 1.46, "logps_train/chosen": -43.76305389404297, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -76.89112854003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4572885036468506, "rewards_train/margins": 3.4643702507019043, "rewards_train/rejected": -3.0070817470550537, "step": 1098 }, { "epoch": 1.46, "logps_train/chosen": -58.350563049316406, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -91.43194580078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04258058965206146, "rewards_train/margins": 3.3553061336278915, "rewards_train/rejected": -3.31272554397583, "step": 1099 }, { "epoch": 1.46, "learning_rate": 3.7579273543896183e-07, "loss": 0.0738, "step": 1100 }, { "epoch": 1.46, "logps_train/chosen": -47.024139404296875, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -108.52749633789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6407501101493835, "rewards_train/margins": 5.199066340923309, "rewards_train/rejected": -4.558316230773926, "step": 1100 }, { "epoch": 1.46, "logps_train/chosen": -57.39291000366211, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -111.84832000732422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3864902853965759, "rewards_train/margins": 4.5314783453941345, "rewards_train/rejected": -4.144988059997559, "step": 1101 }, { "epoch": 1.46, "learning_rate": 3.753172962968617e-07, "loss": 0.0155, "step": 1102 }, { "epoch": 1.46, "logps_train/chosen": -72.623779296875, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -121.871826171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.720434308052063, "rewards_train/margins": 4.813085675239563, "rewards_train/rejected": -4.0926513671875, "step": 1102 }, { "epoch": 1.46, "logps_train/chosen": -40.725425720214844, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -87.36402893066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34191077947616577, "rewards_train/margins": 3.592376172542572, "rewards_train/rejected": -3.2504653930664062, "step": 1103 }, { "epoch": 1.47, "learning_rate": 3.7484125104664835e-07, "loss": 0.0459, "step": 1104 }, { "epoch": 1.47, "logps_train/chosen": -58.49449157714844, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -119.99946594238281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 2.1857075691223145, "rewards_train/margins": 6.957529067993164, "rewards_train/rejected": -4.77182149887085, "step": 1104 }, { "epoch": 1.47, "logps_train/chosen": -58.1461296081543, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -96.14643096923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6252306699752808, "rewards_train/margins": 4.167608141899109, "rewards_train/rejected": -3.542377471923828, "step": 1105 }, { "epoch": 1.47, "learning_rate": 3.743646019907566e-07, "loss": 0.0428, "step": 1106 }, { "epoch": 1.47, "logps_train/chosen": -59.70039367675781, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -83.75991821289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.398710697889328, "rewards_train/margins": 3.0403279960155487, "rewards_train/rejected": -2.6416172981262207, "step": 1106 }, { "epoch": 1.47, "logps_train/chosen": -46.317649841308594, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -106.39736938476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4276101887226105, "rewards_train/margins": 5.47359636425972, "rewards_train/rejected": -5.045986175537109, "step": 1107 }, { "epoch": 1.47, "learning_rate": 3.738873514345413e-07, "loss": 0.0597, "step": 1108 }, { "epoch": 1.47, "logps_train/chosen": -54.878684997558594, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -105.11205291748047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08415296673774719, "rewards_train/margins": 4.0453589260578156, "rewards_train/rejected": -3.9612059593200684, "step": 1108 }, { "epoch": 1.47, "logps_train/chosen": -55.508506774902344, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -116.87051391601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2678989768028259, "rewards_train/margins": 4.824481904506683, "rewards_train/rejected": -4.556582927703857, "step": 1109 }, { "epoch": 1.47, "learning_rate": 3.7340950168626683e-07, "loss": 0.0841, "step": 1110 }, { "epoch": 1.47, "logps_train/chosen": -67.36357879638672, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -130.27218627929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4730176329612732, "rewards_train/margins": 5.434611260890961, "rewards_train/rejected": -4.9615936279296875, "step": 1110 }, { "epoch": 1.48, "logps_train/chosen": -64.18852233886719, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -119.49390411376953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5034622550010681, "rewards_train/margins": 4.341134488582611, "rewards_train/rejected": -3.837672233581543, "step": 1111 }, { "epoch": 1.48, "learning_rate": 3.7293105505709543e-07, "loss": 0.03, "step": 1112 }, { "epoch": 1.48, "logps_train/chosen": -43.375762939453125, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -77.70501708984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.49640795588493347, "rewards_train/margins": 3.145034521818161, "rewards_train/rejected": -2.6486265659332275, "step": 1112 }, { "epoch": 1.48, "logps_train/chosen": -74.84762573242188, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -104.9190673828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9949249029159546, "rewards_train/margins": 4.611831545829773, "rewards_train/rejected": -3.6169066429138184, "step": 1113 }, { "epoch": 1.48, "learning_rate": 3.724520138610762e-07, "loss": 0.0731, "step": 1114 }, { "epoch": 1.48, "logps_train/chosen": -108.8508529663086, "logps_train/ref_chosen": -116.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -134.4057159423828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7641333937644958, "rewards_train/margins": 4.127361238002777, "rewards_train/rejected": -3.3632278442382812, "step": 1114 }, { "epoch": 1.48, "logps_train/chosen": -24.966794967651367, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -56.13177490234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0323245525360107, "rewards_train/margins": 3.6642518043518066, "rewards_train/rejected": -2.631927251815796, "step": 1115 }, { "epoch": 1.48, "learning_rate": 3.7197238041513415e-07, "loss": 0.0875, "step": 1116 }, { "epoch": 1.48, "logps_train/chosen": -46.70785903930664, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -82.52395629882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.24031728506088257, "rewards_train/margins": 2.754266321659088, "rewards_train/rejected": -2.9945836067199707, "step": 1116 }, { "epoch": 1.48, "logps_train/chosen": -70.84528350830078, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -124.11961364746094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1279715597629547, "rewards_train/margins": 3.7868084013462067, "rewards_train/rejected": -3.658836841583252, "step": 1117 }, { "epoch": 1.48, "learning_rate": 3.714921570390583e-07, "loss": 0.0845, "step": 1118 }, { "epoch": 1.48, "logps_train/chosen": -49.310096740722656, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -121.67115783691406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.601021945476532, "rewards_train/margins": 5.2337629199028015, "rewards_train/rejected": -4.6327409744262695, "step": 1118 }, { "epoch": 1.49, "logps_train/chosen": -57.2186279296875, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -93.30982971191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.685949444770813, "rewards_train/margins": 4.887245297431946, "rewards_train/rejected": -4.201295852661133, "step": 1119 }, { "epoch": 1.49, "learning_rate": 3.710113460554915e-07, "loss": 0.0156, "step": 1120 }, { "epoch": 1.49, "logps_train/chosen": -41.76527404785156, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -114.47797393798828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1312849223613739, "rewards_train/margins": 4.600957125425339, "rewards_train/rejected": -4.469672203063965, "step": 1120 }, { "epoch": 1.49, "logps_train/chosen": -48.3980712890625, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -89.03121948242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7461304068565369, "rewards_train/margins": 3.494564712047577, "rewards_train/rejected": -2.74843430519104, "step": 1121 }, { "epoch": 1.49, "learning_rate": 3.705299497899181e-07, "loss": 0.0305, "step": 1122 }, { "epoch": 1.49, "logps_train/chosen": -30.208553314208984, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -31.0, "logps_train/rejected": -56.80779266357422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.014691617339849472, "rewards_train/margins": 2.5907833836972713, "rewards_train/rejected": -2.576091766357422, "step": 1122 }, { "epoch": 1.49, "logps_train/chosen": -86.70021057128906, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -148.09185791015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5143535137176514, "rewards_train/margins": 5.274319410324097, "rewards_train/rejected": -4.759965896606445, "step": 1123 }, { "epoch": 1.49, "learning_rate": 3.700479705706535e-07, "loss": 0.0876, "step": 1124 }, { "epoch": 1.49, "logps_train/chosen": -61.686363220214844, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -92.98202514648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2673010230064392, "rewards_train/margins": 3.260034739971161, "rewards_train/rejected": -2.9927337169647217, "step": 1124 }, { "epoch": 1.49, "logps_train/chosen": -48.21138000488281, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -95.42237854003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0976119339466095, "rewards_train/margins": 3.884381800889969, "rewards_train/rejected": -3.7867698669433594, "step": 1125 }, { "epoch": 1.5, "learning_rate": 3.6956541072883254e-07, "loss": 0.0842, "step": 1126 }, { "epoch": 1.5, "logps_train/chosen": -71.74105834960938, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -109.72960662841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5962070822715759, "rewards_train/margins": 4.600418031215668, "rewards_train/rejected": -4.004210948944092, "step": 1126 }, { "epoch": 1.5, "logps_train/chosen": -71.23037719726562, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -103.26434326171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6082123517990112, "rewards_train/margins": 4.369021773338318, "rewards_train/rejected": -3.7608094215393066, "step": 1127 }, { "epoch": 1.5, "learning_rate": 3.6908227259839823e-07, "loss": 0.037, "step": 1128 }, { "epoch": 1.5, "logps_train/chosen": -53.96746826171875, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -105.89904022216797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6032530069351196, "rewards_train/margins": 4.166594386100769, "rewards_train/rejected": -3.5633413791656494, "step": 1128 }, { "epoch": 1.5, "logps_train/chosen": -50.995262145996094, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -86.5063705444336, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.36124521493911743, "rewards_train/margins": 4.03626674413681, "rewards_train/rejected": -4.397511959075928, "step": 1129 }, { "epoch": 1.5, "learning_rate": 3.685985585160907e-07, "loss": 0.0614, "step": 1130 }, { "epoch": 1.5, "logps_train/chosen": -68.00580596923828, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -132.22116088867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6009820699691772, "rewards_train/margins": 5.766848921775818, "rewards_train/rejected": -5.165866851806641, "step": 1130 }, { "epoch": 1.5, "logps_train/chosen": -55.612831115722656, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -96.86337280273438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8918418884277344, "rewards_train/margins": 3.401616096496582, "rewards_train/rejected": -2.5097742080688477, "step": 1131 }, { "epoch": 1.5, "learning_rate": 3.681142708214355e-07, "loss": 0.0621, "step": 1132 }, { "epoch": 1.5, "logps_train/chosen": -46.75406265258789, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -90.41736602783203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8402189016342163, "rewards_train/margins": 4.903830885887146, "rewards_train/rejected": -4.06361198425293, "step": 1132 }, { "epoch": 1.5, "logps_train/chosen": -51.25959014892578, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -111.66104888916016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3088918924331665, "rewards_train/margins": 4.733101963996887, "rewards_train/rejected": -4.424210071563721, "step": 1133 }, { "epoch": 1.51, "learning_rate": 3.6762941185673274e-07, "loss": 0.0321, "step": 1134 }, { "epoch": 1.51, "logps_train/chosen": -102.30294799804688, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -160.617919921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.05216968059539795, "rewards_train/margins": 6.041653752326965, "rewards_train/rejected": -6.093823432922363, "step": 1134 }, { "epoch": 1.51, "logps_train/chosen": -60.56263732910156, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -123.65089416503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.014048665761947632, "rewards_train/margins": 3.513512462377548, "rewards_train/rejected": -3.4994637966156006, "step": 1135 }, { "epoch": 1.51, "learning_rate": 3.6714398396704527e-07, "loss": 0.082, "step": 1136 }, { "epoch": 1.51, "logps_train/chosen": -49.99536895751953, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -96.3818359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36296308040618896, "rewards_train/margins": 4.123021483421326, "rewards_train/rejected": -3.7600584030151367, "step": 1136 }, { "epoch": 1.51, "logps_train/chosen": -67.23214721679688, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -108.01008605957031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4090116620063782, "rewards_train/margins": 3.5334580540657043, "rewards_train/rejected": -3.124446392059326, "step": 1137 }, { "epoch": 1.51, "learning_rate": 3.666579895001877e-07, "loss": 0.0504, "step": 1138 }, { "epoch": 1.51, "logps_train/chosen": -52.253990173339844, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -106.87899780273438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7339762449264526, "rewards_train/margins": 4.970313906669617, "rewards_train/rejected": -4.236337661743164, "step": 1138 }, { "epoch": 1.51, "logps_train/chosen": -59.06680679321289, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -129.88473510742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3948820233345032, "rewards_train/margins": 4.944293797016144, "rewards_train/rejected": -4.549411773681641, "step": 1139 }, { "epoch": 1.51, "learning_rate": 3.6617143080671513e-07, "loss": 0.061, "step": 1140 }, { "epoch": 1.51, "logps_train/chosen": -41.75344467163086, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -92.43074035644531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6394991278648376, "rewards_train/margins": 4.052494704723358, "rewards_train/rejected": -3.4129955768585205, "step": 1140 }, { "epoch": 1.52, "logps_train/chosen": -53.30633544921875, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -72.99752807617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5053037405014038, "rewards_train/margins": 3.7284945249557495, "rewards_train/rejected": -3.2231907844543457, "step": 1141 }, { "epoch": 1.52, "learning_rate": 3.6568431023991133e-07, "loss": 0.0753, "step": 1142 }, { "epoch": 1.52, "logps_train/chosen": -98.69474029541016, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -166.5941925048828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2647863030433655, "rewards_train/margins": 5.191898763179779, "rewards_train/rejected": -5.4566850662231445, "step": 1142 }, { "epoch": 1.52, "logps_train/chosen": -67.47549438476562, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -104.95869445800781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8594820499420166, "rewards_train/margins": 4.888163805007935, "rewards_train/rejected": -4.028681755065918, "step": 1143 }, { "epoch": 1.52, "learning_rate": 3.651966301557777e-07, "loss": 0.0255, "step": 1144 }, { "epoch": 1.52, "logps_train/chosen": -78.78056335449219, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -133.0907440185547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4133502244949341, "rewards_train/margins": 4.402111649513245, "rewards_train/rejected": -3.9887614250183105, "step": 1144 }, { "epoch": 1.52, "logps_train/chosen": -50.216392517089844, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -99.7728271484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4322670102119446, "rewards_train/margins": 4.115800201892853, "rewards_train/rejected": -3.683533191680908, "step": 1145 }, { "epoch": 1.52, "learning_rate": 3.647083929130218e-07, "loss": 0.0633, "step": 1146 }, { "epoch": 1.52, "logps_train/chosen": -29.023555755615234, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -88.9283218383789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.48670676350593567, "rewards_train/margins": 4.163914233446121, "rewards_train/rejected": -3.6772074699401855, "step": 1146 }, { "epoch": 1.52, "logps_train/chosen": -65.61752319335938, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -105.22210693359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4679350256919861, "rewards_train/margins": 4.622957646846771, "rewards_train/rejected": -4.155022621154785, "step": 1147 }, { "epoch": 1.52, "learning_rate": 3.6421960087304606e-07, "loss": 0.0265, "step": 1148 }, { "epoch": 1.52, "logps_train/chosen": -60.67645263671875, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -111.85208892822266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34641727805137634, "rewards_train/margins": 5.425376266241074, "rewards_train/rejected": -5.078958988189697, "step": 1148 }, { "epoch": 1.53, "logps_train/chosen": -70.41081237792969, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -109.43710327148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.45201873779296875, "rewards_train/margins": 3.971378803253174, "rewards_train/rejected": -4.423397541046143, "step": 1149 }, { "epoch": 1.53, "learning_rate": 3.63730256399936e-07, "loss": 0.0185, "step": 1150 }, { "epoch": 1.53, "logps_train/chosen": -63.833038330078125, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -95.2242660522461, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22763383388519287, "rewards_train/margins": 4.226622939109802, "rewards_train/rejected": -3.9989891052246094, "step": 1150 }, { "epoch": 1.53, "logps_train/chosen": -79.74911499023438, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -139.5867462158203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18446320295333862, "rewards_train/margins": 4.833762347698212, "rewards_train/rejected": -4.649299144744873, "step": 1151 }, { "epoch": 1.53, "learning_rate": 3.6324036186044916e-07, "loss": 0.0251, "step": 1152 }, { "epoch": 1.53, "logps_train/chosen": -54.11359405517578, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -117.70050811767578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6667654514312744, "rewards_train/margins": 5.0352537631988525, "rewards_train/rejected": -4.368488311767578, "step": 1152 }, { "epoch": 1.53, "logps_train/chosen": -40.71803283691406, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -84.96842956542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7000718712806702, "rewards_train/margins": 4.435976564884186, "rewards_train/rejected": -3.7359046936035156, "step": 1153 }, { "epoch": 1.53, "learning_rate": 3.627499196240036e-07, "loss": 0.023, "step": 1154 }, { "epoch": 1.53, "logps_train/chosen": -55.13288116455078, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -62.07521057128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.689836859703064, "rewards_train/margins": 2.4723581075668335, "rewards_train/rejected": -1.7825212478637695, "step": 1154 }, { "epoch": 1.53, "logps_train/chosen": -66.95547485351562, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -140.13360595703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6317959427833557, "rewards_train/margins": 5.068594038486481, "rewards_train/rejected": -4.436798095703125, "step": 1155 }, { "epoch": 1.54, "learning_rate": 3.622589320626662e-07, "loss": 0.0748, "step": 1156 }, { "epoch": 1.54, "logps_train/chosen": -85.71044921875, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -110.75030517578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37739309668540955, "rewards_train/margins": 3.2602365911006927, "rewards_train/rejected": -2.882843494415283, "step": 1156 }, { "epoch": 1.54, "logps_train/chosen": -67.10986328125, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -96.31272888183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4218258559703827, "rewards_train/margins": 3.9140367209911346, "rewards_train/rejected": -3.492210865020752, "step": 1157 }, { "epoch": 1.54, "learning_rate": 3.6176740155114156e-07, "loss": 0.0752, "step": 1158 }, { "epoch": 1.54, "logps_train/chosen": -70.64180755615234, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -92.64602661132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7436319589614868, "rewards_train/margins": 3.8695627450942993, "rewards_train/rejected": -3.1259307861328125, "step": 1158 }, { "epoch": 1.54, "logps_train/chosen": -91.02542877197266, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -143.17457580566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45839452743530273, "rewards_train/margins": 5.1750712394714355, "rewards_train/rejected": -4.716676712036133, "step": 1159 }, { "epoch": 1.54, "learning_rate": 3.6127533046676e-07, "loss": 0.0629, "step": 1160 }, { "epoch": 1.54, "logps_train/chosen": -63.011863708496094, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -87.22319793701172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9050636291503906, "rewards_train/margins": 4.499258756637573, "rewards_train/rejected": -3.5941951274871826, "step": 1160 }, { "epoch": 1.54, "logps_train/chosen": -43.46173095703125, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -91.20061492919922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1538269966840744, "rewards_train/margins": 3.6908805817365646, "rewards_train/rejected": -3.5370535850524902, "step": 1161 }, { "epoch": 1.54, "learning_rate": 3.607827211894667e-07, "loss": 0.032, "step": 1162 }, { "epoch": 1.54, "logps_train/chosen": -53.33774185180664, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -88.04535675048828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.142788365483284, "rewards_train/margins": 4.003574326634407, "rewards_train/rejected": -3.860785961151123, "step": 1162 }, { "epoch": 1.54, "logps_train/chosen": -76.51295471191406, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -130.69015502929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07292267680168152, "rewards_train/margins": 4.807563751935959, "rewards_train/rejected": -4.734641075134277, "step": 1163 }, { "epoch": 1.55, "learning_rate": 3.6028957610180966e-07, "loss": 0.0356, "step": 1164 }, { "epoch": 1.55, "logps_train/chosen": -60.12653350830078, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -125.51658630371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1357839107513428, "rewards_train/margins": 5.7608802318573, "rewards_train/rejected": -4.625096321105957, "step": 1164 }, { "epoch": 1.55, "logps_train/chosen": -38.338462829589844, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -37.0, "logps_train/rejected": -66.69853973388672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1171303987503052, "rewards_train/margins": 4.082296967506409, "rewards_train/rejected": -2.9651665687561035, "step": 1165 }, { "epoch": 1.55, "learning_rate": 3.597958975889285e-07, "loss": 0.0402, "step": 1166 }, { "epoch": 1.55, "logps_train/chosen": -88.50666809082031, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -130.41317749023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.46629127860069275, "rewards_train/margins": 4.565651088953018, "rewards_train/rejected": -5.031942367553711, "step": 1166 }, { "epoch": 1.55, "logps_train/chosen": -47.40610885620117, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -87.6757583618164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5031390190124512, "rewards_train/margins": 4.253527402877808, "rewards_train/rejected": -3.7503883838653564, "step": 1167 }, { "epoch": 1.55, "learning_rate": 3.593016880385425e-07, "loss": 0.0817, "step": 1168 }, { "epoch": 1.55, "logps_train/chosen": -57.20851516723633, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -113.05278015136719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6791485548019409, "rewards_train/margins": 5.009426951408386, "rewards_train/rejected": -4.330278396606445, "step": 1168 }, { "epoch": 1.55, "logps_train/chosen": -81.41835021972656, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -122.6648941040039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7284771203994751, "rewards_train/margins": 4.707466244697571, "rewards_train/rejected": -3.9789891242980957, "step": 1169 }, { "epoch": 1.55, "learning_rate": 3.588069498409398e-07, "loss": 0.0236, "step": 1170 }, { "epoch": 1.55, "logps_train/chosen": -45.03921127319336, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -119.73301696777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45545387268066406, "rewards_train/margins": 5.338130950927734, "rewards_train/rejected": -4.88267707824707, "step": 1170 }, { "epoch": 1.56, "logps_train/chosen": -63.31031799316406, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -145.516357421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7498277425765991, "rewards_train/margins": 6.492089629173279, "rewards_train/rejected": -5.74226188659668, "step": 1171 }, { "epoch": 1.56, "learning_rate": 3.58311685388965e-07, "loss": 0.0123, "step": 1172 }, { "epoch": 1.56, "logps_train/chosen": -62.23689270019531, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -101.80392456054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14076361060142517, "rewards_train/margins": 4.406313270330429, "rewards_train/rejected": -4.265549659729004, "step": 1172 }, { "epoch": 1.56, "logps_train/chosen": -71.29048156738281, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -110.56542205810547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18188969790935516, "rewards_train/margins": 4.810307189822197, "rewards_train/rejected": -4.628417491912842, "step": 1173 }, { "epoch": 1.56, "learning_rate": 3.578158970780082e-07, "loss": 0.0245, "step": 1174 }, { "epoch": 1.56, "logps_train/chosen": -62.290924072265625, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -101.00851440429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06971733272075653, "rewards_train/margins": 3.6217594891786575, "rewards_train/rejected": -3.691476821899414, "step": 1174 }, { "epoch": 1.56, "logps_train/chosen": -48.831886291503906, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -100.26868438720703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42149895429611206, "rewards_train/margins": 4.4999300837516785, "rewards_train/rejected": -4.078431129455566, "step": 1175 }, { "epoch": 1.56, "learning_rate": 3.573195873059932e-07, "loss": 0.0447, "step": 1176 }, { "epoch": 1.56, "logps_train/chosen": -40.008216857910156, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -76.95565795898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22886571288108826, "rewards_train/margins": 2.8271656930446625, "rewards_train/rejected": -2.598299980163574, "step": 1176 }, { "epoch": 1.56, "logps_train/chosen": -42.883583068847656, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -85.88471221923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9749228954315186, "rewards_train/margins": 4.632534742355347, "rewards_train/rejected": -3.657611846923828, "step": 1177 }, { "epoch": 1.56, "learning_rate": 3.568227584733656e-07, "loss": 0.1012, "step": 1178 }, { "epoch": 1.56, "logps_train/chosen": -37.951316833496094, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -73.7178955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1142435073852539, "rewards_train/margins": 3.128220558166504, "rewards_train/rejected": -3.01397705078125, "step": 1178 }, { "epoch": 1.57, "logps_train/chosen": -50.12800598144531, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -100.78193664550781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07608184218406677, "rewards_train/margins": 4.1896118223667145, "rewards_train/rejected": -4.265693664550781, "step": 1179 }, { "epoch": 1.57, "learning_rate": 3.5632541298308194e-07, "loss": 0.0523, "step": 1180 }, { "epoch": 1.57, "logps_train/chosen": -69.09870147705078, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -117.29118347167969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1557547152042389, "rewards_train/margins": 4.663778930902481, "rewards_train/rejected": -4.508024215698242, "step": 1180 }, { "epoch": 1.57, "logps_train/chosen": -63.69761657714844, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -126.79061889648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8177383542060852, "rewards_train/margins": 5.1624258160591125, "rewards_train/rejected": -4.344687461853027, "step": 1181 }, { "epoch": 1.57, "learning_rate": 3.5582755324059727e-07, "loss": 0.0183, "step": 1182 }, { "epoch": 1.57, "logps_train/chosen": -87.4903564453125, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -144.6702880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8415901064872742, "rewards_train/margins": 5.608618915081024, "rewards_train/rejected": -4.76702880859375, "step": 1182 }, { "epoch": 1.57, "logps_train/chosen": -60.89975357055664, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -107.13526916503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5334621667861938, "rewards_train/margins": 4.8805824518203735, "rewards_train/rejected": -4.34712028503418, "step": 1183 }, { "epoch": 1.57, "learning_rate": 3.5532918165385394e-07, "loss": 0.0181, "step": 1184 }, { "epoch": 1.57, "logps_train/chosen": -78.36991119384766, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -137.6177215576172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7301962375640869, "rewards_train/margins": 5.438843011856079, "rewards_train/rejected": -4.708646774291992, "step": 1184 }, { "epoch": 1.57, "logps_train/chosen": -61.22022247314453, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -112.34793853759766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31469622254371643, "rewards_train/margins": 4.821365267038345, "rewards_train/rejected": -4.506669044494629, "step": 1185 }, { "epoch": 1.58, "learning_rate": 3.5483030063327e-07, "loss": 0.0148, "step": 1186 }, { "epoch": 1.58, "logps_train/chosen": -60.09109115600586, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -133.95828247070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2237035036087036, "rewards_train/margins": 6.324219822883606, "rewards_train/rejected": -5.100516319274902, "step": 1186 }, { "epoch": 1.58, "logps_train/chosen": -95.78916931152344, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -143.90614318847656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22108271718025208, "rewards_train/margins": 5.030447036027908, "rewards_train/rejected": -4.809364318847656, "step": 1187 }, { "epoch": 1.58, "learning_rate": 3.543309125917272e-07, "loss": 0.0498, "step": 1188 }, { "epoch": 1.58, "logps_train/chosen": -50.32183074951172, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -74.55839538574219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.12094205617904663, "rewards_train/margins": 2.9533440470695496, "rewards_train/rejected": -2.832401990890503, "step": 1188 }, { "epoch": 1.58, "logps_train/chosen": -46.14740753173828, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -105.67840576171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6493219137191772, "rewards_train/margins": 4.544506669044495, "rewards_train/rejected": -3.8951847553253174, "step": 1189 }, { "epoch": 1.58, "learning_rate": 3.5383101994455977e-07, "loss": 0.1152, "step": 1190 }, { "epoch": 1.58, "logps_train/chosen": -68.40486145019531, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -82.69146728515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.23060722649097443, "rewards_train/margins": 3.468504622578621, "rewards_train/rejected": -3.2378973960876465, "step": 1190 }, { "epoch": 1.58, "logps_train/chosen": -30.870201110839844, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -30.625, "logps_train/rejected": -56.294700622558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.732511043548584, "rewards_train/margins": 3.30143404006958, "rewards_train/rejected": -2.568922996520996, "step": 1191 }, { "epoch": 1.58, "learning_rate": 3.533306251095425e-07, "loss": 0.1014, "step": 1192 }, { "epoch": 1.58, "logps_train/chosen": -45.39210510253906, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -84.66703796386719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.00864112377166748, "rewards_train/margins": 3.762454390525818, "rewards_train/rejected": -3.7538132667541504, "step": 1192 }, { "epoch": 1.58, "logps_train/chosen": -88.29261779785156, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -102.97598266601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2426128387451172, "rewards_train/margins": 4.42927360534668, "rewards_train/rejected": -3.1866607666015625, "step": 1193 }, { "epoch": 1.59, "learning_rate": 3.5282973050687875e-07, "loss": 0.0568, "step": 1194 }, { "epoch": 1.59, "logps_train/chosen": -63.27000427246094, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -108.3096923828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.33676597476005554, "rewards_train/margins": 3.7551415264606476, "rewards_train/rejected": -4.091907501220703, "step": 1194 }, { "epoch": 1.59, "logps_train/chosen": -58.997398376464844, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -90.92861938476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4861975610256195, "rewards_train/margins": 4.091559022665024, "rewards_train/rejected": -3.6053614616394043, "step": 1195 }, { "epoch": 1.59, "learning_rate": 3.523283385591895e-07, "loss": 0.0535, "step": 1196 }, { "epoch": 1.59, "logps_train/chosen": -56.97357177734375, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -109.07212829589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3094664216041565, "rewards_train/margins": 4.800871789455414, "rewards_train/rejected": -5.11033821105957, "step": 1196 }, { "epoch": 1.59, "logps_train/chosen": -46.01885223388672, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -82.29767608642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.17610406875610352, "rewards_train/margins": 3.611085891723633, "rewards_train/rejected": -3.7871899604797363, "step": 1197 }, { "epoch": 1.59, "learning_rate": 3.518264516915008e-07, "loss": 0.0366, "step": 1198 }, { "epoch": 1.59, "logps_train/chosen": -49.95056915283203, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -88.54023742675781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.44166162610054016, "rewards_train/margins": 3.819123536348343, "rewards_train/rejected": -3.3774619102478027, "step": 1198 }, { "epoch": 1.59, "logps_train/chosen": -44.609657287597656, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -76.62826538085938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.26877841353416443, "rewards_train/margins": 2.64873543381691, "rewards_train/rejected": -2.917513847351074, "step": 1199 }, { "epoch": 1.59, "learning_rate": 3.513240723312326e-07, "loss": 0.2121, "step": 1200 }, { "epoch": 1.59, "logps_train/chosen": -62.72941589355469, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -98.19572448730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5458085536956787, "rewards_train/margins": 3.8396003246307373, "rewards_train/rejected": -3.2937917709350586, "step": 1200 }, { "epoch": 1.59, "logps_train/chosen": -53.761287689208984, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -98.62472534179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26215219497680664, "rewards_train/margins": 4.246500015258789, "rewards_train/rejected": -3.9843478202819824, "step": 1201 }, { "epoch": 1.6, "learning_rate": 3.5082120290818685e-07, "loss": 0.0451, "step": 1202 }, { "epoch": 1.6, "logps_train/chosen": -56.65771484375, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -97.57115173339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6967286467552185, "rewards_train/margins": 4.6272823214530945, "rewards_train/rejected": -3.930553674697876, "step": 1202 }, { "epoch": 1.6, "logps_train/chosen": -76.63764190673828, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -146.3726348876953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2862357199192047, "rewards_train/margins": 5.442249208688736, "rewards_train/rejected": -5.156013488769531, "step": 1203 }, { "epoch": 1.6, "learning_rate": 3.5031784585453564e-07, "loss": 0.0303, "step": 1204 }, { "epoch": 1.6, "logps_train/chosen": -62.111236572265625, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -106.24639129638672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.18924865126609802, "rewards_train/margins": 4.051015287637711, "rewards_train/rejected": -4.240263938903809, "step": 1204 }, { "epoch": 1.6, "logps_train/chosen": -86.25396728515625, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -127.8245849609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.35430294275283813, "rewards_train/margins": 4.792999684810638, "rewards_train/rejected": -5.147302627563477, "step": 1205 }, { "epoch": 1.6, "learning_rate": 3.498140036048098e-07, "loss": 0.0505, "step": 1206 }, { "epoch": 1.6, "logps_train/chosen": -78.15673828125, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -131.0347137451172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28432610630989075, "rewards_train/margins": 4.1042037308216095, "rewards_train/rejected": -3.8198776245117188, "step": 1206 }, { "epoch": 1.6, "logps_train/chosen": -58.03449249267578, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -97.57174682617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5809259414672852, "rewards_train/margins": 4.359975576400757, "rewards_train/rejected": -3.7790496349334717, "step": 1207 }, { "epoch": 1.6, "learning_rate": 3.493096785958863e-07, "loss": 0.039, "step": 1208 }, { "epoch": 1.6, "logps_train/chosen": -75.44340515136719, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -129.421630859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.616596519947052, "rewards_train/margins": 5.293134033679962, "rewards_train/rejected": -4.67653751373291, "step": 1208 }, { "epoch": 1.61, "logps_train/chosen": -71.10272979736328, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -120.60340881347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07566455006599426, "rewards_train/margins": 4.165692359209061, "rewards_train/rejected": -4.090027809143066, "step": 1209 }, { "epoch": 1.61, "learning_rate": 3.488048732669776e-07, "loss": 0.0165, "step": 1210 }, { "epoch": 1.61, "logps_train/chosen": -59.339744567871094, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -127.5440902709961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2285252809524536, "rewards_train/margins": 6.329809308052063, "rewards_train/rejected": -5.101284027099609, "step": 1210 }, { "epoch": 1.61, "logps_train/chosen": -59.625404357910156, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -99.11957550048828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4937094449996948, "rewards_train/margins": 4.430667281150818, "rewards_train/rejected": -3.936957836151123, "step": 1211 }, { "epoch": 1.61, "learning_rate": 3.4829959005961885e-07, "loss": 0.0523, "step": 1212 }, { "epoch": 1.61, "logps_train/chosen": -92.18133544921875, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -131.87074279785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15999111533164978, "rewards_train/margins": 4.354878276586533, "rewards_train/rejected": -4.194887161254883, "step": 1212 }, { "epoch": 1.61, "logps_train/chosen": -66.0252685546875, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -125.59088134765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.25721460580825806, "rewards_train/margins": 4.74874871969223, "rewards_train/rejected": -5.005963325500488, "step": 1213 }, { "epoch": 1.61, "learning_rate": 3.4779383141765685e-07, "loss": 0.0283, "step": 1214 }, { "epoch": 1.61, "logps_train/chosen": -49.56360626220703, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -125.35612487792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4983266592025757, "rewards_train/margins": 5.301126837730408, "rewards_train/rejected": -4.802800178527832, "step": 1214 }, { "epoch": 1.61, "logps_train/chosen": -51.14799499511719, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -98.65750122070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3371535837650299, "rewards_train/margins": 4.070091158151627, "rewards_train/rejected": -3.7329375743865967, "step": 1215 }, { "epoch": 1.61, "learning_rate": 3.4728759978723756e-07, "loss": 0.0633, "step": 1216 }, { "epoch": 1.61, "logps_train/chosen": -41.30083084106445, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -93.78266906738281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.956244945526123, "rewards_train/margins": 4.874355792999268, "rewards_train/rejected": -3.9181108474731445, "step": 1216 }, { "epoch": 1.62, "logps_train/chosen": -75.86930847167969, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -97.98799133300781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9615062475204468, "rewards_train/margins": 4.86889922618866, "rewards_train/rejected": -3.907392978668213, "step": 1217 }, { "epoch": 1.62, "learning_rate": 3.4678089761679484e-07, "loss": 0.0284, "step": 1218 }, { "epoch": 1.62, "logps_train/chosen": -45.80632400512695, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -77.6366958618164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6521801352500916, "rewards_train/margins": 3.8248338103294373, "rewards_train/rejected": -3.1726536750793457, "step": 1218 }, { "epoch": 1.62, "logps_train/chosen": -101.06536865234375, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -152.53598022460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.012212730944156647, "rewards_train/margins": 5.637685276567936, "rewards_train/rejected": -5.625472545623779, "step": 1219 }, { "epoch": 1.62, "learning_rate": 3.4627372735703816e-07, "loss": 0.0252, "step": 1220 }, { "epoch": 1.62, "logps_train/chosen": -39.80760955810547, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -88.94658660888672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7004889249801636, "rewards_train/margins": 5.056084990501404, "rewards_train/rejected": -4.35559606552124, "step": 1220 }, { "epoch": 1.62, "logps_train/chosen": -56.40376281738281, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -101.62013244628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7744676470756531, "rewards_train/margins": 3.9200751185417175, "rewards_train/rejected": -3.1456074714660645, "step": 1221 }, { "epoch": 1.62, "learning_rate": 3.457660914609411e-07, "loss": 0.0717, "step": 1222 }, { "epoch": 1.62, "logps_train/chosen": -75.43949890136719, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -111.02293395996094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6974563598632812, "rewards_train/margins": 3.5106873512268066, "rewards_train/rejected": -2.8132309913635254, "step": 1222 }, { "epoch": 1.62, "logps_train/chosen": -69.11103820800781, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -105.63573455810547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1693650186061859, "rewards_train/margins": 4.284501165151596, "rewards_train/rejected": -4.11513614654541, "step": 1223 }, { "epoch": 1.63, "learning_rate": 3.452579923837292e-07, "loss": 0.0426, "step": 1224 }, { "epoch": 1.63, "logps_train/chosen": -75.84112548828125, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -105.89423370361328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1471378803253174, "rewards_train/margins": 4.413904905319214, "rewards_train/rejected": -3.2667670249938965, "step": 1224 }, { "epoch": 1.63, "logps_train/chosen": -37.039520263671875, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -95.42317962646484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8288604021072388, "rewards_train/margins": 4.6032103300094604, "rewards_train/rejected": -3.7743499279022217, "step": 1225 }, { "epoch": 1.63, "learning_rate": 3.447494325828685e-07, "loss": 0.0413, "step": 1226 }, { "epoch": 1.63, "logps_train/chosen": -48.159095764160156, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -89.51729583740234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08934725821018219, "rewards_train/margins": 4.0967575162649155, "rewards_train/rejected": -4.186104774475098, "step": 1226 }, { "epoch": 1.63, "logps_train/chosen": -84.19065856933594, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -134.36306762695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2425037920475006, "rewards_train/margins": 5.409428924322128, "rewards_train/rejected": -5.651932716369629, "step": 1227 }, { "epoch": 1.63, "learning_rate": 3.442404145180528e-07, "loss": 0.0432, "step": 1228 }, { "epoch": 1.63, "logps_train/chosen": -38.32874298095703, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -68.63412475585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6839224696159363, "rewards_train/margins": 3.6903037428855896, "rewards_train/rejected": -3.0063812732696533, "step": 1228 }, { "epoch": 1.63, "logps_train/chosen": -73.37425231933594, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -108.77545166015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29382458329200745, "rewards_train/margins": 3.9494947493076324, "rewards_train/rejected": -3.655670166015625, "step": 1229 }, { "epoch": 1.63, "learning_rate": 3.43730940651193e-07, "loss": 0.0368, "step": 1230 }, { "epoch": 1.63, "logps_train/chosen": -70.14106750488281, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -109.42921447753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.18637317419052124, "rewards_train/margins": 4.055767118930817, "rewards_train/rejected": -4.242140293121338, "step": 1230 }, { "epoch": 1.63, "logps_train/chosen": -62.73509979248047, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -93.09678649902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0218026638031006, "rewards_train/margins": 3.65804386138916, "rewards_train/rejected": -2.6362411975860596, "step": 1231 }, { "epoch": 1.64, "learning_rate": 3.4322101344640404e-07, "loss": 0.0791, "step": 1232 }, { "epoch": 1.64, "logps_train/chosen": -89.06459045410156, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -145.55136108398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5497916340827942, "rewards_train/margins": 4.658833801746368, "rewards_train/rejected": -4.109042167663574, "step": 1232 }, { "epoch": 1.64, "logps_train/chosen": -69.12921142578125, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -145.77700805664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9370793700218201, "rewards_train/margins": 5.205406129360199, "rewards_train/rejected": -4.268326759338379, "step": 1233 }, { "epoch": 1.64, "learning_rate": 3.427106353699937e-07, "loss": 0.0509, "step": 1234 }, { "epoch": 1.64, "logps_train/chosen": -77.95858764648438, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -108.58003997802734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4329679608345032, "rewards_train/margins": 3.6328489184379578, "rewards_train/rejected": -4.065816879272461, "step": 1234 }, { "epoch": 1.64, "logps_train/chosen": -76.71270751953125, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -132.99984741210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0556458979845047, "rewards_train/margins": 4.6990261524915695, "rewards_train/rejected": -4.754672050476074, "step": 1235 }, { "epoch": 1.64, "learning_rate": 3.421998088904504e-07, "loss": 0.0467, "step": 1236 }, { "epoch": 1.64, "logps_train/chosen": -43.599815368652344, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -70.81554412841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09880734980106354, "rewards_train/margins": 3.2440338283777237, "rewards_train/rejected": -3.14522647857666, "step": 1236 }, { "epoch": 1.64, "logps_train/chosen": -99.39512634277344, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -180.20960998535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27220600843429565, "rewards_train/margins": 6.7302767634391785, "rewards_train/rejected": -6.458070755004883, "step": 1237 }, { "epoch": 1.64, "learning_rate": 3.416885364784313e-07, "loss": 0.0708, "step": 1238 }, { "epoch": 1.64, "logps_train/chosen": -79.3169174194336, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -120.51239013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2026832103729248, "rewards_train/margins": 5.737516641616821, "rewards_train/rejected": -4.5348334312438965, "step": 1238 }, { "epoch": 1.65, "logps_train/chosen": -64.22528839111328, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -125.53378295898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.03463807702064514, "rewards_train/margins": 4.275771468877792, "rewards_train/rejected": -4.3104095458984375, "step": 1239 }, { "epoch": 1.65, "learning_rate": 3.411768206067503e-07, "loss": 0.0428, "step": 1240 }, { "epoch": 1.65, "logps_train/chosen": -49.51910400390625, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -90.89720153808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7508238554000854, "rewards_train/margins": 4.360856413841248, "rewards_train/rejected": -3.610032558441162, "step": 1240 }, { "epoch": 1.65, "logps_train/chosen": -53.82635498046875, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -91.87954711914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6658023595809937, "rewards_train/margins": 4.072490811347961, "rewards_train/rejected": -3.4066884517669678, "step": 1241 }, { "epoch": 1.65, "learning_rate": 3.4066466375036617e-07, "loss": 0.0831, "step": 1242 }, { "epoch": 1.65, "logps_train/chosen": -80.1483154296875, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -132.58181762695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5070439577102661, "rewards_train/margins": 4.799601197242737, "rewards_train/rejected": -4.292557239532471, "step": 1242 }, { "epoch": 1.65, "logps_train/chosen": -46.889129638671875, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -82.32633209228516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07172523438930511, "rewards_train/margins": 3.551533356308937, "rewards_train/rejected": -3.623258590698242, "step": 1243 }, { "epoch": 1.65, "learning_rate": 3.401520683863706e-07, "loss": 0.0464, "step": 1244 }, { "epoch": 1.65, "logps_train/chosen": -33.233970642089844, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -82.40876770019531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4750401973724365, "rewards_train/margins": 4.036229848861694, "rewards_train/rejected": -3.561189651489258, "step": 1244 }, { "epoch": 1.65, "logps_train/chosen": -40.76352310180664, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -93.35983276367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3650539517402649, "rewards_train/margins": 3.613537609577179, "rewards_train/rejected": -3.248483657836914, "step": 1245 }, { "epoch": 1.65, "learning_rate": 3.39639036993976e-07, "loss": 0.0773, "step": 1246 }, { "epoch": 1.65, "logps_train/chosen": -89.75995635986328, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -142.31585693359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3599419593811035, "rewards_train/margins": 4.6329345703125, "rewards_train/rejected": -4.2729926109313965, "step": 1246 }, { "epoch": 1.66, "logps_train/chosen": -49.58206558227539, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -121.36693572998047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1794956773519516, "rewards_train/margins": 5.027509823441505, "rewards_train/rejected": -5.207005500793457, "step": 1247 }, { "epoch": 1.66, "learning_rate": 3.391255720545039e-07, "loss": 0.0413, "step": 1248 }, { "epoch": 1.66, "logps_train/chosen": -46.93727111816406, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -100.52733612060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1140857934951782, "rewards_train/margins": 3.9293192625045776, "rewards_train/rejected": -2.8152334690093994, "step": 1248 }, { "epoch": 1.66, "logps_train/chosen": -71.68449401855469, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -143.488525390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08311371505260468, "rewards_train/margins": 5.366341635584831, "rewards_train/rejected": -5.283227920532227, "step": 1249 }, { "epoch": 1.66, "learning_rate": 3.386116760513724e-07, "loss": 0.0267, "step": 1250 }, { "epoch": 1.66, "logps_train/chosen": -43.33943176269531, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -97.82997131347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1644940972328186, "rewards_train/margins": 4.059990704059601, "rewards_train/rejected": -3.8954966068267822, "step": 1250 }, { "epoch": 1.66, "logps_train/chosen": -69.80758666992188, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -87.5067367553711, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5895535945892334, "rewards_train/margins": 3.7371022701263428, "rewards_train/rejected": -3.1475486755371094, "step": 1251 }, { "epoch": 1.66, "learning_rate": 3.380973514700849e-07, "loss": 0.0384, "step": 1252 }, { "epoch": 1.66, "logps_train/chosen": -61.50464630126953, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -116.66973876953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.21140184998512268, "rewards_train/margins": 4.689947217702866, "rewards_train/rejected": -4.901349067687988, "step": 1252 }, { "epoch": 1.66, "logps_train/chosen": -65.73272705078125, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -114.96029663085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06350772082805634, "rewards_train/margins": 4.2254908829927444, "rewards_train/rejected": -4.288998603820801, "step": 1253 }, { "epoch": 1.67, "learning_rate": 3.375826007982172e-07, "loss": 0.0365, "step": 1254 }, { "epoch": 1.67, "logps_train/chosen": -82.12040710449219, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -165.12677001953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4738973379135132, "rewards_train/margins": 6.805324912071228, "rewards_train/rejected": -6.331427574157715, "step": 1254 }, { "epoch": 1.67, "logps_train/chosen": -41.567134857177734, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -81.79862213134766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3729740381240845, "rewards_train/margins": 3.8278361558914185, "rewards_train/rejected": -3.454862117767334, "step": 1255 }, { "epoch": 1.67, "learning_rate": 3.3706742652540635e-07, "loss": 0.0312, "step": 1256 }, { "epoch": 1.67, "logps_train/chosen": -68.29951477050781, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -137.4251708984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04973659664392471, "rewards_train/margins": 4.779754258692265, "rewards_train/rejected": -4.73001766204834, "step": 1256 }, { "epoch": 1.67, "logps_train/chosen": -34.21421813964844, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -72.66361999511719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6504534482955933, "rewards_train/margins": 3.2168158292770386, "rewards_train/rejected": -2.5663623809814453, "step": 1257 }, { "epoch": 1.67, "learning_rate": 3.3655183114333783e-07, "loss": 0.0653, "step": 1258 }, { "epoch": 1.67, "logps_train/chosen": -65.99408721923828, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -140.14695739746094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5646533966064453, "rewards_train/margins": 4.665286064147949, "rewards_train/rejected": -4.100632667541504, "step": 1258 }, { "epoch": 1.67, "logps_train/chosen": -58.06837844848633, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -124.74282836914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31347498297691345, "rewards_train/margins": 5.089125961065292, "rewards_train/rejected": -4.775650978088379, "step": 1259 }, { "epoch": 1.67, "learning_rate": 3.3603581714573414e-07, "loss": 0.0321, "step": 1260 }, { "epoch": 1.67, "logps_train/chosen": -55.80063247680664, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -117.1109619140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1379057168960571, "rewards_train/margins": 5.629470467567444, "rewards_train/rejected": -4.491564750671387, "step": 1260 }, { "epoch": 1.67, "logps_train/chosen": -78.77391815185547, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -123.04335021972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3460455536842346, "rewards_train/margins": 4.267567694187164, "rewards_train/rejected": -3.9215221405029297, "step": 1261 }, { "epoch": 1.68, "learning_rate": 3.355193870283422e-07, "loss": 0.0735, "step": 1262 }, { "epoch": 1.68, "logps_train/chosen": -78.79950714111328, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -103.21495056152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5762991905212402, "rewards_train/margins": 4.366544008255005, "rewards_train/rejected": -3.7902448177337646, "step": 1262 }, { "epoch": 1.68, "logps_train/chosen": -109.18474578857422, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -130.0, "logps_train/rejected": -193.97239685058594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4028494954109192, "rewards_train/margins": 6.012358367443085, "rewards_train/rejected": -6.415207862854004, "step": 1263 }, { "epoch": 1.68, "learning_rate": 3.3500254328892154e-07, "loss": 0.0396, "step": 1264 }, { "epoch": 1.68, "logps_train/chosen": -94.49217224121094, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -162.7421112060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04453215003013611, "rewards_train/margins": 6.174992889165878, "rewards_train/rejected": -6.130460739135742, "step": 1264 }, { "epoch": 1.68, "logps_train/chosen": -101.43020629882812, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -166.46823120117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09614566713571548, "rewards_train/margins": 5.439740143716335, "rewards_train/rejected": -5.535885810852051, "step": 1265 }, { "epoch": 1.68, "learning_rate": 3.3448528842723255e-07, "loss": 0.0297, "step": 1266 }, { "epoch": 1.68, "logps_train/chosen": -60.23590087890625, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -103.900146484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34203481674194336, "rewards_train/margins": 4.57267427444458, "rewards_train/rejected": -4.230639457702637, "step": 1266 }, { "epoch": 1.68, "logps_train/chosen": -53.927001953125, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -87.64691162109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3165586292743683, "rewards_train/margins": 3.380663186311722, "rewards_train/rejected": -3.0641045570373535, "step": 1267 }, { "epoch": 1.68, "learning_rate": 3.3396762494502373e-07, "loss": 0.0633, "step": 1268 }, { "epoch": 1.68, "logps_train/chosen": -41.89490509033203, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -69.8431396484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2326774597167969, "rewards_train/margins": 3.6228513717651367, "rewards_train/rejected": -2.39017391204834, "step": 1268 }, { "epoch": 1.69, "logps_train/chosen": -69.00054168701172, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -96.66023254394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40033674240112305, "rewards_train/margins": 3.6028835773468018, "rewards_train/rejected": -3.2025468349456787, "step": 1269 }, { "epoch": 1.69, "learning_rate": 3.3344955534601993e-07, "loss": 0.1047, "step": 1270 }, { "epoch": 1.69, "logps_train/chosen": -45.46272659301758, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.25, "logps_train/rejected": -70.67433166503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.14705386757850647, "rewards_train/margins": 2.8000667989254, "rewards_train/rejected": -2.9471206665039062, "step": 1270 }, { "epoch": 1.69, "logps_train/chosen": -43.96276092529297, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -92.99159240722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.630286693572998, "rewards_train/margins": 4.413820743560791, "rewards_train/rejected": -3.783534049987793, "step": 1271 }, { "epoch": 1.69, "learning_rate": 3.329310821359103e-07, "loss": 0.0707, "step": 1272 }, { "epoch": 1.69, "logps_train/chosen": -43.77528381347656, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -98.27178955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26544010639190674, "rewards_train/margins": 4.076994061470032, "rewards_train/rejected": -3.811553955078125, "step": 1272 }, { "epoch": 1.69, "logps_train/chosen": -41.8399658203125, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -87.28164672851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2628786265850067, "rewards_train/margins": 4.003543347120285, "rewards_train/rejected": -3.7406647205352783, "step": 1273 }, { "epoch": 1.69, "learning_rate": 3.324122078223361e-07, "loss": 0.0237, "step": 1274 }, { "epoch": 1.69, "logps_train/chosen": -32.85054397583008, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -80.73304748535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4895550310611725, "rewards_train/margins": 4.506219059228897, "rewards_train/rejected": -4.016664028167725, "step": 1274 }, { "epoch": 1.69, "logps_train/chosen": -89.41511535644531, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -143.1314697265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4475514590740204, "rewards_train/margins": 5.384525507688522, "rewards_train/rejected": -4.936974048614502, "step": 1275 }, { "epoch": 1.69, "learning_rate": 3.318929349148786e-07, "loss": 0.034, "step": 1276 }, { "epoch": 1.69, "logps_train/chosen": -62.960792541503906, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -118.03072357177734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.4320461750030518, "rewards_train/margins": 5.686681509017944, "rewards_train/rejected": -4.254635334014893, "step": 1276 }, { "epoch": 1.7, "logps_train/chosen": -84.21917724609375, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -154.69448852539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0749566555023193, "rewards_train/margins": 6.483468294143677, "rewards_train/rejected": -5.408511638641357, "step": 1277 }, { "epoch": 1.7, "learning_rate": 3.313732659250467e-07, "loss": 0.0187, "step": 1278 }, { "epoch": 1.7, "logps_train/chosen": -45.43123245239258, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -87.5999526977539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11361478269100189, "rewards_train/margins": 2.879859670996666, "rewards_train/rejected": -2.766244888305664, "step": 1278 }, { "epoch": 1.7, "logps_train/chosen": -61.76457977294922, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -85.24491119384766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09003237634897232, "rewards_train/margins": 3.2684433087706566, "rewards_train/rejected": -3.358475685119629, "step": 1279 }, { "epoch": 1.7, "learning_rate": 3.3085320336626515e-07, "loss": 0.1438, "step": 1280 }, { "epoch": 1.7, "logps_train/chosen": -61.346763610839844, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -119.7281494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42157384753227234, "rewards_train/margins": 4.979935377836227, "rewards_train/rejected": -4.558361530303955, "step": 1280 }, { "epoch": 1.7, "logps_train/chosen": -48.748931884765625, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -110.11878204345703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26416897773742676, "rewards_train/margins": 4.830734968185425, "rewards_train/rejected": -4.566565990447998, "step": 1281 }, { "epoch": 1.7, "learning_rate": 3.3033274975386233e-07, "loss": 0.0336, "step": 1282 }, { "epoch": 1.7, "logps_train/chosen": -46.555450439453125, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -81.43418884277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7061734199523926, "rewards_train/margins": 3.6132636070251465, "rewards_train/rejected": -2.907090187072754, "step": 1282 }, { "epoch": 1.7, "logps_train/chosen": -61.14817810058594, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -112.43848419189453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36721375584602356, "rewards_train/margins": 4.464187175035477, "rewards_train/rejected": -4.096973419189453, "step": 1283 }, { "epoch": 1.71, "learning_rate": 3.2981190760505765e-07, "loss": 0.0464, "step": 1284 }, { "epoch": 1.71, "logps_train/chosen": -60.14622497558594, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -114.14445495605469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20568996667861938, "rewards_train/margins": 4.806073725223541, "rewards_train/rejected": -4.600383758544922, "step": 1284 }, { "epoch": 1.71, "logps_train/chosen": -54.61853790283203, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -101.54710388183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10298976302146912, "rewards_train/margins": 4.732309907674789, "rewards_train/rejected": -4.62932014465332, "step": 1285 }, { "epoch": 1.71, "learning_rate": 3.292906794389502e-07, "loss": 0.0315, "step": 1286 }, { "epoch": 1.71, "logps_train/chosen": -56.822174072265625, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -84.07611846923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0806734561920166, "rewards_train/margins": 3.540238618850708, "rewards_train/rejected": -3.4595651626586914, "step": 1286 }, { "epoch": 1.71, "logps_train/chosen": -77.15886688232422, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -139.53407287597656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37786340713500977, "rewards_train/margins": 5.300020694732666, "rewards_train/rejected": -4.922157287597656, "step": 1287 }, { "epoch": 1.71, "learning_rate": 3.287690677765055e-07, "loss": 0.0776, "step": 1288 }, { "epoch": 1.71, "logps_train/chosen": -62.614532470703125, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -90.66036987304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.001828014850616455, "rewards_train/margins": 4.078802406787872, "rewards_train/rejected": -4.076974391937256, "step": 1288 }, { "epoch": 1.71, "logps_train/chosen": -60.615699768066406, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -118.24978637695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04936763644218445, "rewards_train/margins": 4.41497203707695, "rewards_train/rejected": -4.365604400634766, "step": 1289 }, { "epoch": 1.71, "learning_rate": 3.2824707514054433e-07, "loss": 0.0342, "step": 1290 }, { "epoch": 1.71, "logps_train/chosen": -74.69686889648438, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -108.50785827636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.013126015663147, "rewards_train/margins": 4.848286747932434, "rewards_train/rejected": -3.835160732269287, "step": 1290 }, { "epoch": 1.71, "logps_train/chosen": -64.16402435302734, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -129.58990478515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7031289935112, "rewards_train/margins": 5.807431876659393, "rewards_train/rejected": -5.104302883148193, "step": 1291 }, { "epoch": 1.72, "learning_rate": 3.2772470405572994e-07, "loss": 0.0125, "step": 1292 }, { "epoch": 1.72, "logps_train/chosen": -50.25048065185547, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -92.09669494628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21401438117027283, "rewards_train/margins": 4.4080584943294525, "rewards_train/rejected": -4.19404411315918, "step": 1292 }, { "epoch": 1.72, "logps_train/chosen": -81.95832824707031, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -131.42608642578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9760424494743347, "rewards_train/margins": 5.707713305950165, "rewards_train/rejected": -4.73167085647583, "step": 1293 }, { "epoch": 1.72, "learning_rate": 3.272019570485559e-07, "loss": 0.0133, "step": 1294 }, { "epoch": 1.72, "logps_train/chosen": -88.58802795410156, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -152.03662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.036926984786987305, "rewards_train/margins": 4.297986268997192, "rewards_train/rejected": -4.33491325378418, "step": 1294 }, { "epoch": 1.72, "logps_train/chosen": -67.58811950683594, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -114.74232482910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6646257638931274, "rewards_train/margins": 4.795107960700989, "rewards_train/rejected": -4.130482196807861, "step": 1295 }, { "epoch": 1.72, "learning_rate": 3.266788366473342e-07, "loss": 0.0345, "step": 1296 }, { "epoch": 1.72, "logps_train/chosen": -89.37545776367188, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -152.94842529296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9780793190002441, "rewards_train/margins": 5.938547611236572, "rewards_train/rejected": -4.960468292236328, "step": 1296 }, { "epoch": 1.72, "logps_train/chosen": -79.32011413574219, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -129.9368896484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.021113455295562744, "rewards_train/margins": 4.710116446018219, "rewards_train/rejected": -4.689002990722656, "step": 1297 }, { "epoch": 1.72, "learning_rate": 3.261553453821825e-07, "loss": 0.0172, "step": 1298 }, { "epoch": 1.72, "logps_train/chosen": -68.74508666992188, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -99.09082794189453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4676787257194519, "rewards_train/margins": 4.200198829174042, "rewards_train/rejected": -3.73252010345459, "step": 1298 }, { "epoch": 1.73, "logps_train/chosen": -83.87431335449219, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -131.5478515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30319344997406006, "rewards_train/margins": 4.748603701591492, "rewards_train/rejected": -4.445410251617432, "step": 1299 }, { "epoch": 1.73, "learning_rate": 3.2563148578501227e-07, "loss": 0.0485, "step": 1300 }, { "epoch": 1.73, "logps_train/chosen": -31.09895133972168, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -75.96563720703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5858080387115479, "rewards_train/margins": 3.7446768283843994, "rewards_train/rejected": -3.1588687896728516, "step": 1300 }, { "epoch": 1.73, "logps_train/chosen": -51.630897521972656, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -83.62006378173828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5837852954864502, "rewards_train/margins": 3.0903234481811523, "rewards_train/rejected": -2.506538152694702, "step": 1301 }, { "epoch": 1.73, "learning_rate": 3.2510726038951646e-07, "loss": 0.0963, "step": 1302 }, { "epoch": 1.73, "logps_train/chosen": -87.4047622680664, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -113.35118865966797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08218032121658325, "rewards_train/margins": 4.093471348285675, "rewards_train/rejected": -4.011291027069092, "step": 1302 }, { "epoch": 1.73, "logps_train/chosen": -74.77301025390625, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -100.62956237792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09349971264600754, "rewards_train/margins": 3.402939848601818, "rewards_train/rejected": -3.3094401359558105, "step": 1303 }, { "epoch": 1.73, "learning_rate": 3.2458267173115737e-07, "loss": 0.0907, "step": 1304 }, { "epoch": 1.73, "logps_train/chosen": -51.64720916748047, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -105.64906311035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1561269462108612, "rewards_train/margins": 4.755654841661453, "rewards_train/rejected": -4.9117817878723145, "step": 1304 }, { "epoch": 1.73, "logps_train/chosen": -72.72596740722656, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -123.30240631103516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5460348129272461, "rewards_train/margins": 4.737330913543701, "rewards_train/rejected": -5.283365726470947, "step": 1305 }, { "epoch": 1.73, "learning_rate": 3.24057722347154e-07, "loss": 0.0497, "step": 1306 }, { "epoch": 1.73, "logps_train/chosen": -63.780242919921875, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -103.94377136230469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1569300889968872, "rewards_train/margins": 4.264010310173035, "rewards_train/rejected": -4.420940399169922, "step": 1306 }, { "epoch": 1.74, "logps_train/chosen": -77.91183471679688, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -114.68812561035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.10993298143148422, "rewards_train/margins": 4.554192461073399, "rewards_train/rejected": -4.664125442504883, "step": 1307 }, { "epoch": 1.74, "learning_rate": 3.235324147764703e-07, "loss": 0.0471, "step": 1308 }, { "epoch": 1.74, "logps_train/chosen": -56.85551452636719, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -109.82720947265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.664448618888855, "rewards_train/margins": 4.522169470787048, "rewards_train/rejected": -3.8577208518981934, "step": 1308 }, { "epoch": 1.74, "logps_train/chosen": -42.04686737060547, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -58.00616455078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8031256794929504, "rewards_train/margins": 3.019074499607086, "rewards_train/rejected": -2.2159488201141357, "step": 1309 }, { "epoch": 1.74, "learning_rate": 3.230067515598024e-07, "loss": 0.0603, "step": 1310 }, { "epoch": 1.74, "logps_train/chosen": -40.37076950073242, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -88.0704345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8988605737686157, "rewards_train/margins": 3.9996544122695923, "rewards_train/rejected": -3.1007938385009766, "step": 1310 }, { "epoch": 1.74, "logps_train/chosen": -25.2216796875, "logps_train/ref_chosen": -30.5, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -59.214603424072266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5270507335662842, "rewards_train/margins": 3.1594483852386475, "rewards_train/rejected": -2.6323976516723633, "step": 1311 }, { "epoch": 1.74, "learning_rate": 3.224807352395666e-07, "loss": 0.0905, "step": 1312 }, { "epoch": 1.74, "logps_train/chosen": -34.91630172729492, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -76.10845947265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3021198511123657, "rewards_train/margins": 3.6141377687454224, "rewards_train/rejected": -3.3120179176330566, "step": 1312 }, { "epoch": 1.74, "logps_train/chosen": -46.99993133544922, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -81.25511932373047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.14686810970306396, "rewards_train/margins": 3.9263006448745728, "rewards_train/rejected": -4.073168754577637, "step": 1313 }, { "epoch": 1.75, "learning_rate": 3.219543683598871e-07, "loss": 0.0724, "step": 1314 }, { "epoch": 1.75, "logps_train/chosen": -45.52054977416992, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -92.04312896728516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3526324927806854, "rewards_train/margins": 3.9659297168254852, "rewards_train/rejected": -3.6132972240448, "step": 1314 }, { "epoch": 1.75, "logps_train/chosen": -72.036376953125, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -137.9612274169922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22448745369911194, "rewards_train/margins": 5.926860481500626, "rewards_train/rejected": -5.702373027801514, "step": 1315 }, { "epoch": 1.75, "learning_rate": 3.2142765346658365e-07, "loss": 0.054, "step": 1316 }, { "epoch": 1.75, "logps_train/chosen": -77.81909942626953, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -118.39262390136719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6170035004615784, "rewards_train/margins": 4.7422032952308655, "rewards_train/rejected": -4.125199794769287, "step": 1316 }, { "epoch": 1.75, "logps_train/chosen": -83.16867065429688, "logps_train/ref_chosen": -97.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -116.9627685546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.4487583637237549, "rewards_train/margins": 5.229410171508789, "rewards_train/rejected": -3.780651807785034, "step": 1317 }, { "epoch": 1.75, "learning_rate": 3.2090059310715883e-07, "loss": 0.0192, "step": 1318 }, { "epoch": 1.75, "logps_train/chosen": -61.10783004760742, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -96.2449722290039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4907795488834381, "rewards_train/margins": 3.826214224100113, "rewards_train/rejected": -3.335434675216675, "step": 1318 }, { "epoch": 1.75, "logps_train/chosen": -36.37995910644531, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -91.6414794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1729416847229004, "rewards_train/margins": 4.052714824676514, "rewards_train/rejected": -3.8797731399536133, "step": 1319 }, { "epoch": 1.75, "learning_rate": 3.203731898307867e-07, "loss": 0.0541, "step": 1320 }, { "epoch": 1.75, "logps_train/chosen": -54.88543701171875, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -102.17388916015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4903629422187805, "rewards_train/margins": 4.4983771443367, "rewards_train/rejected": -4.00801420211792, "step": 1320 }, { "epoch": 1.75, "logps_train/chosen": -75.27879333496094, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -126.5898666381836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5471201539039612, "rewards_train/margins": 5.068606436252594, "rewards_train/rejected": -4.521486282348633, "step": 1321 }, { "epoch": 1.76, "learning_rate": 3.1984544618829923e-07, "loss": 0.0172, "step": 1322 }, { "epoch": 1.76, "logps_train/chosen": -81.05743408203125, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -130.18179321289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09636864066123962, "rewards_train/margins": 4.442123562097549, "rewards_train/rejected": -4.538492202758789, "step": 1322 }, { "epoch": 1.76, "logps_train/chosen": -45.12523651123047, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -89.20333862304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9437263011932373, "rewards_train/margins": 4.018747806549072, "rewards_train/rejected": -3.075021505355835, "step": 1323 }, { "epoch": 1.76, "learning_rate": 3.1931736473217517e-07, "loss": 0.0527, "step": 1324 }, { "epoch": 1.76, "logps_train/chosen": -53.421531677246094, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -116.16659545898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08121564239263535, "rewards_train/margins": 5.327631212770939, "rewards_train/rejected": -5.408846855163574, "step": 1324 }, { "epoch": 1.76, "logps_train/chosen": -54.0560188293457, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -91.33196258544922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26002299785614014, "rewards_train/margins": 3.7357975244522095, "rewards_train/rejected": -3.4757745265960693, "step": 1325 }, { "epoch": 1.76, "learning_rate": 3.1878894801652673e-07, "loss": 0.0372, "step": 1326 }, { "epoch": 1.76, "logps_train/chosen": -34.62523651123047, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -66.30241394042969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8921639323234558, "rewards_train/margins": 3.801311194896698, "rewards_train/rejected": -2.909147262573242, "step": 1326 }, { "epoch": 1.76, "logps_train/chosen": -78.89883422851562, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -117.92451477050781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6773035526275635, "rewards_train/margins": 4.893192529678345, "rewards_train/rejected": -4.215888977050781, "step": 1327 }, { "epoch": 1.76, "learning_rate": 3.182601985970878e-07, "loss": 0.0394, "step": 1328 }, { "epoch": 1.76, "logps_train/chosen": -70.3931655883789, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -94.61600494384766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.41700220108032227, "rewards_train/margins": 3.1875672340393066, "rewards_train/rejected": -3.604569435119629, "step": 1328 }, { "epoch": 1.76, "logps_train/chosen": -67.95346069335938, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -103.56622314453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0749666839838028, "rewards_train/margins": 4.701901212334633, "rewards_train/rejected": -4.62693452835083, "step": 1329 }, { "epoch": 1.77, "learning_rate": 3.177311190312015e-07, "loss": 0.0359, "step": 1330 }, { "epoch": 1.77, "logps_train/chosen": -53.2960319519043, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -100.29216766357422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3296031057834625, "rewards_train/margins": 4.238676160573959, "rewards_train/rejected": -4.568279266357422, "step": 1330 }, { "epoch": 1.77, "logps_train/chosen": -41.40813446044922, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -75.69696044921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.29237616062164307, "rewards_train/margins": 2.7179449796676636, "rewards_train/rejected": -3.0103211402893066, "step": 1331 }, { "epoch": 1.77, "learning_rate": 3.172017118778075e-07, "loss": 0.0622, "step": 1332 }, { "epoch": 1.77, "logps_train/chosen": -51.72608184814453, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -94.88812255859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04956158250570297, "rewards_train/margins": 3.7173761501908302, "rewards_train/rejected": -3.766937732696533, "step": 1332 }, { "epoch": 1.77, "logps_train/chosen": -39.467689514160156, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -94.80979919433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7989343404769897, "rewards_train/margins": 5.029133200645447, "rewards_train/rejected": -4.230198860168457, "step": 1333 }, { "epoch": 1.77, "learning_rate": 3.166719796974301e-07, "loss": 0.0445, "step": 1334 }, { "epoch": 1.77, "logps_train/chosen": -58.66218185424805, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -140.8789520263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4697190523147583, "rewards_train/margins": 5.642769932746887, "rewards_train/rejected": -5.173050880432129, "step": 1334 }, { "epoch": 1.77, "logps_train/chosen": -61.016143798828125, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -102.59158325195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07026086747646332, "rewards_train/margins": 4.2325442880392075, "rewards_train/rejected": -4.162283420562744, "step": 1335 }, { "epoch": 1.77, "learning_rate": 3.161419250521654e-07, "loss": 0.025, "step": 1336 }, { "epoch": 1.77, "logps_train/chosen": -64.99016571044922, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -109.13330078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.060358256101608276, "rewards_train/margins": 4.165876120328903, "rewards_train/rejected": -4.105517864227295, "step": 1336 }, { "epoch": 1.78, "logps_train/chosen": -47.07199478149414, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -94.9906997680664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2529566287994385, "rewards_train/margins": 3.999682903289795, "rewards_train/rejected": -3.7467262744903564, "step": 1337 }, { "epoch": 1.78, "learning_rate": 3.156115505056695e-07, "loss": 0.0522, "step": 1338 }, { "epoch": 1.78, "logps_train/chosen": -52.91387939453125, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -85.16996002197266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0101745128631592, "rewards_train/margins": 3.552170515060425, "rewards_train/rejected": -2.5419960021972656, "step": 1338 }, { "epoch": 1.78, "logps_train/chosen": -70.00985717773438, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -128.4418487548828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18338876962661743, "rewards_train/margins": 5.141636431217194, "rewards_train/rejected": -4.958247661590576, "step": 1339 }, { "epoch": 1.78, "learning_rate": 3.150808586231452e-07, "loss": 0.065, "step": 1340 }, { "epoch": 1.78, "logps_train/chosen": -53.93753433227539, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -104.31352233886719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32187145948410034, "rewards_train/margins": 4.325098216533661, "rewards_train/rejected": -4.0032267570495605, "step": 1340 }, { "epoch": 1.78, "logps_train/chosen": -69.31382751464844, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -113.26199340820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.22669503092765808, "rewards_train/margins": 4.296379595994949, "rewards_train/rejected": -4.523074626922607, "step": 1341 }, { "epoch": 1.78, "learning_rate": 3.145498519713306e-07, "loss": 0.0498, "step": 1342 }, { "epoch": 1.78, "logps_train/chosen": -61.64352798461914, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -97.69657135009766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5583034753799438, "rewards_train/margins": 4.020148158073425, "rewards_train/rejected": -3.4618446826934814, "step": 1342 }, { "epoch": 1.78, "logps_train/chosen": -54.30946350097656, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -103.72923278808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04813361167907715, "rewards_train/margins": 4.229477167129517, "rewards_train/rejected": -4.277610778808594, "step": 1343 }, { "epoch": 1.78, "learning_rate": 3.1401853311848596e-07, "loss": 0.0545, "step": 1344 }, { "epoch": 1.78, "logps_train/chosen": -53.456260681152344, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -86.74205780029297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4582805633544922, "rewards_train/margins": 4.213736534118652, "rewards_train/rejected": -3.75545597076416, "step": 1344 }, { "epoch": 1.79, "logps_train/chosen": -50.258140563964844, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -74.61735534667969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.104720339179039, "rewards_train/margins": 2.8138508945703506, "rewards_train/rejected": -2.9185712337493896, "step": 1345 }, { "epoch": 1.79, "learning_rate": 3.1348690463438165e-07, "loss": 0.1014, "step": 1346 }, { "epoch": 1.79, "logps_train/chosen": -68.87425231933594, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -118.25076293945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3375745415687561, "rewards_train/margins": 4.933354914188385, "rewards_train/rejected": -4.595780372619629, "step": 1346 }, { "epoch": 1.79, "logps_train/chosen": -50.996910095214844, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -82.05500030517578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4098471999168396, "rewards_train/margins": 2.964402973651886, "rewards_train/rejected": -3.3742501735687256, "step": 1347 }, { "epoch": 1.79, "learning_rate": 3.1295496909028543e-07, "loss": 0.1072, "step": 1348 }, { "epoch": 1.79, "logps_train/chosen": -86.40998840332031, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -153.81192016601562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0511549711227417, "rewards_train/margins": 5.5425368547439575, "rewards_train/rejected": -6.593691825866699, "step": 1348 }, { "epoch": 1.79, "logps_train/chosen": -30.314226150512695, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -69.81155395507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5365461111068726, "rewards_train/margins": 3.830983281135559, "rewards_train/rejected": -3.2944371700286865, "step": 1349 }, { "epoch": 1.79, "learning_rate": 3.1242272905895046e-07, "loss": 0.1951, "step": 1350 }, { "epoch": 1.79, "logps_train/chosen": -68.84539794921875, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -114.88736724853516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5037418007850647, "rewards_train/margins": 4.87685352563858, "rewards_train/rejected": -4.373111724853516, "step": 1350 }, { "epoch": 1.79, "logps_train/chosen": -59.06312561035156, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -139.22659301757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20931226015090942, "rewards_train/margins": 5.7147844433784485, "rewards_train/rejected": -5.505472183227539, "step": 1351 }, { "epoch": 1.8, "learning_rate": 3.118901871146022e-07, "loss": 0.0268, "step": 1352 }, { "epoch": 1.8, "logps_train/chosen": -46.557125091552734, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -124.31275177001953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4902836084365845, "rewards_train/margins": 5.107496857643127, "rewards_train/rejected": -4.617213249206543, "step": 1352 }, { "epoch": 1.8, "logps_train/chosen": -40.084022521972656, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -99.239013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8228479623794556, "rewards_train/margins": 4.921749711036682, "rewards_train/rejected": -4.098901748657227, "step": 1353 }, { "epoch": 1.8, "learning_rate": 3.1135734583292673e-07, "loss": 0.0388, "step": 1354 }, { "epoch": 1.8, "logps_train/chosen": -51.53060531616211, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -91.71237182617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17662709951400757, "rewards_train/margins": 4.291614472866058, "rewards_train/rejected": -4.114987373352051, "step": 1354 }, { "epoch": 1.8, "logps_train/chosen": -66.9827651977539, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -119.54243469238281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.21546393632888794, "rewards_train/margins": 5.341806590557098, "rewards_train/rejected": -5.557270526885986, "step": 1355 }, { "epoch": 1.8, "learning_rate": 3.108242077910576e-07, "loss": 0.05, "step": 1356 }, { "epoch": 1.8, "logps_train/chosen": -49.62389373779297, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -102.19573211669922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5012826919555664, "rewards_train/margins": 3.2974185943603516, "rewards_train/rejected": -2.796135902404785, "step": 1356 }, { "epoch": 1.8, "logps_train/chosen": -44.61186599731445, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -85.93205261230469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10600095987319946, "rewards_train/margins": 3.443933069705963, "rewards_train/rejected": -3.3379321098327637, "step": 1357 }, { "epoch": 1.8, "learning_rate": 3.102907755675638e-07, "loss": 0.1483, "step": 1358 }, { "epoch": 1.8, "logps_train/chosen": -63.868865966796875, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -114.36643981933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0756137371063232, "rewards_train/margins": 5.237258195877075, "rewards_train/rejected": -4.161644458770752, "step": 1358 }, { "epoch": 1.8, "logps_train/chosen": -86.20753479003906, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -117.86534118652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10580922663211823, "rewards_train/margins": 4.81734324991703, "rewards_train/rejected": -4.711534023284912, "step": 1359 }, { "epoch": 1.81, "learning_rate": 3.097570517424373e-07, "loss": 0.0238, "step": 1360 }, { "epoch": 1.81, "logps_train/chosen": -78.66271209716797, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -138.128662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.44349443912506104, "rewards_train/margins": 5.445423245429993, "rewards_train/rejected": -5.001928806304932, "step": 1360 }, { "epoch": 1.81, "logps_train/chosen": -87.16780090332031, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -141.34060668945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15353183448314667, "rewards_train/margins": 4.781342312693596, "rewards_train/rejected": -4.627810478210449, "step": 1361 }, { "epoch": 1.81, "learning_rate": 3.0922303889708007e-07, "loss": 0.0549, "step": 1362 }, { "epoch": 1.81, "logps_train/chosen": -74.5985107421875, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -122.53484344482422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12471897900104523, "rewards_train/margins": 4.932500675320625, "rewards_train/rejected": -4.80778169631958, "step": 1362 }, { "epoch": 1.81, "logps_train/chosen": -65.30064392089844, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -105.40562438964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4910295903682709, "rewards_train/margins": 4.389404147863388, "rewards_train/rejected": -3.898374557495117, "step": 1363 }, { "epoch": 1.81, "learning_rate": 3.0868873961429225e-07, "loss": 0.0566, "step": 1364 }, { "epoch": 1.81, "logps_train/chosen": -83.2791748046875, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -178.60520935058594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4408326745033264, "rewards_train/margins": 6.935729563236237, "rewards_train/rejected": -6.49489688873291, "step": 1364 }, { "epoch": 1.81, "logps_train/chosen": -56.840614318847656, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -104.32028198242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.02296966314315796, "rewards_train/margins": 4.161247670650482, "rewards_train/rejected": -4.138278007507324, "step": 1365 }, { "epoch": 1.81, "learning_rate": 3.081541564782592e-07, "loss": 0.0369, "step": 1366 }, { "epoch": 1.81, "logps_train/chosen": -63.11859130859375, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -109.56233215332031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5115785598754883, "rewards_train/margins": 4.5162482261657715, "rewards_train/rejected": -4.004669666290283, "step": 1366 }, { "epoch": 1.82, "logps_train/chosen": -41.38029479980469, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -69.97128295898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5994703769683838, "rewards_train/margins": 3.5626144409179688, "rewards_train/rejected": -2.963144063949585, "step": 1367 }, { "epoch": 1.82, "learning_rate": 3.0761929207453935e-07, "loss": 0.0316, "step": 1368 }, { "epoch": 1.82, "logps_train/chosen": -49.12171173095703, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -93.09168243408203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8253287076950073, "rewards_train/margins": 3.757153868675232, "rewards_train/rejected": -2.9318251609802246, "step": 1368 }, { "epoch": 1.82, "logps_train/chosen": -59.18602752685547, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -104.74286651611328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7985846996307373, "rewards_train/margins": 4.64474630355835, "rewards_train/rejected": -3.8461616039276123, "step": 1369 }, { "epoch": 1.82, "learning_rate": 3.0708414899005126e-07, "loss": 0.0386, "step": 1370 }, { "epoch": 1.82, "logps_train/chosen": -66.77318572998047, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -100.14771270751953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23869720101356506, "rewards_train/margins": 3.458155781030655, "rewards_train/rejected": -3.21945858001709, "step": 1370 }, { "epoch": 1.82, "logps_train/chosen": -35.78307342529297, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -86.46635437011719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.027526073157787323, "rewards_train/margins": 3.8073905184865, "rewards_train/rejected": -3.834916591644287, "step": 1371 }, { "epoch": 1.82, "learning_rate": 3.065487298130615e-07, "loss": 0.0487, "step": 1372 }, { "epoch": 1.82, "logps_train/chosen": -54.715187072753906, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -95.5133056640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04847194254398346, "rewards_train/margins": 4.31301449239254, "rewards_train/rejected": -4.361486434936523, "step": 1372 }, { "epoch": 1.82, "logps_train/chosen": -81.32508850097656, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -126.1264877319336, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7253035306930542, "rewards_train/margins": 4.677796721458435, "rewards_train/rejected": -3.952493190765381, "step": 1373 }, { "epoch": 1.82, "learning_rate": 3.0601303713317193e-07, "loss": 0.0426, "step": 1374 }, { "epoch": 1.82, "logps_train/chosen": -65.03947448730469, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -117.6639175415039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26489973068237305, "rewards_train/margins": 4.2225022315979, "rewards_train/rejected": -3.9576025009155273, "step": 1374 }, { "epoch": 1.83, "logps_train/chosen": -52.906349182128906, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -98.38458251953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9304587244987488, "rewards_train/margins": 5.2571980357170105, "rewards_train/rejected": -4.326739311218262, "step": 1375 }, { "epoch": 1.83, "learning_rate": 3.0547707354130734e-07, "loss": 0.0586, "step": 1376 }, { "epoch": 1.83, "logps_train/chosen": -53.370025634765625, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -110.36749267578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4239352345466614, "rewards_train/margins": 4.55443412065506, "rewards_train/rejected": -4.130498886108398, "step": 1376 }, { "epoch": 1.83, "logps_train/chosen": -85.61115264892578, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -148.0464630126953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12325990200042725, "rewards_train/margins": 5.131031394004822, "rewards_train/rejected": -5.0077714920043945, "step": 1377 }, { "epoch": 1.83, "learning_rate": 3.049408416297026e-07, "loss": 0.0178, "step": 1378 }, { "epoch": 1.83, "logps_train/chosen": -52.071075439453125, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -115.71488952636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3452361524105072, "rewards_train/margins": 5.394850105047226, "rewards_train/rejected": -5.049613952636719, "step": 1378 }, { "epoch": 1.83, "logps_train/chosen": -41.01616287231445, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -87.26249694824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04103497788310051, "rewards_train/margins": 3.7391598634421825, "rewards_train/rejected": -3.698124885559082, "step": 1379 }, { "epoch": 1.83, "learning_rate": 3.044043439918907e-07, "loss": 0.0449, "step": 1380 }, { "epoch": 1.83, "logps_train/chosen": -56.04023742675781, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -98.2576904296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38191360235214233, "rewards_train/margins": 4.482682645320892, "rewards_train/rejected": -4.10076904296875, "step": 1380 }, { "epoch": 1.83, "logps_train/chosen": -42.693115234375, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -97.98914337158203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28459465503692627, "rewards_train/margins": 3.8905402421951294, "rewards_train/rejected": -3.605945587158203, "step": 1381 }, { "epoch": 1.84, "learning_rate": 3.038675832226893e-07, "loss": 0.0483, "step": 1382 }, { "epoch": 1.84, "logps_train/chosen": -66.4197998046875, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -93.89173889160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.028331995010376, "rewards_train/margins": 4.651880502700806, "rewards_train/rejected": -3.6235485076904297, "step": 1382 }, { "epoch": 1.84, "logps_train/chosen": -61.731849670410156, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -104.83549499511719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2596275508403778, "rewards_train/margins": 3.9259893596172333, "rewards_train/rejected": -3.6663618087768555, "step": 1383 }, { "epoch": 1.84, "learning_rate": 3.0333056191818925e-07, "loss": 0.0232, "step": 1384 }, { "epoch": 1.84, "logps_train/chosen": -66.45545196533203, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -106.81305694580078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24820439517498016, "rewards_train/margins": 4.028728649020195, "rewards_train/rejected": -3.780524253845215, "step": 1384 }, { "epoch": 1.84, "logps_train/chosen": -85.05638122558594, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -109.67033386230469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5771746039390564, "rewards_train/margins": 4.189520061016083, "rewards_train/rejected": -3.6123454570770264, "step": 1385 }, { "epoch": 1.84, "learning_rate": 3.027932826757411e-07, "loss": 0.0873, "step": 1386 }, { "epoch": 1.84, "logps_train/chosen": -43.98091125488281, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -82.68589782714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28550249338150024, "rewards_train/margins": 3.1714263558387756, "rewards_train/rejected": -2.8859238624572754, "step": 1386 }, { "epoch": 1.84, "logps_train/chosen": -60.19977569580078, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -119.281494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04966506361961365, "rewards_train/margins": 4.203484445810318, "rewards_train/rejected": -4.253149509429932, "step": 1387 }, { "epoch": 1.84, "learning_rate": 3.022557480939432e-07, "loss": 0.0857, "step": 1388 }, { "epoch": 1.84, "logps_train/chosen": -59.94446563720703, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -121.92713928222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09444694221019745, "rewards_train/margins": 4.720141604542732, "rewards_train/rejected": -4.81458854675293, "step": 1388 }, { "epoch": 1.84, "logps_train/chosen": -69.99658966064453, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -134.48577880859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9550284147262573, "rewards_train/margins": 5.450480818748474, "rewards_train/rejected": -4.495452404022217, "step": 1389 }, { "epoch": 1.85, "learning_rate": 3.017179607726288e-07, "loss": 0.0496, "step": 1390 }, { "epoch": 1.85, "logps_train/chosen": -46.09320068359375, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -91.91936492919922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31724274158477783, "rewards_train/margins": 4.6708985567092896, "rewards_train/rejected": -4.353655815124512, "step": 1390 }, { "epoch": 1.85, "logps_train/chosen": -88.20574951171875, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -150.81411743164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16848719120025635, "rewards_train/margins": 5.192086815834045, "rewards_train/rejected": -5.023599624633789, "step": 1391 }, { "epoch": 1.85, "learning_rate": 3.0117992331285346e-07, "loss": 0.0288, "step": 1392 }, { "epoch": 1.85, "logps_train/chosen": -76.96282958984375, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -123.3883056640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4216860234737396, "rewards_train/margins": 4.7558296620845795, "rewards_train/rejected": -4.33414363861084, "step": 1392 }, { "epoch": 1.85, "logps_train/chosen": -71.67112731933594, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -121.38059997558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4161360263824463, "rewards_train/margins": 4.078564882278442, "rewards_train/rejected": -4.494700908660889, "step": 1393 }, { "epoch": 1.85, "learning_rate": 3.0064163831688274e-07, "loss": 0.0609, "step": 1394 }, { "epoch": 1.85, "logps_train/chosen": -58.07894515991211, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -111.1194839477539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.26414453983306885, "rewards_train/margins": 3.816553473472595, "rewards_train/rejected": -4.080698013305664, "step": 1394 }, { "epoch": 1.85, "logps_train/chosen": -47.85907745361328, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -96.53277587890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.380498468875885, "rewards_train/margins": 4.1837761998176575, "rewards_train/rejected": -3.8032777309417725, "step": 1395 }, { "epoch": 1.85, "learning_rate": 3.001031083881791e-07, "loss": 0.0422, "step": 1396 }, { "epoch": 1.85, "logps_train/chosen": -49.59112548828125, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -97.49778747558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5502627491950989, "rewards_train/margins": 4.206291019916534, "rewards_train/rejected": -3.6560282707214355, "step": 1396 }, { "epoch": 1.86, "logps_train/chosen": -52.57244110107422, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -110.08491516113281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.055255651473999, "rewards_train/margins": 5.044997453689575, "rewards_train/rejected": -3.989741802215576, "step": 1397 }, { "epoch": 1.86, "learning_rate": 2.995643361313901e-07, "loss": 0.0265, "step": 1398 }, { "epoch": 1.86, "logps_train/chosen": -52.32046127319336, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -116.98666381835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.9452974796295166, "rewards_train/margins": 4.649042367935181, "rewards_train/rejected": -3.703744888305664, "step": 1398 }, { "epoch": 1.86, "logps_train/chosen": -48.648155212402344, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -90.46015930175781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12971559166908264, "rewards_train/margins": 4.2390128672122955, "rewards_train/rejected": -4.109297275543213, "step": 1399 }, { "epoch": 1.86, "learning_rate": 2.990253241523349e-07, "loss": 0.034, "step": 1400 }, { "epoch": 1.86, "logps_train/chosen": -61.207550048828125, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -99.67433166503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6120576858520508, "rewards_train/margins": 4.791991233825684, "rewards_train/rejected": -4.179933547973633, "step": 1400 }, { "epoch": 1.86, "logps_train/chosen": -74.7815170288086, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -109.3410873413086, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.3038794994354248, "rewards_train/margins": 5.159863471984863, "rewards_train/rejected": -3.8559839725494385, "step": 1401 }, { "epoch": 1.86, "learning_rate": 2.9848607505799245e-07, "loss": 0.0403, "step": 1402 }, { "epoch": 1.86, "logps_train/chosen": -35.02925109863281, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -101.90641021728516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7111372947692871, "rewards_train/margins": 4.845528602600098, "rewards_train/rejected": -4.1343913078308105, "step": 1402 }, { "epoch": 1.86, "logps_train/chosen": -80.09613037109375, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -110.01890563964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19741851091384888, "rewards_train/margins": 3.974309265613556, "rewards_train/rejected": -3.776890754699707, "step": 1403 }, { "epoch": 1.86, "learning_rate": 2.9794659145648814e-07, "loss": 0.0535, "step": 1404 }, { "epoch": 1.86, "logps_train/chosen": -50.37534713745117, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -93.46333312988281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21715278923511505, "rewards_train/margins": 3.934580758213997, "rewards_train/rejected": -3.717427968978882, "step": 1404 }, { "epoch": 1.87, "logps_train/chosen": -60.197166442871094, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -121.78974914550781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.19002953171730042, "rewards_train/margins": 5.2483201920986176, "rewards_train/rejected": -5.438349723815918, "step": 1405 }, { "epoch": 1.87, "learning_rate": 2.97406875957082e-07, "loss": 0.0632, "step": 1406 }, { "epoch": 1.87, "logps_train/chosen": -51.564117431640625, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -102.11001586914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6717129945755005, "rewards_train/margins": 3.973730683326721, "rewards_train/rejected": -3.3020176887512207, "step": 1406 }, { "epoch": 1.87, "logps_train/chosen": -49.958805084228516, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -96.32981872558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5056820511817932, "rewards_train/margins": 3.9699142575263977, "rewards_train/rejected": -3.4642322063446045, "step": 1407 }, { "epoch": 1.87, "learning_rate": 2.968669311701555e-07, "loss": 0.0476, "step": 1408 }, { "epoch": 1.87, "logps_train/chosen": -54.87819290161133, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -117.6095962524414, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.877805769443512, "rewards_train/margins": 6.093452990055084, "rewards_train/rejected": -5.215647220611572, "step": 1408 }, { "epoch": 1.87, "logps_train/chosen": -79.81818389892578, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -124.37107849121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15568169951438904, "rewards_train/margins": 5.692789167165756, "rewards_train/rejected": -5.537107467651367, "step": 1409 }, { "epoch": 1.87, "learning_rate": 2.963267597071988e-07, "loss": 0.0149, "step": 1410 }, { "epoch": 1.87, "logps_train/chosen": -32.90620422363281, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -55.9189567565918, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.2570356130599976, "rewards_train/margins": 3.4086965322494507, "rewards_train/rejected": -2.151660919189453, "step": 1410 }, { "epoch": 1.87, "logps_train/chosen": -48.267337799072266, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -87.08444213867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.44748491048812866, "rewards_train/margins": 4.073897540569305, "rewards_train/rejected": -3.6264126300811768, "step": 1411 }, { "epoch": 1.88, "learning_rate": 2.9578636418079894e-07, "loss": 0.0714, "step": 1412 }, { "epoch": 1.88, "logps_train/chosen": -92.38131713867188, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -156.2255859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1725068986415863, "rewards_train/margins": 5.25630322098732, "rewards_train/rejected": -5.428810119628906, "step": 1412 }, { "epoch": 1.88, "logps_train/chosen": -63.55613708496094, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -116.3621826171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15366077423095703, "rewards_train/margins": 4.344276428222656, "rewards_train/rejected": -4.497937202453613, "step": 1413 }, { "epoch": 1.88, "learning_rate": 2.952457472046261e-07, "loss": 0.0124, "step": 1414 }, { "epoch": 1.88, "logps_train/chosen": -74.3561782836914, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -148.5107879638672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32375743985176086, "rewards_train/margins": 5.1373365223407745, "rewards_train/rejected": -4.813579082489014, "step": 1414 }, { "epoch": 1.88, "logps_train/chosen": -44.059532165527344, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -79.52403259277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11748401820659637, "rewards_train/margins": 3.5456690043210983, "rewards_train/rejected": -3.428184986114502, "step": 1415 }, { "epoch": 1.88, "learning_rate": 2.947049113934219e-07, "loss": 0.0369, "step": 1416 }, { "epoch": 1.88, "logps_train/chosen": -70.86552429199219, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -107.1171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.025947168469429016, "rewards_train/margins": 3.9083684235811234, "rewards_train/rejected": -3.8824212551116943, "step": 1416 }, { "epoch": 1.88, "logps_train/chosen": -48.52073669433594, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -99.45140075683594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45339542627334595, "rewards_train/margins": 4.861035883426666, "rewards_train/rejected": -4.40764045715332, "step": 1417 }, { "epoch": 1.88, "learning_rate": 2.941638593629863e-07, "loss": 0.0284, "step": 1418 }, { "epoch": 1.88, "logps_train/chosen": -35.82040023803711, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -98.71434783935547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37468594312667847, "rewards_train/margins": 4.558620631694794, "rewards_train/rejected": -4.183934688568115, "step": 1418 }, { "epoch": 1.88, "logps_train/chosen": -42.31189727783203, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -79.3904037475586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5063104033470154, "rewards_train/margins": 4.03519481420517, "rewards_train/rejected": -3.5288844108581543, "step": 1419 }, { "epoch": 1.89, "learning_rate": 2.9362259373016483e-07, "loss": 0.0267, "step": 1420 }, { "epoch": 1.89, "logps_train/chosen": -60.00281524658203, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -122.89885711669922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18682795763015747, "rewards_train/margins": 5.225542366504669, "rewards_train/rejected": -5.038714408874512, "step": 1420 }, { "epoch": 1.89, "logps_train/chosen": -52.390525817871094, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -99.8812484741211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3828224539756775, "rewards_train/margins": 4.5131348967552185, "rewards_train/rejected": -4.130312442779541, "step": 1421 }, { "epoch": 1.89, "learning_rate": 2.9308111711283633e-07, "loss": 0.026, "step": 1422 }, { "epoch": 1.89, "logps_train/chosen": -70.5267105102539, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -127.44424438476562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.2817041873931885, "rewards_train/margins": 5.202691316604614, "rewards_train/rejected": -4.920987129211426, "step": 1422 }, { "epoch": 1.89, "logps_train/chosen": -55.425819396972656, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -120.81999206542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.046488210558891296, "rewards_train/margins": 4.937073782086372, "rewards_train/rejected": -4.983561992645264, "step": 1423 }, { "epoch": 1.89, "learning_rate": 2.925394321299002e-07, "loss": 0.12, "step": 1424 }, { "epoch": 1.89, "logps_train/chosen": -64.47172546386719, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -129.3775634765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09188999235630035, "rewards_train/margins": 5.49527134001255, "rewards_train/rejected": -5.40338134765625, "step": 1424 }, { "epoch": 1.89, "logps_train/chosen": -107.11774444580078, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -143.26275634765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.23833663761615753, "rewards_train/margins": 4.184032365679741, "rewards_train/rejected": -4.422369003295898, "step": 1425 }, { "epoch": 1.89, "learning_rate": 2.919975414012632e-07, "loss": 0.0337, "step": 1426 }, { "epoch": 1.89, "logps_train/chosen": -74.57425689697266, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -138.93374633789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22460520267486572, "rewards_train/margins": 5.713682770729065, "rewards_train/rejected": -5.489077568054199, "step": 1426 }, { "epoch": 1.9, "logps_train/chosen": -39.20121383666992, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -93.17411804199219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8892538547515869, "rewards_train/margins": 5.139869928359985, "rewards_train/rejected": -4.250616073608398, "step": 1427 }, { "epoch": 1.9, "learning_rate": 2.9145544754782766e-07, "loss": 0.0245, "step": 1428 }, { "epoch": 1.9, "logps_train/chosen": -82.14473724365234, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -128.3704833984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.527713418006897, "rewards_train/margins": 4.969449162483215, "rewards_train/rejected": -4.441735744476318, "step": 1428 }, { "epoch": 1.9, "logps_train/chosen": -35.23230743408203, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -84.66827392578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0009880661964416504, "rewards_train/margins": 3.5846124291419983, "rewards_train/rejected": -3.5836243629455566, "step": 1429 }, { "epoch": 1.9, "learning_rate": 2.909131531914779e-07, "loss": 0.0461, "step": 1430 }, { "epoch": 1.9, "logps_train/chosen": -96.21036529541016, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -153.4139404296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.002792343497276306, "rewards_train/margins": 4.91918657720089, "rewards_train/rejected": -4.916394233703613, "step": 1430 }, { "epoch": 1.9, "logps_train/chosen": -79.72184753417969, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -122.22181701660156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8121906518936157, "rewards_train/margins": 5.806247353553772, "rewards_train/rejected": -4.994056701660156, "step": 1431 }, { "epoch": 1.9, "learning_rate": 2.9037066095506844e-07, "loss": 0.0229, "step": 1432 }, { "epoch": 1.9, "logps_train/chosen": -69.42741394042969, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -107.76435089111328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.009602285921573639, "rewards_train/margins": 4.163380362093449, "rewards_train/rejected": -4.153778076171875, "step": 1432 }, { "epoch": 1.9, "logps_train/chosen": -56.00580596923828, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -116.01048278808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3276299834251404, "rewards_train/margins": 4.127505719661713, "rewards_train/rejected": -3.7998757362365723, "step": 1433 }, { "epoch": 1.9, "learning_rate": 2.898279734624105e-07, "loss": 0.0571, "step": 1434 }, { "epoch": 1.9, "logps_train/chosen": -42.48843002319336, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -98.35917663574219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8495945930480957, "rewards_train/margins": 5.057387351989746, "rewards_train/rejected": -4.20779275894165, "step": 1434 }, { "epoch": 1.91, "logps_train/chosen": -66.51753234863281, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -129.76156616210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35605907440185547, "rewards_train/margins": 5.3915910720825195, "rewards_train/rejected": -5.035531997680664, "step": 1435 }, { "epoch": 1.91, "learning_rate": 2.8928509333825986e-07, "loss": 0.0115, "step": 1436 }, { "epoch": 1.91, "logps_train/chosen": -53.16557312011719, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -116.50257110595703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12016167491674423, "rewards_train/margins": 4.847762726247311, "rewards_train/rejected": -4.727601051330566, "step": 1436 }, { "epoch": 1.91, "logps_train/chosen": -52.30064392089844, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -98.35253143310547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6800918579101562, "rewards_train/margins": 5.034095764160156, "rewards_train/rejected": -4.35400390625, "step": 1437 }, { "epoch": 1.91, "learning_rate": 2.88742023208304e-07, "loss": 0.0195, "step": 1438 }, { "epoch": 1.91, "logps_train/chosen": -65.48428344726562, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -108.00885772705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8148536682128906, "rewards_train/margins": 5.8329267501831055, "rewards_train/rejected": -5.018073081970215, "step": 1438 }, { "epoch": 1.91, "logps_train/chosen": -36.332275390625, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -86.35623931884766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2652098536491394, "rewards_train/margins": 4.152787387371063, "rewards_train/rejected": -3.887577533721924, "step": 1439 }, { "epoch": 1.91, "learning_rate": 2.881987656991491e-07, "loss": 0.0465, "step": 1440 }, { "epoch": 1.91, "logps_train/chosen": -41.27156066894531, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -79.32283020019531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.14121852815151215, "rewards_train/margins": 3.80200232565403, "rewards_train/rejected": -3.943220853805542, "step": 1440 }, { "epoch": 1.91, "logps_train/chosen": -42.398807525634766, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -76.52741241455078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.49066174030303955, "rewards_train/margins": 3.5167664289474487, "rewards_train/rejected": -4.007428169250488, "step": 1441 }, { "epoch": 1.92, "learning_rate": 2.8765532343830815e-07, "loss": 0.0917, "step": 1442 }, { "epoch": 1.92, "logps_train/chosen": -84.64006805419922, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -167.8251953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7171317338943481, "rewards_train/margins": 5.268512606620789, "rewards_train/rejected": -5.985644340515137, "step": 1442 }, { "epoch": 1.92, "logps_train/chosen": -79.37599182128906, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -125.7872085571289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3108382821083069, "rewards_train/margins": 5.1208091378211975, "rewards_train/rejected": -4.809970855712891, "step": 1443 }, { "epoch": 1.92, "learning_rate": 2.8711169905418714e-07, "loss": 0.0132, "step": 1444 }, { "epoch": 1.92, "logps_train/chosen": -68.53081512451172, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -125.01752471923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22816842794418335, "rewards_train/margins": 5.012734353542328, "rewards_train/rejected": -4.7845659255981445, "step": 1444 }, { "epoch": 1.92, "logps_train/chosen": -35.465431213378906, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -99.49406433105469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04056597501039505, "rewards_train/margins": 4.3977847173810005, "rewards_train/rejected": -4.3572187423706055, "step": 1445 }, { "epoch": 1.92, "learning_rate": 2.8656789517607326e-07, "loss": 0.0286, "step": 1446 }, { "epoch": 1.92, "logps_train/chosen": -68.00939178466797, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -82.16260528564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2365611493587494, "rewards_train/margins": 3.5715712010860443, "rewards_train/rejected": -3.335010051727295, "step": 1446 }, { "epoch": 1.92, "logps_train/chosen": -50.444820404052734, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -90.63224792480469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3574712872505188, "rewards_train/margins": 4.203117668628693, "rewards_train/rejected": -3.845646381378174, "step": 1447 }, { "epoch": 1.92, "learning_rate": 2.860239144341217e-07, "loss": 0.041, "step": 1448 }, { "epoch": 1.92, "logps_train/chosen": -86.54007720947266, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -143.39755249023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3381800055503845, "rewards_train/margins": 5.865435063838959, "rewards_train/rejected": -5.527255058288574, "step": 1448 }, { "epoch": 1.92, "logps_train/chosen": -50.87220764160156, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -99.9171142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42527899146080017, "rewards_train/margins": 4.24980291724205, "rewards_train/rejected": -3.82452392578125, "step": 1449 }, { "epoch": 1.93, "learning_rate": 2.8547975945934317e-07, "loss": 0.0531, "step": 1450 }, { "epoch": 1.93, "logps_train/chosen": -58.34137725830078, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -139.39535522460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30961209535598755, "rewards_train/margins": 5.649149239063263, "rewards_train/rejected": -5.339537143707275, "step": 1450 }, { "epoch": 1.93, "logps_train/chosen": -44.89744186401367, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -130.18356323242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.00869344174861908, "rewards_train/margins": 5.633299097418785, "rewards_train/rejected": -5.624605655670166, "step": 1451 }, { "epoch": 1.93, "learning_rate": 2.8493543288359095e-07, "loss": 0.013, "step": 1452 }, { "epoch": 1.93, "logps_train/chosen": -46.73979568481445, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -76.02203369140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34252452850341797, "rewards_train/margins": 3.77207088470459, "rewards_train/rejected": -3.429546356201172, "step": 1452 }, { "epoch": 1.93, "logps_train/chosen": -69.0582275390625, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -136.95826721191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13949009776115417, "rewards_train/margins": 5.763442009687424, "rewards_train/rejected": -5.6239519119262695, "step": 1453 }, { "epoch": 1.93, "learning_rate": 2.843909373395484e-07, "loss": 0.0283, "step": 1454 }, { "epoch": 1.93, "logps_train/chosen": -45.812522888183594, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -89.51861572265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2468721866607666, "rewards_train/margins": 4.687796354293823, "rewards_train/rejected": -4.440924167633057, "step": 1454 }, { "epoch": 1.93, "logps_train/chosen": -70.66246032714844, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -134.38975524902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5837537050247192, "rewards_train/margins": 5.7336665391922, "rewards_train/rejected": -5.1499128341674805, "step": 1455 }, { "epoch": 1.93, "learning_rate": 2.838462754607159e-07, "loss": 0.0389, "step": 1456 }, { "epoch": 1.93, "logps_train/chosen": -91.08067321777344, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -143.36907958984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2927139401435852, "rewards_train/margins": 5.610871136188507, "rewards_train/rejected": -5.318157196044922, "step": 1456 }, { "epoch": 1.93, "logps_train/chosen": -61.23676681518555, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -96.6188735961914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.442338764667511, "rewards_train/margins": 3.5593042969703674, "rewards_train/rejected": -3.1169655323028564, "step": 1457 }, { "epoch": 1.94, "learning_rate": 2.8330144988139884e-07, "loss": 0.0644, "step": 1458 }, { "epoch": 1.94, "logps_train/chosen": -71.24273681640625, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -125.11076354980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.004241734743118286, "rewards_train/margins": 5.030943185091019, "rewards_train/rejected": -5.0267014503479, "step": 1458 }, { "epoch": 1.94, "logps_train/chosen": -44.77756881713867, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -98.69107055664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4284931719303131, "rewards_train/margins": 4.474163204431534, "rewards_train/rejected": -4.045670032501221, "step": 1459 }, { "epoch": 1.94, "learning_rate": 2.8275646323669357e-07, "loss": 0.0413, "step": 1460 }, { "epoch": 1.94, "logps_train/chosen": -54.52716064453125, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -99.99333190917969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1722475290298462, "rewards_train/margins": 4.205211043357849, "rewards_train/rejected": -4.377458572387695, "step": 1460 }, { "epoch": 1.94, "logps_train/chosen": -113.48202514648438, "logps_train/ref_chosen": -120.5, "logps_train/ref_rejected": -112.5, "logps_train/rejected": -172.98388671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7252349853515625, "rewards_train/margins": 6.7533111572265625, "rewards_train/rejected": -6.028076171875, "step": 1461 }, { "epoch": 1.94, "learning_rate": 2.822113181624761e-07, "loss": 0.053, "step": 1462 }, { "epoch": 1.94, "logps_train/chosen": -55.837181091308594, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -113.34998321533203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.0076881647109985, "rewards_train/margins": 4.925499081611633, "rewards_train/rejected": -3.9178109169006348, "step": 1462 }, { "epoch": 1.94, "logps_train/chosen": -63.88433074951172, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -130.2223663330078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07719208300113678, "rewards_train/margins": 5.093569055199623, "rewards_train/rejected": -5.016376972198486, "step": 1463 }, { "epoch": 1.94, "learning_rate": 2.8166601729538846e-07, "loss": 0.015, "step": 1464 }, { "epoch": 1.94, "logps_train/chosen": -53.39549255371094, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -101.66741180419922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04795091599225998, "rewards_train/margins": 3.571723870933056, "rewards_train/rejected": -3.523772954940796, "step": 1464 }, { "epoch": 1.95, "logps_train/chosen": -45.021942138671875, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -89.75994873046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6481965780258179, "rewards_train/margins": 4.394503951072693, "rewards_train/rejected": -3.746307373046875, "step": 1465 }, { "epoch": 1.95, "learning_rate": 2.811205632728262e-07, "loss": 0.064, "step": 1466 }, { "epoch": 1.95, "logps_train/chosen": -38.482234954833984, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -99.0985107421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7193544507026672, "rewards_train/margins": 6.205767452716827, "rewards_train/rejected": -5.48641300201416, "step": 1466 }, { "epoch": 1.95, "logps_train/chosen": -38.944766998291016, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -83.85580444335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38364824652671814, "rewards_train/margins": 4.2301667630672455, "rewards_train/rejected": -3.8465185165405273, "step": 1467 }, { "epoch": 1.95, "learning_rate": 2.805749587329256e-07, "loss": 0.0198, "step": 1468 }, { "epoch": 1.95, "logps_train/chosen": -80.60104370117188, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -133.9772491455078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.610989511013031, "rewards_train/margins": 6.119261682033539, "rewards_train/rejected": -5.508272171020508, "step": 1468 }, { "epoch": 1.95, "logps_train/chosen": -91.69876098632812, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -134.55972290039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.3144994974136353, "rewards_train/margins": 6.207972168922424, "rewards_train/rejected": -4.893472671508789, "step": 1469 }, { "epoch": 1.95, "learning_rate": 2.800292063145509e-07, "loss": 0.0241, "step": 1470 }, { "epoch": 1.95, "logps_train/chosen": -52.74502182006836, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -104.6856689453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20518550276756287, "rewards_train/margins": 4.98312720656395, "rewards_train/rejected": -4.777941703796387, "step": 1470 }, { "epoch": 1.95, "logps_train/chosen": -71.55609130859375, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -116.09407043457031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.007171005010604858, "rewards_train/margins": 4.297548919916153, "rewards_train/rejected": -4.304719924926758, "step": 1471 }, { "epoch": 1.95, "learning_rate": 2.7948330865728173e-07, "loss": 0.0603, "step": 1472 }, { "epoch": 1.95, "logps_train/chosen": -65.50558471679688, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -109.54410552978516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11662900447845459, "rewards_train/margins": 4.691351771354675, "rewards_train/rejected": -4.574722766876221, "step": 1472 }, { "epoch": 1.96, "logps_train/chosen": -40.497886657714844, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -89.035888671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3642734885215759, "rewards_train/margins": 3.9506747126579285, "rewards_train/rejected": -3.5864012241363525, "step": 1473 }, { "epoch": 1.96, "learning_rate": 2.789372684014e-07, "loss": 0.0409, "step": 1474 }, { "epoch": 1.96, "logps_train/chosen": -82.15108489990234, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -147.2525177001953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.09323322772979736, "rewards_train/margins": 5.689831614494324, "rewards_train/rejected": -5.783064842224121, "step": 1474 }, { "epoch": 1.96, "logps_train/chosen": -63.462379455566406, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -110.44020080566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4722146987915039, "rewards_train/margins": 3.590555191040039, "rewards_train/rejected": -4.062769889831543, "step": 1475 }, { "epoch": 1.96, "learning_rate": 2.783910881878774e-07, "loss": 0.1127, "step": 1476 }, { "epoch": 1.96, "logps_train/chosen": -47.33250427246094, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -104.92821502685547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35346829891204834, "rewards_train/margins": 4.177540421485901, "rewards_train/rejected": -3.8240721225738525, "step": 1476 }, { "epoch": 1.96, "logps_train/chosen": -70.73235321044922, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -113.67223358154297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.043547987937927246, "rewards_train/margins": 4.37211287021637, "rewards_train/rejected": -4.415660858154297, "step": 1477 }, { "epoch": 1.96, "learning_rate": 2.778447706583625e-07, "loss": 0.0605, "step": 1478 }, { "epoch": 1.96, "logps_train/chosen": -81.16079711914062, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -144.43612670898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23079533874988556, "rewards_train/margins": 5.436908200383186, "rewards_train/rejected": -5.206112861633301, "step": 1478 }, { "epoch": 1.96, "logps_train/chosen": -47.40415954589844, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -96.43424224853516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.02130260318517685, "rewards_train/margins": 4.548321150243282, "rewards_train/rejected": -4.5270185470581055, "step": 1479 }, { "epoch": 1.97, "learning_rate": 2.7729831845516804e-07, "loss": 0.0251, "step": 1480 }, { "epoch": 1.97, "logps_train/chosen": -51.33964538574219, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -93.49244689941406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3348585367202759, "rewards_train/margins": 3.467511296272278, "rewards_train/rejected": -3.8023698329925537, "step": 1480 }, { "epoch": 1.97, "logps_train/chosen": -82.95388793945312, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -149.5021209716797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41711124777793884, "rewards_train/margins": 5.609510749578476, "rewards_train/rejected": -5.192399501800537, "step": 1481 }, { "epoch": 1.97, "learning_rate": 2.7675173422125806e-07, "loss": 0.0483, "step": 1482 }, { "epoch": 1.97, "logps_train/chosen": -69.3114013671875, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -104.64030456542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.43895331025123596, "rewards_train/margins": 4.186014264822006, "rewards_train/rejected": -4.624967575073242, "step": 1482 }, { "epoch": 1.97, "logps_train/chosen": -67.50462341308594, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -161.12814331054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38860011100769043, "rewards_train/margins": 6.9529759883880615, "rewards_train/rejected": -6.564375877380371, "step": 1483 }, { "epoch": 1.97, "learning_rate": 2.7620502060023534e-07, "loss": 0.0262, "step": 1484 }, { "epoch": 1.97, "logps_train/chosen": -65.65058135986328, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -117.5969009399414, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5271295309066772, "rewards_train/margins": 5.218069434165955, "rewards_train/rejected": -4.690939903259277, "step": 1484 }, { "epoch": 1.97, "logps_train/chosen": -68.64362335205078, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -127.66253662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.571393609046936, "rewards_train/margins": 4.962047457695007, "rewards_train/rejected": -5.533441066741943, "step": 1485 }, { "epoch": 1.97, "learning_rate": 2.756581802363282e-07, "loss": 0.021, "step": 1486 }, { "epoch": 1.97, "logps_train/chosen": -72.97940063476562, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -140.24441528320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.11356444656848907, "rewards_train/margins": 6.007751986384392, "rewards_train/rejected": -6.121316432952881, "step": 1486 }, { "epoch": 1.97, "logps_train/chosen": -68.36685180664062, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -117.40311431884766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.30777865648269653, "rewards_train/margins": 4.398157775402069, "rewards_train/rejected": -4.705936431884766, "step": 1487 }, { "epoch": 1.98, "learning_rate": 2.751112157743782e-07, "loss": 0.0245, "step": 1488 }, { "epoch": 1.98, "logps_train/chosen": -64.34452819824219, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -101.63462829589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4999223053455353, "rewards_train/margins": 4.169634610414505, "rewards_train/rejected": -3.6697123050689697, "step": 1488 }, { "epoch": 1.98, "logps_train/chosen": -67.4752426147461, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -113.42302703857422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4443991184234619, "rewards_train/margins": 4.4260289669036865, "rewards_train/rejected": -4.870428085327148, "step": 1489 }, { "epoch": 1.98, "learning_rate": 2.74564129859827e-07, "loss": 0.0544, "step": 1490 }, { "epoch": 1.98, "logps_train/chosen": -36.124183654785156, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -84.19427490234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33387088775634766, "rewards_train/margins": 4.070876598358154, "rewards_train/rejected": -3.7370057106018066, "step": 1490 }, { "epoch": 1.98, "logps_train/chosen": -54.76097869873047, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -139.83721923828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1317143440246582, "rewards_train/margins": 5.313873291015625, "rewards_train/rejected": -5.182158946990967, "step": 1491 }, { "epoch": 1.98, "learning_rate": 2.7401692513870374e-07, "loss": 0.0814, "step": 1492 }, { "epoch": 1.98, "logps_train/chosen": -51.678688049316406, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -103.94388580322266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8008812665939331, "rewards_train/margins": 4.192144751548767, "rewards_train/rejected": -3.391263484954834, "step": 1492 }, { "epoch": 1.98, "logps_train/chosen": -52.904869079589844, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -110.48863220214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20795035362243652, "rewards_train/margins": 4.189625978469849, "rewards_train/rejected": -3.981675624847412, "step": 1493 }, { "epoch": 1.98, "learning_rate": 2.7346960425761196e-07, "loss": 0.0263, "step": 1494 }, { "epoch": 1.98, "logps_train/chosen": -65.22968292236328, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -141.08441162109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32390648126602173, "rewards_train/margins": 5.43547397851944, "rewards_train/rejected": -5.111567497253418, "step": 1494 }, { "epoch": 1.99, "logps_train/chosen": -54.40276336669922, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -88.49136352539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20816144347190857, "rewards_train/margins": 3.5096417367458344, "rewards_train/rejected": -3.301480293273926, "step": 1495 }, { "epoch": 1.99, "learning_rate": 2.7292216986371724e-07, "loss": 0.0567, "step": 1496 }, { "epoch": 1.99, "logps_train/chosen": -46.14875030517578, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -89.08484649658203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12034371495246887, "rewards_train/margins": 3.617828816175461, "rewards_train/rejected": -3.7381725311279297, "step": 1496 }, { "epoch": 1.99, "logps_train/chosen": -60.46788787841797, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -106.10638427734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1395392417907715, "rewards_train/margins": 5.034552335739136, "rewards_train/rejected": -3.8950130939483643, "step": 1497 }, { "epoch": 1.99, "learning_rate": 2.723746246047343e-07, "loss": 0.057, "step": 1498 }, { "epoch": 1.99, "logps_train/chosen": -79.06304168701172, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -129.55767822265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1530708372592926, "rewards_train/margins": 5.021338850259781, "rewards_train/rejected": -4.868268013000488, "step": 1498 }, { "epoch": 1.99, "logps_train/chosen": -72.53982543945312, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -123.5213623046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9668728709220886, "rewards_train/margins": 3.7411234974861145, "rewards_train/rejected": -4.707996368408203, "step": 1499 }, { "epoch": 1.99, "learning_rate": 2.718269711289137e-07, "loss": 0.0348, "step": 1500 }, { "epoch": 1.99, "logps_train/chosen": -66.54116821289062, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -95.76277160644531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1947413980960846, "rewards_train/margins": 4.04716095328331, "rewards_train/rejected": -4.2419023513793945, "step": 1500 }, { "epoch": 1.99, "logps_train/chosen": -92.96195983886719, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -152.18441772460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.20557141304016113, "rewards_train/margins": 5.351933240890503, "rewards_train/rejected": -5.557504653930664, "step": 1501 }, { "epoch": 1.99, "learning_rate": 2.712792120850297e-07, "loss": 0.0568, "step": 1502 }, { "epoch": 1.99, "logps_train/chosen": -41.53184509277344, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -86.88002014160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.060996681451797485, "rewards_train/margins": 4.282082885503769, "rewards_train/rejected": -4.343079566955566, "step": 1502 }, { "epoch": 2.0, "logps_train/chosen": -64.42831420898438, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -117.857421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1881433129310608, "rewards_train/margins": 4.031974732875824, "rewards_train/rejected": -4.220118045806885, "step": 1503 }, { "epoch": 2.0, "learning_rate": 2.7073135012236684e-07, "loss": 0.061, "step": 1504 }, { "epoch": 2.0, "logps_train/chosen": -54.92771530151367, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -93.7679214477539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6708967089653015, "rewards_train/margins": 4.118395626544952, "rewards_train/rejected": -4.789292335510254, "step": 1504 }, { "epoch": 2.0, "logps_train/chosen": -73.65581512451172, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -137.98806762695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07261277735233307, "rewards_train/margins": 5.2761950343847275, "rewards_train/rejected": -5.3488078117370605, "step": 1505 }, { "epoch": 2.0, "learning_rate": 2.7018338789070793e-07, "loss": 0.036, "step": 1506 }, { "epoch": 2.0, "logps_train/chosen": -80.58269500732422, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -146.76889038085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2971992492675781, "rewards_train/margins": 5.706118583679199, "rewards_train/rejected": -5.408919334411621, "step": 1506 }, { "epoch": 2.0, "logps_train/chosen": -98.68017578125, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -162.65444946289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6664549112319946, "rewards_train/margins": 5.113051772117615, "rewards_train/rejected": -5.779506683349609, "step": 1507 }, { "epoch": 2.0, "learning_rate": 2.6963532804032027e-07, "loss": 0.0147, "step": 1508 }, { "epoch": 2.0, "logps_train/chosen": -91.02877044677734, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -167.73997497558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.13412678241729736, "rewards_train/margins": 7.06799590587616, "rewards_train/rejected": -7.202122688293457, "step": 1508 }, { "epoch": 2.0, "logps_train/chosen": -70.12113952636719, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -120.04388427734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.17617684602737427, "rewards_train/margins": 5.170399367809296, "rewards_train/rejected": -5.34657621383667, "step": 1509 }, { "epoch": 2.01, "learning_rate": 2.690871732219435e-07, "loss": 0.01, "step": 1510 }, { "epoch": 2.01, "logps_train/chosen": -57.403995513916016, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -91.07113647460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38928788900375366, "rewards_train/margins": 3.885464608669281, "rewards_train/rejected": -3.4961767196655273, "step": 1510 }, { "epoch": 2.01, "logps_train/chosen": -65.37919616699219, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -119.10092163085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14645498991012573, "rewards_train/margins": 5.4409219622612, "rewards_train/rejected": -5.294466972351074, "step": 1511 }, { "epoch": 2.01, "learning_rate": 2.685389260867765e-07, "loss": 0.0234, "step": 1512 }, { "epoch": 2.01, "logps_train/chosen": -63.641632080078125, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -124.69804382324219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1782255619764328, "rewards_train/margins": 4.980641320347786, "rewards_train/rejected": -5.158866882324219, "step": 1512 }, { "epoch": 2.01, "logps_train/chosen": -92.23426818847656, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -135.4663543701172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7953237891197205, "rewards_train/margins": 5.276334226131439, "rewards_train/rejected": -4.481010437011719, "step": 1513 }, { "epoch": 2.01, "learning_rate": 2.6799058928646477e-07, "loss": 0.0138, "step": 1514 }, { "epoch": 2.01, "logps_train/chosen": -77.5556640625, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -122.76179504394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5708003640174866, "rewards_train/margins": 4.214753568172455, "rewards_train/rejected": -4.785553932189941, "step": 1514 }, { "epoch": 2.01, "logps_train/chosen": -59.10913848876953, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -105.50821685791016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3296639919281006, "rewards_train/margins": 4.58209490776062, "rewards_train/rejected": -4.911758899688721, "step": 1515 }, { "epoch": 2.01, "learning_rate": 2.6744216547308747e-07, "loss": 0.0169, "step": 1516 }, { "epoch": 2.01, "logps_train/chosen": -48.72401428222656, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -95.6735610961914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11744228005409241, "rewards_train/margins": 3.8668293058872223, "rewards_train/rejected": -3.74938702583313, "step": 1516 }, { "epoch": 2.01, "logps_train/chosen": -53.20874786376953, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -106.03361511230469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3497812747955322, "rewards_train/margins": 4.195768117904663, "rewards_train/rejected": -4.545549392700195, "step": 1517 }, { "epoch": 2.02, "learning_rate": 2.668936572991444e-07, "loss": 0.0483, "step": 1518 }, { "epoch": 2.02, "logps_train/chosen": -59.403358459472656, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -110.39976501464844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12471073865890503, "rewards_train/margins": 4.123078644275665, "rewards_train/rejected": -4.24778938293457, "step": 1518 }, { "epoch": 2.02, "logps_train/chosen": -49.50419616699219, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -103.47373962402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08708065748214722, "rewards_train/margins": 4.662580192089081, "rewards_train/rejected": -4.575499534606934, "step": 1519 }, { "epoch": 2.02, "learning_rate": 2.663450674175437e-07, "loss": 0.0348, "step": 1520 }, { "epoch": 2.02, "logps_train/chosen": -79.5920181274414, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -134.07373046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.16857658326625824, "rewards_train/margins": 4.68879522383213, "rewards_train/rejected": -4.857371807098389, "step": 1520 }, { "epoch": 2.02, "logps_train/chosen": -49.621028900146484, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -107.77963256835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5261653661727905, "rewards_train/margins": 5.2244545221328735, "rewards_train/rejected": -5.750619888305664, "step": 1521 }, { "epoch": 2.02, "learning_rate": 2.657963984815885e-07, "loss": 0.0372, "step": 1522 }, { "epoch": 2.02, "logps_train/chosen": -44.043548583984375, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -81.99150848388672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.026895053684711456, "rewards_train/margins": 3.4518268182873726, "rewards_train/rejected": -3.424931764602661, "step": 1522 }, { "epoch": 2.02, "logps_train/chosen": -34.81300735473633, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -84.9473876953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5499492883682251, "rewards_train/margins": 4.390390753746033, "rewards_train/rejected": -3.8404414653778076, "step": 1523 }, { "epoch": 2.02, "learning_rate": 2.6524765314496416e-07, "loss": 0.0793, "step": 1524 }, { "epoch": 2.02, "logps_train/chosen": -49.01211929321289, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -76.61344909667969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5012120008468628, "rewards_train/margins": 3.0093518495559692, "rewards_train/rejected": -3.510563850402832, "step": 1524 }, { "epoch": 2.03, "logps_train/chosen": -51.36775207519531, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -104.84258270263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13822507858276367, "rewards_train/margins": 4.406858444213867, "rewards_train/rejected": -4.2686333656311035, "step": 1525 }, { "epoch": 2.03, "learning_rate": 2.646988340617258e-07, "loss": 0.0973, "step": 1526 }, { "epoch": 2.03, "logps_train/chosen": -65.84739685058594, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -97.88469696044922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4402603805065155, "rewards_train/margins": 4.411542862653732, "rewards_train/rejected": -3.971282482147217, "step": 1526 }, { "epoch": 2.03, "logps_train/chosen": -84.15038299560547, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -128.93423461914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.414649099111557, "rewards_train/margins": 4.419790834188461, "rewards_train/rejected": -4.005141735076904, "step": 1527 }, { "epoch": 2.03, "learning_rate": 2.6414994388628525e-07, "loss": 0.0265, "step": 1528 }, { "epoch": 2.03, "logps_train/chosen": -64.61827087402344, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -133.75851440429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3416889011859894, "rewards_train/margins": 4.995665341615677, "rewards_train/rejected": -4.6539764404296875, "step": 1528 }, { "epoch": 2.03, "logps_train/chosen": -74.78620910644531, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -152.0070343017578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5184650421142578, "rewards_train/margins": 5.588488578796387, "rewards_train/rejected": -6.1069536209106445, "step": 1529 }, { "epoch": 2.03, "learning_rate": 2.636009852733979e-07, "loss": 0.0469, "step": 1530 }, { "epoch": 2.03, "logps_train/chosen": -37.29755783081055, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -82.67414855957031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.020380735397338867, "rewards_train/margins": 4.0360963344573975, "rewards_train/rejected": -4.056477069854736, "step": 1530 }, { "epoch": 2.03, "logps_train/chosen": -38.68263244628906, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -79.23599243164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.16435717046260834, "rewards_train/margins": 3.1619768291711807, "rewards_train/rejected": -3.326333999633789, "step": 1531 }, { "epoch": 2.03, "learning_rate": 2.630519608781505e-07, "loss": 0.061, "step": 1532 }, { "epoch": 2.03, "logps_train/chosen": -64.52886199951172, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -114.98080444335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2807074785232544, "rewards_train/margins": 4.763163685798645, "rewards_train/rejected": -4.482456207275391, "step": 1532 }, { "epoch": 2.04, "logps_train/chosen": -61.22148895263672, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -114.10621643066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3091011941432953, "rewards_train/margins": 4.479097455739975, "rewards_train/rejected": -4.16999626159668, "step": 1533 }, { "epoch": 2.04, "learning_rate": 2.6250287335594746e-07, "loss": 0.052, "step": 1534 }, { "epoch": 2.04, "logps_train/chosen": -49.084747314453125, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -114.23832702636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.40613073110580444, "rewards_train/margins": 4.219265043735504, "rewards_train/rejected": -4.625395774841309, "step": 1534 }, { "epoch": 2.04, "logps_train/chosen": -52.24105453491211, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -84.32142639160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5555819869041443, "rewards_train/margins": 4.219756543636322, "rewards_train/rejected": -3.6641745567321777, "step": 1535 }, { "epoch": 2.04, "learning_rate": 2.6195372536249913e-07, "loss": 0.0416, "step": 1536 }, { "epoch": 2.04, "logps_train/chosen": -77.5198974609375, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -134.8009796142578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25582292675971985, "rewards_train/margins": 5.023420602083206, "rewards_train/rejected": -4.767597675323486, "step": 1536 }, { "epoch": 2.04, "logps_train/chosen": -43.38502502441406, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -93.40028381347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18229836225509644, "rewards_train/margins": 4.181310832500458, "rewards_train/rejected": -3.9990124702453613, "step": 1537 }, { "epoch": 2.04, "learning_rate": 2.614045195538078e-07, "loss": 0.0602, "step": 1538 }, { "epoch": 2.04, "logps_train/chosen": -69.01332092285156, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -118.67607116699219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.23648770153522491, "rewards_train/margins": 3.6068999022245407, "rewards_train/rejected": -3.8433876037597656, "step": 1538 }, { "epoch": 2.04, "logps_train/chosen": -61.692901611328125, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -98.7035903930664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8666471242904663, "rewards_train/margins": 4.11669385433197, "rewards_train/rejected": -3.250046730041504, "step": 1539 }, { "epoch": 2.05, "learning_rate": 2.608552585861559e-07, "loss": 0.0606, "step": 1540 }, { "epoch": 2.05, "logps_train/chosen": -42.5904541015625, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -81.87644958496094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.006774693727493286, "rewards_train/margins": 3.491294652223587, "rewards_train/rejected": -3.4845199584960938, "step": 1540 }, { "epoch": 2.05, "logps_train/chosen": -55.27981185913086, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -93.18289184570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.033737726509571075, "rewards_train/margins": 4.031714983284473, "rewards_train/rejected": -3.9979772567749023, "step": 1541 }, { "epoch": 2.05, "learning_rate": 2.6030594511609194e-07, "loss": 0.0557, "step": 1542 }, { "epoch": 2.05, "logps_train/chosen": -40.53589630126953, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -100.51864624023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.650855302810669, "rewards_train/margins": 3.946321725845337, "rewards_train/rejected": -4.597177028656006, "step": 1542 }, { "epoch": 2.05, "logps_train/chosen": -89.83085632324219, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -113.07759094238281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5619921088218689, "rewards_train/margins": 3.563540995121002, "rewards_train/rejected": -4.125533103942871, "step": 1543 }, { "epoch": 2.05, "learning_rate": 2.5975658180041917e-07, "loss": 0.0726, "step": 1544 }, { "epoch": 2.05, "logps_train/chosen": -52.169281005859375, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -99.7624282836914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15364661812782288, "rewards_train/margins": 4.677283614873886, "rewards_train/rejected": -4.830930233001709, "step": 1544 }, { "epoch": 2.05, "logps_train/chosen": -56.277809143066406, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -89.40313720703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4746555984020233, "rewards_train/margins": 4.076595813035965, "rewards_train/rejected": -4.551251411437988, "step": 1545 }, { "epoch": 2.05, "learning_rate": 2.592071712961813e-07, "loss": 0.0489, "step": 1546 }, { "epoch": 2.05, "logps_train/chosen": -61.126136779785156, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -103.38435363769531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16863611340522766, "rewards_train/margins": 4.711758404970169, "rewards_train/rejected": -4.543122291564941, "step": 1546 }, { "epoch": 2.05, "logps_train/chosen": -96.93534851074219, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -133.3983154296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.390409380197525, "rewards_train/margins": 4.488484472036362, "rewards_train/rejected": -4.878893852233887, "step": 1547 }, { "epoch": 2.06, "learning_rate": 2.586577162606506e-07, "loss": 0.0174, "step": 1548 }, { "epoch": 2.06, "logps_train/chosen": -66.61924743652344, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -118.21217346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18230153620243073, "rewards_train/margins": 4.461330905556679, "rewards_train/rejected": -4.279029369354248, "step": 1548 }, { "epoch": 2.06, "logps_train/chosen": -42.583744049072266, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -89.92491149902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.39314013719558716, "rewards_train/margins": 3.598178803920746, "rewards_train/rejected": -3.991318941116333, "step": 1549 }, { "epoch": 2.06, "learning_rate": 2.5810821935131456e-07, "loss": 0.0465, "step": 1550 }, { "epoch": 2.06, "logps_train/chosen": -93.02580261230469, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -159.73475646972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.768204927444458, "rewards_train/margins": 5.930270433425903, "rewards_train/rejected": -6.698475360870361, "step": 1550 }, { "epoch": 2.06, "logps_train/chosen": -46.9946403503418, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -84.75387573242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.022901445627212524, "rewards_train/margins": 3.3661580979824066, "rewards_train/rejected": -3.389059543609619, "step": 1551 }, { "epoch": 2.06, "learning_rate": 2.5755868322586327e-07, "loss": 0.0323, "step": 1552 }, { "epoch": 2.06, "logps_train/chosen": -42.54875946044922, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -67.339111328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6794989109039307, "rewards_train/margins": 3.3829407691955566, "rewards_train/rejected": -2.703441858291626, "step": 1552 }, { "epoch": 2.06, "logps_train/chosen": -83.93661499023438, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -108.704833984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2842859923839569, "rewards_train/margins": 4.060025244951248, "rewards_train/rejected": -4.344311237335205, "step": 1553 }, { "epoch": 2.06, "learning_rate": 2.570091105421765e-07, "loss": 0.0599, "step": 1554 }, { "epoch": 2.06, "logps_train/chosen": -66.35104370117188, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -87.71920776367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13833293318748474, "rewards_train/margins": 3.4032226502895355, "rewards_train/rejected": -3.264889717102051, "step": 1554 }, { "epoch": 2.07, "logps_train/chosen": -71.66230773925781, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -126.78219604492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.024824365973472595, "rewards_train/margins": 4.540894761681557, "rewards_train/rejected": -4.565719127655029, "step": 1555 }, { "epoch": 2.07, "learning_rate": 2.564595039583109e-07, "loss": 0.0441, "step": 1556 }, { "epoch": 2.07, "logps_train/chosen": -73.87794494628906, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -147.35125732421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2747054100036621, "rewards_train/margins": 5.7410807609558105, "rewards_train/rejected": -5.466375350952148, "step": 1556 }, { "epoch": 2.07, "logps_train/chosen": -22.887451171875, "logps_train/ref_chosen": -25.0, "logps_train/ref_rejected": -31.5, "logps_train/rejected": -70.28204345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21164549887180328, "rewards_train/margins": 4.090435400605202, "rewards_train/rejected": -3.8787899017333984, "step": 1557 }, { "epoch": 2.07, "learning_rate": 2.559098661324868e-07, "loss": 0.0318, "step": 1558 }, { "epoch": 2.07, "logps_train/chosen": -85.10887145996094, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -139.03176879882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3476055860519409, "rewards_train/margins": 4.742290139198303, "rewards_train/rejected": -5.089895725250244, "step": 1558 }, { "epoch": 2.07, "logps_train/chosen": -73.78115844726562, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -144.37875366210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4521392285823822, "rewards_train/margins": 5.4622980654239655, "rewards_train/rejected": -5.914437294006348, "step": 1559 }, { "epoch": 2.07, "learning_rate": 2.55360199723076e-07, "loss": 0.0374, "step": 1560 }, { "epoch": 2.07, "logps_train/chosen": -36.88570785522461, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -84.48214721679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31533533334732056, "rewards_train/margins": 4.468383848667145, "rewards_train/rejected": -4.153048515319824, "step": 1560 }, { "epoch": 2.07, "logps_train/chosen": -54.943965911865234, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -118.80177307128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.01080290973186493, "rewards_train/margins": 4.7553113251924515, "rewards_train/rejected": -4.766114234924316, "step": 1561 }, { "epoch": 2.07, "learning_rate": 2.5481050738858836e-07, "loss": 0.0241, "step": 1562 }, { "epoch": 2.07, "logps_train/chosen": -106.90838623046875, "logps_train/ref_chosen": -111.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -130.95278930664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3919735848903656, "rewards_train/margins": 5.3810034692287445, "rewards_train/rejected": -4.989029884338379, "step": 1562 }, { "epoch": 2.08, "logps_train/chosen": -69.67191314697266, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -122.857421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15078496932983398, "rewards_train/margins": 4.575387001037598, "rewards_train/rejected": -4.726171970367432, "step": 1563 }, { "epoch": 2.08, "learning_rate": 2.54260791787659e-07, "loss": 0.0571, "step": 1564 }, { "epoch": 2.08, "logps_train/chosen": -85.62047576904297, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -128.25270080566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.44517719745635986, "rewards_train/margins": 4.138296008110046, "rewards_train/rejected": -4.583473205566406, "step": 1564 }, { "epoch": 2.08, "logps_train/chosen": -60.68268966674805, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -104.92230224609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07235600054264069, "rewards_train/margins": 4.592711225152016, "rewards_train/rejected": -4.520355224609375, "step": 1565 }, { "epoch": 2.08, "learning_rate": 2.5371105557903593e-07, "loss": 0.0471, "step": 1566 }, { "epoch": 2.08, "logps_train/chosen": -55.982749938964844, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -94.38992309570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.13811860978603363, "rewards_train/margins": 3.8003846555948257, "rewards_train/rejected": -3.9385032653808594, "step": 1566 }, { "epoch": 2.08, "logps_train/chosen": -52.22447204589844, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -105.70120239257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4730607867240906, "rewards_train/margins": 4.500310599803925, "rewards_train/rejected": -4.027249813079834, "step": 1567 }, { "epoch": 2.08, "learning_rate": 2.531613014215665e-07, "loss": 0.0341, "step": 1568 }, { "epoch": 2.08, "logps_train/chosen": -56.40673828125, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -103.34124755859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24213893711566925, "rewards_train/margins": 4.310638502240181, "rewards_train/rejected": -4.068499565124512, "step": 1568 }, { "epoch": 2.08, "logps_train/chosen": -44.017494201660156, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -100.64871215820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.10370241105556488, "rewards_train/margins": 4.248668804764748, "rewards_train/rejected": -4.3523712158203125, "step": 1569 }, { "epoch": 2.08, "learning_rate": 2.52611531974185e-07, "loss": 0.0451, "step": 1570 }, { "epoch": 2.08, "logps_train/chosen": -76.9195785522461, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -117.16419219970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.583042323589325, "rewards_train/margins": 4.674461543560028, "rewards_train/rejected": -4.091419219970703, "step": 1570 }, { "epoch": 2.09, "logps_train/chosen": -47.77491760253906, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -103.7301025390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3614758849143982, "rewards_train/margins": 4.233408987522125, "rewards_train/rejected": -4.594884872436523, "step": 1571 }, { "epoch": 2.09, "learning_rate": 2.520617498958997e-07, "loss": 0.0374, "step": 1572 }, { "epoch": 2.09, "logps_train/chosen": -75.93413543701172, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -109.85200500488281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.46216312050819397, "rewards_train/margins": 3.5996006429195404, "rewards_train/rejected": -4.061763763427734, "step": 1572 }, { "epoch": 2.09, "logps_train/chosen": -45.1778450012207, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -89.40332794189453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.025965616106987, "rewards_train/margins": 3.9709860533475876, "rewards_train/rejected": -3.9450204372406006, "step": 1573 }, { "epoch": 2.09, "learning_rate": 2.515119578457799e-07, "loss": 0.0676, "step": 1574 }, { "epoch": 2.09, "logps_train/chosen": -54.127647399902344, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -98.14857482910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3127646744251251, "rewards_train/margins": 4.339593380689621, "rewards_train/rejected": -4.652358055114746, "step": 1574 }, { "epoch": 2.09, "logps_train/chosen": -76.50881958007812, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -113.28398132324219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.202243372797966, "rewards_train/margins": 4.175954148173332, "rewards_train/rejected": -3.973710775375366, "step": 1575 }, { "epoch": 2.09, "learning_rate": 2.5096215848294305e-07, "loss": 0.0214, "step": 1576 }, { "epoch": 2.09, "logps_train/chosen": -96.40092468261719, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -141.45005798339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6158736944198608, "rewards_train/margins": 4.559601426124573, "rewards_train/rejected": -5.175475120544434, "step": 1576 }, { "epoch": 2.09, "logps_train/chosen": -55.775482177734375, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -109.15546417236328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6755768060684204, "rewards_train/margins": 4.687997937202454, "rewards_train/rejected": -4.012421131134033, "step": 1577 }, { "epoch": 2.1, "learning_rate": 2.504123544665423e-07, "loss": 0.0684, "step": 1578 }, { "epoch": 2.1, "logps_train/chosen": -68.91365051269531, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -171.9080047607422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3164474368095398, "rewards_train/margins": 5.86662358045578, "rewards_train/rejected": -5.55017614364624, "step": 1578 }, { "epoch": 2.1, "logps_train/chosen": -43.86370849609375, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -86.76457214355469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11284802854061127, "rewards_train/margins": 4.461180433630943, "rewards_train/rejected": -4.348332405090332, "step": 1579 }, { "epoch": 2.1, "learning_rate": 2.498625484557529e-07, "loss": 0.0259, "step": 1580 }, { "epoch": 2.1, "logps_train/chosen": -46.797454833984375, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -82.36903381347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.545254111289978, "rewards_train/margins": 4.027469515800476, "rewards_train/rejected": -3.482215404510498, "step": 1580 }, { "epoch": 2.1, "logps_train/chosen": -53.979408264160156, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -74.20802307128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06512857228517532, "rewards_train/margins": 3.358017675578594, "rewards_train/rejected": -3.4231462478637695, "step": 1581 }, { "epoch": 2.1, "learning_rate": 2.4931274310975996e-07, "loss": 0.0775, "step": 1582 }, { "epoch": 2.1, "logps_train/chosen": -64.98932647705078, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -111.77056884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.02919236570596695, "rewards_train/margins": 4.954686559736729, "rewards_train/rejected": -4.925494194030762, "step": 1582 }, { "epoch": 2.1, "logps_train/chosen": -94.63760375976562, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -172.94630432128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.769033670425415, "rewards_train/margins": 5.139659643173218, "rewards_train/rejected": -5.908693313598633, "step": 1583 }, { "epoch": 2.1, "learning_rate": 2.487629410877453e-07, "loss": 0.027, "step": 1584 }, { "epoch": 2.1, "logps_train/chosen": -35.84782791137695, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -82.56978607177734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24802957475185394, "rewards_train/margins": 3.9909456819295883, "rewards_train/rejected": -3.7429161071777344, "step": 1584 }, { "epoch": 2.1, "logps_train/chosen": -41.21533966064453, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -84.68006896972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5897940993309021, "rewards_train/margins": 4.235926568508148, "rewards_train/rejected": -3.646132469177246, "step": 1585 }, { "epoch": 2.11, "learning_rate": 2.482131450488748e-07, "loss": 0.0533, "step": 1586 }, { "epoch": 2.11, "logps_train/chosen": -59.50300598144531, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -113.67447662353516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15889409184455872, "rewards_train/margins": 4.72417876124382, "rewards_train/rejected": -4.883072853088379, "step": 1586 }, { "epoch": 2.11, "logps_train/chosen": -57.84709548950195, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -119.77457427978516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32622769474983215, "rewards_train/margins": 4.802123099565506, "rewards_train/rejected": -4.475895404815674, "step": 1587 }, { "epoch": 2.11, "learning_rate": 2.4766335765228523e-07, "loss": 0.0269, "step": 1588 }, { "epoch": 2.11, "logps_train/chosen": -71.62646484375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -91.20127868652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.17670877277851105, "rewards_train/margins": 3.625450536608696, "rewards_train/rejected": -3.802159309387207, "step": 1588 }, { "epoch": 2.11, "logps_train/chosen": -101.00092315673828, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -188.58261108398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.728217601776123, "rewards_train/margins": 6.017544269561768, "rewards_train/rejected": -6.745761871337891, "step": 1589 }, { "epoch": 2.11, "learning_rate": 2.4711358155707167e-07, "loss": 0.0267, "step": 1590 }, { "epoch": 2.11, "logps_train/chosen": -64.10111236572266, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -107.20722961425781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35707587003707886, "rewards_train/margins": 4.341861665248871, "rewards_train/rejected": -3.984785795211792, "step": 1590 }, { "epoch": 2.11, "logps_train/chosen": -39.69798278808594, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -82.97311401367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10988955199718475, "rewards_train/margins": 4.799779459834099, "rewards_train/rejected": -4.689889907836914, "step": 1591 }, { "epoch": 2.11, "learning_rate": 2.465638194222745e-07, "loss": 0.0221, "step": 1592 }, { "epoch": 2.11, "logps_train/chosen": -56.59600830078125, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -110.76094818115234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23221127688884735, "rewards_train/margins": 5.017681285738945, "rewards_train/rejected": -4.785470008850098, "step": 1592 }, { "epoch": 2.12, "logps_train/chosen": -49.70170211791992, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -107.25511932373047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7634236812591553, "rewards_train/margins": 4.651435375213623, "rewards_train/rejected": -3.8880116939544678, "step": 1593 }, { "epoch": 2.12, "learning_rate": 2.4601407390686653e-07, "loss": 0.0192, "step": 1594 }, { "epoch": 2.12, "logps_train/chosen": -76.11460876464844, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -103.22503662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.00885099172592163, "rewards_train/margins": 4.325104653835297, "rewards_train/rejected": -4.316253662109375, "step": 1594 }, { "epoch": 2.12, "logps_train/chosen": -47.336090087890625, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -90.42121887207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21912571787834167, "rewards_train/margins": 4.599919766187668, "rewards_train/rejected": -4.380794048309326, "step": 1595 }, { "epoch": 2.12, "learning_rate": 2.454643476697404e-07, "loss": 0.0165, "step": 1596 }, { "epoch": 2.12, "logps_train/chosen": -54.9783821105957, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -90.58058166503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5912240743637085, "rewards_train/margins": 4.099281668663025, "rewards_train/rejected": -3.5080575942993164, "step": 1596 }, { "epoch": 2.12, "logps_train/chosen": -39.15232467651367, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -76.30804443359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6644551753997803, "rewards_train/margins": 4.7108848094940186, "rewards_train/rejected": -4.046429634094238, "step": 1597 }, { "epoch": 2.12, "learning_rate": 2.4491464336969515e-07, "loss": 0.0364, "step": 1598 }, { "epoch": 2.12, "logps_train/chosen": -45.36869430541992, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -97.99908447265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2056192010641098, "rewards_train/margins": 4.518507614731789, "rewards_train/rejected": -4.724126815795898, "step": 1598 }, { "epoch": 2.12, "logps_train/chosen": -91.71726989746094, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -158.6529998779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3590348958969116, "rewards_train/margins": 5.393865942955017, "rewards_train/rejected": -5.0348310470581055, "step": 1599 }, { "epoch": 2.12, "learning_rate": 2.44364963665424e-07, "loss": 0.0442, "step": 1600 }, { "epoch": 2.12, "logps_train/chosen": -64.96986389160156, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -128.32882690429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.23292358219623566, "rewards_train/margins": 5.2858976572752, "rewards_train/rejected": -5.5188212394714355, "step": 1600 }, { "epoch": 2.13, "logps_train/chosen": -46.02593994140625, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -122.32823181152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26615601778030396, "rewards_train/margins": 5.692728817462921, "rewards_train/rejected": -5.426572799682617, "step": 1601 }, { "epoch": 2.13, "learning_rate": 2.438153112155012e-07, "loss": 0.0122, "step": 1602 }, { "epoch": 2.13, "logps_train/chosen": -78.45476531982422, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -129.31253051757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.1928045749664307, "rewards_train/margins": 5.77249550819397, "rewards_train/rejected": -4.579690933227539, "step": 1602 }, { "epoch": 2.13, "logps_train/chosen": -75.72451782226562, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -119.27732849121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04629838466644287, "rewards_train/margins": 4.506062865257263, "rewards_train/rejected": -4.45976448059082, "step": 1603 }, { "epoch": 2.13, "learning_rate": 2.4326568867836906e-07, "loss": 0.0353, "step": 1604 }, { "epoch": 2.13, "logps_train/chosen": -56.995269775390625, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -126.45884704589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6315581202507019, "rewards_train/margins": 4.197920620441437, "rewards_train/rejected": -4.829478740692139, "step": 1604 }, { "epoch": 2.13, "logps_train/chosen": -58.013954162597656, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -89.17253112792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4259481132030487, "rewards_train/margins": 3.7057017982006073, "rewards_train/rejected": -3.2797536849975586, "step": 1605 }, { "epoch": 2.13, "learning_rate": 2.427160987123252e-07, "loss": 0.082, "step": 1606 }, { "epoch": 2.13, "logps_train/chosen": -63.029266357421875, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -95.93510437011719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36347946524620056, "rewards_train/margins": 5.014021724462509, "rewards_train/rejected": -4.650542259216309, "step": 1606 }, { "epoch": 2.13, "logps_train/chosen": -115.98505401611328, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -157.87326049804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4703811705112457, "rewards_train/margins": 4.87632092833519, "rewards_train/rejected": -5.3467020988464355, "step": 1607 }, { "epoch": 2.14, "learning_rate": 2.421665439755099e-07, "loss": 0.0143, "step": 1608 }, { "epoch": 2.14, "logps_train/chosen": -59.8670539855957, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -125.93307495117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.044544339179992676, "rewards_train/margins": 4.8128520250320435, "rewards_train/rejected": -4.768307685852051, "step": 1608 }, { "epoch": 2.14, "logps_train/chosen": -44.045658111572266, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -95.89444732666016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.743383526802063, "rewards_train/margins": 4.635953068733215, "rewards_train/rejected": -3.8925695419311523, "step": 1609 }, { "epoch": 2.14, "learning_rate": 2.4161702712589284e-07, "loss": 0.0215, "step": 1610 }, { "epoch": 2.14, "logps_train/chosen": -78.1297836303711, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -159.01239013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.11141559481620789, "rewards_train/margins": 5.617948800325394, "rewards_train/rejected": -5.729364395141602, "step": 1610 }, { "epoch": 2.14, "logps_train/chosen": -97.22563934326172, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -116.85176086425781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8454047441482544, "rewards_train/margins": 5.379408955574036, "rewards_train/rejected": -4.534004211425781, "step": 1611 }, { "epoch": 2.14, "learning_rate": 2.410675508212606e-07, "loss": 0.0293, "step": 1612 }, { "epoch": 2.14, "logps_train/chosen": -78.36993408203125, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -114.35699462890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3950382173061371, "rewards_train/margins": 5.335425943136215, "rewards_train/rejected": -4.940387725830078, "step": 1612 }, { "epoch": 2.14, "logps_train/chosen": -31.0833797454834, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -70.13264465332031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6588495969772339, "rewards_train/margins": 3.9947701692581177, "rewards_train/rejected": -3.335920572280884, "step": 1613 }, { "epoch": 2.14, "learning_rate": 2.405181177192035e-07, "loss": 0.0228, "step": 1614 }, { "epoch": 2.14, "logps_train/chosen": -41.733909606933594, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -90.10077667236328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1820778101682663, "rewards_train/margins": 4.259342595934868, "rewards_train/rejected": -4.077264785766602, "step": 1614 }, { "epoch": 2.14, "logps_train/chosen": -56.06945037841797, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -105.80155944824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.38145679235458374, "rewards_train/margins": 4.846355974674225, "rewards_train/rejected": -5.227812767028809, "step": 1615 }, { "epoch": 2.15, "learning_rate": 2.399687304771031e-07, "loss": 0.0767, "step": 1616 }, { "epoch": 2.15, "logps_train/chosen": -50.83064270019531, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -106.90767669677734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.22603324055671692, "rewards_train/margins": 4.7287969291210175, "rewards_train/rejected": -4.954830169677734, "step": 1616 }, { "epoch": 2.15, "logps_train/chosen": -66.17247009277344, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -110.68878936767578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4702523946762085, "rewards_train/margins": 5.001631140708923, "rewards_train/rejected": -4.531378746032715, "step": 1617 }, { "epoch": 2.15, "learning_rate": 2.39419391752119e-07, "loss": 0.0188, "step": 1618 }, { "epoch": 2.15, "logps_train/chosen": -68.6185302734375, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -127.30271911621094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3564090132713318, "rewards_train/margins": 4.831993043422699, "rewards_train/rejected": -4.475584030151367, "step": 1618 }, { "epoch": 2.15, "logps_train/chosen": -73.25057220458984, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -102.1708755493164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.02974472939968109, "rewards_train/margins": 4.598280802369118, "rewards_train/rejected": -4.628025531768799, "step": 1619 }, { "epoch": 2.15, "learning_rate": 2.388701042011763e-07, "loss": 0.0376, "step": 1620 }, { "epoch": 2.15, "logps_train/chosen": -48.103763580322266, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -79.97691345214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07712352275848389, "rewards_train/margins": 3.763876795768738, "rewards_train/rejected": -3.686753273010254, "step": 1620 }, { "epoch": 2.15, "logps_train/chosen": -113.75464630126953, "logps_train/ref_chosen": -118.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -177.00018310546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.430785208940506, "rewards_train/margins": 7.046427756547928, "rewards_train/rejected": -6.615642547607422, "step": 1621 }, { "epoch": 2.15, "learning_rate": 2.3832087048095239e-07, "loss": 0.0413, "step": 1622 }, { "epoch": 2.15, "logps_train/chosen": -55.05327606201172, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -106.33682250976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40199631452560425, "rewards_train/margins": 4.5950533747673035, "rewards_train/rejected": -4.193057060241699, "step": 1622 }, { "epoch": 2.16, "logps_train/chosen": -81.50466918945312, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -156.93157958984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9364046454429626, "rewards_train/margins": 5.744254648685455, "rewards_train/rejected": -6.680659294128418, "step": 1623 }, { "epoch": 2.16, "learning_rate": 2.3777169324786444e-07, "loss": 0.0237, "step": 1624 }, { "epoch": 2.16, "logps_train/chosen": -56.124969482421875, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -116.07621765136719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4695281982421875, "rewards_train/margins": 5.025592803955078, "rewards_train/rejected": -5.495121002197266, "step": 1624 }, { "epoch": 2.16, "logps_train/chosen": -54.168025970458984, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -96.92337799072266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.03319740295410156, "rewards_train/margins": 4.513035297393799, "rewards_train/rejected": -4.479837894439697, "step": 1625 }, { "epoch": 2.16, "learning_rate": 2.3722257515805648e-07, "loss": 0.0134, "step": 1626 }, { "epoch": 2.16, "logps_train/chosen": -96.09199523925781, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -152.91561889648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07283129543066025, "rewards_train/margins": 5.794081829488277, "rewards_train/rejected": -5.721250534057617, "step": 1626 }, { "epoch": 2.16, "logps_train/chosen": -67.76876068115234, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -124.48509216308594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.246407151222229, "rewards_train/margins": 5.070851683616638, "rewards_train/rejected": -5.317258834838867, "step": 1627 }, { "epoch": 2.16, "learning_rate": 2.3667351886738627e-07, "loss": 0.0112, "step": 1628 }, { "epoch": 2.16, "logps_train/chosen": -64.56925964355469, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -90.9698257446289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2055739164352417, "rewards_train/margins": 4.198650240898132, "rewards_train/rejected": -3.9930763244628906, "step": 1628 }, { "epoch": 2.16, "logps_train/chosen": -50.277252197265625, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -122.89326477050781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40039992332458496, "rewards_train/margins": 6.066288709640503, "rewards_train/rejected": -5.665888786315918, "step": 1629 }, { "epoch": 2.16, "learning_rate": 2.3612452703141286e-07, "loss": 0.0255, "step": 1630 }, { "epoch": 2.16, "logps_train/chosen": -53.14012908935547, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -101.79084777832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7531749606132507, "rewards_train/margins": 5.366635024547577, "rewards_train/rejected": -4.613460063934326, "step": 1630 }, { "epoch": 2.17, "logps_train/chosen": -47.96973419189453, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -91.8565902709961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7399420738220215, "rewards_train/margins": 3.619154453277588, "rewards_train/rejected": -4.359096527099609, "step": 1631 }, { "epoch": 2.17, "learning_rate": 2.3557560230538347e-07, "loss": 0.0236, "step": 1632 }, { "epoch": 2.17, "logps_train/chosen": -63.61260986328125, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -110.71380615234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5596988201141357, "rewards_train/margins": 4.278869867324829, "rewards_train/rejected": -4.838568687438965, "step": 1632 }, { "epoch": 2.17, "logps_train/chosen": -66.30958557128906, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -113.25575256347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7081039547920227, "rewards_train/margins": 5.205554783344269, "rewards_train/rejected": -4.497450828552246, "step": 1633 }, { "epoch": 2.17, "learning_rate": 2.3502674734422078e-07, "loss": 0.0218, "step": 1634 }, { "epoch": 2.17, "logps_train/chosen": -84.64749908447266, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -128.42918395996094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5647495985031128, "rewards_train/margins": 4.123871922492981, "rewards_train/rejected": -4.688621520996094, "step": 1634 }, { "epoch": 2.17, "logps_train/chosen": -56.634986877441406, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -99.15913391113281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19431385397911072, "rewards_train/margins": 4.257102340459824, "rewards_train/rejected": -4.062788486480713, "step": 1635 }, { "epoch": 2.17, "learning_rate": 2.3447796480250998e-07, "loss": 0.0368, "step": 1636 }, { "epoch": 2.17, "logps_train/chosen": -66.58016967773438, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -110.55088806152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08614209294319153, "rewards_train/margins": 4.722071617841721, "rewards_train/rejected": -4.808213710784912, "step": 1636 }, { "epoch": 2.17, "logps_train/chosen": -73.31005859375, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -137.06768798828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.47631821036338806, "rewards_train/margins": 5.66170135140419, "rewards_train/rejected": -6.138019561767578, "step": 1637 }, { "epoch": 2.18, "learning_rate": 2.3392925733448615e-07, "loss": 0.0128, "step": 1638 }, { "epoch": 2.18, "logps_train/chosen": -89.06816864013672, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -151.74034118652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.049004822969436646, "rewards_train/margins": 5.818779677152634, "rewards_train/rejected": -5.86778450012207, "step": 1638 }, { "epoch": 2.18, "logps_train/chosen": -77.03298950195312, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -105.98013305664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5736537575721741, "rewards_train/margins": 4.368150770664215, "rewards_train/rejected": -3.794497013092041, "step": 1639 }, { "epoch": 2.18, "learning_rate": 2.3338062759402104e-07, "loss": 0.0227, "step": 1640 }, { "epoch": 2.18, "logps_train/chosen": -79.04985046386719, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -149.3660888671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.32060930132865906, "rewards_train/margins": 5.456624776124954, "rewards_train/rejected": -5.777234077453613, "step": 1640 }, { "epoch": 2.18, "logps_train/chosen": -53.42491912841797, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -101.69596862792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18328961730003357, "rewards_train/margins": 4.889605611562729, "rewards_train/rejected": -4.706315994262695, "step": 1641 }, { "epoch": 2.18, "learning_rate": 2.328320782346107e-07, "loss": 0.0165, "step": 1642 }, { "epoch": 2.18, "logps_train/chosen": -66.21868133544922, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -107.97332000732422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5953192114830017, "rewards_train/margins": 4.776244580745697, "rewards_train/rejected": -4.180925369262695, "step": 1642 }, { "epoch": 2.18, "logps_train/chosen": -93.39530181884766, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -137.36459350585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7690638303756714, "rewards_train/margins": 6.5738829374313354, "rewards_train/rejected": -5.804819107055664, "step": 1643 }, { "epoch": 2.18, "learning_rate": 2.3228361190936222e-07, "loss": 0.0331, "step": 1644 }, { "epoch": 2.18, "logps_train/chosen": -86.44429016113281, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -161.20623779296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2493205964565277, "rewards_train/margins": 7.376194566488266, "rewards_train/rejected": -7.126873970031738, "step": 1644 }, { "epoch": 2.18, "logps_train/chosen": -57.833553314208984, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -109.65545654296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15523049235343933, "rewards_train/margins": 5.338439971208572, "rewards_train/rejected": -5.493670463562012, "step": 1645 }, { "epoch": 2.19, "learning_rate": 2.3173523127098124e-07, "loss": 0.0098, "step": 1646 }, { "epoch": 2.19, "logps_train/chosen": -97.1297836303711, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -145.30487060546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0053609758615493774, "rewards_train/margins": 6.2641888707876205, "rewards_train/rejected": -6.26954984664917, "step": 1646 }, { "epoch": 2.19, "logps_train/chosen": -71.72000885009766, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -108.19837951660156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.143624186515808, "rewards_train/margins": 5.172838091850281, "rewards_train/rejected": -4.029213905334473, "step": 1647 }, { "epoch": 2.19, "learning_rate": 2.311869389717588e-07, "loss": 0.0119, "step": 1648 }, { "epoch": 2.19, "logps_train/chosen": -52.44001770019531, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -111.6469497680664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7356858253479004, "rewards_train/margins": 5.510537624359131, "rewards_train/rejected": -4.7748517990112305, "step": 1648 }, { "epoch": 2.19, "logps_train/chosen": -43.7813720703125, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -103.22969055175781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.027746468782424927, "rewards_train/margins": 4.8170983493328094, "rewards_train/rejected": -4.844844818115234, "step": 1649 }, { "epoch": 2.19, "learning_rate": 2.306387376635588e-07, "loss": 0.0245, "step": 1650 }, { "epoch": 2.19, "logps_train/chosen": -54.53162384033203, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -111.63616943359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8798456788063049, "rewards_train/margins": 5.77471262216568, "rewards_train/rejected": -4.894866943359375, "step": 1650 }, { "epoch": 2.19, "logps_train/chosen": -45.57465362548828, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -83.05009460449219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.45972204208374023, "rewards_train/margins": 4.446763038635254, "rewards_train/rejected": -3.9870409965515137, "step": 1651 }, { "epoch": 2.19, "learning_rate": 2.3009062999780515e-07, "loss": 0.0449, "step": 1652 }, { "epoch": 2.19, "logps_train/chosen": -71.48902893066406, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -134.68780517578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4339093565940857, "rewards_train/margins": 6.150344789028168, "rewards_train/rejected": -5.716435432434082, "step": 1652 }, { "epoch": 2.2, "logps_train/chosen": -79.80711364746094, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -140.88546752929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2911634147167206, "rewards_train/margins": 6.189084976911545, "rewards_train/rejected": -5.897921562194824, "step": 1653 }, { "epoch": 2.2, "learning_rate": 2.2954261862546853e-07, "loss": 0.0137, "step": 1654 }, { "epoch": 2.2, "logps_train/chosen": -74.31769561767578, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -119.30254364013672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36119920015335083, "rewards_train/margins": 5.507860004901886, "rewards_train/rejected": -5.146660804748535, "step": 1654 }, { "epoch": 2.2, "logps_train/chosen": -73.23081970214844, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -146.81724548339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.5425432920455933, "rewards_train/margins": 7.918408274650574, "rewards_train/rejected": -6.3758649826049805, "step": 1655 }, { "epoch": 2.2, "learning_rate": 2.2899470619705429e-07, "loss": 0.0193, "step": 1656 }, { "epoch": 2.2, "logps_train/chosen": -51.79977035522461, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -74.28446197509766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 1.123147964477539, "rewards_train/margins": 4.505500316619873, "rewards_train/rejected": -3.382352352142334, "step": 1656 }, { "epoch": 2.2, "logps_train/chosen": -45.744300842285156, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -88.53366088867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.017398618161678314, "rewards_train/margins": 4.051592089235783, "rewards_train/rejected": -4.068990707397461, "step": 1657 }, { "epoch": 2.2, "learning_rate": 2.2844689536258886e-07, "loss": 0.0409, "step": 1658 }, { "epoch": 2.2, "logps_train/chosen": -61.734397888183594, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -144.71124267578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6796855330467224, "rewards_train/margins": 6.288310945034027, "rewards_train/rejected": -5.608625411987305, "step": 1658 }, { "epoch": 2.2, "logps_train/chosen": -33.049800872802734, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -80.02346801757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15693405270576477, "rewards_train/margins": 3.894535332918167, "rewards_train/rejected": -3.7376012802124023, "step": 1659 }, { "epoch": 2.2, "learning_rate": 2.2789918877160756e-07, "loss": 0.0296, "step": 1660 }, { "epoch": 2.2, "logps_train/chosen": -53.490760803222656, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -119.21926879882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.21782618761062622, "rewards_train/margins": 5.613475978374481, "rewards_train/rejected": -5.831302165985107, "step": 1660 }, { "epoch": 2.21, "logps_train/chosen": -59.0009880065918, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -115.6325912475586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.25791144371032715, "rewards_train/margins": 5.322535276412964, "rewards_train/rejected": -5.580446720123291, "step": 1661 }, { "epoch": 2.21, "learning_rate": 2.2735158907314143e-07, "loss": 0.0132, "step": 1662 }, { "epoch": 2.21, "logps_train/chosen": -40.65831756591797, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -96.56291198730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2068241834640503, "rewards_train/margins": 4.699052691459656, "rewards_train/rejected": -4.4922285079956055, "step": 1662 }, { "epoch": 2.21, "logps_train/chosen": -79.8701171875, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -154.0372772216797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3135742247104645, "rewards_train/margins": 6.437028497457504, "rewards_train/rejected": -6.750602722167969, "step": 1663 }, { "epoch": 2.21, "learning_rate": 2.2680409891570448e-07, "loss": 0.0119, "step": 1664 }, { "epoch": 2.21, "logps_train/chosen": -41.70933532714844, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -101.9284439086914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4243786931037903, "rewards_train/margins": 5.246129810810089, "rewards_train/rejected": -4.821751117706299, "step": 1664 }, { "epoch": 2.21, "logps_train/chosen": -87.85283660888672, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -156.11868286132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.513593316078186, "rewards_train/margins": 5.227804780006409, "rewards_train/rejected": -4.714211463928223, "step": 1665 }, { "epoch": 2.21, "learning_rate": 2.2625672094728097e-07, "loss": 0.0401, "step": 1666 }, { "epoch": 2.21, "logps_train/chosen": -59.63440704345703, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -114.00321960449219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.8709344863891602, "rewards_train/margins": 5.858757019042969, "rewards_train/rejected": -4.987822532653809, "step": 1666 }, { "epoch": 2.21, "logps_train/chosen": -50.95458984375, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -104.30546569824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15014678239822388, "rewards_train/margins": 4.7319623827934265, "rewards_train/rejected": -4.88210916519165, "step": 1667 }, { "epoch": 2.22, "learning_rate": 2.2570945781531259e-07, "loss": 0.0139, "step": 1668 }, { "epoch": 2.22, "logps_train/chosen": -40.91083526611328, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -98.2369384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.11608340591192245, "rewards_train/margins": 4.256439425051212, "rewards_train/rejected": -4.372522830963135, "step": 1668 }, { "epoch": 2.22, "logps_train/chosen": -55.73370361328125, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -102.60723876953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12766724824905396, "rewards_train/margins": 4.325244128704071, "rewards_train/rejected": -4.452911376953125, "step": 1669 }, { "epoch": 2.22, "learning_rate": 2.2516231216668542e-07, "loss": 0.0407, "step": 1670 }, { "epoch": 2.22, "logps_train/chosen": -61.32091522216797, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -111.96902465820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7242790460586548, "rewards_train/margins": 4.899185538291931, "rewards_train/rejected": -5.623464584350586, "step": 1670 }, { "epoch": 2.22, "logps_train/chosen": -65.0093994140625, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -87.73915100097656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3032352328300476, "rewards_train/margins": 2.9973891377449036, "rewards_train/rejected": -3.300624370574951, "step": 1671 }, { "epoch": 2.22, "learning_rate": 2.2461528664771763e-07, "loss": 0.0675, "step": 1672 }, { "epoch": 2.22, "logps_train/chosen": -55.681724548339844, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -106.15396118164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06698378920555115, "rewards_train/margins": 5.1714416444301605, "rewards_train/rejected": -5.104457855224609, "step": 1672 }, { "epoch": 2.22, "logps_train/chosen": -60.871726989746094, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -125.38886260986328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3456396460533142, "rewards_train/margins": 5.705228745937347, "rewards_train/rejected": -5.359589099884033, "step": 1673 }, { "epoch": 2.22, "learning_rate": 2.2406838390414605e-07, "loss": 0.0094, "step": 1674 }, { "epoch": 2.22, "logps_train/chosen": -45.904052734375, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -90.9768295288086, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33928221464157104, "rewards_train/margins": 4.726809203624725, "rewards_train/rejected": -4.387526988983154, "step": 1674 }, { "epoch": 2.22, "logps_train/chosen": -71.08259582519531, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -104.04598236083984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8645099997520447, "rewards_train/margins": 3.940479338169098, "rewards_train/rejected": -4.804989337921143, "step": 1675 }, { "epoch": 2.23, "learning_rate": 2.23521606581114e-07, "loss": 0.0349, "step": 1676 }, { "epoch": 2.23, "logps_train/chosen": -75.70677185058594, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -157.05160522460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4386974573135376, "rewards_train/margins": 5.848544716835022, "rewards_train/rejected": -5.409847259521484, "step": 1676 }, { "epoch": 2.23, "logps_train/chosen": -68.19268035888672, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -112.60706329345703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2786432206630707, "rewards_train/margins": 4.794368177652359, "rewards_train/rejected": -5.07301139831543, "step": 1677 }, { "epoch": 2.23, "learning_rate": 2.229749573231579e-07, "loss": 0.0238, "step": 1678 }, { "epoch": 2.23, "logps_train/chosen": -40.02933120727539, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -108.81055450439453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2919215261936188, "rewards_train/margins": 5.369461923837662, "rewards_train/rejected": -5.077540397644043, "step": 1678 }, { "epoch": 2.23, "logps_train/chosen": -50.072044372558594, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -106.66670227050781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23576444387435913, "rewards_train/margins": 5.614153802394867, "rewards_train/rejected": -5.378389358520508, "step": 1679 }, { "epoch": 2.23, "learning_rate": 2.2242843877419493e-07, "loss": 0.0151, "step": 1680 }, { "epoch": 2.23, "logps_train/chosen": -50.86200714111328, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -96.57337951660156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3942679762840271, "rewards_train/margins": 4.443792760372162, "rewards_train/rejected": -4.049524784088135, "step": 1680 }, { "epoch": 2.23, "logps_train/chosen": -68.32810974121094, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -109.55654907226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40781405568122864, "rewards_train/margins": 4.947844535112381, "rewards_train/rejected": -4.540030479431152, "step": 1681 }, { "epoch": 2.23, "learning_rate": 2.2188205357751017e-07, "loss": 0.0222, "step": 1682 }, { "epoch": 2.23, "logps_train/chosen": -59.291446685791016, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -113.6025390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4572695195674896, "rewards_train/margins": 4.846734195947647, "rewards_train/rejected": -5.304003715515137, "step": 1682 }, { "epoch": 2.24, "logps_train/chosen": -41.99505615234375, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -95.7668685913086, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2979428470134735, "rewards_train/margins": 4.681087285280228, "rewards_train/rejected": -4.979030132293701, "step": 1683 }, { "epoch": 2.24, "learning_rate": 2.2133580437574352e-07, "loss": 0.0204, "step": 1684 }, { "epoch": 2.24, "logps_train/chosen": -74.65121459960938, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -137.10507202148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7301912903785706, "rewards_train/margins": 5.850073873996735, "rewards_train/rejected": -5.119882583618164, "step": 1684 }, { "epoch": 2.24, "logps_train/chosen": -38.73481369018555, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -85.76426696777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13159671425819397, "rewards_train/margins": 4.025992304086685, "rewards_train/rejected": -3.894395589828491, "step": 1685 }, { "epoch": 2.24, "learning_rate": 2.207896938108773e-07, "loss": 0.0397, "step": 1686 }, { "epoch": 2.24, "logps_train/chosen": -47.91719055175781, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -79.89845275878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.7535935044288635, "rewards_train/margins": 4.555939495563507, "rewards_train/rejected": -3.8023459911346436, "step": 1686 }, { "epoch": 2.24, "logps_train/chosen": -44.0987434387207, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -87.04654693603516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.13741333782672882, "rewards_train/margins": 3.910014793276787, "rewards_train/rejected": -4.047428131103516, "step": 1687 }, { "epoch": 2.24, "learning_rate": 2.2024372452422316e-07, "loss": 0.0416, "step": 1688 }, { "epoch": 2.24, "logps_train/chosen": -75.06198120117188, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -175.1716766357422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6117703914642334, "rewards_train/margins": 6.510187864303589, "rewards_train/rejected": -5.8984174728393555, "step": 1688 }, { "epoch": 2.24, "logps_train/chosen": -68.56094360351562, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -94.68940734863281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6732823848724365, "rewards_train/margins": 4.561283349990845, "rewards_train/rejected": -5.234565734863281, "step": 1689 }, { "epoch": 2.24, "learning_rate": 2.1969789915640963e-07, "loss": 0.0179, "step": 1690 }, { "epoch": 2.24, "logps_train/chosen": -63.77099609375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -132.34539794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.6213377118110657, "rewards_train/margins": 6.080877602100372, "rewards_train/rejected": -5.459539890289307, "step": 1690 }, { "epoch": 2.25, "logps_train/chosen": -66.35791015625, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -123.77528381347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1279783844947815, "rewards_train/margins": 5.065175950527191, "rewards_train/rejected": -5.193154335021973, "step": 1691 }, { "epoch": 2.25, "learning_rate": 2.1915222034736893e-07, "loss": 0.0154, "step": 1692 } ], "logging_steps": 2, "max_steps": 3008, "num_train_epochs": 4, "save_steps": 188, "total_flos": 0.0, "trial_name": null, "trial_params": null }