{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7467859139183901, "eval_steps": 500, "global_step": 2672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "logps_train/chosen": -28.009265899658203, "logps_train/ref_chosen": -27.875, "logps_train/ref_rejected": -34.25, "logps_train/rejected": -34.33583068847656, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.0073716905899345875, "rewards_train/margins": -0.0024997838772833347, "rewards_train/rejected": -0.004871906712651253, "step": 0 }, { "epoch": 0.0, "logps_train/chosen": -73.56502532958984, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -81.08031463623047, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.03540842980146408, "rewards_train/margins": -0.03733780421316624, "rewards_train/rejected": 0.001929374411702156, "step": 1 }, { "epoch": 0.0, "learning_rate": 7.462686567164179e-09, "loss": 0.7033, "step": 2 }, { "epoch": 0.0, "logps_train/chosen": -67.06131744384766, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -58.305389404296875, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.004414868541061878, "rewards_train/margins": -0.012702328152954578, "rewards_train/rejected": 0.017117196694016457, "step": 2 }, { "epoch": 0.0, "logps_train/chosen": -74.58602905273438, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -72.01849365234375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.004287677817046642, "rewards_train/margins": -0.014956123195588589, "rewards_train/rejected": 0.01924380101263523, "step": 3 }, { "epoch": 0.0, "learning_rate": 1.4925373134328357e-08, "loss": 0.7004, "step": 4 }, { "epoch": 0.0, "logps_train/chosen": -96.714111328125, "logps_train/ref_chosen": -96.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -94.39057159423828, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.017505139112472534, "rewards_train/margins": -0.02043974446132779, "rewards_train/rejected": 0.002934605348855257, "step": 4 }, { "epoch": 0.0, "logps_train/chosen": -30.27285385131836, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -31.125, "logps_train/rejected": -31.02100944519043, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0027926864568144083, "rewards_train/margins": -0.00448125577531755, "rewards_train/rejected": 0.007273942232131958, "step": 5 }, { "epoch": 0.0, "learning_rate": 2.2388059701492534e-08, "loss": 0.7019, "step": 6 }, { "epoch": 0.0, "logps_train/chosen": -52.10371017456055, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -55.562103271484375, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.01896473951637745, "rewards_train/margins": -0.026817021891474724, "rewards_train/rejected": 0.007852282375097275, "step": 6 }, { "epoch": 0.0, "logps_train/chosen": -71.28314208984375, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -76.08761596679688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01777965947985649, "rewards_train/margins": 0.03591597452759743, "rewards_train/rejected": -0.018136315047740936, "step": 7 }, { "epoch": 0.0, "learning_rate": 2.9850746268656714e-08, "loss": 0.6922, "step": 8 }, { "epoch": 0.0, "logps_train/chosen": -76.19033813476562, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -79.52376556396484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0020599185954779387, "rewards_train/margins": 0.017619952326640487, "rewards_train/rejected": -0.015560033731162548, "step": 8 }, { "epoch": 0.0, "logps_train/chosen": -54.57809829711914, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -62.02259063720703, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.007614586036652327, "rewards_train/margins": 0.005191568750888109, "rewards_train/rejected": -0.012806154787540436, "step": 9 }, { "epoch": 0.0, "learning_rate": 3.731343283582089e-08, "loss": 0.687, "step": 10 }, { "epoch": 0.0, "logps_train/chosen": -70.93370819091797, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -66.47660064697266, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.001964431954547763, "rewards_train/margins": -0.003913437249138951, "rewards_train/rejected": 0.0019490052945911884, "step": 10 }, { "epoch": 0.0, "logps_train/chosen": -51.025726318359375, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -62.484683990478516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.013052631169557571, "rewards_train/margins": 0.0199194997549057, "rewards_train/rejected": -0.006866868585348129, "step": 11 }, { "epoch": 0.0, "learning_rate": 4.477611940298507e-08, "loss": 0.6889, "step": 12 }, { "epoch": 0.0, "logps_train/chosen": -52.511112213134766, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -51.727561950683594, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.004747912287712097, "rewards_train/margins": 0.019887147471308708, "rewards_train/rejected": -0.015139235183596611, "step": 12 }, { "epoch": 0.0, "logps_train/chosen": -63.97336959838867, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -80.89689636230469, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.007350319065153599, "rewards_train/margins": 0.003485799767076969, "rewards_train/rejected": 0.0038645192980766296, "step": 13 }, { "epoch": 0.0, "learning_rate": 5.223880597014925e-08, "loss": 0.6875, "step": 14 }, { "epoch": 0.0, "logps_train/chosen": -31.11260223388672, "logps_train/ref_chosen": -31.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -47.75293731689453, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.0066704899072647095, "rewards_train/margins": -0.021025076508522034, "rewards_train/rejected": 0.014354586601257324, "step": 14 }, { "epoch": 0.0, "logps_train/chosen": -62.03797912597656, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -76.19300842285156, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.00223541259765625, "rewards_train/margins": -0.006372492760419846, "rewards_train/rejected": 0.004137080162763596, "step": 15 }, { "epoch": 0.0, "learning_rate": 5.970149253731343e-08, "loss": 0.6999, "step": 16 }, { "epoch": 0.0, "logps_train/chosen": -45.3233528137207, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -52.9857177734375, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.008873285725712776, "rewards_train/margins": -0.014622984454035759, "rewards_train/rejected": 0.005749698728322983, "step": 16 }, { "epoch": 0.0, "logps_train/chosen": -43.823646545410156, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -45.151485443115234, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.015372293069958687, "rewards_train/margins": -0.006864186376333237, "rewards_train/rejected": -0.00850810669362545, "step": 17 }, { "epoch": 0.01, "learning_rate": 6.71641791044776e-08, "loss": 0.6986, "step": 18 }, { "epoch": 0.01, "logps_train/chosen": -46.67842102050781, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -45.35725021362305, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.017704779282212257, "rewards_train/margins": 0.009679672308266163, "rewards_train/rejected": 0.008025106973946095, "step": 18 }, { "epoch": 0.01, "logps_train/chosen": -55.24577331542969, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -60.35206604003906, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0006179455667734146, "rewards_train/margins": 0.008089966606348753, "rewards_train/rejected": -0.007472021039575338, "step": 19 }, { "epoch": 0.01, "learning_rate": 7.462686567164178e-08, "loss": 0.6891, "step": 20 }, { "epoch": 0.01, "logps_train/chosen": -70.94389343261719, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -83.84043884277344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.008793413639068604, "rewards_train/margins": -0.014788252301514149, "rewards_train/rejected": 0.005994838662445545, "step": 20 }, { "epoch": 0.01, "logps_train/chosen": -57.3712272644043, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -69.30262756347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.010169684886932373, "rewards_train/margins": 0.016967594623565674, "rewards_train/rejected": -0.027137279510498047, "step": 21 }, { "epoch": 0.01, "learning_rate": 8.208955223880596e-08, "loss": 0.6928, "step": 22 }, { "epoch": 0.01, "logps_train/chosen": -45.63951873779297, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -71.93592071533203, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.013489503413438797, "rewards_train/margins": 0.002784440293908119, "rewards_train/rejected": 0.010705063119530678, "step": 22 }, { "epoch": 0.01, "logps_train/chosen": -43.61445617675781, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -50.658485412597656, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.0010543353855609894, "rewards_train/margins": -0.006143870297819376, "rewards_train/rejected": 0.007198205683380365, "step": 23 }, { "epoch": 0.01, "learning_rate": 8.955223880597014e-08, "loss": 0.694, "step": 24 }, { "epoch": 0.01, "logps_train/chosen": -54.579505920410156, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -60.249229431152344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.009708481840789318, "rewards_train/margins": -0.012520214542746544, "rewards_train/rejected": 0.002811732701957226, "step": 24 }, { "epoch": 0.01, "logps_train/chosen": -44.96370315551758, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -50.72367858886719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.002693634247407317, "rewards_train/margins": 0.0003871179651468992, "rewards_train/rejected": -0.0030807522125542164, "step": 25 }, { "epoch": 0.01, "learning_rate": 9.701492537313432e-08, "loss": 0.6957, "step": 26 }, { "epoch": 0.01, "logps_train/chosen": -52.113609313964844, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -55.80499267578125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.0025722477585077286, "rewards_train/margins": 0.009958336129784584, "rewards_train/rejected": -0.012530583888292313, "step": 26 }, { "epoch": 0.01, "logps_train/chosen": -78.57445526123047, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -71.58308410644531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.011304108425974846, "rewards_train/margins": 0.008577450178563595, "rewards_train/rejected": 0.002726658247411251, "step": 27 }, { "epoch": 0.01, "learning_rate": 1.044776119402985e-07, "loss": 0.6886, "step": 28 }, { "epoch": 0.01, "logps_train/chosen": -48.44830322265625, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -42.34334182739258, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.015338033437728882, "rewards_train/margins": -0.016941360663622618, "rewards_train/rejected": 0.0016033272258937359, "step": 28 }, { "epoch": 0.01, "logps_train/chosen": -72.24885559082031, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -65.16725158691406, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.024398073554039, "rewards_train/margins": -0.003961820155382156, "rewards_train/rejected": -0.020436253398656845, "step": 29 }, { "epoch": 0.01, "learning_rate": 1.1194029850746268e-07, "loss": 0.6985, "step": 30 }, { "epoch": 0.01, "logps_train/chosen": -48.282257080078125, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -51.52762985229492, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.025192007422447205, "rewards_train/margins": 0.00940013863146305, "rewards_train/rejected": 0.015791868790984154, "step": 30 }, { "epoch": 0.01, "logps_train/chosen": -69.8888931274414, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -71.6630859375, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0028590518049895763, "rewards_train/margins": -0.012228913139551878, "rewards_train/rejected": 0.015087964944541454, "step": 31 }, { "epoch": 0.01, "learning_rate": 1.1940298507462686e-07, "loss": 0.6941, "step": 32 }, { "epoch": 0.01, "logps_train/chosen": -61.05165481567383, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -48.58993911743164, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.017588702961802483, "rewards_train/margins": 0.022090469021350145, "rewards_train/rejected": -0.004501766059547663, "step": 32 }, { "epoch": 0.01, "logps_train/chosen": -65.1072006225586, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -65.76799011230469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0034931949339807034, "rewards_train/margins": -0.010873754974454641, "rewards_train/rejected": 0.007380560040473938, "step": 33 }, { "epoch": 0.01, "learning_rate": 1.2686567164179106e-07, "loss": 0.69, "step": 34 }, { "epoch": 0.01, "logps_train/chosen": -79.93659973144531, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -91.6839599609375, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.030379492789506912, "rewards_train/margins": -0.06432736292481422, "rewards_train/rejected": 0.03394787013530731, "step": 34 }, { "epoch": 0.01, "logps_train/chosen": -70.9692611694336, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -69.50221252441406, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.005519765429198742, "rewards_train/margins": 0.0036860769614577293, "rewards_train/rejected": -0.009205842390656471, "step": 35 }, { "epoch": 0.01, "learning_rate": 1.343283582089552e-07, "loss": 0.7098, "step": 36 }, { "epoch": 0.01, "logps_train/chosen": -20.790733337402344, "logps_train/ref_chosen": -20.875, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -34.51048278808594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.005765479989349842, "rewards_train/margins": 0.005568873370066285, "rewards_train/rejected": 0.00019660661928355694, "step": 36 }, { "epoch": 0.01, "logps_train/chosen": -60.28750991821289, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -69.29464721679688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.009041781537234783, "rewards_train/margins": 0.010185684077441692, "rewards_train/rejected": -0.0011439025402069092, "step": 37 }, { "epoch": 0.01, "learning_rate": 1.4179104477611938e-07, "loss": 0.6896, "step": 38 }, { "epoch": 0.01, "logps_train/chosen": -53.85614013671875, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -76.01130676269531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.005633586551994085, "rewards_train/margins": -0.013291806448251009, "rewards_train/rejected": 0.007658219896256924, "step": 38 }, { "epoch": 0.01, "logps_train/chosen": -89.0808334350586, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -98.09664154052734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.011349696666002274, "rewards_train/margins": 0.018670076970010996, "rewards_train/rejected": -0.007320380304008722, "step": 39 }, { "epoch": 0.01, "learning_rate": 1.4925373134328355e-07, "loss": 0.692, "step": 40 }, { "epoch": 0.01, "logps_train/chosen": -23.839229583740234, "logps_train/ref_chosen": -24.0, "logps_train/ref_rejected": -25.875, "logps_train/rejected": -25.802457809448242, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.009827196598052979, "rewards_train/margins": 0.0015963371843099594, "rewards_train/rejected": 0.008230859413743019, "step": 40 }, { "epoch": 0.01, "logps_train/chosen": -69.20179748535156, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -86.10008239746094, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.01432049460709095, "rewards_train/margins": -0.0254062432795763, "rewards_train/rejected": 0.011085748672485352, "step": 41 }, { "epoch": 0.01, "learning_rate": 1.5671641791044775e-07, "loss": 0.6993, "step": 42 }, { "epoch": 0.01, "logps_train/chosen": -61.325164794921875, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -78.37467956542969, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.002786031924188137, "rewards_train/margins": 0.01662126276642084, "rewards_train/rejected": -0.013835230842232704, "step": 42 }, { "epoch": 0.01, "logps_train/chosen": -61.91954040527344, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -53.99800491333008, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.011192599311470985, "rewards_train/margins": -0.01178260799497366, "rewards_train/rejected": 0.0005900086835026741, "step": 43 }, { "epoch": 0.01, "learning_rate": 1.6417910447761193e-07, "loss": 0.6922, "step": 44 }, { "epoch": 0.01, "logps_train/chosen": -61.60581588745117, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -85.86632537841797, "rewards_train/accuracies": 0.0, "rewards_train/chosen": -0.011753350496292114, "rewards_train/margins": -0.02785518765449524, "rewards_train/rejected": 0.016101837158203125, "step": 44 }, { "epoch": 0.01, "logps_train/chosen": -48.868995666503906, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -50.02741241455078, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.005552059970796108, "rewards_train/margins": -0.00515446113422513, "rewards_train/rejected": -0.00039759883657097816, "step": 45 }, { "epoch": 0.01, "learning_rate": 1.716417910447761e-07, "loss": 0.7018, "step": 46 }, { "epoch": 0.01, "logps_train/chosen": -59.111244201660156, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.70795440673828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.010750353336334229, "rewards_train/margins": -0.0006807064637541771, "rewards_train/rejected": 0.011431059800088406, "step": 46 }, { "epoch": 0.01, "logps_train/chosen": -53.983726501464844, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -75.58424377441406, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.009602963924407959, "rewards_train/margins": -0.005280325189232826, "rewards_train/rejected": -0.004322638735175133, "step": 47 }, { "epoch": 0.01, "learning_rate": 1.7910447761194027e-07, "loss": 0.6947, "step": 48 }, { "epoch": 0.01, "logps_train/chosen": -56.18229675292969, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -61.73432922363281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.00553455064073205, "rewards_train/margins": -0.013693366665393114, "rewards_train/rejected": 0.008158816024661064, "step": 48 }, { "epoch": 0.01, "logps_train/chosen": -58.342559814453125, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -61.901893615722656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.009103439748287201, "rewards_train/margins": 0.017651867121458054, "rewards_train/rejected": -0.008548427373170853, "step": 49 }, { "epoch": 0.01, "learning_rate": 1.8656716417910447e-07, "loss": 0.6924, "step": 50 }, { "epoch": 0.01, "logps_train/chosen": -75.02145385742188, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -74.90623474121094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.023635398596525192, "rewards_train/margins": 0.01660336134955287, "rewards_train/rejected": 0.0070320372469723225, "step": 50 }, { "epoch": 0.01, "logps_train/chosen": -75.75387573242188, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -73.04302978515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.007620600052177906, "rewards_train/margins": 0.013486066833138466, "rewards_train/rejected": -0.00586546678096056, "step": 51 }, { "epoch": 0.01, "learning_rate": 1.9402985074626865e-07, "loss": 0.6858, "step": 52 }, { "epoch": 0.01, "logps_train/chosen": -57.362266540527344, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -62.26457595825195, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0058142924681305885, "rewards_train/margins": -0.0009311474859714508, "rewards_train/rejected": 0.006745439954102039, "step": 52 }, { "epoch": 0.01, "logps_train/chosen": -76.48269653320312, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.6187973022461, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.016237949952483177, "rewards_train/margins": -0.030920982360839844, "rewards_train/rejected": 0.014683032408356667, "step": 53 }, { "epoch": 0.02, "learning_rate": 2.0149253731343282e-07, "loss": 0.7013, "step": 54 }, { "epoch": 0.02, "logps_train/chosen": -72.03915405273438, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -69.89439392089844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.003134148893877864, "rewards_train/margins": -0.01096053677611053, "rewards_train/rejected": 0.007826387882232666, "step": 54 }, { "epoch": 0.02, "logps_train/chosen": -85.34034729003906, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -86.39530181884766, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.01801927760243416, "rewards_train/margins": -0.03161392919719219, "rewards_train/rejected": 0.013594651594758034, "step": 55 }, { "epoch": 0.02, "learning_rate": 2.08955223880597e-07, "loss": 0.7039, "step": 56 }, { "epoch": 0.02, "logps_train/chosen": -35.24477767944336, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -47.85071563720703, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.005209779366850853, "rewards_train/margins": -0.005617326125502586, "rewards_train/rejected": 0.01082710549235344, "step": 56 }, { "epoch": 0.02, "logps_train/chosen": -36.646060943603516, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -54.13223648071289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.009026802144944668, "rewards_train/margins": 0.011508381925523281, "rewards_train/rejected": -0.0024815797805786133, "step": 57 }, { "epoch": 0.02, "learning_rate": 2.1641791044776117e-07, "loss": 0.692, "step": 58 }, { "epoch": 0.02, "logps_train/chosen": -63.958457946777344, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -68.56193542480469, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.004244078882038593, "rewards_train/margins": -0.023246058262884617, "rewards_train/rejected": 0.019001979380846024, "step": 58 }, { "epoch": 0.02, "logps_train/chosen": -70.13823699951172, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -75.07279205322266, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.01538639422506094, "rewards_train/margins": -0.01924017141573131, "rewards_train/rejected": 0.003853777190670371, "step": 59 }, { "epoch": 0.02, "learning_rate": 2.2388059701492537e-07, "loss": 0.7041, "step": 60 }, { "epoch": 0.02, "logps_train/chosen": -50.05101776123047, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -58.0960693359375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.024194950237870216, "rewards_train/margins": 0.008801520802080631, "rewards_train/rejected": 0.015393429435789585, "step": 60 }, { "epoch": 0.02, "logps_train/chosen": -62.10710906982422, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -51.14201736450195, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.007258105091750622, "rewards_train/margins": 0.012280058581382036, "rewards_train/rejected": -0.005021953489631414, "step": 61 }, { "epoch": 0.02, "learning_rate": 2.3134328358208954e-07, "loss": 0.6879, "step": 62 }, { "epoch": 0.02, "logps_train/chosen": -79.69609069824219, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -100.82936096191406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.010859489440917969, "rewards_train/margins": 0.0031700842082500458, "rewards_train/rejected": 0.007689405232667923, "step": 62 }, { "epoch": 0.02, "logps_train/chosen": -50.9639778137207, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -54.70915222167969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0046273572370409966, "rewards_train/margins": -0.0018990645185112953, "rewards_train/rejected": 0.006526421755552292, "step": 63 }, { "epoch": 0.02, "learning_rate": 2.388059701492537e-07, "loss": 0.6933, "step": 64 }, { "epoch": 0.02, "logps_train/chosen": -54.156246185302734, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -77.61766052246094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0162116177380085, "rewards_train/margins": 0.016259407624602318, "rewards_train/rejected": -4.7789886593818665e-05, "step": 64 }, { "epoch": 0.02, "logps_train/chosen": -56.522613525390625, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -60.18783950805664, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.01241797860711813, "rewards_train/margins": -0.0030089272186160088, "rewards_train/rejected": -0.009409051388502121, "step": 65 }, { "epoch": 0.02, "learning_rate": 2.4626865671641786e-07, "loss": 0.6905, "step": 66 }, { "epoch": 0.02, "logps_train/chosen": -36.38053512573242, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -33.81393814086914, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.004655003547668457, "rewards_train/margins": -0.0015814690850675106, "rewards_train/rejected": -0.0030735344626009464, "step": 66 }, { "epoch": 0.02, "logps_train/chosen": -115.5427474975586, "logps_train/ref_chosen": -115.5, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -127.41462707519531, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.0004127498250454664, "rewards_train/margins": -0.026874830247834325, "rewards_train/rejected": 0.02728758007287979, "step": 67 }, { "epoch": 0.02, "learning_rate": 2.537313432835821e-07, "loss": 0.7003, "step": 68 }, { "epoch": 0.02, "logps_train/chosen": -80.20258331298828, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -74.4958267211914, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.00424259714782238, "rewards_train/margins": 0.0011994852684438229, "rewards_train/rejected": -0.005442082416266203, "step": 68 }, { "epoch": 0.02, "logps_train/chosen": -42.610416412353516, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -51.10821533203125, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.012200661934912205, "rewards_train/margins": 0.0036860527470707893, "rewards_train/rejected": 0.008514609187841415, "step": 69 }, { "epoch": 0.02, "learning_rate": 2.611940298507462e-07, "loss": 0.6925, "step": 70 }, { "epoch": 0.02, "logps_train/chosen": -39.376792907714844, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -27.125, "logps_train/rejected": -27.271896362304688, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.00021116758580319583, "rewards_train/margins": 0.01660984705085866, "rewards_train/rejected": -0.016398679465055466, "step": 70 }, { "epoch": 0.02, "logps_train/chosen": -74.99695587158203, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -83.50621032714844, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.013757944107055664, "rewards_train/margins": -0.018800931982696056, "rewards_train/rejected": 0.005042987875640392, "step": 71 }, { "epoch": 0.02, "learning_rate": 2.686567164179104e-07, "loss": 0.6938, "step": 72 }, { "epoch": 0.02, "logps_train/chosen": -85.22811889648438, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -84.8520278930664, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.01890520006418228, "rewards_train/margins": -0.010264983400702477, "rewards_train/rejected": -0.008640216663479805, "step": 72 }, { "epoch": 0.02, "logps_train/chosen": -42.7127685546875, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -62.7537956237793, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.017492689192295074, "rewards_train/margins": 0.019776830216869712, "rewards_train/rejected": -0.0022841410245746374, "step": 73 }, { "epoch": 0.02, "learning_rate": 2.761194029850746e-07, "loss": 0.6908, "step": 74 }, { "epoch": 0.02, "logps_train/chosen": -97.81999206542969, "logps_train/ref_chosen": -97.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -85.24695587158203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.015202022157609463, "rewards_train/margins": 0.0016810325905680656, "rewards_train/rejected": -0.01688305474817753, "step": 74 }, { "epoch": 0.02, "logps_train/chosen": -26.67871856689453, "logps_train/ref_chosen": -26.75, "logps_train/ref_rejected": -32.5, "logps_train/rejected": -32.54246520996094, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.004229029640555382, "rewards_train/margins": 0.013328035362064838, "rewards_train/rejected": -0.009099005721509457, "step": 75 }, { "epoch": 0.02, "learning_rate": 2.8358208955223876e-07, "loss": 0.6896, "step": 76 }, { "epoch": 0.02, "logps_train/chosen": -56.448753356933594, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -52.06647491455078, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0031714322976768017, "rewards_train/margins": -0.01283704349771142, "rewards_train/rejected": 0.016008475795388222, "step": 76 }, { "epoch": 0.02, "logps_train/chosen": -51.58002471923828, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -57.23174285888672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.009184528142213821, "rewards_train/margins": 0.016293874010443687, "rewards_train/rejected": -0.007109345868229866, "step": 77 }, { "epoch": 0.02, "learning_rate": 2.9104477611940296e-07, "loss": 0.6923, "step": 78 }, { "epoch": 0.02, "logps_train/chosen": -47.17328643798828, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -34.302825927734375, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0014215768314898014, "rewards_train/margins": -0.008920720312744379, "rewards_train/rejected": 0.01034229714423418, "step": 78 }, { "epoch": 0.02, "logps_train/chosen": -88.59147644042969, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -86.37312316894531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.01891244947910309, "rewards_train/margins": -0.030819376930594444, "rewards_train/rejected": 0.011906927451491356, "step": 79 }, { "epoch": 0.02, "learning_rate": 2.985074626865671e-07, "loss": 0.7041, "step": 80 }, { "epoch": 0.02, "logps_train/chosen": -37.71403503417969, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -42.998077392578125, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.0020339488983154297, "rewards_train/margins": 0.001060450915247202, "rewards_train/rejected": 0.0009734979830682278, "step": 80 }, { "epoch": 0.02, "logps_train/chosen": -42.956787109375, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -43.76375198364258, "rewards_train/accuracies": 0.125, "rewards_train/chosen": 0.0027585094794631004, "rewards_train/margins": -0.021647651679813862, "rewards_train/rejected": 0.024406161159276962, "step": 81 }, { "epoch": 0.02, "learning_rate": 3.059701492537313e-07, "loss": 0.6986, "step": 82 }, { "epoch": 0.02, "logps_train/chosen": -52.473175048828125, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -58.66938018798828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.014303699135780334, "rewards_train/margins": 0.0017494112253189087, "rewards_train/rejected": 0.012554287910461426, "step": 82 }, { "epoch": 0.02, "logps_train/chosen": -11.788549423217773, "logps_train/ref_chosen": -11.6875, "logps_train/ref_rejected": -14.6875, "logps_train/rejected": -14.75306224822998, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.008640158921480179, "rewards_train/margins": 0.00035746581852436066, "rewards_train/rejected": -0.00899762474000454, "step": 83 }, { "epoch": 0.02, "learning_rate": 3.134328358208955e-07, "loss": 0.6927, "step": 84 }, { "epoch": 0.02, "logps_train/chosen": -65.22269439697266, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -73.28553009033203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0037143828812986612, "rewards_train/margins": -0.004116818541660905, "rewards_train/rejected": 0.00040243566036224365, "step": 84 }, { "epoch": 0.02, "logps_train/chosen": -33.343101501464844, "logps_train/ref_chosen": -33.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -52.500579833984375, "rewards_train/accuracies": 0.25, "rewards_train/chosen": 0.010611563920974731, "rewards_train/margins": 0.00031818635761737823, "rewards_train/rejected": 0.010293377563357353, "step": 85 }, { "epoch": 0.02, "learning_rate": 3.2089552238805965e-07, "loss": 0.6938, "step": 86 }, { "epoch": 0.02, "logps_train/chosen": -80.40919494628906, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -66.40739440917969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.022752420976758003, "rewards_train/margins": 0.032046287320554256, "rewards_train/rejected": -0.009293866343796253, "step": 86 }, { "epoch": 0.02, "logps_train/chosen": -80.67681884765625, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -98.67953491210938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.015911972150206566, "rewards_train/margins": 0.02058399934321642, "rewards_train/rejected": -0.004672027193009853, "step": 87 }, { "epoch": 0.02, "learning_rate": 3.2835820895522385e-07, "loss": 0.6803, "step": 88 }, { "epoch": 0.02, "logps_train/chosen": -54.21306228637695, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -59.89112854003906, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.0027515410911291838, "rewards_train/margins": -0.024185542715713382, "rewards_train/rejected": 0.021434001624584198, "step": 88 }, { "epoch": 0.02, "logps_train/chosen": -48.04336929321289, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -48.124664306640625, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.011758768931031227, "rewards_train/margins": 0.012621596455574036, "rewards_train/rejected": -0.024380365386605263, "step": 89 }, { "epoch": 0.03, "learning_rate": 3.3582089552238805e-07, "loss": 0.6965, "step": 90 }, { "epoch": 0.03, "logps_train/chosen": -74.77471923828125, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -90.97662353515625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.00803880300372839, "rewards_train/margins": 0.0036856932565569878, "rewards_train/rejected": -0.011724496260285378, "step": 90 }, { "epoch": 0.03, "logps_train/chosen": -43.102508544921875, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -40.36244201660156, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0009797094389796257, "rewards_train/margins": -0.007697904482483864, "rewards_train/rejected": 0.00867761392146349, "step": 91 }, { "epoch": 0.03, "learning_rate": 3.432835820895522e-07, "loss": 0.6943, "step": 92 }, { "epoch": 0.03, "logps_train/chosen": -62.44756317138672, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -57.75743103027344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.020868778228759766, "rewards_train/margins": 0.007549166679382324, "rewards_train/rejected": 0.013319611549377441, "step": 92 }, { "epoch": 0.03, "logps_train/chosen": -42.98347473144531, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -20.25, "logps_train/rejected": -20.200942993164062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.003996431827545166, "rewards_train/margins": -0.002471828367561102, "rewards_train/rejected": 0.006468260195106268, "step": 93 }, { "epoch": 0.03, "learning_rate": 3.507462686567164e-07, "loss": 0.6918, "step": 94 }, { "epoch": 0.03, "logps_train/chosen": -47.029605865478516, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -63.656864166259766, "rewards_train/accuracies": 0.125, "rewards_train/chosen": -0.02981605753302574, "rewards_train/margins": -0.05631690286099911, "rewards_train/rejected": 0.026500845327973366, "step": 94 }, { "epoch": 0.03, "logps_train/chosen": -37.82858657836914, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -43.48558807373047, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0007352705579251051, "rewards_train/margins": -0.0038309639785438776, "rewards_train/rejected": 0.004566234536468983, "step": 95 }, { "epoch": 0.03, "learning_rate": 3.5820895522388055e-07, "loss": 0.7088, "step": 96 }, { "epoch": 0.03, "logps_train/chosen": -90.15899658203125, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -109.81600952148438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.002459877170622349, "rewards_train/margins": 0.024685590527951717, "rewards_train/rejected": -0.02222571335732937, "step": 96 }, { "epoch": 0.03, "logps_train/chosen": -74.28402709960938, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -92.08799743652344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.016909385100007057, "rewards_train/margins": 0.022584796883165836, "rewards_train/rejected": -0.005675411783158779, "step": 97 }, { "epoch": 0.03, "learning_rate": 3.6567164179104475e-07, "loss": 0.6816, "step": 98 }, { "epoch": 0.03, "logps_train/chosen": -69.00167846679688, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.96739959716797, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.013449341058731079, "rewards_train/margins": -0.006944017950445414, "rewards_train/rejected": -0.0065053231082856655, "step": 98 }, { "epoch": 0.03, "logps_train/chosen": -54.449981689453125, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -59.801513671875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.01300974190235138, "rewards_train/margins": 0.008004629984498024, "rewards_train/rejected": 0.005005111917853355, "step": 99 }, { "epoch": 0.03, "learning_rate": 3.7313432835820895e-07, "loss": 0.6937, "step": 100 }, { "epoch": 0.03, "logps_train/chosen": -62.00507354736328, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -46.26144790649414, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.00011680088937282562, "rewards_train/margins": -0.004440450109541416, "rewards_train/rejected": 0.0043236492201685905, "step": 100 }, { "epoch": 0.03, "logps_train/chosen": -17.897796630859375, "logps_train/ref_chosen": -17.875, "logps_train/ref_rejected": -28.375, "logps_train/rejected": -28.47299575805664, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0002105669118463993, "rewards_train/margins": 0.009375241119414568, "rewards_train/rejected": -0.009164674207568169, "step": 101 }, { "epoch": 0.03, "learning_rate": 3.805970149253731e-07, "loss": 0.6908, "step": 102 }, { "epoch": 0.03, "logps_train/chosen": -64.4922866821289, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -86.14122009277344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.00448232889175415, "rewards_train/margins": 0.01547945849597454, "rewards_train/rejected": -0.01099712960422039, "step": 102 }, { "epoch": 0.03, "logps_train/chosen": -59.390933990478516, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -63.25, "logps_train/rejected": -63.29236602783203, "rewards_train/accuracies": 0.125, "rewards_train/chosen": 0.007195710204541683, "rewards_train/margins": 0.007623705081641674, "rewards_train/rejected": -0.00042799487709999084, "step": 103 }, { "epoch": 0.03, "learning_rate": 3.880597014925373e-07, "loss": 0.6884, "step": 104 }, { "epoch": 0.03, "logps_train/chosen": -90.15403747558594, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -70.99836730957031, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0037373658269643784, "rewards_train/margins": -0.005800902843475342, "rewards_train/rejected": 0.00953826867043972, "step": 104 }, { "epoch": 0.03, "logps_train/chosen": -101.05010223388672, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -96.0403060913086, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.00012723170220851898, "rewards_train/margins": 0.032711880281567574, "rewards_train/rejected": -0.03283911198377609, "step": 105 }, { "epoch": 0.03, "learning_rate": 3.9552238805970144e-07, "loss": 0.6871, "step": 106 }, { "epoch": 0.03, "logps_train/chosen": -74.55607604980469, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -73.98163604736328, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0059983693063259125, "rewards_train/margins": 0.009547839872539043, "rewards_train/rejected": -0.015546209178864956, "step": 106 }, { "epoch": 0.03, "logps_train/chosen": -88.17259216308594, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -104.19441986083984, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0030534989200532436, "rewards_train/margins": -0.00641061132773757, "rewards_train/rejected": 0.009464110247790813, "step": 107 }, { "epoch": 0.03, "learning_rate": 4.0298507462686564e-07, "loss": 0.6928, "step": 108 }, { "epoch": 0.03, "logps_train/chosen": -105.41600036621094, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -89.28736877441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.007032524794340134, "rewards_train/margins": 0.03362088464200497, "rewards_train/rejected": -0.026588359847664833, "step": 108 }, { "epoch": 0.03, "logps_train/chosen": -41.34018325805664, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -41.07966995239258, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.01754409447312355, "rewards_train/margins": 0.02336266729980707, "rewards_train/rejected": -0.005818572826683521, "step": 109 }, { "epoch": 0.03, "learning_rate": 4.1044776119402984e-07, "loss": 0.6789, "step": 110 }, { "epoch": 0.03, "logps_train/chosen": -76.86898803710938, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -74.01725769042969, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.01905249059200287, "rewards_train/margins": -0.006267772987484932, "rewards_train/rejected": -0.012784717604517937, "step": 110 }, { "epoch": 0.03, "logps_train/chosen": -86.36917114257812, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -82.92460632324219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.008792471140623093, "rewards_train/margins": -0.008519744966179132, "rewards_train/rejected": -0.0002727261744439602, "step": 111 }, { "epoch": 0.03, "learning_rate": 4.17910447761194e-07, "loss": 0.6968, "step": 112 }, { "epoch": 0.03, "logps_train/chosen": -52.108375549316406, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -71.63522338867188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.012404504232108593, "rewards_train/margins": 0.010302287060767412, "rewards_train/rejected": 0.002102217171341181, "step": 112 }, { "epoch": 0.03, "logps_train/chosen": -65.410400390625, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -67.90836334228516, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.003759864717721939, "rewards_train/margins": 0.0019203806295990944, "rewards_train/rejected": -0.0056802453473210335, "step": 113 }, { "epoch": 0.03, "learning_rate": 4.253731343283582e-07, "loss": 0.6902, "step": 114 }, { "epoch": 0.03, "logps_train/chosen": -64.06271362304688, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -65.58354187011719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.01599377952516079, "rewards_train/margins": 0.029817133210599422, "rewards_train/rejected": -0.013823353685438633, "step": 114 }, { "epoch": 0.03, "logps_train/chosen": -72.0600814819336, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -61.60381317138672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.025241797789931297, "rewards_train/margins": 0.01003727875649929, "rewards_train/rejected": 0.015204519033432007, "step": 115 }, { "epoch": 0.03, "learning_rate": 4.3283582089552234e-07, "loss": 0.6834, "step": 116 }, { "epoch": 0.03, "logps_train/chosen": -53.74980545043945, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -49.438079833984375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01935528591275215, "rewards_train/margins": 0.009647496044635773, "rewards_train/rejected": 0.009707789868116379, "step": 116 }, { "epoch": 0.03, "logps_train/chosen": -44.96778869628906, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -56.949012756347656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.004197769798338413, "rewards_train/margins": 0.025466148741543293, "rewards_train/rejected": -0.02126837894320488, "step": 117 }, { "epoch": 0.03, "learning_rate": 4.4029850746268654e-07, "loss": 0.6848, "step": 118 }, { "epoch": 0.03, "logps_train/chosen": -25.349895477294922, "logps_train/ref_chosen": -25.5, "logps_train/ref_rejected": -27.5, "logps_train/rejected": -27.3612060546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01569410413503647, "rewards_train/margins": 0.003084232099354267, "rewards_train/rejected": 0.012609872035682201, "step": 118 }, { "epoch": 0.03, "logps_train/chosen": -97.0093994140625, "logps_train/ref_chosen": -97.5, "logps_train/ref_rejected": -121.0, "logps_train/rejected": -121.2686538696289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02640366740524769, "rewards_train/margins": 0.057956697419285774, "rewards_train/rejected": -0.031553030014038086, "step": 119 }, { "epoch": 0.03, "learning_rate": 4.4776119402985074e-07, "loss": 0.6782, "step": 120 }, { "epoch": 0.03, "logps_train/chosen": -80.8241195678711, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -72.63106536865234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.007431835867464542, "rewards_train/margins": 0.06506937090307474, "rewards_train/rejected": -0.0576375350356102, "step": 120 }, { "epoch": 0.03, "logps_train/chosen": -88.81282806396484, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -77.29328155517578, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01754564791917801, "rewards_train/margins": 0.024217775091528893, "rewards_train/rejected": -0.0066721271723508835, "step": 121 }, { "epoch": 0.03, "learning_rate": 4.552238805970149e-07, "loss": 0.6719, "step": 122 }, { "epoch": 0.03, "logps_train/chosen": -59.9490966796875, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -59.63578414916992, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.026183905079960823, "rewards_train/margins": 0.016910739243030548, "rewards_train/rejected": 0.009273165836930275, "step": 122 }, { "epoch": 0.03, "logps_train/chosen": -41.998470306396484, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -55.53712463378906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.011090368032455444, "rewards_train/margins": 0.02329868171364069, "rewards_train/rejected": -0.012208313681185246, "step": 123 }, { "epoch": 0.03, "learning_rate": 4.626865671641791e-07, "loss": 0.6833, "step": 124 }, { "epoch": 0.03, "logps_train/chosen": -43.328521728515625, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -36.337039947509766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.022860581055283546, "rewards_train/margins": 0.017941400408744812, "rewards_train/rejected": 0.0049191806465387344, "step": 124 }, { "epoch": 0.03, "logps_train/chosen": -67.00100708007812, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -63.06188201904297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.013180387206375599, "rewards_train/margins": 0.03597052674740553, "rewards_train/rejected": -0.02279013954102993, "step": 125 }, { "epoch": 0.04, "learning_rate": 4.701492537313433e-07, "loss": 0.6796, "step": 126 }, { "epoch": 0.04, "logps_train/chosen": -21.85276985168457, "logps_train/ref_chosen": -22.0, "logps_train/ref_rejected": -5.84375, "logps_train/rejected": -5.905564308166504, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.013258203864097595, "rewards_train/margins": 0.019097805954515934, "rewards_train/rejected": -0.005839602090418339, "step": 126 }, { "epoch": 0.04, "logps_train/chosen": -78.06990051269531, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -61.922767639160156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.015666604042053223, "rewards_train/margins": 0.016537124291062355, "rewards_train/rejected": -0.0008705202490091324, "step": 127 }, { "epoch": 0.04, "learning_rate": 4.776119402985074e-07, "loss": 0.684, "step": 128 }, { "epoch": 0.04, "logps_train/chosen": -54.64369201660156, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -62.104461669921875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.04009862244129181, "rewards_train/margins": 0.05611103028059006, "rewards_train/rejected": -0.01601240783929825, "step": 128 }, { "epoch": 0.04, "logps_train/chosen": -84.65701293945312, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -79.7423095703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.027267076075077057, "rewards_train/margins": 0.01243535429239273, "rewards_train/rejected": 0.014831721782684326, "step": 129 }, { "epoch": 0.04, "learning_rate": 4.850746268656717e-07, "loss": 0.6764, "step": 130 }, { "epoch": 0.04, "logps_train/chosen": -33.27048873901367, "logps_train/ref_chosen": -33.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -49.85738754272461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0020977838430553675, "rewards_train/margins": 0.01118008024059236, "rewards_train/rejected": -0.013277864083647728, "step": 130 }, { "epoch": 0.04, "logps_train/chosen": -41.979339599609375, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -49.16737365722656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.013589856214821339, "rewards_train/margins": 0.02290544006973505, "rewards_train/rejected": -0.009315583854913712, "step": 131 }, { "epoch": 0.04, "learning_rate": 4.925373134328357e-07, "loss": 0.6848, "step": 132 }, { "epoch": 0.04, "logps_train/chosen": -86.48685455322266, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -89.16781616210938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.000589694594964385, "rewards_train/margins": 0.02039074362255633, "rewards_train/rejected": -0.020980438217520714, "step": 132 }, { "epoch": 0.04, "logps_train/chosen": -69.70428466796875, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -70.12745666503906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.025470493361353874, "rewards_train/margins": 0.0425125639885664, "rewards_train/rejected": -0.017042070627212524, "step": 133 }, { "epoch": 0.04, "learning_rate": 5e-07, "loss": 0.678, "step": 134 }, { "epoch": 0.04, "logps_train/chosen": -58.901615142822266, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -68.68511962890625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0001215348020195961, "rewards_train/margins": 0.016546146012842655, "rewards_train/rejected": -0.01642461121082306, "step": 134 }, { "epoch": 0.04, "logps_train/chosen": -62.59575653076172, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -65.994140625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.026752160862088203, "rewards_train/margins": 0.0330016715452075, "rewards_train/rejected": -0.006249510683119297, "step": 135 }, { "epoch": 0.04, "learning_rate": 4.999992338985357e-07, "loss": 0.6812, "step": 136 }, { "epoch": 0.04, "logps_train/chosen": -53.07342529296875, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -52.71923828125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0067200311459600925, "rewards_train/margins": 0.023077736143022776, "rewards_train/rejected": -0.016357704997062683, "step": 136 }, { "epoch": 0.04, "logps_train/chosen": -51.292930603027344, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -50.846290588378906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.029105406254529953, "rewards_train/margins": 0.025648584589362144, "rewards_train/rejected": 0.0034568216651678085, "step": 137 }, { "epoch": 0.04, "learning_rate": 4.999969355988384e-07, "loss": 0.6813, "step": 138 }, { "epoch": 0.04, "logps_train/chosen": -66.85665130615234, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -62.703922271728516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.01667836867272854, "rewards_train/margins": 0.03902385197579861, "rewards_train/rejected": -0.02234548330307007, "step": 138 }, { "epoch": 0.04, "logps_train/chosen": -70.43873596191406, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -81.95301818847656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0010485467500984669, "rewards_train/margins": 0.0041623483411967754, "rewards_train/rejected": -0.0031138015910983086, "step": 139 }, { "epoch": 0.04, "learning_rate": 4.999931051149936e-07, "loss": 0.6826, "step": 140 }, { "epoch": 0.04, "logps_train/chosen": -70.70523834228516, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -74.788818359375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.013362860307097435, "rewards_train/margins": 0.032869674265384674, "rewards_train/rejected": -0.01950681395828724, "step": 140 }, { "epoch": 0.04, "logps_train/chosen": -37.05867004394531, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -50.647212982177734, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.024699414148926735, "rewards_train/margins": 0.007780121639370918, "rewards_train/rejected": 0.016919292509555817, "step": 141 }, { "epoch": 0.04, "learning_rate": 4.999877424704779e-07, "loss": 0.6833, "step": 142 }, { "epoch": 0.04, "logps_train/chosen": -59.707279205322266, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -81.51679992675781, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.021068911999464035, "rewards_train/margins": 0.046185821294784546, "rewards_train/rejected": -0.02511690929532051, "step": 142 }, { "epoch": 0.04, "logps_train/chosen": -62.5545768737793, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -51.992942810058594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.004307818599045277, "rewards_train/margins": 0.01766448002308607, "rewards_train/rejected": -0.013356661424040794, "step": 143 }, { "epoch": 0.04, "learning_rate": 4.999808476981578e-07, "loss": 0.6769, "step": 144 }, { "epoch": 0.04, "logps_train/chosen": -63.847530364990234, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -59.586456298828125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.004699907265603542, "rewards_train/margins": -0.015462937764823437, "rewards_train/rejected": 0.02016284503042698, "step": 144 }, { "epoch": 0.04, "logps_train/chosen": -62.57245635986328, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -62.590755462646484, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.00017600646242499352, "rewards_train/margins": -0.008717326913028955, "rewards_train/rejected": 0.008893333375453949, "step": 145 }, { "epoch": 0.04, "learning_rate": 4.9997242084029e-07, "loss": 0.6989, "step": 146 }, { "epoch": 0.04, "logps_train/chosen": -74.72541809082031, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -75.61740112304688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0003094852436333895, "rewards_train/margins": 0.02454940532334149, "rewards_train/rejected": -0.0242399200797081, "step": 146 }, { "epoch": 0.04, "logps_train/chosen": -33.403953552246094, "logps_train/ref_chosen": -33.5, "logps_train/ref_rejected": -29.625, "logps_train/rejected": -29.620080947875977, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.0037940330803394318, "rewards_train/margins": 0.00794072262942791, "rewards_train/rejected": -0.004146689549088478, "step": 147 }, { "epoch": 0.04, "learning_rate": 4.999624619485213e-07, "loss": 0.6852, "step": 148 }, { "epoch": 0.04, "logps_train/chosen": -84.8711929321289, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -73.16226196289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.022255651652812958, "rewards_train/margins": 0.05723201483488083, "rewards_train/rejected": -0.03497636318206787, "step": 148 }, { "epoch": 0.04, "logps_train/chosen": -89.43215942382812, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -90.07511138916016, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.019625797867774963, "rewards_train/margins": 0.011316696181893349, "rewards_train/rejected": 0.008309101685881615, "step": 149 }, { "epoch": 0.04, "learning_rate": 4.999509710838877e-07, "loss": 0.676, "step": 150 }, { "epoch": 0.04, "logps_train/chosen": -80.99070739746094, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -56.01921844482422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.01577312871813774, "rewards_train/margins": 0.054804813116788864, "rewards_train/rejected": -0.03903168439865112, "step": 150 }, { "epoch": 0.04, "logps_train/chosen": -96.13180541992188, "logps_train/ref_chosen": -96.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -84.63067626953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03056936338543892, "rewards_train/margins": 0.02879274054430425, "rewards_train/rejected": 0.0017766228411346674, "step": 151 }, { "epoch": 0.04, "learning_rate": 4.999379483168148e-07, "loss": 0.6732, "step": 152 }, { "epoch": 0.04, "logps_train/chosen": -76.49810791015625, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -71.49435424804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.024993985891342163, "rewards_train/margins": 0.07130471616983414, "rewards_train/rejected": -0.046310730278491974, "step": 152 }, { "epoch": 0.04, "logps_train/chosen": -75.715087890625, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -87.75228118896484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.009741455316543579, "rewards_train/margins": 0.007625645026564598, "rewards_train/rejected": 0.002115810289978981, "step": 153 }, { "epoch": 0.04, "learning_rate": 4.999233937271163e-07, "loss": 0.6737, "step": 154 }, { "epoch": 0.04, "logps_train/chosen": -67.73640441894531, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -79.92431640625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0232345350086689, "rewards_train/margins": 0.06644811853766441, "rewards_train/rejected": -0.043213583528995514, "step": 154 }, { "epoch": 0.04, "logps_train/chosen": -61.9312629699707, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -67.37472534179688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.021815136075019836, "rewards_train/margins": 0.014268056489527225, "rewards_train/rejected": 0.007547079585492611, "step": 155 }, { "epoch": 0.04, "learning_rate": 4.999073074039949e-07, "loss": 0.6738, "step": 156 }, { "epoch": 0.04, "logps_train/chosen": -70.95030212402344, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -80.04240417480469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.044813372194767, "rewards_train/margins": 0.06311655044555664, "rewards_train/rejected": -0.018303178250789642, "step": 156 }, { "epoch": 0.04, "logps_train/chosen": -66.55253601074219, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -77.7836685180664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.025996200740337372, "rewards_train/margins": 0.07428490370512009, "rewards_train/rejected": -0.048288702964782715, "step": 157 }, { "epoch": 0.04, "learning_rate": 4.998896894460405e-07, "loss": 0.6609, "step": 158 }, { "epoch": 0.04, "logps_train/chosen": -54.9752311706543, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -70.57875061035156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.0013316869735717773, "rewards_train/margins": 0.010840454138815403, "rewards_train/rejected": -0.01217214111238718, "step": 158 }, { "epoch": 0.04, "logps_train/chosen": -47.71549987792969, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -56.431739807128906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05091074854135513, "rewards_train/margins": 0.07377275638282299, "rewards_train/rejected": -0.022862007841467857, "step": 159 }, { "epoch": 0.04, "learning_rate": 4.998705399612302e-07, "loss": 0.6734, "step": 160 }, { "epoch": 0.04, "logps_train/chosen": -63.73102951049805, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -79.51908111572266, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.01244371011853218, "rewards_train/margins": 0.020601927302777767, "rewards_train/rejected": -0.008158217184245586, "step": 160 }, { "epoch": 0.04, "logps_train/chosen": -53.09801483154297, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -55.18875503540039, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03355775773525238, "rewards_train/margins": 0.03407386876642704, "rewards_train/rejected": -0.0005161110311746597, "step": 161 }, { "epoch": 0.05, "learning_rate": 4.998498590669277e-07, "loss": 0.6805, "step": 162 }, { "epoch": 0.05, "logps_train/chosen": -38.73344421386719, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -45.12217330932617, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.026655657216906548, "rewards_train/margins": 0.025982449762523174, "rewards_train/rejected": 0.0006732074543833733, "step": 162 }, { "epoch": 0.05, "logps_train/chosen": -45.36915588378906, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -38.06843948364258, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.016209345310926437, "rewards_train/margins": 0.03594367578625679, "rewards_train/rejected": -0.019734330475330353, "step": 163 }, { "epoch": 0.05, "learning_rate": 4.998276468898822e-07, "loss": 0.6786, "step": 164 }, { "epoch": 0.05, "logps_train/chosen": -73.39313507080078, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -82.00084686279297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.011321071535348892, "rewards_train/margins": 0.06414046883583069, "rewards_train/rejected": -0.052819397300481796, "step": 164 }, { "epoch": 0.05, "logps_train/chosen": -69.62464141845703, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -72.80537414550781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.05394202470779419, "rewards_train/margins": 0.07510420121252537, "rewards_train/rejected": -0.02116217650473118, "step": 165 }, { "epoch": 0.05, "learning_rate": 4.99803903566228e-07, "loss": 0.66, "step": 166 }, { "epoch": 0.05, "logps_train/chosen": -34.17655563354492, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -35.15182876586914, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.021407097578048706, "rewards_train/margins": 0.05534002184867859, "rewards_train/rejected": -0.03393292427062988, "step": 166 }, { "epoch": 0.05, "logps_train/chosen": -68.01045989990234, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -73.16434478759766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05207864195108414, "rewards_train/margins": 0.0966385267674923, "rewards_train/rejected": -0.04455988481640816, "step": 167 }, { "epoch": 0.05, "learning_rate": 4.997786292414834e-07, "loss": 0.6576, "step": 168 }, { "epoch": 0.05, "logps_train/chosen": -22.09493637084961, "logps_train/ref_chosen": -22.25, "logps_train/ref_rejected": -28.375, "logps_train/rejected": -28.5806827545166, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.015311000868678093, "rewards_train/margins": 0.03900415636599064, "rewards_train/rejected": -0.023693155497312546, "step": 168 }, { "epoch": 0.05, "logps_train/chosen": -103.13506317138672, "logps_train/ref_chosen": -103.5, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -110.44572448730469, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.03766520321369171, "rewards_train/margins": 0.0269642174243927, "rewards_train/rejected": 0.010700985789299011, "step": 169 }, { "epoch": 0.05, "learning_rate": 4.997518240705501e-07, "loss": 0.6786, "step": 170 }, { "epoch": 0.05, "logps_train/chosen": -68.88632202148438, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -77.92741394042969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.006650085560977459, "rewards_train/margins": 0.03765397239476442, "rewards_train/rejected": -0.04430405795574188, "step": 170 }, { "epoch": 0.05, "logps_train/chosen": -73.45475769042969, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -83.04298400878906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.006086999084800482, "rewards_train/margins": 0.09085501125082374, "rewards_train/rejected": -0.08476801216602325, "step": 171 }, { "epoch": 0.05, "learning_rate": 4.99723488217712e-07, "loss": 0.6623, "step": 172 }, { "epoch": 0.05, "logps_train/chosen": -75.52366638183594, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -71.59895324707031, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.01169600524008274, "rewards_train/margins": 0.03565373457968235, "rewards_train/rejected": -0.02395772933959961, "step": 172 }, { "epoch": 0.05, "logps_train/chosen": -87.9087905883789, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -81.4451904296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.011191445402801037, "rewards_train/margins": 0.03371789027005434, "rewards_train/rejected": -0.04490933567285538, "step": 173 }, { "epoch": 0.05, "learning_rate": 4.996936218566339e-07, "loss": 0.6769, "step": 174 }, { "epoch": 0.05, "logps_train/chosen": -87.11066436767578, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -98.43775177001953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.055828407406806946, "rewards_train/margins": 0.10761188715696335, "rewards_train/rejected": -0.0517834797501564, "step": 174 }, { "epoch": 0.05, "logps_train/chosen": -67.88616943359375, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -60.13406753540039, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05044550448656082, "rewards_train/margins": 0.08357885107398033, "rewards_train/rejected": -0.03313334658741951, "step": 175 }, { "epoch": 0.05, "learning_rate": 4.996622251703612e-07, "loss": 0.6478, "step": 176 }, { "epoch": 0.05, "logps_train/chosen": -42.33866882324219, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -58.38355255126953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0025344383902847767, "rewards_train/margins": 0.04460058035328984, "rewards_train/rejected": -0.042066141963005066, "step": 176 }, { "epoch": 0.05, "logps_train/chosen": -44.17629623413086, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -43.33050537109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.025827443227171898, "rewards_train/margins": 0.05087011493742466, "rewards_train/rejected": -0.025042671710252762, "step": 177 }, { "epoch": 0.05, "learning_rate": 4.996292983513186e-07, "loss": 0.6706, "step": 178 }, { "epoch": 0.05, "logps_train/chosen": -61.72535705566406, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -72.10646057128906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.033714115619659424, "rewards_train/margins": 0.04436032846570015, "rewards_train/rejected": -0.010646212846040726, "step": 178 }, { "epoch": 0.05, "logps_train/chosen": -32.68050003051758, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -46.875099182128906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.010270203463733196, "rewards_train/margins": 0.0329365162178874, "rewards_train/rejected": -0.022666312754154205, "step": 179 }, { "epoch": 0.05, "learning_rate": 4.99594841601308e-07, "loss": 0.675, "step": 180 }, { "epoch": 0.05, "logps_train/chosen": -26.811080932617188, "logps_train/ref_chosen": -26.75, "logps_train/ref_rejected": -23.875, "logps_train/rejected": -23.95778465270996, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.0036788261495530605, "rewards_train/margins": 0.007834318559616804, "rewards_train/rejected": -0.011513144709169865, "step": 180 }, { "epoch": 0.05, "logps_train/chosen": -72.8066177368164, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -73.80039978027344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.028542205691337585, "rewards_train/margins": 0.046252816915512085, "rewards_train/rejected": -0.0177106112241745, "step": 181 }, { "epoch": 0.05, "learning_rate": 4.995588551315086e-07, "loss": 0.6809, "step": 182 }, { "epoch": 0.05, "logps_train/chosen": -52.16310119628906, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -79.22138214111328, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03623219206929207, "rewards_train/margins": 0.011687222868204117, "rewards_train/rejected": -0.047919414937496185, "step": 182 }, { "epoch": 0.05, "logps_train/chosen": -84.68603515625, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -87.58245849609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03999017924070358, "rewards_train/margins": 0.05604833923280239, "rewards_train/rejected": -0.01605815999209881, "step": 183 }, { "epoch": 0.05, "learning_rate": 4.995213391624745e-07, "loss": 0.6773, "step": 184 }, { "epoch": 0.05, "logps_train/chosen": -77.65924072265625, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -106.4644775390625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.003998493775725365, "rewards_train/margins": 0.06392284296452999, "rewards_train/rejected": -0.059924349188804626, "step": 184 }, { "epoch": 0.05, "logps_train/chosen": -64.07603454589844, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -72.57796478271484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.016615189611911774, "rewards_train/margins": 0.041013384237885475, "rewards_train/rejected": -0.0243981946259737, "step": 185 }, { "epoch": 0.05, "learning_rate": 4.994822939241341e-07, "loss": 0.6671, "step": 186 }, { "epoch": 0.05, "logps_train/chosen": -81.4778060913086, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -82.53826904296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.030202491208910942, "rewards_train/margins": 0.1001871544867754, "rewards_train/rejected": -0.13038964569568634, "step": 186 }, { "epoch": 0.05, "logps_train/chosen": -56.910369873046875, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -58.9641227722168, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0221464354544878, "rewards_train/margins": 0.08301190473139286, "rewards_train/rejected": -0.06086546927690506, "step": 187 }, { "epoch": 0.05, "learning_rate": 4.994417196557883e-07, "loss": 0.6515, "step": 188 }, { "epoch": 0.05, "logps_train/chosen": -98.9520263671875, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -98.87950134277344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08292235434055328, "rewards_train/margins": 0.1208723932504654, "rewards_train/rejected": -0.03795003890991211, "step": 188 }, { "epoch": 0.05, "logps_train/chosen": -41.69307327270508, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -48.171234130859375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.025809943675994873, "rewards_train/margins": 0.03160558361560106, "rewards_train/rejected": -0.00579563993960619, "step": 189 }, { "epoch": 0.05, "learning_rate": 4.993996166061091e-07, "loss": 0.657, "step": 190 }, { "epoch": 0.05, "logps_train/chosen": -67.85884094238281, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -84.90817260742188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.04145927354693413, "rewards_train/margins": 0.17368273064494133, "rewards_train/rejected": -0.1322234570980072, "step": 190 }, { "epoch": 0.05, "logps_train/chosen": -74.41243743896484, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -71.4466323852539, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.032650504261255264, "rewards_train/margins": 0.019825126975774765, "rewards_train/rejected": -0.05247563123703003, "step": 191 }, { "epoch": 0.05, "learning_rate": 4.993559850331383e-07, "loss": 0.6496, "step": 192 }, { "epoch": 0.05, "logps_train/chosen": -73.65574645996094, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -72.97344207763672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03657417371869087, "rewards_train/margins": 0.08010983094573021, "rewards_train/rejected": -0.04353565722703934, "step": 192 }, { "epoch": 0.05, "logps_train/chosen": -81.60496520996094, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.50057983398438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06127751246094704, "rewards_train/margins": 0.005968581885099411, "rewards_train/rejected": -0.06724609434604645, "step": 193 }, { "epoch": 0.05, "learning_rate": 4.993108252042853e-07, "loss": 0.6731, "step": 194 }, { "epoch": 0.05, "logps_train/chosen": -70.02510070800781, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -77.16969299316406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.017744647338986397, "rewards_train/margins": 0.013287629932165146, "rewards_train/rejected": -0.031032277271151543, "step": 194 }, { "epoch": 0.05, "logps_train/chosen": -29.98483657836914, "logps_train/ref_chosen": -30.125, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -33.236244201660156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.018703678622841835, "rewards_train/margins": 0.033734289929270744, "rewards_train/rejected": -0.01503061130642891, "step": 195 }, { "epoch": 0.05, "learning_rate": 4.992641373963267e-07, "loss": 0.6824, "step": 196 }, { "epoch": 0.05, "logps_train/chosen": -23.881338119506836, "logps_train/ref_chosen": -23.875, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -32.399864196777344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.004418111406266689, "rewards_train/margins": 0.009591526351869106, "rewards_train/rejected": -0.014009637758135796, "step": 196 }, { "epoch": 0.06, "logps_train/chosen": -60.460777282714844, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -72.92411804199219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.03865598142147064, "rewards_train/margins": 0.012349851429462433, "rewards_train/rejected": -0.051005832850933075, "step": 197 }, { "epoch": 0.06, "learning_rate": 4.992159218954027e-07, "loss": 0.6882, "step": 198 }, { "epoch": 0.06, "logps_train/chosen": -59.503578186035156, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.3902587890625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.002767347265034914, "rewards_train/margins": 0.03583629475906491, "rewards_train/rejected": -0.03306894749403, "step": 198 }, { "epoch": 0.06, "logps_train/chosen": -75.33126831054688, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -76.33258056640625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06867402046918869, "rewards_train/margins": 0.026302769780158997, "rewards_train/rejected": -0.09497679024934769, "step": 199 }, { "epoch": 0.06, "learning_rate": 4.991661789970175e-07, "loss": 0.679, "step": 200 }, { "epoch": 0.06, "logps_train/chosen": -62.34615707397461, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -64.10647583007812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.03335317596793175, "rewards_train/margins": 0.017437901347875595, "rewards_train/rejected": 0.015915274620056152, "step": 200 }, { "epoch": 0.06, "logps_train/chosen": -32.947509765625, "logps_train/ref_chosen": -33.25, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -37.5911750793457, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.03171394020318985, "rewards_train/margins": 0.11231562495231628, "rewards_train/rejected": -0.08060168474912643, "step": 201 }, { "epoch": 0.06, "learning_rate": 4.991149090060357e-07, "loss": 0.666, "step": 202 }, { "epoch": 0.06, "logps_train/chosen": -52.979347229003906, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -66.29036712646484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.053188204765319824, "rewards_train/margins": 0.1179671660065651, "rewards_train/rejected": -0.06477896124124527, "step": 202 }, { "epoch": 0.06, "logps_train/chosen": -40.81906509399414, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -40.9730339050293, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.010866811498999596, "rewards_train/margins": 0.08453742228448391, "rewards_train/rejected": -0.07367061078548431, "step": 203 }, { "epoch": 0.06, "learning_rate": 4.990621122366814e-07, "loss": 0.6465, "step": 204 }, { "epoch": 0.06, "logps_train/chosen": -72.531982421875, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -81.16976165771484, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.0594484768807888, "rewards_train/margins": -0.012833558022975922, "rewards_train/rejected": -0.04661491885781288, "step": 204 }, { "epoch": 0.06, "logps_train/chosen": -60.44779968261719, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -65.13034057617188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.017329489812254906, "rewards_train/margins": 0.1112231370061636, "rewards_train/rejected": -0.09389364719390869, "step": 205 }, { "epoch": 0.06, "learning_rate": 4.990077890125363e-07, "loss": 0.6729, "step": 206 }, { "epoch": 0.06, "logps_train/chosen": -68.99644470214844, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -71.91361999511719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.013706970028579235, "rewards_train/margins": 0.046600774861872196, "rewards_train/rejected": -0.06030774489045143, "step": 206 }, { "epoch": 0.06, "logps_train/chosen": -80.01974487304688, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -83.99603271484375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.03044714406132698, "rewards_train/margins": 0.10719845816493034, "rewards_train/rejected": -0.07675131410360336, "step": 207 }, { "epoch": 0.06, "learning_rate": 4.98951939666537e-07, "loss": 0.6567, "step": 208 }, { "epoch": 0.06, "logps_train/chosen": -59.470733642578125, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -60.334495544433594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.00019833073019981384, "rewards_train/margins": 0.05414910987019539, "rewards_train/rejected": -0.0543474406003952, "step": 208 }, { "epoch": 0.06, "logps_train/chosen": -60.944374084472656, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -69.1351089477539, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0071250684559345245, "rewards_train/margins": 0.053448259830474854, "rewards_train/rejected": -0.04632319137454033, "step": 209 }, { "epoch": 0.06, "learning_rate": 4.988945645409737e-07, "loss": 0.6679, "step": 210 }, { "epoch": 0.06, "logps_train/chosen": -74.97129821777344, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -78.39900970458984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08724522590637207, "rewards_train/margins": 0.17402145266532898, "rewards_train/rejected": -0.08677622675895691, "step": 210 }, { "epoch": 0.06, "logps_train/chosen": -37.18720245361328, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -43.60913848876953, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.01207956112921238, "rewards_train/margins": -0.010540693998336792, "rewards_train/rejected": -0.0015388671308755875, "step": 211 }, { "epoch": 0.06, "learning_rate": 4.988356639874877e-07, "loss": 0.6595, "step": 212 }, { "epoch": 0.06, "logps_train/chosen": -60.066497802734375, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -69.7305908203125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06444393843412399, "rewards_train/margins": 0.08906573243439198, "rewards_train/rejected": -0.024621794000267982, "step": 212 }, { "epoch": 0.06, "logps_train/chosen": -39.71125030517578, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -45.317726135253906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0685858279466629, "rewards_train/margins": 0.07217095792293549, "rewards_train/rejected": -0.1407567858695984, "step": 213 }, { "epoch": 0.06, "learning_rate": 4.987752383670694e-07, "loss": 0.6567, "step": 214 }, { "epoch": 0.06, "logps_train/chosen": -102.42160034179688, "logps_train/ref_chosen": -102.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -99.19660949707031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.018777890130877495, "rewards_train/margins": 0.07906324602663517, "rewards_train/rejected": -0.060285355895757675, "step": 214 }, { "epoch": 0.06, "logps_train/chosen": -74.23822784423828, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -64.2943344116211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.024702059105038643, "rewards_train/margins": 0.09457517974078655, "rewards_train/rejected": -0.1192772388458252, "step": 215 }, { "epoch": 0.06, "learning_rate": 4.987132880500561e-07, "loss": 0.6532, "step": 216 }, { "epoch": 0.06, "logps_train/chosen": -100.21539306640625, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -87.05062866210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04408545047044754, "rewards_train/margins": 0.20149267464876175, "rewards_train/rejected": -0.1574072241783142, "step": 216 }, { "epoch": 0.06, "logps_train/chosen": -73.68387603759766, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -69.42728424072266, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.059794262051582336, "rewards_train/margins": 0.052856430411338806, "rewards_train/rejected": -0.11265069246292114, "step": 217 }, { "epoch": 0.06, "learning_rate": 4.986498134161296e-07, "loss": 0.6343, "step": 218 }, { "epoch": 0.06, "logps_train/chosen": -87.93561553955078, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -100.5460205078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04434296116232872, "rewards_train/margins": 0.11963364854454994, "rewards_train/rejected": -0.16397660970687866, "step": 218 }, { "epoch": 0.06, "logps_train/chosen": -76.35858154296875, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -74.37849426269531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0811704471707344, "rewards_train/margins": 0.09535088390111923, "rewards_train/rejected": -0.17652133107185364, "step": 219 }, { "epoch": 0.06, "learning_rate": 4.98584814854314e-07, "loss": 0.6434, "step": 220 }, { "epoch": 0.06, "logps_train/chosen": -70.59697723388672, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -84.3460922241211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04680757224559784, "rewards_train/margins": 0.09483274817466736, "rewards_train/rejected": -0.1416403204202652, "step": 220 }, { "epoch": 0.06, "logps_train/chosen": -44.067588806152344, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -45.22989273071289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.010818934068083763, "rewards_train/margins": 0.036151934415102005, "rewards_train/rejected": -0.025333000347018242, "step": 221 }, { "epoch": 0.06, "learning_rate": 4.985182927629732e-07, "loss": 0.6628, "step": 222 }, { "epoch": 0.06, "logps_train/chosen": -59.0699577331543, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -52.7005500793457, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.019300485029816628, "rewards_train/margins": 0.09919190965592861, "rewards_train/rejected": -0.11849239468574524, "step": 222 }, { "epoch": 0.06, "logps_train/chosen": -76.71279907226562, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -89.87239837646484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05411018803715706, "rewards_train/margins": 0.15736595168709755, "rewards_train/rejected": -0.10325576364994049, "step": 223 }, { "epoch": 0.06, "learning_rate": 4.984502475498087e-07, "loss": 0.6353, "step": 224 }, { "epoch": 0.06, "logps_train/chosen": -61.84689712524414, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -64.61875915527344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06618925929069519, "rewards_train/margins": 0.12904177606105804, "rewards_train/rejected": -0.06285251677036285, "step": 224 }, { "epoch": 0.06, "logps_train/chosen": -37.708656311035156, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -38.8215217590332, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0836707204580307, "rewards_train/margins": 0.047309741377830505, "rewards_train/rejected": -0.1309804618358612, "step": 225 }, { "epoch": 0.06, "learning_rate": 4.983806796318566e-07, "loss": 0.6547, "step": 226 }, { "epoch": 0.06, "logps_train/chosen": -35.12958526611328, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -40.443180084228516, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.012431969866156578, "rewards_train/margins": 0.029650233685970306, "rewards_train/rejected": -0.01721826381981373, "step": 226 }, { "epoch": 0.06, "logps_train/chosen": -65.00015258789062, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -63.44427490234375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.01639087125658989, "rewards_train/margins": 0.09441191330552101, "rewards_train/rejected": -0.07802104204893112, "step": 227 }, { "epoch": 0.06, "learning_rate": 4.983095894354857e-07, "loss": 0.6635, "step": 228 }, { "epoch": 0.06, "logps_train/chosen": -69.90158081054688, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -77.40280151367188, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.015212714672088623, "rewards_train/margins": 0.08293427526950836, "rewards_train/rejected": -0.06772156059741974, "step": 228 }, { "epoch": 0.06, "logps_train/chosen": -72.91961669921875, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -76.48533630371094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.06702268868684769, "rewards_train/margins": 0.11711882799863815, "rewards_train/rejected": -0.050096139311790466, "step": 229 }, { "epoch": 0.06, "learning_rate": 4.982369773963945e-07, "loss": 0.6479, "step": 230 }, { "epoch": 0.06, "logps_train/chosen": -64.00090026855469, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -51.01182556152344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.006941433064639568, "rewards_train/margins": 0.0829286826774478, "rewards_train/rejected": -0.07598724961280823, "step": 230 }, { "epoch": 0.06, "logps_train/chosen": -55.70654296875, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -74.71240234375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.019970979541540146, "rewards_train/margins": 0.1654302142560482, "rewards_train/rejected": -0.14545923471450806, "step": 231 }, { "epoch": 0.06, "learning_rate": 4.981628439596082e-07, "loss": 0.6359, "step": 232 }, { "epoch": 0.06, "logps_train/chosen": -73.67041015625, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -93.5, "logps_train/rejected": -96.10404205322266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.03256802633404732, "rewards_train/margins": 0.2894815690815449, "rewards_train/rejected": -0.25691354274749756, "step": 232 }, { "epoch": 0.07, "logps_train/chosen": -70.38328552246094, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -93.5, "logps_train/rejected": -96.76373291015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.04887603595852852, "rewards_train/margins": 0.2696845047175884, "rewards_train/rejected": -0.31856054067611694, "step": 233 }, { "epoch": 0.07, "learning_rate": 4.980871895794771e-07, "loss": 0.5744, "step": 234 }, { "epoch": 0.07, "logps_train/chosen": -45.762168884277344, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -55.0173454284668, "rewards_train/accuracies": 0.375, "rewards_train/chosen": 0.02378300204873085, "rewards_train/margins": -0.015107657760381699, "rewards_train/rejected": 0.03889065980911255, "step": 234 }, { "epoch": 0.07, "logps_train/chosen": -49.5078239440918, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -44.71027374267578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.08339961618185043, "rewards_train/margins": 0.07788651436567307, "rewards_train/rejected": -0.1612861305475235, "step": 235 }, { "epoch": 0.07, "learning_rate": 4.980100147196724e-07, "loss": 0.6802, "step": 236 }, { "epoch": 0.07, "logps_train/chosen": -39.74016571044922, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -36.35296630859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0015552621334791183, "rewards_train/margins": 0.07983522303402424, "rewards_train/rejected": -0.08139048516750336, "step": 236 }, { "epoch": 0.07, "logps_train/chosen": -74.76644134521484, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -72.96209716796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.015413643792271614, "rewards_train/margins": 0.07229947857558727, "rewards_train/rejected": -0.08771312236785889, "step": 237 }, { "epoch": 0.07, "learning_rate": 4.979313198531843e-07, "loss": 0.6575, "step": 238 }, { "epoch": 0.07, "logps_train/chosen": -63.63264083862305, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -70.26762390136719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.08458728343248367, "rewards_train/margins": 0.1047093328088522, "rewards_train/rejected": -0.020122049376368523, "step": 238 }, { "epoch": 0.07, "logps_train/chosen": -60.88203811645508, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -57.22417449951172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.011836709454655647, "rewards_train/margins": 0.0781101118773222, "rewards_train/rejected": -0.08994682133197784, "step": 239 }, { "epoch": 0.07, "learning_rate": 4.978511054623189e-07, "loss": 0.65, "step": 240 }, { "epoch": 0.07, "logps_train/chosen": -73.17652130126953, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -86.74775695800781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.041332535445690155, "rewards_train/margins": 0.07313960790634155, "rewards_train/rejected": -0.0318070724606514, "step": 240 }, { "epoch": 0.07, "logps_train/chosen": -17.81582260131836, "logps_train/ref_chosen": -17.875, "logps_train/ref_rejected": -22.25, "logps_train/rejected": -23.40126609802246, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.005038968753069639, "rewards_train/margins": 0.11733342660591006, "rewards_train/rejected": -0.11229445785284042, "step": 241 }, { "epoch": 0.07, "learning_rate": 4.97769372038695e-07, "loss": 0.6524, "step": 242 }, { "epoch": 0.07, "logps_train/chosen": -78.91831970214844, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -67.61292266845703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.10123417526483536, "rewards_train/margins": 0.1330341175198555, "rewards_train/rejected": -0.03179994225502014, "step": 242 }, { "epoch": 0.07, "logps_train/chosen": -66.00843048095703, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -74.28964233398438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.01685863733291626, "rewards_train/margins": 0.031246401369571686, "rewards_train/rejected": -0.048105038702487946, "step": 243 }, { "epoch": 0.07, "learning_rate": 4.976861200832414e-07, "loss": 0.656, "step": 244 }, { "epoch": 0.07, "logps_train/chosen": -39.82799530029297, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -35.4915771484375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.04122381657361984, "rewards_train/margins": 0.08457089215517044, "rewards_train/rejected": -0.0433470755815506, "step": 244 }, { "epoch": 0.07, "logps_train/chosen": -45.840084075927734, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -36.725608825683594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.04528848081827164, "rewards_train/margins": 0.1261502429842949, "rewards_train/rejected": -0.08086176216602325, "step": 245 }, { "epoch": 0.07, "learning_rate": 4.976013501061937e-07, "loss": 0.6443, "step": 246 }, { "epoch": 0.07, "logps_train/chosen": -33.44615173339844, "logps_train/ref_chosen": -32.5, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -46.146392822265625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.10545528680086136, "rewards_train/margins": 0.04668416827917099, "rewards_train/rejected": -0.15213945508003235, "step": 246 }, { "epoch": 0.07, "logps_train/chosen": -85.43083190917969, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -101.01116943359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.05519186705350876, "rewards_train/margins": 0.1408468261361122, "rewards_train/rejected": -0.19603869318962097, "step": 247 }, { "epoch": 0.07, "learning_rate": 4.975150626270911e-07, "loss": 0.6505, "step": 248 }, { "epoch": 0.07, "logps_train/chosen": -74.1454086303711, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -79.21577453613281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.05557667464017868, "rewards_train/margins": 0.18652979284524918, "rewards_train/rejected": -0.1309531182050705, "step": 248 }, { "epoch": 0.07, "logps_train/chosen": -65.63245391845703, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -76.48052978515625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.09722961485385895, "rewards_train/margins": 0.06556923687458038, "rewards_train/rejected": -0.16279885172843933, "step": 249 }, { "epoch": 0.07, "learning_rate": 4.974272581747734e-07, "loss": 0.6403, "step": 250 }, { "epoch": 0.07, "logps_train/chosen": -125.66022491455078, "logps_train/ref_chosen": -124.5, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -115.13243103027344, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -0.11914749443531036, "rewards_train/margins": -0.0027793869376182556, "rewards_train/rejected": -0.11636810749769211, "step": 250 }, { "epoch": 0.07, "logps_train/chosen": -34.89984130859375, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -32.87923049926758, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.07290676981210709, "rewards_train/margins": 0.14266571402549744, "rewards_train/rejected": -0.06975894421339035, "step": 251 }, { "epoch": 0.07, "learning_rate": 4.973379372873774e-07, "loss": 0.6682, "step": 252 }, { "epoch": 0.07, "logps_train/chosen": -54.228675842285156, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -55.161293029785156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.02830410562455654, "rewards_train/margins": 0.16044876538217068, "rewards_train/rejected": -0.13214465975761414, "step": 252 }, { "epoch": 0.07, "logps_train/chosen": -96.91339874267578, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -101.10609436035156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.09382425993680954, "rewards_train/margins": 0.12551989406347275, "rewards_train/rejected": -0.2193441540002823, "step": 253 }, { "epoch": 0.07, "learning_rate": 4.97247100512334e-07, "loss": 0.6293, "step": 254 }, { "epoch": 0.07, "logps_train/chosen": -65.65286254882812, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -83.21630096435547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1160673275589943, "rewards_train/margins": 0.30956657975912094, "rewards_train/rejected": -0.42563390731811523, "step": 254 }, { "epoch": 0.07, "logps_train/chosen": -61.75201416015625, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -62.04127883911133, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.016399795189499855, "rewards_train/margins": 0.20919951237738132, "rewards_train/rejected": -0.19279971718788147, "step": 255 }, { "epoch": 0.07, "learning_rate": 4.971547484063648e-07, "loss": 0.5835, "step": 256 }, { "epoch": 0.07, "logps_train/chosen": -58.18988037109375, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -63.04719543457031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.018511950969696045, "rewards_train/margins": 0.17791859805583954, "rewards_train/rejected": -0.1594066470861435, "step": 256 }, { "epoch": 0.07, "logps_train/chosen": -54.394195556640625, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -54.88017654418945, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.032455600798130035, "rewards_train/margins": 0.08062924444675446, "rewards_train/rejected": -0.04817364364862442, "step": 257 }, { "epoch": 0.07, "learning_rate": 4.970608815354785e-07, "loss": 0.634, "step": 258 }, { "epoch": 0.07, "logps_train/chosen": -77.7452392578125, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -103.17654418945312, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.09405508637428284, "rewards_train/margins": 0.15172407031059265, "rewards_train/rejected": -0.2457791566848755, "step": 258 }, { "epoch": 0.07, "logps_train/chosen": -12.239175796508789, "logps_train/ref_chosen": -12.0625, "logps_train/ref_rejected": -10.125, "logps_train/rejected": -10.303321838378906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.017667582258582115, "rewards_train/margins": -0.0015444457530975342, "rewards_train/rejected": -0.01612313650548458, "step": 259 }, { "epoch": 0.07, "learning_rate": 4.969655004749673e-07, "loss": 0.6598, "step": 260 }, { "epoch": 0.07, "logps_train/chosen": -58.5343132019043, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -63.327796936035156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07266954332590103, "rewards_train/margins": 0.009719453752040863, "rewards_train/rejected": -0.0823889970779419, "step": 260 }, { "epoch": 0.07, "logps_train/chosen": -36.77006149291992, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -44.284690856933594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.04052316024899483, "rewards_train/margins": 0.1600080542266369, "rewards_train/rejected": -0.11948489397764206, "step": 261 }, { "epoch": 0.07, "learning_rate": 4.96868605809404e-07, "loss": 0.6581, "step": 262 }, { "epoch": 0.07, "logps_train/chosen": -45.656219482421875, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -43.79771423339844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.06055013835430145, "rewards_train/margins": 0.14735297858715057, "rewards_train/rejected": -0.08680284023284912, "step": 262 }, { "epoch": 0.07, "logps_train/chosen": -77.39680480957031, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -65.07958984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06546133756637573, "rewards_train/margins": 0.20655959844589233, "rewards_train/rejected": -0.27202093601226807, "step": 263 }, { "epoch": 0.07, "learning_rate": 4.967701981326376e-07, "loss": 0.6144, "step": 264 }, { "epoch": 0.07, "logps_train/chosen": -52.240684509277344, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -63.36178207397461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.007962603121995926, "rewards_train/margins": 0.042383063584566116, "rewards_train/rejected": -0.03442046046257019, "step": 264 }, { "epoch": 0.07, "logps_train/chosen": -38.47343826293945, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -39.063697814941406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.07917973399162292, "rewards_train/margins": 0.10121335089206696, "rewards_train/rejected": -0.18039308488368988, "step": 265 }, { "epoch": 0.07, "learning_rate": 4.966702780477901e-07, "loss": 0.6603, "step": 266 }, { "epoch": 0.07, "logps_train/chosen": -51.64854431152344, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -60.975067138671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0629773736000061, "rewards_train/margins": 0.1491558775305748, "rewards_train/rejected": -0.0861785039305687, "step": 266 }, { "epoch": 0.07, "logps_train/chosen": -68.49447631835938, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -69.90038299560547, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.05686892941594124, "rewards_train/margins": 0.07618732377886772, "rewards_train/rejected": -0.13305625319480896, "step": 267 }, { "epoch": 0.07, "learning_rate": 4.965688461672532e-07, "loss": 0.6429, "step": 268 }, { "epoch": 0.07, "logps_train/chosen": -80.12190246582031, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -71.27377319335938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.032893478870391846, "rewards_train/margins": 0.10581210255622864, "rewards_train/rejected": -0.13870558142662048, "step": 268 }, { "epoch": 0.08, "logps_train/chosen": -70.51513671875, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -88.27554321289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.02465803734958172, "rewards_train/margins": 0.27164660580456257, "rewards_train/rejected": -0.2963046431541443, "step": 269 }, { "epoch": 0.08, "learning_rate": 4.964659031126836e-07, "loss": 0.6145, "step": 270 }, { "epoch": 0.08, "logps_train/chosen": -55.628578186035156, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -68.12471008300781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.08304059505462646, "rewards_train/margins": 0.21445679664611816, "rewards_train/rejected": -0.1314162015914917, "step": 270 }, { "epoch": 0.08, "logps_train/chosen": -66.52167510986328, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -71.10826110839844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.017754392698407173, "rewards_train/margins": 0.1602208036929369, "rewards_train/rejected": -0.14246641099452972, "step": 271 }, { "epoch": 0.08, "learning_rate": 4.963614495149999e-07, "loss": 0.6084, "step": 272 }, { "epoch": 0.08, "logps_train/chosen": -46.289825439453125, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -48.580875396728516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.10007592290639877, "rewards_train/margins": -0.013765774667263031, "rewards_train/rejected": -0.08631014823913574, "step": 272 }, { "epoch": 0.08, "logps_train/chosen": -20.95589828491211, "logps_train/ref_chosen": -20.875, "logps_train/ref_rejected": -32.0, "logps_train/rejected": -33.178314208984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.006576208863407373, "rewards_train/margins": 0.10148985078558326, "rewards_train/rejected": -0.10806605964899063, "step": 273 }, { "epoch": 0.08, "learning_rate": 4.962554860143786e-07, "loss": 0.6745, "step": 274 }, { "epoch": 0.08, "logps_train/chosen": -26.65819549560547, "logps_train/ref_chosen": -26.625, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -44.0958366394043, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0029288511723279953, "rewards_train/margins": 0.12711374647915363, "rewards_train/rejected": -0.13004259765148163, "step": 274 }, { "epoch": 0.08, "logps_train/chosen": -68.20321655273438, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -55.19052505493164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.03594617545604706, "rewards_train/margins": 0.08369223028421402, "rewards_train/rejected": -0.11963840574026108, "step": 275 }, { "epoch": 0.08, "learning_rate": 4.9614801326025e-07, "loss": 0.6465, "step": 276 }, { "epoch": 0.08, "logps_train/chosen": -37.916133880615234, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -31.375, "logps_train/rejected": -33.3668098449707, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.002136575523763895, "rewards_train/margins": 0.19584876531735063, "rewards_train/rejected": -0.19371218979358673, "step": 276 }, { "epoch": 0.08, "logps_train/chosen": -106.4279556274414, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -104.23807525634766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12677955627441406, "rewards_train/margins": 0.32046520709991455, "rewards_train/rejected": -0.4472447633743286, "step": 277 }, { "epoch": 0.08, "learning_rate": 4.960390319112945e-07, "loss": 0.5839, "step": 278 }, { "epoch": 0.08, "logps_train/chosen": -67.5575942993164, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -85.05168151855469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.03388417139649391, "rewards_train/margins": 0.10136232897639275, "rewards_train/rejected": -0.13524650037288666, "step": 278 }, { "epoch": 0.08, "logps_train/chosen": -50.628753662109375, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -59.204593658447266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0066254232078790665, "rewards_train/margins": 0.20289664529263973, "rewards_train/rejected": -0.2095220685005188, "step": 279 }, { "epoch": 0.08, "learning_rate": 4.959285426354381e-07, "loss": 0.6273, "step": 280 }, { "epoch": 0.08, "logps_train/chosen": -59.6366081237793, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -63.785099029541016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.06610220670700073, "rewards_train/margins": 0.1313529908657074, "rewards_train/rejected": -0.19745519757270813, "step": 280 }, { "epoch": 0.08, "logps_train/chosen": -45.673583984375, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -57.671844482421875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.021899394690990448, "rewards_train/margins": 0.21134945005178452, "rewards_train/rejected": -0.18945005536079407, "step": 281 }, { "epoch": 0.08, "learning_rate": 4.958165461098487e-07, "loss": 0.6204, "step": 282 }, { "epoch": 0.08, "logps_train/chosen": -47.9825553894043, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -50.561790466308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.021766599267721176, "rewards_train/margins": 0.04613130912184715, "rewards_train/rejected": -0.06789790838956833, "step": 282 }, { "epoch": 0.08, "logps_train/chosen": -23.648746490478516, "logps_train/ref_chosen": -23.375, "logps_train/ref_rejected": -16.75, "logps_train/rejected": -17.18783187866211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.02698415145277977, "rewards_train/margins": 0.017580367624759674, "rewards_train/rejected": -0.044564519077539444, "step": 283 }, { "epoch": 0.08, "learning_rate": 4.957030430209321e-07, "loss": 0.679, "step": 284 }, { "epoch": 0.08, "logps_train/chosen": -54.955718994140625, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -54.96346664428711, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.016342464834451675, "rewards_train/margins": 0.08407585695385933, "rewards_train/rejected": -0.06773339211940765, "step": 284 }, { "epoch": 0.08, "logps_train/chosen": -51.135501861572266, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -58.292991638183594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.07956597208976746, "rewards_train/margins": 0.2786390483379364, "rewards_train/rejected": -0.35820502042770386, "step": 285 }, { "epoch": 0.08, "learning_rate": 4.955880340643274e-07, "loss": 0.6148, "step": 286 }, { "epoch": 0.08, "logps_train/chosen": -47.648345947265625, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -62.818199157714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06387611478567123, "rewards_train/margins": 0.22928988188505173, "rewards_train/rejected": -0.1654137670993805, "step": 286 }, { "epoch": 0.08, "logps_train/chosen": -49.7845458984375, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -75.52261352539062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.05282023176550865, "rewards_train/margins": 0.1923610307276249, "rewards_train/rejected": -0.24518126249313354, "step": 287 }, { "epoch": 0.08, "learning_rate": 4.954715199449026e-07, "loss": 0.5995, "step": 288 }, { "epoch": 0.08, "logps_train/chosen": -27.164216995239258, "logps_train/ref_chosen": -27.375, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -30.333599090576172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.016781475394964218, "rewards_train/margins": 0.07279767468571663, "rewards_train/rejected": -0.05601619929075241, "step": 288 }, { "epoch": 0.08, "logps_train/chosen": -62.513511657714844, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -77.88041687011719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03540075942873955, "rewards_train/margins": 0.2973349802196026, "rewards_train/rejected": -0.26193422079086304, "step": 289 }, { "epoch": 0.08, "learning_rate": 4.95353501376751e-07, "loss": 0.615, "step": 290 }, { "epoch": 0.08, "logps_train/chosen": -59.4865837097168, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -62.95096969604492, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1172129362821579, "rewards_train/margins": 0.037844955921173096, "rewards_train/rejected": -0.155057892203331, "step": 290 }, { "epoch": 0.08, "logps_train/chosen": -105.95268249511719, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -115.05427551269531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06558090448379517, "rewards_train/margins": 0.15039342641830444, "rewards_train/rejected": -0.2159743309020996, "step": 291 }, { "epoch": 0.08, "learning_rate": 4.952339790831861e-07, "loss": 0.6507, "step": 292 }, { "epoch": 0.08, "logps_train/chosen": -59.823951721191406, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -64.58686065673828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.038547221571207047, "rewards_train/margins": 0.1658419780433178, "rewards_train/rejected": -0.20438919961452484, "step": 292 }, { "epoch": 0.08, "logps_train/chosen": -56.13823699951172, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -68.07988739013672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0319875106215477, "rewards_train/margins": 0.21579626947641373, "rewards_train/rejected": -0.24778378009796143, "step": 293 }, { "epoch": 0.08, "learning_rate": 4.951129537967377e-07, "loss": 0.6119, "step": 294 }, { "epoch": 0.08, "logps_train/chosen": -49.520912170410156, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -35.58644485473633, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.007088212296366692, "rewards_train/margins": 0.19073268584907055, "rewards_train/rejected": -0.18364447355270386, "step": 294 }, { "epoch": 0.08, "logps_train/chosen": -67.98999786376953, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -70.90355682373047, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.003296487033367157, "rewards_train/margins": 0.15151257067918777, "rewards_train/rejected": -0.15480905771255493, "step": 295 }, { "epoch": 0.08, "learning_rate": 4.949904262591467e-07, "loss": 0.6205, "step": 296 }, { "epoch": 0.08, "logps_train/chosen": -52.28907012939453, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -51.18754196166992, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.02148505672812462, "rewards_train/margins": 0.1144569106400013, "rewards_train/rejected": -0.13594196736812592, "step": 296 }, { "epoch": 0.08, "logps_train/chosen": -34.808326721191406, "logps_train/ref_chosen": -34.25, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -37.986000061035156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0535435788333416, "rewards_train/margins": 0.17733753100037575, "rewards_train/rejected": -0.23088110983371735, "step": 297 }, { "epoch": 0.08, "learning_rate": 4.948663972213615e-07, "loss": 0.6307, "step": 298 }, { "epoch": 0.08, "logps_train/chosen": -79.06289672851562, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -70.95683288574219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.05082162469625473, "rewards_train/margins": 0.30560410767793655, "rewards_train/rejected": -0.3564257323741913, "step": 298 }, { "epoch": 0.08, "logps_train/chosen": -33.221107482910156, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -21.875, "logps_train/rejected": -22.736474990844727, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.022855589166283607, "rewards_train/margins": 0.05798187665641308, "rewards_train/rejected": -0.08083746582269669, "step": 299 }, { "epoch": 0.08, "learning_rate": 4.947408674435326e-07, "loss": 0.614, "step": 300 }, { "epoch": 0.08, "logps_train/chosen": -94.10527038574219, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -107.86579895019531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.010526798665523529, "rewards_train/margins": 0.3217567577958107, "rewards_train/rejected": -0.33228355646133423, "step": 300 }, { "epoch": 0.08, "logps_train/chosen": -48.57219696044922, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -63.75008010864258, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.009186375886201859, "rewards_train/margins": 0.26476090028882027, "rewards_train/rejected": -0.2555745244026184, "step": 301 }, { "epoch": 0.08, "learning_rate": 4.946138376950086e-07, "loss": 0.5724, "step": 302 }, { "epoch": 0.08, "logps_train/chosen": -45.28582763671875, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -62.547607421875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.019989222288131714, "rewards_train/margins": 0.10234978795051575, "rewards_train/rejected": -0.12233901023864746, "step": 302 }, { "epoch": 0.08, "logps_train/chosen": -87.74296569824219, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -81.56104278564453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.03542996197938919, "rewards_train/margins": 0.23336955159902573, "rewards_train/rejected": -0.2687995135784149, "step": 303 }, { "epoch": 0.08, "learning_rate": 4.944853087543306e-07, "loss": 0.6248, "step": 304 }, { "epoch": 0.08, "logps_train/chosen": -62.05071258544922, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -68.63275909423828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0007883422076702118, "rewards_train/margins": 0.0510755218565464, "rewards_train/rejected": -0.05028717964887619, "step": 304 }, { "epoch": 0.09, "logps_train/chosen": -84.78604125976562, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -95.69252014160156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.18797874450683594, "rewards_train/margins": 0.39611655473709106, "rewards_train/rejected": -0.584095299243927, "step": 305 }, { "epoch": 0.09, "learning_rate": 4.943552814092287e-07, "loss": 0.6015, "step": 306 }, { "epoch": 0.09, "logps_train/chosen": -66.4135971069336, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -101.26895141601562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1567896604537964, "rewards_train/margins": 0.3613162040710449, "rewards_train/rejected": -0.5181058645248413, "step": 306 }, { "epoch": 0.09, "logps_train/chosen": -64.56097412109375, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -69.8199691772461, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17797234654426575, "rewards_train/margins": 0.08468848466873169, "rewards_train/rejected": -0.26266083121299744, "step": 307 }, { "epoch": 0.09, "learning_rate": 4.942237564566155e-07, "loss": 0.6147, "step": 308 }, { "epoch": 0.09, "logps_train/chosen": -95.0046157836914, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -112.35191345214844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0785864070057869, "rewards_train/margins": 0.27848007529973984, "rewards_train/rejected": -0.35706648230552673, "step": 308 }, { "epoch": 0.09, "logps_train/chosen": -44.23193359375, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -59.32194137573242, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.020947443321347237, "rewards_train/margins": 0.12267252989113331, "rewards_train/rejected": -0.10172508656978607, "step": 309 }, { "epoch": 0.09, "learning_rate": 4.94090734702583e-07, "loss": 0.6114, "step": 310 }, { "epoch": 0.09, "logps_train/chosen": -88.3235092163086, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -87.81587219238281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.035227496176958084, "rewards_train/margins": 0.3707207925617695, "rewards_train/rejected": -0.3354932963848114, "step": 310 }, { "epoch": 0.09, "logps_train/chosen": -108.91020202636719, "logps_train/ref_chosen": -107.5, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -118.88829040527344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12227039039134979, "rewards_train/margins": 0.19155924022197723, "rewards_train/rejected": -0.313829630613327, "step": 311 }, { "epoch": 0.09, "learning_rate": 4.939562169623964e-07, "loss": 0.5769, "step": 312 }, { "epoch": 0.09, "logps_train/chosen": -65.96340942382812, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -67.19998168945312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.021529756486415863, "rewards_train/margins": 0.39426176995038986, "rewards_train/rejected": -0.372732013463974, "step": 312 }, { "epoch": 0.09, "logps_train/chosen": -62.58346176147461, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -65.02446746826172, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.12126119434833527, "rewards_train/margins": 0.1448579579591751, "rewards_train/rejected": -0.2661191523075104, "step": 313 }, { "epoch": 0.09, "learning_rate": 4.938202040604898e-07, "loss": 0.589, "step": 314 }, { "epoch": 0.09, "logps_train/chosen": -63.953369140625, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -85.4097671508789, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.13088364899158478, "rewards_train/margins": 0.24603049457073212, "rewards_train/rejected": -0.3769141435623169, "step": 314 }, { "epoch": 0.09, "logps_train/chosen": -92.89512634277344, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -101.70654296875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.20591847598552704, "rewards_train/margins": 0.47957928478717804, "rewards_train/rejected": -0.6854977607727051, "step": 315 }, { "epoch": 0.09, "learning_rate": 4.936826968304603e-07, "loss": 0.5441, "step": 316 }, { "epoch": 0.09, "logps_train/chosen": -94.77378845214844, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -100.09722137451172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2539416551589966, "rewards_train/margins": 0.1714058220386505, "rewards_train/rejected": -0.4253474771976471, "step": 316 }, { "epoch": 0.09, "logps_train/chosen": -82.51100158691406, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -81.36175537109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3784438371658325, "rewards_train/margins": 0.017107129096984863, "rewards_train/rejected": -0.3955509662628174, "step": 317 }, { "epoch": 0.09, "learning_rate": 4.935436961150639e-07, "loss": 0.6562, "step": 318 }, { "epoch": 0.09, "logps_train/chosen": -41.34101486206055, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -45.80457305908203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.046601392328739166, "rewards_train/margins": 0.07291863113641739, "rewards_train/rejected": -0.11952002346515656, "step": 318 }, { "epoch": 0.09, "logps_train/chosen": -38.202938079833984, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -49.329376220703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.06486262381076813, "rewards_train/margins": 0.13608145713806152, "rewards_train/rejected": -0.0712188333272934, "step": 319 }, { "epoch": 0.09, "learning_rate": 4.934032027662101e-07, "loss": 0.6493, "step": 320 }, { "epoch": 0.09, "logps_train/chosen": -37.03022766113281, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -47.34885787963867, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.12919463217258453, "rewards_train/margins": 0.05471464991569519, "rewards_train/rejected": -0.18390928208827972, "step": 320 }, { "epoch": 0.09, "logps_train/chosen": -67.58390045166016, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -67.95511627197266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.29706206917762756, "rewards_train/margins": 0.11837100982666016, "rewards_train/rejected": -0.4154330790042877, "step": 321 }, { "epoch": 0.09, "learning_rate": 4.932612176449559e-07, "loss": 0.6543, "step": 322 }, { "epoch": 0.09, "logps_train/chosen": -28.910926818847656, "logps_train/ref_chosen": -29.625, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -52.609092712402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0702354684472084, "rewards_train/margins": 0.15204314142465591, "rewards_train/rejected": -0.08180767297744751, "step": 322 }, { "epoch": 0.09, "logps_train/chosen": -53.39405822753906, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -51.8156852722168, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06371934711933136, "rewards_train/margins": 0.2859126329421997, "rewards_train/rejected": -0.22219328582286835, "step": 323 }, { "epoch": 0.09, "learning_rate": 4.931177416215015e-07, "loss": 0.5955, "step": 324 }, { "epoch": 0.09, "logps_train/chosen": -69.10913848876953, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -73.91200256347656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.12517178058624268, "rewards_train/margins": 0.17325466871261597, "rewards_train/rejected": -0.29842644929885864, "step": 324 }, { "epoch": 0.09, "logps_train/chosen": -27.53668785095215, "logps_train/ref_chosen": -27.875, "logps_train/ref_rejected": -32.75, "logps_train/rejected": -33.74585723876953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03175593167543411, "rewards_train/margins": 0.12196669727563858, "rewards_train/rejected": -0.09021076560020447, "step": 325 }, { "epoch": 0.09, "learning_rate": 4.929727755751845e-07, "loss": 0.6329, "step": 326 }, { "epoch": 0.09, "logps_train/chosen": -78.49880981445312, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -82.13140869140625, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.12956848740577698, "rewards_train/margins": 0.08318158984184265, "rewards_train/rejected": -0.21275007724761963, "step": 326 }, { "epoch": 0.09, "logps_train/chosen": -55.102596282958984, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -68.35601806640625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11397083103656769, "rewards_train/margins": 0.36076705157756805, "rewards_train/rejected": -0.47473788261413574, "step": 327 }, { "epoch": 0.09, "learning_rate": 4.928263203944743e-07, "loss": 0.6162, "step": 328 }, { "epoch": 0.09, "logps_train/chosen": -79.43524169921875, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -139.0, "logps_train/rejected": -142.58819580078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2659849226474762, "rewards_train/margins": 0.13189652562141418, "rewards_train/rejected": -0.3978814482688904, "step": 328 }, { "epoch": 0.09, "logps_train/chosen": -109.83356475830078, "logps_train/ref_chosen": -109.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -78.52315521240234, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.04702841490507126, "rewards_train/margins": 0.15997455269098282, "rewards_train/rejected": -0.20700296759605408, "step": 329 }, { "epoch": 0.09, "learning_rate": 4.926783769769671e-07, "loss": 0.6316, "step": 330 }, { "epoch": 0.09, "logps_train/chosen": -79.29072570800781, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -69.26737976074219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.018582772463560104, "rewards_train/margins": 0.0820390097796917, "rewards_train/rejected": -0.06345623731613159, "step": 330 }, { "epoch": 0.09, "logps_train/chosen": -74.45913696289062, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -82.04327392578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.13341361284255981, "rewards_train/margins": 0.2119290828704834, "rewards_train/rejected": -0.3453426957130432, "step": 331 }, { "epoch": 0.09, "learning_rate": 4.925289462293807e-07, "loss": 0.6314, "step": 332 }, { "epoch": 0.09, "logps_train/chosen": -78.71355438232422, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -93.92138671875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.10963641852140427, "rewards_train/margins": 0.38992414623498917, "rewards_train/rejected": -0.49956056475639343, "step": 332 }, { "epoch": 0.09, "logps_train/chosen": -49.360774993896484, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -59.33673095703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0645933449268341, "rewards_train/margins": 0.16575965285301208, "rewards_train/rejected": -0.2303529977798462, "step": 333 }, { "epoch": 0.09, "learning_rate": 4.923780290675475e-07, "loss": 0.5829, "step": 334 }, { "epoch": 0.09, "logps_train/chosen": -36.918785095214844, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -74.87519836425781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.03780872002243996, "rewards_train/margins": 0.24407846108078957, "rewards_train/rejected": -0.2062697410583496, "step": 334 }, { "epoch": 0.09, "logps_train/chosen": -60.549774169921875, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -63.61624526977539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0905240923166275, "rewards_train/margins": 0.12559273838996887, "rewards_train/rejected": -0.21611683070659637, "step": 335 }, { "epoch": 0.09, "learning_rate": 4.922256264164105e-07, "loss": 0.6111, "step": 336 }, { "epoch": 0.09, "logps_train/chosen": -49.96208190917969, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -69.14873504638672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08068116009235382, "rewards_train/margins": 0.1533328741788864, "rewards_train/rejected": -0.23401403427124023, "step": 336 }, { "epoch": 0.09, "logps_train/chosen": -75.82608032226562, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -96.15160369873047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0439356230199337, "rewards_train/margins": 0.25169334188103676, "rewards_train/rejected": -0.29562896490097046, "step": 337 }, { "epoch": 0.09, "learning_rate": 4.92071739210017e-07, "loss": 0.6082, "step": 338 }, { "epoch": 0.09, "logps_train/chosen": -70.35716247558594, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -74.14249420166016, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.2540755271911621, "rewards_train/margins": -0.005450919270515442, "rewards_train/rejected": -0.24862460792064667, "step": 338 }, { "epoch": 0.09, "logps_train/chosen": -31.80899429321289, "logps_train/ref_chosen": -31.75, "logps_train/ref_rejected": -29.125, "logps_train/rejected": -30.475730895996094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.008633816614747047, "rewards_train/margins": 0.12116583250463009, "rewards_train/rejected": -0.12979964911937714, "step": 339 }, { "epoch": 0.1, "learning_rate": 4.919163683915127e-07, "loss": 0.6779, "step": 340 }, { "epoch": 0.1, "logps_train/chosen": -23.782291412353516, "logps_train/ref_chosen": -22.375, "logps_train/ref_rejected": -25.25, "logps_train/rejected": -26.922977447509766, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.13618811964988708, "rewards_train/margins": 0.02671496570110321, "rewards_train/rejected": -0.1629030853509903, "step": 340 }, { "epoch": 0.1, "logps_train/chosen": -79.93000793457031, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -76.40746307373047, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.21702466905117035, "rewards_train/margins": 0.04422964155673981, "rewards_train/rejected": -0.26125431060791016, "step": 341 }, { "epoch": 0.1, "learning_rate": 4.91759514913136e-07, "loss": 0.6793, "step": 342 }, { "epoch": 0.1, "logps_train/chosen": -63.07942199707031, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -61.959495544433594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.06868436187505722, "rewards_train/margins": 0.17960888892412186, "rewards_train/rejected": -0.24829325079917908, "step": 342 }, { "epoch": 0.1, "logps_train/chosen": -68.63347625732422, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -74.12576293945312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.10494876652956009, "rewards_train/margins": 0.557627372443676, "rewards_train/rejected": -0.6625761389732361, "step": 343 }, { "epoch": 0.1, "learning_rate": 4.916011797362123e-07, "loss": 0.5576, "step": 344 }, { "epoch": 0.1, "logps_train/chosen": -49.114723205566406, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -47.364410400390625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.040378522127866745, "rewards_train/margins": 0.11871876195073128, "rewards_train/rejected": -0.15909728407859802, "step": 344 }, { "epoch": 0.1, "logps_train/chosen": -22.078798294067383, "logps_train/ref_chosen": -22.125, "logps_train/ref_rejected": -27.125, "logps_train/rejected": -27.537137985229492, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -6.734486669301987e-05, "rewards_train/margins": 0.046224416233599186, "rewards_train/rejected": -0.046291761100292206, "step": 345 }, { "epoch": 0.1, "learning_rate": 4.914413638311482e-07, "loss": 0.6558, "step": 346 }, { "epoch": 0.1, "logps_train/chosen": -71.45697021484375, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -68.04414367675781, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.2310480773448944, "rewards_train/margins": -0.025852814316749573, "rewards_train/rejected": -0.20519526302814484, "step": 346 }, { "epoch": 0.1, "logps_train/chosen": -60.23451614379883, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -76.00108337402344, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.05177208036184311, "rewards_train/margins": 0.26786769181489944, "rewards_train/rejected": -0.31963977217674255, "step": 347 }, { "epoch": 0.1, "learning_rate": 4.912800681774253e-07, "loss": 0.6695, "step": 348 }, { "epoch": 0.1, "logps_train/chosen": -55.41716766357422, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -52.72454833984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.20265452563762665, "rewards_train/margins": 0.09440977871417999, "rewards_train/rejected": -0.29706430435180664, "step": 348 }, { "epoch": 0.1, "logps_train/chosen": -87.39714050292969, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -87.9417724609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2334636151790619, "rewards_train/margins": 0.24196359515190125, "rewards_train/rejected": -0.47542721033096313, "step": 349 }, { "epoch": 0.1, "learning_rate": 4.911172937635942e-07, "loss": 0.622, "step": 350 }, { "epoch": 0.1, "logps_train/chosen": -73.04360961914062, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -73.84121704101562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.14186100661754608, "rewards_train/margins": 0.2610108405351639, "rewards_train/rejected": -0.40287184715270996, "step": 350 }, { "epoch": 0.1, "logps_train/chosen": -81.9167709350586, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -85.91574096679688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.026052236557006836, "rewards_train/margins": 0.25341176986694336, "rewards_train/rejected": -0.2794640064239502, "step": 351 }, { "epoch": 0.1, "learning_rate": 4.909530415872685e-07, "loss": 0.5867, "step": 352 }, { "epoch": 0.1, "logps_train/chosen": -39.090415954589844, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -53.72549057006836, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.030021198093891144, "rewards_train/margins": 0.2693668082356453, "rewards_train/rejected": -0.23934561014175415, "step": 352 }, { "epoch": 0.1, "logps_train/chosen": -63.3445930480957, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -72.13990783691406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.11004532873630524, "rewards_train/margins": 0.36100123822689056, "rewards_train/rejected": -0.4710465669631958, "step": 353 }, { "epoch": 0.1, "learning_rate": 4.907873126551192e-07, "loss": 0.5637, "step": 354 }, { "epoch": 0.1, "logps_train/chosen": -65.56784057617188, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -39.25541305541992, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0036589615046977997, "rewards_train/margins": 0.18203851953148842, "rewards_train/rejected": -0.18569748103618622, "step": 354 }, { "epoch": 0.1, "logps_train/chosen": -64.6380615234375, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -92.01697540283203, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.020056378096342087, "rewards_train/margins": 0.2699230797588825, "rewards_train/rejected": -0.2899794578552246, "step": 355 }, { "epoch": 0.1, "learning_rate": 4.906201079828676e-07, "loss": 0.5964, "step": 356 }, { "epoch": 0.1, "logps_train/chosen": -73.18698120117188, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -76.12444305419922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.005715329200029373, "rewards_train/margins": 0.10956589505076408, "rewards_train/rejected": -0.10385056585073471, "step": 356 }, { "epoch": 0.1, "logps_train/chosen": -60.72767639160156, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -68.98823547363281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2723768949508667, "rewards_train/margins": 0.040704816579818726, "rewards_train/rejected": -0.3130817115306854, "step": 357 }, { "epoch": 0.1, "learning_rate": 4.904514285952794e-07, "loss": 0.6646, "step": 358 }, { "epoch": 0.1, "logps_train/chosen": -61.012474060058594, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -65.67412567138672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.016135694459080696, "rewards_train/margins": 0.6413607615977526, "rewards_train/rejected": -0.6252250671386719, "step": 358 }, { "epoch": 0.1, "logps_train/chosen": -50.726600646972656, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -52.99345397949219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0968787670135498, "rewards_train/margins": 0.16848242282867432, "rewards_train/rejected": -0.2653611898422241, "step": 359 }, { "epoch": 0.1, "learning_rate": 4.902812755261591e-07, "loss": 0.5466, "step": 360 }, { "epoch": 0.1, "logps_train/chosen": -58.21422576904297, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -55.1922607421875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04739930480718613, "rewards_train/margins": 0.25253967195749283, "rewards_train/rejected": -0.29993897676467896, "step": 360 }, { "epoch": 0.1, "logps_train/chosen": -76.33403778076172, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -51.67599105834961, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.021684732288122177, "rewards_train/margins": 0.2488439716398716, "rewards_train/rejected": -0.2705287039279938, "step": 361 }, { "epoch": 0.1, "learning_rate": 4.901096498183428e-07, "loss": 0.5973, "step": 362 }, { "epoch": 0.1, "logps_train/chosen": -27.518657684326172, "logps_train/ref_chosen": -26.875, "logps_train/ref_rejected": -24.75, "logps_train/rejected": -26.936294555664062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.062070704996585846, "rewards_train/margins": 0.15226199477910995, "rewards_train/rejected": -0.2143326997756958, "step": 362 }, { "epoch": 0.1, "logps_train/chosen": -35.827030181884766, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -56.23878479003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0893670916557312, "rewards_train/margins": 0.6168099045753479, "rewards_train/rejected": -0.5274428129196167, "step": 363 }, { "epoch": 0.1, "learning_rate": 4.899365525236921e-07, "loss": 0.5601, "step": 364 }, { "epoch": 0.1, "logps_train/chosen": -77.91535186767578, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -85.88685607910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0022149011492729187, "rewards_train/margins": 0.5287915393710136, "rewards_train/rejected": -0.5265766382217407, "step": 364 }, { "epoch": 0.1, "logps_train/chosen": -94.54623413085938, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -96.4705581665039, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.29993563890457153, "rewards_train/margins": 0.3475103974342346, "rewards_train/rejected": -0.6474460363388062, "step": 365 }, { "epoch": 0.1, "learning_rate": 4.897619847030876e-07, "loss": 0.5368, "step": 366 }, { "epoch": 0.1, "logps_train/chosen": -100.95994567871094, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -100.17292785644531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4110339879989624, "rewards_train/margins": 0.18516451120376587, "rewards_train/rejected": -0.5961984992027283, "step": 366 }, { "epoch": 0.1, "logps_train/chosen": -63.71215057373047, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -68.8509521484375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.13527780771255493, "rewards_train/margins": 0.23848959803581238, "rewards_train/rejected": -0.3737674057483673, "step": 367 }, { "epoch": 0.1, "learning_rate": 4.895859474264228e-07, "loss": 0.6032, "step": 368 }, { "epoch": 0.1, "logps_train/chosen": -61.28565979003906, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -51.18120574951172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.037550076842308044, "rewards_train/margins": 0.20063889026641846, "rewards_train/rejected": -0.2381889671087265, "step": 368 }, { "epoch": 0.1, "logps_train/chosen": -68.31848907470703, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -82.45947265625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.033238910138607025, "rewards_train/margins": 0.3307482525706291, "rewards_train/rejected": -0.2975093424320221, "step": 369 }, { "epoch": 0.1, "learning_rate": 4.894084417725969e-07, "loss": 0.5941, "step": 370 }, { "epoch": 0.1, "logps_train/chosen": -108.39409637451172, "logps_train/ref_chosen": -105.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -108.1373291015625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.35815951228141785, "rewards_train/margins": 0.03682336211204529, "rewards_train/rejected": -0.39498287439346313, "step": 370 }, { "epoch": 0.1, "logps_train/chosen": -45.12579345703125, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -50.849544525146484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.01613139919936657, "rewards_train/margins": 0.44864421896636486, "rewards_train/rejected": -0.4325128197669983, "step": 371 }, { "epoch": 0.1, "learning_rate": 4.892294688295088e-07, "loss": 0.6145, "step": 372 }, { "epoch": 0.1, "logps_train/chosen": -62.0494270324707, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -86.26089477539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1879505217075348, "rewards_train/margins": 0.37935033440589905, "rewards_train/rejected": -0.5673008561134338, "step": 372 }, { "epoch": 0.1, "logps_train/chosen": -99.60758209228516, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -99.30949401855469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5310704112052917, "rewards_train/margins": 0.18523025512695312, "rewards_train/rejected": -0.7163006663322449, "step": 373 }, { "epoch": 0.1, "learning_rate": 4.890490296940496e-07, "loss": 0.5772, "step": 374 }, { "epoch": 0.1, "logps_train/chosen": -79.73228454589844, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -81.31240844726562, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.23709505796432495, "rewards_train/margins": -0.09638087451457977, "rewards_train/rejected": -0.14071418344974518, "step": 374 }, { "epoch": 0.1, "logps_train/chosen": -68.92910766601562, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -86.09484100341797, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08158338069915771, "rewards_train/margins": 0.3589552640914917, "rewards_train/rejected": -0.4405386447906494, "step": 375 }, { "epoch": 0.11, "learning_rate": 4.888671254720972e-07, "loss": 0.6605, "step": 376 }, { "epoch": 0.11, "logps_train/chosen": -44.20768737792969, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -52.269378662109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02493463270366192, "rewards_train/margins": 0.19913811795413494, "rewards_train/rejected": -0.17420348525047302, "step": 376 }, { "epoch": 0.11, "logps_train/chosen": -71.08357238769531, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -59.199913024902344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.22769232094287872, "rewards_train/margins": 0.21515028178691864, "rewards_train/rejected": -0.44284260272979736, "step": 377 }, { "epoch": 0.11, "learning_rate": 4.886837572785081e-07, "loss": 0.611, "step": 378 }, { "epoch": 0.11, "logps_train/chosen": -50.59214401245117, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -55.63287353515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.0373397022485733, "rewards_train/margins": 0.38141684234142303, "rewards_train/rejected": -0.41875654458999634, "step": 378 }, { "epoch": 0.11, "logps_train/chosen": -36.52409744262695, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -30.375, "logps_train/rejected": -33.19350814819336, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.285368412733078, "rewards_train/margins": -0.009327977895736694, "rewards_train/rejected": -0.2760404348373413, "step": 379 }, { "epoch": 0.11, "learning_rate": 4.884989262371114e-07, "loss": 0.6264, "step": 380 }, { "epoch": 0.11, "logps_train/chosen": -37.08723068237305, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -39.5583610534668, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.07593493163585663, "rewards_train/margins": 0.02108776569366455, "rewards_train/rejected": -0.09702269732952118, "step": 380 }, { "epoch": 0.11, "logps_train/chosen": -78.44664001464844, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -81.58367919921875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.10560118407011032, "rewards_train/margins": 0.40745461732149124, "rewards_train/rejected": -0.5130558013916016, "step": 381 }, { "epoch": 0.11, "learning_rate": 4.883126334807019e-07, "loss": 0.6138, "step": 382 }, { "epoch": 0.11, "logps_train/chosen": -74.96675872802734, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -77.48548889160156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3568321764469147, "rewards_train/margins": 0.15304431319236755, "rewards_train/rejected": -0.5098764896392822, "step": 382 }, { "epoch": 0.11, "logps_train/chosen": -79.03826141357422, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -82.42130279541016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.12234561890363693, "rewards_train/margins": 0.6360576674342155, "rewards_train/rejected": -0.5137120485305786, "step": 383 }, { "epoch": 0.11, "learning_rate": 4.881248801510328e-07, "loss": 0.5603, "step": 384 }, { "epoch": 0.11, "logps_train/chosen": -77.80622863769531, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -77.42886352539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.042341478168964386, "rewards_train/margins": 0.4349202439188957, "rewards_train/rejected": -0.4772617220878601, "step": 384 }, { "epoch": 0.11, "logps_train/chosen": -46.81436538696289, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -49.15264129638672, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.08514738827943802, "rewards_train/margins": 0.05882773548364639, "rewards_train/rejected": -0.1439751237630844, "step": 385 }, { "epoch": 0.11, "learning_rate": 4.879356673988089e-07, "loss": 0.5965, "step": 386 }, { "epoch": 0.11, "logps_train/chosen": -33.68955612182617, "logps_train/ref_chosen": -34.25, "logps_train/ref_rejected": -23.75, "logps_train/rejected": -24.28402328491211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.06151317059993744, "rewards_train/margins": 0.11745448410511017, "rewards_train/rejected": -0.05594131350517273, "step": 386 }, { "epoch": 0.11, "logps_train/chosen": -111.4737548828125, "logps_train/ref_chosen": -109.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -105.9298095703125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.18487553298473358, "rewards_train/margins": 0.05068403482437134, "rewards_train/rejected": -0.23555956780910492, "step": 387 }, { "epoch": 0.11, "learning_rate": 4.877449963836794e-07, "loss": 0.6609, "step": 388 }, { "epoch": 0.11, "logps_train/chosen": -93.16571044921875, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -106.4239501953125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6222352385520935, "rewards_train/margins": 0.13461285829544067, "rewards_train/rejected": -0.7568480968475342, "step": 388 }, { "epoch": 0.11, "logps_train/chosen": -45.30501174926758, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -48.52457046508789, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.16722005605697632, "rewards_train/margins": 0.10828381776809692, "rewards_train/rejected": -0.27550387382507324, "step": 389 }, { "epoch": 0.11, "learning_rate": 4.875528682742312e-07, "loss": 0.6591, "step": 390 }, { "epoch": 0.11, "logps_train/chosen": -119.17372131347656, "logps_train/ref_chosen": -117.5, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -114.12166595458984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.19002825021743774, "rewards_train/margins": 0.4022170901298523, "rewards_train/rejected": -0.59224534034729, "step": 390 }, { "epoch": 0.11, "logps_train/chosen": -78.75126647949219, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -103.34577941894531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.29938769340515137, "rewards_train/margins": 0.2758580446243286, "rewards_train/rejected": -0.57524573802948, "step": 391 }, { "epoch": 0.11, "learning_rate": 4.873592842479813e-07, "loss": 0.5755, "step": 392 }, { "epoch": 0.11, "logps_train/chosen": -73.84769439697266, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -80.33969116210938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07715222239494324, "rewards_train/margins": 0.42830124497413635, "rewards_train/rejected": -0.5054534673690796, "step": 392 }, { "epoch": 0.11, "logps_train/chosen": -61.32831954956055, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -74.7039794921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.07267564535140991, "rewards_train/margins": 0.3262382745742798, "rewards_train/rejected": -0.3989139199256897, "step": 393 }, { "epoch": 0.11, "learning_rate": 4.871642454913696e-07, "loss": 0.5709, "step": 394 }, { "epoch": 0.11, "logps_train/chosen": -38.112831115722656, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -30.125, "logps_train/rejected": -32.09352493286133, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.004837889224290848, "rewards_train/margins": 0.19621382281184196, "rewards_train/rejected": -0.2010517120361328, "step": 394 }, { "epoch": 0.11, "logps_train/chosen": -60.59162902832031, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -61.164512634277344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.3570142984390259, "rewards_train/margins": -0.11595363914966583, "rewards_train/rejected": -0.24106065928936005, "step": 395 }, { "epoch": 0.11, "learning_rate": 4.86967753199752e-07, "loss": 0.6936, "step": 396 }, { "epoch": 0.11, "logps_train/chosen": -54.332664489746094, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -62.183677673339844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.09698053449392319, "rewards_train/margins": 0.4324652776122093, "rewards_train/rejected": -0.33548474311828613, "step": 396 }, { "epoch": 0.11, "logps_train/chosen": -67.56103515625, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -75.46615600585938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1305174082517624, "rewards_train/margins": 0.43875421583652496, "rewards_train/rejected": -0.5692716240882874, "step": 397 }, { "epoch": 0.11, "learning_rate": 4.867698085773929e-07, "loss": 0.5435, "step": 398 }, { "epoch": 0.11, "logps_train/chosen": -74.09577941894531, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -99.85299682617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.16739077866077423, "rewards_train/margins": 0.6913464516401291, "rewards_train/rejected": -0.8587372303009033, "step": 398 }, { "epoch": 0.11, "logps_train/chosen": -72.29281616210938, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -83.37358856201172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.19061006605625153, "rewards_train/margins": 0.3238971084356308, "rewards_train/rejected": -0.5145071744918823, "step": 399 }, { "epoch": 0.11, "learning_rate": 4.865704128374575e-07, "loss": 0.5364, "step": 400 }, { "epoch": 0.11, "logps_train/chosen": -102.74913024902344, "logps_train/ref_chosen": -98.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -107.50867462158203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4030378460884094, "rewards_train/margins": 0.4126731753349304, "rewards_train/rejected": -0.8157110214233398, "step": 400 }, { "epoch": 0.11, "logps_train/chosen": -56.70863342285156, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -69.8048095703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21910548210144043, "rewards_train/margins": 0.12368011474609375, "rewards_train/rejected": -0.3427855968475342, "step": 401 }, { "epoch": 0.11, "learning_rate": 4.863695672020047e-07, "loss": 0.6287, "step": 402 }, { "epoch": 0.11, "logps_train/chosen": -102.02717590332031, "logps_train/ref_chosen": -99.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -113.07080841064453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.24256093800067902, "rewards_train/margins": 0.6547542065382004, "rewards_train/rejected": -0.8973151445388794, "step": 402 }, { "epoch": 0.11, "logps_train/chosen": -53.998497009277344, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -73.76364135742188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.004244105890393257, "rewards_train/margins": 0.172120226547122, "rewards_train/rejected": -0.17636433243751526, "step": 403 }, { "epoch": 0.11, "learning_rate": 4.861672729019796e-07, "loss": 0.5442, "step": 404 }, { "epoch": 0.11, "logps_train/chosen": -59.30846405029297, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -62.81505584716797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0035284943878650665, "rewards_train/margins": 0.47038574889302254, "rewards_train/rejected": -0.46685725450515747, "step": 404 }, { "epoch": 0.11, "logps_train/chosen": -37.09577941894531, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -46.77699279785156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.14551538228988647, "rewards_train/margins": 0.1204647421836853, "rewards_train/rejected": -0.2659801244735718, "step": 405 }, { "epoch": 0.11, "learning_rate": 4.85963531177206e-07, "loss": 0.5697, "step": 406 }, { "epoch": 0.11, "logps_train/chosen": -67.38511657714844, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -84.91424560546875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.029918041080236435, "rewards_train/margins": 0.24138892069458961, "rewards_train/rejected": -0.27130696177482605, "step": 406 }, { "epoch": 0.11, "logps_train/chosen": -35.23558044433594, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -42.9957160949707, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.08527660369873047, "rewards_train/margins": 0.28695130348205566, "rewards_train/rejected": -0.37222790718078613, "step": 407 }, { "epoch": 0.11, "learning_rate": 4.857583432763784e-07, "loss": 0.5887, "step": 408 }, { "epoch": 0.11, "logps_train/chosen": -37.568763732910156, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -39.81353759765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1369543969631195, "rewards_train/margins": 0.3692041337490082, "rewards_train/rejected": -0.5061585307121277, "step": 408 }, { "epoch": 0.11, "logps_train/chosen": -45.81025695800781, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -44.82408142089844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22760777175426483, "rewards_train/margins": 0.01769109070301056, "rewards_train/rejected": -0.2452988624572754, "step": 409 }, { "epoch": 0.11, "learning_rate": 4.85551710457055e-07, "loss": 0.618, "step": 410 }, { "epoch": 0.11, "logps_train/chosen": -65.6756362915039, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -75.97378540039062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.20076701045036316, "rewards_train/margins": 0.34036150574684143, "rewards_train/rejected": -0.5411285161972046, "step": 410 }, { "epoch": 0.11, "logps_train/chosen": -52.825531005859375, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -58.52812194824219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.12093216925859451, "rewards_train/margins": 0.13520055264234543, "rewards_train/rejected": -0.25613272190093994, "step": 411 }, { "epoch": 0.12, "learning_rate": 4.853436339856493e-07, "loss": 0.598, "step": 412 }, { "epoch": 0.12, "logps_train/chosen": -83.06622314453125, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -78.86665344238281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.06873153150081635, "rewards_train/margins": 0.3175438195466995, "rewards_train/rejected": -0.38627535104751587, "step": 412 }, { "epoch": 0.12, "logps_train/chosen": -45.289215087890625, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -51.860042572021484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.03924257308244705, "rewards_train/margins": 0.06938788667321205, "rewards_train/rejected": -0.030145313590765, "step": 413 }, { "epoch": 0.12, "learning_rate": 4.851341151374227e-07, "loss": 0.6123, "step": 414 }, { "epoch": 0.12, "logps_train/chosen": -88.4419937133789, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -110.30561828613281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09366677701473236, "rewards_train/margins": 0.9019871801137924, "rewards_train/rejected": -0.8083204030990601, "step": 414 }, { "epoch": 0.12, "logps_train/chosen": -40.033206939697266, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -46.84977722167969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.04980486258864403, "rewards_train/margins": 0.02706756815314293, "rewards_train/rejected": -0.07687243074178696, "step": 415 }, { "epoch": 0.12, "learning_rate": 4.849231551964771e-07, "loss": 0.5419, "step": 416 }, { "epoch": 0.12, "logps_train/chosen": -51.32529830932617, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -57.75691604614258, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.13409240543842316, "rewards_train/margins": 0.33700962364673615, "rewards_train/rejected": -0.4711020290851593, "step": 416 }, { "epoch": 0.12, "logps_train/chosen": -37.329917907714844, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -41.282283782958984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.006852060556411743, "rewards_train/margins": 0.4186740517616272, "rewards_train/rejected": -0.41182199120521545, "step": 417 }, { "epoch": 0.12, "learning_rate": 4.84710755455746e-07, "loss": 0.5501, "step": 418 }, { "epoch": 0.12, "logps_train/chosen": -67.50133514404297, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -66.35977172851562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21087563037872314, "rewards_train/margins": 0.4042034149169922, "rewards_train/rejected": -0.6150790452957153, "step": 418 }, { "epoch": 0.12, "logps_train/chosen": -27.96630859375, "logps_train/ref_chosen": -28.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -41.9415397644043, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.05063493549823761, "rewards_train/margins": 0.20885145664215088, "rewards_train/rejected": -0.15821652114391327, "step": 419 }, { "epoch": 0.12, "learning_rate": 4.844969172169875e-07, "loss": 0.5743, "step": 420 }, { "epoch": 0.12, "logps_train/chosen": -85.80335998535156, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -108.86079406738281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12642979621887207, "rewards_train/margins": 0.6698055863380432, "rewards_train/rejected": -0.7962353825569153, "step": 420 }, { "epoch": 0.12, "logps_train/chosen": -60.522071838378906, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -92.09263610839844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06099611148238182, "rewards_train/margins": 0.6867439188063145, "rewards_train/rejected": -0.7477400302886963, "step": 421 }, { "epoch": 0.12, "learning_rate": 4.842816417907758e-07, "loss": 0.4661, "step": 422 }, { "epoch": 0.12, "logps_train/chosen": -67.54059600830078, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -73.53221893310547, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2579660415649414, "rewards_train/margins": 0.5901778340339661, "rewards_train/rejected": -0.8481438755989075, "step": 422 }, { "epoch": 0.12, "logps_train/chosen": -54.35505676269531, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -73.50029754638672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.25972452759742737, "rewards_train/margins": 0.36061760783195496, "rewards_train/rejected": -0.6203421354293823, "step": 423 }, { "epoch": 0.12, "learning_rate": 4.840649304964937e-07, "loss": 0.5545, "step": 424 }, { "epoch": 0.12, "logps_train/chosen": -27.04102325439453, "logps_train/ref_chosen": -25.75, "logps_train/ref_rejected": -25.875, "logps_train/rejected": -28.863727569580078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.12929782271385193, "rewards_train/margins": 0.1678171157836914, "rewards_train/rejected": -0.29711493849754333, "step": 424 }, { "epoch": 0.12, "logps_train/chosen": -65.1640625, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -63.27779769897461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.28906217217445374, "rewards_train/margins": 0.1396942138671875, "rewards_train/rejected": -0.42875638604164124, "step": 425 }, { "epoch": 0.12, "learning_rate": 4.838467846623237e-07, "loss": 0.6549, "step": 426 }, { "epoch": 0.12, "logps_train/chosen": -59.83967590332031, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -78.50164031982422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.11013912409543991, "rewards_train/margins": 0.7677593305706978, "rewards_train/rejected": -0.8778984546661377, "step": 426 }, { "epoch": 0.12, "logps_train/chosen": -66.80098724365234, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -85.57430267333984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.209786057472229, "rewards_train/margins": 0.7109251022338867, "rewards_train/rejected": -0.9207111597061157, "step": 427 }, { "epoch": 0.12, "learning_rate": 4.836272056252406e-07, "loss": 0.4639, "step": 428 }, { "epoch": 0.12, "logps_train/chosen": -50.29276657104492, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -39.00887680053711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.36749064922332764, "rewards_train/margins": 0.1373031735420227, "rewards_train/rejected": -0.5047938227653503, "step": 428 }, { "epoch": 0.12, "logps_train/chosen": -54.68303680419922, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -67.43299865722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.039557721465826035, "rewards_train/margins": 0.3359828256070614, "rewards_train/rejected": -0.29642510414123535, "step": 429 }, { "epoch": 0.12, "learning_rate": 4.83406194731003e-07, "loss": 0.6045, "step": 430 }, { "epoch": 0.12, "logps_train/chosen": -79.35588073730469, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -95.75279235839844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4887131154537201, "rewards_train/margins": 0.20609775185585022, "rewards_train/rejected": -0.6948108673095703, "step": 430 }, { "epoch": 0.12, "logps_train/chosen": -62.179195404052734, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -61.67304992675781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3796379864215851, "rewards_train/margins": 0.36315545439720154, "rewards_train/rejected": -0.7427934408187866, "step": 431 }, { "epoch": 0.12, "learning_rate": 4.831837533341451e-07, "loss": 0.5812, "step": 432 }, { "epoch": 0.12, "logps_train/chosen": -60.57322692871094, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -76.04409790039062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.10175628960132599, "rewards_train/margins": 0.07169632613658905, "rewards_train/rejected": -0.17345261573791504, "step": 432 }, { "epoch": 0.12, "logps_train/chosen": -72.44471740722656, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -90.85264587402344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.21126829087734222, "rewards_train/margins": 0.6040742546319962, "rewards_train/rejected": -0.8153425455093384, "step": 433 }, { "epoch": 0.12, "learning_rate": 4.829598827979682e-07, "loss": 0.5872, "step": 434 }, { "epoch": 0.12, "logps_train/chosen": -58.04673767089844, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -55.37931442260742, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.39217352867126465, "rewards_train/margins": -0.028070569038391113, "rewards_train/rejected": -0.36410295963287354, "step": 434 }, { "epoch": 0.12, "logps_train/chosen": -93.18022155761719, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -92.59236145019531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.1730998456478119, "rewards_train/margins": 0.17832356691360474, "rewards_train/rejected": -0.3514234125614166, "step": 435 }, { "epoch": 0.12, "learning_rate": 4.827345844945327e-07, "loss": 0.6736, "step": 436 }, { "epoch": 0.12, "logps_train/chosen": -63.54442596435547, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -51.33317565917969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.38188374042510986, "rewards_train/margins": 0.1117851734161377, "rewards_train/rejected": -0.49366891384124756, "step": 436 }, { "epoch": 0.12, "logps_train/chosen": -44.57952880859375, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -54.54182052612305, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06264039874076843, "rewards_train/margins": 0.1345101147890091, "rewards_train/rejected": -0.19715051352977753, "step": 437 }, { "epoch": 0.12, "learning_rate": 4.825078598046497e-07, "loss": 0.671, "step": 438 }, { "epoch": 0.12, "logps_train/chosen": -78.75662231445312, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -98.87887573242188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.02726789191365242, "rewards_train/margins": 0.9161319322884083, "rewards_train/rejected": -0.8888640403747559, "step": 438 }, { "epoch": 0.12, "logps_train/chosen": -61.60486602783203, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -70.74028778076172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.11165854334831238, "rewards_train/margins": 0.6619797050952911, "rewards_train/rejected": -0.7736382484436035, "step": 439 }, { "epoch": 0.12, "learning_rate": 4.822797101178718e-07, "loss": 0.4382, "step": 440 }, { "epoch": 0.12, "logps_train/chosen": -101.79241943359375, "logps_train/ref_chosen": -98.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -78.33682250976562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3050227761268616, "rewards_train/margins": 0.16459760069847107, "rewards_train/rejected": -0.46962037682533264, "step": 440 }, { "epoch": 0.12, "logps_train/chosen": -69.30650329589844, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -88.3681640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.34393131732940674, "rewards_train/margins": 0.91710364818573, "rewards_train/rejected": -1.2610349655151367, "step": 441 }, { "epoch": 0.12, "learning_rate": 4.820501368324858e-07, "loss": 0.5168, "step": 442 }, { "epoch": 0.12, "logps_train/chosen": -32.35873031616211, "logps_train/ref_chosen": -31.125, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -50.96668243408203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12102940678596497, "rewards_train/margins": 0.28559964895248413, "rewards_train/rejected": -0.4066290557384491, "step": 442 }, { "epoch": 0.12, "logps_train/chosen": -41.715675354003906, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -45.301876068115234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.08641133457422256, "rewards_train/margins": 0.1713152751326561, "rewards_train/rejected": -0.25772660970687866, "step": 443 }, { "epoch": 0.12, "learning_rate": 4.818191413555029e-07, "loss": 0.6137, "step": 444 }, { "epoch": 0.12, "logps_train/chosen": -88.72758483886719, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -91.88250732421875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.47598087787628174, "rewards_train/margins": 0.3681298494338989, "rewards_train/rejected": -0.8441107273101807, "step": 444 }, { "epoch": 0.12, "logps_train/chosen": -56.40264129638672, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -73.94459533691406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.004521720111370087, "rewards_train/margins": 0.3090783432126045, "rewards_train/rejected": -0.3136000633239746, "step": 445 }, { "epoch": 0.12, "learning_rate": 4.81586725102651e-07, "loss": 0.5719, "step": 446 }, { "epoch": 0.12, "logps_train/chosen": -101.0855712890625, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -114.37125396728516, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6935184001922607, "rewards_train/margins": 0.3915567398071289, "rewards_train/rejected": -1.0850751399993896, "step": 446 }, { "epoch": 0.12, "logps_train/chosen": -69.82218170166016, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -96.4715576171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2097572535276413, "rewards_train/margins": 0.7131803780794144, "rewards_train/rejected": -0.9229376316070557, "step": 447 }, { "epoch": 0.13, "learning_rate": 4.813528894983653e-07, "loss": 0.5464, "step": 448 }, { "epoch": 0.13, "logps_train/chosen": -67.9070053100586, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -54.96982192993164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1430443972349167, "rewards_train/margins": 0.3299145847558975, "rewards_train/rejected": -0.4729589819908142, "step": 448 }, { "epoch": 0.13, "logps_train/chosen": -64.23294830322266, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -74.89968872070312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.18677125871181488, "rewards_train/margins": 0.6061270385980606, "rewards_train/rejected": -0.7928982973098755, "step": 449 }, { "epoch": 0.13, "learning_rate": 4.811176359757807e-07, "loss": 0.5321, "step": 450 }, { "epoch": 0.13, "logps_train/chosen": -71.74510955810547, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -83.18284606933594, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.5309804677963257, "rewards_train/margins": 0.10624945163726807, "rewards_train/rejected": -0.6372299194335938, "step": 450 }, { "epoch": 0.13, "logps_train/chosen": -47.425254821777344, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -53.84074401855469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3173302412033081, "rewards_train/margins": 0.19262266159057617, "rewards_train/rejected": -0.5099529027938843, "step": 451 }, { "epoch": 0.13, "learning_rate": 4.808809659767213e-07, "loss": 0.6304, "step": 452 }, { "epoch": 0.13, "logps_train/chosen": -53.49700164794922, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -56.15985870361328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.16766878962516785, "rewards_train/margins": 0.5490983426570892, "rewards_train/rejected": -0.7167671322822571, "step": 452 }, { "epoch": 0.13, "logps_train/chosen": -38.96504211425781, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -46.623050689697266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08240190148353577, "rewards_train/margins": 0.36189451813697815, "rewards_train/rejected": -0.2794926166534424, "step": 453 }, { "epoch": 0.13, "learning_rate": 4.806428809516932e-07, "loss": 0.5151, "step": 454 }, { "epoch": 0.13, "logps_train/chosen": -69.56109619140625, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -75.6127700805664, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2150936722755432, "rewards_train/margins": 0.26571446657180786, "rewards_train/rejected": -0.4808081388473511, "step": 454 }, { "epoch": 0.13, "logps_train/chosen": -50.74924087524414, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -59.40789031982422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09523653984069824, "rewards_train/margins": 0.5916464328765869, "rewards_train/rejected": -0.6868829727172852, "step": 455 }, { "epoch": 0.13, "learning_rate": 4.804033823598745e-07, "loss": 0.5153, "step": 456 }, { "epoch": 0.13, "logps_train/chosen": -90.73284912109375, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -100.54654693603516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5598087310791016, "rewards_train/margins": 0.6948459148406982, "rewards_train/rejected": -1.2546546459197998, "step": 456 }, { "epoch": 0.13, "logps_train/chosen": -72.41488647460938, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -57.239173889160156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2641451358795166, "rewards_train/margins": 0.12110072374343872, "rewards_train/rejected": -0.3852458596229553, "step": 457 }, { "epoch": 0.13, "learning_rate": 4.801624716691072e-07, "loss": 0.5505, "step": 458 }, { "epoch": 0.13, "logps_train/chosen": -64.20964050292969, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -63.94740295410156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.26490911841392517, "rewards_train/margins": 0.20014378428459167, "rewards_train/rejected": -0.46505290269851685, "step": 458 }, { "epoch": 0.13, "logps_train/chosen": -65.96568298339844, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -68.7130355834961, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.08563080430030823, "rewards_train/margins": 0.24504747986793518, "rewards_train/rejected": -0.3306782841682434, "step": 459 }, { "epoch": 0.13, "learning_rate": 4.799201503558873e-07, "loss": 0.6115, "step": 460 }, { "epoch": 0.13, "logps_train/chosen": -101.25625610351562, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -108.38130187988281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4350007176399231, "rewards_train/margins": 0.5078170895576477, "rewards_train/rejected": -0.9428178071975708, "step": 460 }, { "epoch": 0.13, "logps_train/chosen": -45.537818908691406, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -48.5035400390625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17721939086914062, "rewards_train/margins": 0.25516611337661743, "rewards_train/rejected": -0.43238550424575806, "step": 461 }, { "epoch": 0.13, "learning_rate": 4.796764199053568e-07, "loss": 0.579, "step": 462 }, { "epoch": 0.13, "logps_train/chosen": -55.23745346069336, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -80.50357055664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3159329295158386, "rewards_train/margins": 0.6236823201179504, "rewards_train/rejected": -0.9396152496337891, "step": 462 }, { "epoch": 0.13, "logps_train/chosen": -71.55741882324219, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -81.88558197021484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3819139003753662, "rewards_train/margins": 0.06435957551002502, "rewards_train/rejected": -0.44627347588539124, "step": 463 }, { "epoch": 0.13, "learning_rate": 4.794312818112935e-07, "loss": 0.5781, "step": 464 }, { "epoch": 0.13, "logps_train/chosen": -58.86534118652344, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -80.2977294921875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.15889745950698853, "rewards_train/margins": 0.2706802785396576, "rewards_train/rejected": -0.4295777380466461, "step": 464 }, { "epoch": 0.13, "logps_train/chosen": -45.71202087402344, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -75.93025207519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.13213971257209778, "rewards_train/margins": 0.49877622723579407, "rewards_train/rejected": -0.6309159398078918, "step": 465 }, { "epoch": 0.13, "learning_rate": 4.791847375761027e-07, "loss": 0.5414, "step": 466 }, { "epoch": 0.13, "logps_train/chosen": -50.770538330078125, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -59.130760192871094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3323274254798889, "rewards_train/margins": 0.16824871301651, "rewards_train/rejected": -0.5005761384963989, "step": 466 }, { "epoch": 0.13, "logps_train/chosen": -90.62741088867188, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -81.84719848632812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6760218143463135, "rewards_train/margins": 0.2461986541748047, "rewards_train/rejected": -0.9222204685211182, "step": 467 }, { "epoch": 0.13, "learning_rate": 4.789367887108076e-07, "loss": 0.6064, "step": 468 }, { "epoch": 0.13, "logps_train/chosen": -105.81695556640625, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -123.5, "logps_train/rejected": -136.29693603515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3746643364429474, "rewards_train/margins": 0.882177084684372, "rewards_train/rejected": -1.2568414211273193, "step": 468 }, { "epoch": 0.13, "logps_train/chosen": -41.73709487915039, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -19.5, "logps_train/rejected": -21.092626571655273, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.10593586415052414, "rewards_train/margins": 0.05879557877779007, "rewards_train/rejected": -0.1647314429283142, "step": 469 }, { "epoch": 0.13, "learning_rate": 4.786874367350402e-07, "loss": 0.5404, "step": 470 }, { "epoch": 0.13, "logps_train/chosen": -42.11448669433594, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -53.18017578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4178939759731293, "rewards_train/margins": 0.11828795075416565, "rewards_train/rejected": -0.5361819267272949, "step": 470 }, { "epoch": 0.13, "logps_train/chosen": -71.83967590332031, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -100.61727905273438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3722483515739441, "rewards_train/margins": 1.105105221271515, "rewards_train/rejected": -1.477353572845459, "step": 471 }, { "epoch": 0.13, "learning_rate": 4.784366831770317e-07, "loss": 0.4904, "step": 472 }, { "epoch": 0.13, "logps_train/chosen": -58.41155242919922, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -64.99591064453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.286467969417572, "rewards_train/margins": 0.14593565464019775, "rewards_train/rejected": -0.4324036240577698, "step": 472 }, { "epoch": 0.13, "logps_train/chosen": -81.48255157470703, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -97.74531555175781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.39356729388237, "rewards_train/margins": 0.349714070558548, "rewards_train/rejected": -0.743281364440918, "step": 473 }, { "epoch": 0.13, "learning_rate": 4.781845295736033e-07, "loss": 0.6057, "step": 474 }, { "epoch": 0.13, "logps_train/chosen": -24.26443099975586, "logps_train/ref_chosen": -22.875, "logps_train/ref_rejected": -34.25, "logps_train/rejected": -38.41557693481445, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.14040803909301758, "rewards_train/margins": 0.26535844802856445, "rewards_train/rejected": -0.40576648712158203, "step": 474 }, { "epoch": 0.13, "logps_train/chosen": -62.27745819091797, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -56.919986724853516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.20977705717086792, "rewards_train/margins": 0.05683097243309021, "rewards_train/rejected": -0.26660802960395813, "step": 475 }, { "epoch": 0.13, "learning_rate": 4.779309774701573e-07, "loss": 0.6405, "step": 476 }, { "epoch": 0.13, "logps_train/chosen": -54.82136535644531, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -71.8387451171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2172931730747223, "rewards_train/margins": 0.26462796330451965, "rewards_train/rejected": -0.48192113637924194, "step": 476 }, { "epoch": 0.13, "logps_train/chosen": -48.08940887451172, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -36.25, "logps_train/rejected": -41.057167053222656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.27642133831977844, "rewards_train/margins": 0.2077132761478424, "rewards_train/rejected": -0.48413461446762085, "step": 477 }, { "epoch": 0.13, "learning_rate": 4.776760284206667e-07, "loss": 0.6099, "step": 478 }, { "epoch": 0.13, "logps_train/chosen": -30.757614135742188, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -52.735233306884766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04529266804456711, "rewards_train/margins": 0.6586994007229805, "rewards_train/rejected": -0.7039920687675476, "step": 478 }, { "epoch": 0.13, "logps_train/chosen": -87.45452117919922, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -73.78488159179688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.23685763776302338, "rewards_train/margins": 0.2888965755701065, "rewards_train/rejected": -0.5257542133331299, "step": 479 }, { "epoch": 0.13, "learning_rate": 4.774196839876659e-07, "loss": 0.5428, "step": 480 }, { "epoch": 0.13, "logps_train/chosen": -66.86396789550781, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -84.89311218261719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6605181097984314, "rewards_train/margins": 0.8312340378761292, "rewards_train/rejected": -1.4917521476745605, "step": 480 }, { "epoch": 0.13, "logps_train/chosen": -28.001731872558594, "logps_train/ref_chosen": -26.125, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -39.36369323730469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.18816140294075012, "rewards_train/margins": 0.35348114371299744, "rewards_train/rejected": -0.5416425466537476, "step": 481 }, { "epoch": 0.13, "learning_rate": 4.771619457422421e-07, "loss": 0.5028, "step": 482 }, { "epoch": 0.13, "logps_train/chosen": -52.00128936767578, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -51.218685150146484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.15382777154445648, "rewards_train/margins": 0.1682240217924118, "rewards_train/rejected": -0.3220517933368683, "step": 482 }, { "epoch": 0.13, "logps_train/chosen": -60.037567138671875, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -60.619300842285156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0021761134266853333, "rewards_train/margins": 0.6805120185017586, "rewards_train/rejected": -0.6783359050750732, "step": 483 }, { "epoch": 0.14, "learning_rate": 4.769028152640243e-07, "loss": 0.5578, "step": 484 }, { "epoch": 0.14, "logps_train/chosen": -94.0927734375, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -93.60713195800781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.48740169405937195, "rewards_train/margins": 0.5895219147205353, "rewards_train/rejected": -1.0769236087799072, "step": 484 }, { "epoch": 0.14, "logps_train/chosen": -70.69978332519531, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -92.28846740722656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6100178956985474, "rewards_train/margins": 0.5551564693450928, "rewards_train/rejected": -1.1651743650436401, "step": 485 }, { "epoch": 0.14, "learning_rate": 4.7664229414117437e-07, "loss": 0.4843, "step": 486 }, { "epoch": 0.14, "logps_train/chosen": -53.37134552001953, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -66.5076904296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2820568084716797, "rewards_train/margins": 0.2972272038459778, "rewards_train/rejected": -0.5792840123176575, "step": 486 }, { "epoch": 0.14, "logps_train/chosen": -42.8087158203125, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -39.998809814453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.09525149315595627, "rewards_train/margins": 0.3722808286547661, "rewards_train/rejected": -0.2770293354988098, "step": 487 }, { "epoch": 0.14, "learning_rate": 4.7638038397037724e-07, "loss": 0.5738, "step": 488 }, { "epoch": 0.14, "logps_train/chosen": -34.80330276489258, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -43.25440979003906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11118964850902557, "rewards_train/margins": 0.30438844859600067, "rewards_train/rejected": -0.41557809710502625, "step": 488 }, { "epoch": 0.14, "logps_train/chosen": -50.295082092285156, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -68.56315612792969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2521645426750183, "rewards_train/margins": 0.5594242811203003, "rewards_train/rejected": -0.8115888237953186, "step": 489 }, { "epoch": 0.14, "learning_rate": 4.76117086356831e-07, "loss": 0.5276, "step": 490 }, { "epoch": 0.14, "logps_train/chosen": -31.872812271118164, "logps_train/ref_chosen": -31.375, "logps_train/ref_rejected": -28.625, "logps_train/rejected": -30.312978744506836, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.05349217355251312, "rewards_train/margins": 0.10944627225399017, "rewards_train/rejected": -0.1629384458065033, "step": 490 }, { "epoch": 0.14, "logps_train/chosen": -57.79804229736328, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -49.39804458618164, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2505076229572296, "rewards_train/margins": 0.14193353056907654, "rewards_train/rejected": -0.39244115352630615, "step": 491 }, { "epoch": 0.14, "learning_rate": 4.758524029142372e-07, "loss": 0.6623, "step": 492 }, { "epoch": 0.14, "logps_train/chosen": -69.51055908203125, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -78.01091003417969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5385558605194092, "rewards_train/margins": 0.6375353336334229, "rewards_train/rejected": -1.176091194152832, "step": 492 }, { "epoch": 0.14, "logps_train/chosen": -71.8697509765625, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -83.75015258789062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.0095336437225342, "rewards_train/margins": 0.502004861831665, "rewards_train/rejected": -1.5115385055541992, "step": 493 }, { "epoch": 0.14, "learning_rate": 4.7558633526479084e-07, "loss": 0.5098, "step": 494 }, { "epoch": 0.14, "logps_train/chosen": -22.331228256225586, "logps_train/ref_chosen": -21.375, "logps_train/ref_rejected": -28.875, "logps_train/rejected": -30.32866096496582, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.09894312918186188, "rewards_train/margins": 0.05130559206008911, "rewards_train/rejected": -0.150248721241951, "step": 494 }, { "epoch": 0.14, "logps_train/chosen": -62.72913360595703, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -45.517066955566406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.19234707951545715, "rewards_train/margins": 0.2933442294597626, "rewards_train/rejected": -0.4856913089752197, "step": 495 }, { "epoch": 0.14, "learning_rate": 4.753188850391705e-07, "loss": 0.6269, "step": 496 }, { "epoch": 0.14, "logps_train/chosen": -29.741872787475586, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -46.2149658203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.055304765701293945, "rewards_train/margins": 0.5283637642860413, "rewards_train/rejected": -0.4730589985847473, "step": 496 }, { "epoch": 0.14, "logps_train/chosen": -64.40620422363281, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -66.39431762695312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.23051299154758453, "rewards_train/margins": 0.45579348504543304, "rewards_train/rejected": -0.6863064765930176, "step": 497 }, { "epoch": 0.14, "learning_rate": 4.7505005387652805e-07, "loss": 0.555, "step": 498 }, { "epoch": 0.14, "logps_train/chosen": -72.48267364501953, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -82.56196594238281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4054940938949585, "rewards_train/margins": 0.5087108016014099, "rewards_train/rejected": -0.9142048954963684, "step": 498 }, { "epoch": 0.14, "logps_train/chosen": -86.4345703125, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -81.76953125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9524410367012024, "rewards_train/margins": 0.2749027609825134, "rewards_train/rejected": -1.2273437976837158, "step": 499 }, { "epoch": 0.14, "learning_rate": 4.747798434244793e-07, "loss": 0.6017, "step": 500 }, { "epoch": 0.14, "logps_train/chosen": -73.94851684570312, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -74.02375793457031, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.4011017084121704, "rewards_train/margins": 0.0686572790145874, "rewards_train/rejected": -0.4697589874267578, "step": 500 }, { "epoch": 0.14, "logps_train/chosen": -96.72221374511719, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -117.32746887207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12222132831811905, "rewards_train/margins": 0.6851353868842125, "rewards_train/rejected": -0.8073567152023315, "step": 501 }, { "epoch": 0.14, "learning_rate": 4.745082553390931e-07, "loss": 0.5593, "step": 502 }, { "epoch": 0.14, "logps_train/chosen": -55.377403259277344, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -89.51190185546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.19125555455684662, "rewards_train/margins": 0.5690164715051651, "rewards_train/rejected": -0.7602720260620117, "step": 502 }, { "epoch": 0.14, "logps_train/chosen": -63.07593536376953, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -80.22622680664062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.029125303030014038, "rewards_train/margins": 0.7242086231708527, "rewards_train/rejected": -0.6950833201408386, "step": 503 }, { "epoch": 0.14, "learning_rate": 4.742352912848817e-07, "loss": 0.4981, "step": 504 }, { "epoch": 0.14, "logps_train/chosen": -88.90757751464844, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -84.74685668945312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7782579660415649, "rewards_train/margins": 0.027678251266479492, "rewards_train/rejected": -0.8059362173080444, "step": 504 }, { "epoch": 0.14, "logps_train/chosen": -70.9320068359375, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -68.92997741699219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.37430447340011597, "rewards_train/margins": 0.5436931252479553, "rewards_train/rejected": -0.9179975986480713, "step": 505 }, { "epoch": 0.14, "learning_rate": 4.7396095293479043e-07, "loss": 0.6605, "step": 506 }, { "epoch": 0.14, "logps_train/chosen": -80.8028564453125, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -97.88168334960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6273554563522339, "rewards_train/margins": 0.7297576665878296, "rewards_train/rejected": -1.3571131229400635, "step": 506 }, { "epoch": 0.14, "logps_train/chosen": -91.94205474853516, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -86.92388916015625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.38170573115348816, "rewards_train/margins": 0.22162029147148132, "rewards_train/rejected": -0.6033260226249695, "step": 507 }, { "epoch": 0.14, "learning_rate": 4.7368524197018735e-07, "loss": 0.5651, "step": 508 }, { "epoch": 0.14, "logps_train/chosen": -41.23701477050781, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -60.14656066894531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.026435818523168564, "rewards_train/margins": 0.5856812335550785, "rewards_train/rejected": -0.6121170520782471, "step": 508 }, { "epoch": 0.14, "logps_train/chosen": -138.77102661132812, "logps_train/ref_chosen": -129.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -113.13682556152344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.9345235228538513, "rewards_train/margins": -0.14818471670150757, "rewards_train/rejected": -0.7863388061523438, "step": 509 }, { "epoch": 0.14, "learning_rate": 4.7340816008085305e-07, "loss": 0.6675, "step": 510 }, { "epoch": 0.14, "logps_train/chosen": -65.23442077636719, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -76.99596405029297, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3794972896575928, "rewards_train/margins": -0.01701033115386963, "rewards_train/rejected": -0.36248695850372314, "step": 510 }, { "epoch": 0.14, "logps_train/chosen": -98.24939727783203, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -104.49810028076172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4968145787715912, "rewards_train/margins": -0.05169233679771423, "rewards_train/rejected": -0.44512224197387695, "step": 511 }, { "epoch": 0.14, "learning_rate": 4.7312970896497027e-07, "loss": 0.7327, "step": 512 }, { "epoch": 0.14, "logps_train/chosen": -23.39773941040039, "logps_train/ref_chosen": -22.125, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -34.852760314941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12141447514295578, "rewards_train/margins": 0.05683001130819321, "rewards_train/rejected": -0.178244486451149, "step": 512 }, { "epoch": 0.14, "logps_train/chosen": -34.62717819213867, "logps_train/ref_chosen": -32.0, "logps_train/ref_rejected": -19.75, "logps_train/rejected": -22.485401153564453, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.26056942343711853, "rewards_train/margins": 0.011530131101608276, "rewards_train/rejected": -0.2720995545387268, "step": 513 }, { "epoch": 0.14, "learning_rate": 4.728498903291135e-07, "loss": 0.6824, "step": 514 }, { "epoch": 0.14, "logps_train/chosen": -75.23242950439453, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -93.727294921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5572268962860107, "rewards_train/margins": 0.08581513166427612, "rewards_train/rejected": -0.6430420279502869, "step": 514 }, { "epoch": 0.14, "logps_train/chosen": -31.645565032958984, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -40.616207122802734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.13799408078193665, "rewards_train/margins": 0.4856139123439789, "rewards_train/rejected": -0.6236079931259155, "step": 515 }, { "epoch": 0.14, "learning_rate": 4.7256870588823847e-07, "loss": 0.6125, "step": 516 }, { "epoch": 0.14, "logps_train/chosen": -81.1720199584961, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -94.9151611328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.20392099022865295, "rewards_train/margins": 0.5293920934200287, "rewards_train/rejected": -0.7333130836486816, "step": 516 }, { "epoch": 0.14, "logps_train/chosen": -62.583106994628906, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -79.09172058105469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3465920090675354, "rewards_train/margins": 0.5137519240379333, "rewards_train/rejected": -0.8603439331054688, "step": 517 }, { "epoch": 0.14, "learning_rate": 4.722861573656716e-07, "loss": 0.5278, "step": 518 }, { "epoch": 0.14, "logps_train/chosen": -80.66468048095703, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -99.61990356445312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5630502104759216, "rewards_train/margins": 0.6380017399787903, "rewards_train/rejected": -1.201051950454712, "step": 518 }, { "epoch": 0.15, "logps_train/chosen": -76.64894104003906, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -83.43063354492188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2889173924922943, "rewards_train/margins": 0.2760217487812042, "rewards_train/rejected": -0.5649391412734985, "step": 519 }, { "epoch": 0.15, "learning_rate": 4.7200224649309974e-07, "loss": 0.5385, "step": 520 }, { "epoch": 0.15, "logps_train/chosen": -42.11444854736328, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -48.564491271972656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.08040061593055725, "rewards_train/margins": 0.3436858654022217, "rewards_train/rejected": -0.26328524947166443, "step": 520 }, { "epoch": 0.15, "logps_train/chosen": -20.484411239624023, "logps_train/ref_chosen": -20.125, "logps_train/ref_rejected": -27.0, "logps_train/rejected": -28.30255889892578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.036624811589717865, "rewards_train/margins": 0.09734208136796951, "rewards_train/rejected": -0.13396689295768738, "step": 521 }, { "epoch": 0.15, "learning_rate": 4.71716975010559e-07, "loss": 0.6026, "step": 522 }, { "epoch": 0.15, "logps_train/chosen": -107.2323226928711, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -124.77798461914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6540920734405518, "rewards_train/margins": 0.6166746616363525, "rewards_train/rejected": -1.2707667350769043, "step": 522 }, { "epoch": 0.15, "logps_train/chosen": -72.55206298828125, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -104.24154663085938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.24426917731761932, "rewards_train/margins": 0.5662143379449844, "rewards_train/rejected": -0.8104835152626038, "step": 523 }, { "epoch": 0.15, "learning_rate": 4.714303446664246e-07, "loss": 0.4907, "step": 524 }, { "epoch": 0.15, "logps_train/chosen": -110.03675079345703, "logps_train/ref_chosen": -101.5, "logps_train/ref_rejected": -118.0, "logps_train/rejected": -134.18072509765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8478155136108398, "rewards_train/margins": 0.7593201398849487, "rewards_train/rejected": -1.6071356534957886, "step": 524 }, { "epoch": 0.15, "logps_train/chosen": -58.35194396972656, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -61.030277252197266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3433973789215088, "rewards_train/margins": 0.3086537718772888, "rewards_train/rejected": -0.6520511507987976, "step": 525 }, { "epoch": 0.15, "learning_rate": 4.7114235721740005e-07, "loss": 0.5154, "step": 526 }, { "epoch": 0.15, "logps_train/chosen": -54.793128967285156, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -68.19047546386719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3890784680843353, "rewards_train/margins": 0.5676640570163727, "rewards_train/rejected": -0.956742525100708, "step": 526 }, { "epoch": 0.15, "logps_train/chosen": -66.0406494140625, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -69.14704132080078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.24078363180160522, "rewards_train/margins": 0.40204566717147827, "rewards_train/rejected": -0.6428292989730835, "step": 527 }, { "epoch": 0.15, "learning_rate": 4.708530144285062e-07, "loss": 0.5388, "step": 528 }, { "epoch": 0.15, "logps_train/chosen": -52.726234436035156, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -76.1851806640625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.37876343727111816, "rewards_train/margins": 0.5359342694282532, "rewards_train/rejected": -0.9146977066993713, "step": 528 }, { "epoch": 0.15, "logps_train/chosen": -43.520347595214844, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -54.7512321472168, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2442224621772766, "rewards_train/margins": 0.49515873193740845, "rewards_train/rejected": -0.7393811941146851, "step": 529 }, { "epoch": 0.15, "learning_rate": 4.7056231807307045e-07, "loss": 0.5458, "step": 530 }, { "epoch": 0.15, "logps_train/chosen": -82.77078247070312, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -73.08013916015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.32512497901916504, "rewards_train/margins": 0.46667778491973877, "rewards_train/rejected": -0.7918027639389038, "step": 530 }, { "epoch": 0.15, "logps_train/chosen": -75.02953338623047, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -77.51634979248047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06154738366603851, "rewards_train/margins": 0.8650875836610794, "rewards_train/rejected": -0.9266349673271179, "step": 531 }, { "epoch": 0.15, "learning_rate": 4.70270269932716e-07, "loss": 0.499, "step": 532 }, { "epoch": 0.15, "logps_train/chosen": -23.459375381469727, "logps_train/ref_chosen": -20.75, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -36.59151077270508, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.26537108421325684, "rewards_train/margins": 0.15960031747817993, "rewards_train/rejected": -0.42497140169143677, "step": 532 }, { "epoch": 0.15, "logps_train/chosen": -51.84281921386719, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -57.507625579833984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2215864062309265, "rewards_train/margins": 0.09128543734550476, "rewards_train/rejected": -0.3128718435764313, "step": 533 }, { "epoch": 0.15, "learning_rate": 4.699768717973511e-07, "loss": 0.6438, "step": 534 }, { "epoch": 0.15, "logps_train/chosen": -76.173583984375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -99.55367279052734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5900144577026367, "rewards_train/margins": 0.49660229682922363, "rewards_train/rejected": -1.0866167545318604, "step": 534 }, { "epoch": 0.15, "logps_train/chosen": -70.67461395263672, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -81.53765869140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2904106378555298, "rewards_train/margins": 0.7003673315048218, "rewards_train/rejected": -0.9907779693603516, "step": 535 }, { "epoch": 0.15, "learning_rate": 4.696821254651574e-07, "loss": 0.5112, "step": 536 }, { "epoch": 0.15, "logps_train/chosen": -76.12255859375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -80.60302734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5864747166633606, "rewards_train/margins": 0.7764647603034973, "rewards_train/rejected": -1.362939476966858, "step": 536 }, { "epoch": 0.15, "logps_train/chosen": -104.69821166992188, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -114.86045837402344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5728488564491272, "rewards_train/margins": 1.0352677702903748, "rewards_train/rejected": -1.608116626739502, "step": 537 }, { "epoch": 0.15, "learning_rate": 4.693860327425799e-07, "loss": 0.4259, "step": 538 }, { "epoch": 0.15, "logps_train/chosen": -38.83859634399414, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -30.375, "logps_train/rejected": -32.0008659362793, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.08053939044475555, "rewards_train/margins": 0.07911746203899384, "rewards_train/rejected": -0.1596568524837494, "step": 538 }, { "epoch": 0.15, "logps_train/chosen": -130.12071228027344, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -126.61973571777344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5995713472366333, "rewards_train/margins": 0.9575196504592896, "rewards_train/rejected": -1.5570909976959229, "step": 539 }, { "epoch": 0.15, "learning_rate": 4.690885954443151e-07, "loss": 0.5288, "step": 540 }, { "epoch": 0.15, "logps_train/chosen": -89.41333770751953, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -105.00894165039062, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.9659432768821716, "rewards_train/margins": 0.3537004590034485, "rewards_train/rejected": -1.3196437358856201, "step": 540 }, { "epoch": 0.15, "logps_train/chosen": -15.983552932739258, "logps_train/ref_chosen": -16.125, "logps_train/ref_rejected": -23.75, "logps_train/rejected": -24.74352264404297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.015365377068519592, "rewards_train/margins": 0.11022551357746124, "rewards_train/rejected": -0.09486013650894165, "step": 541 }, { "epoch": 0.15, "learning_rate": 4.687898153933001e-07, "loss": 0.6695, "step": 542 }, { "epoch": 0.15, "logps_train/chosen": -67.36192321777344, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -68.09326171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2351670265197754, "rewards_train/margins": 0.49930596351623535, "rewards_train/rejected": -0.7344729900360107, "step": 542 }, { "epoch": 0.15, "logps_train/chosen": -69.2124252319336, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -114.23719024658203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.302492618560791, "rewards_train/margins": 0.47435152530670166, "rewards_train/rejected": -0.7768441438674927, "step": 543 }, { "epoch": 0.15, "learning_rate": 4.6848969442070177e-07, "loss": 0.5516, "step": 544 }, { "epoch": 0.15, "logps_train/chosen": -34.0140495300293, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -30.5, "logps_train/rejected": -34.26715087890625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.04742322117090225, "rewards_train/margins": 0.4264822378754616, "rewards_train/rejected": -0.3790590167045593, "step": 544 }, { "epoch": 0.15, "logps_train/chosen": -57.643898010253906, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -74.52793884277344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2847025394439697, "rewards_train/margins": 0.4878177046775818, "rewards_train/rejected": -0.7725202441215515, "step": 545 }, { "epoch": 0.15, "learning_rate": 4.6818823436590475e-07, "loss": 0.5354, "step": 546 }, { "epoch": 0.15, "logps_train/chosen": -76.83927917480469, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -94.51193237304688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.28275570273399353, "rewards_train/margins": 0.28855451941490173, "rewards_train/rejected": -0.5713102221488953, "step": 546 }, { "epoch": 0.15, "logps_train/chosen": -111.79127502441406, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -82.41812133789062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9564964771270752, "rewards_train/margins": 0.010339975357055664, "rewards_train/rejected": -0.9668364524841309, "step": 547 }, { "epoch": 0.15, "learning_rate": 4.6788543707650116e-07, "loss": 0.6551, "step": 548 }, { "epoch": 0.15, "logps_train/chosen": -69.09648132324219, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -72.76083374023438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4018356502056122, "rewards_train/margins": 0.3637010157108307, "rewards_train/rejected": -0.7655366659164429, "step": 548 }, { "epoch": 0.15, "logps_train/chosen": -64.34158325195312, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -61.416481018066406, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.7240022420883179, "rewards_train/margins": -0.015166401863098145, "rewards_train/rejected": -0.7088358402252197, "step": 549 }, { "epoch": 0.15, "learning_rate": 4.6758130440827864e-07, "loss": 0.6297, "step": 550 }, { "epoch": 0.15, "logps_train/chosen": -54.109127044677734, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -57.748172760009766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0011966973543167114, "rewards_train/margins": 0.6152716428041458, "rewards_train/rejected": -0.6140749454498291, "step": 550 }, { "epoch": 0.15, "logps_train/chosen": -63.19514465332031, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -108.71592712402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15935802459716797, "rewards_train/margins": 0.9356725215911865, "rewards_train/rejected": -1.0950305461883545, "step": 551 }, { "epoch": 0.15, "learning_rate": 4.672758382252089e-07, "loss": 0.445, "step": 552 }, { "epoch": 0.15, "logps_train/chosen": -50.334468841552734, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -47.449493408203125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.17289999127388, "rewards_train/margins": 0.23220571875572205, "rewards_train/rejected": -0.40510571002960205, "step": 552 }, { "epoch": 0.15, "logps_train/chosen": -50.62821578979492, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -62.52609634399414, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2872355282306671, "rewards_train/margins": 0.28178027272224426, "rewards_train/rejected": -0.5690158009529114, "step": 553 }, { "epoch": 0.15, "learning_rate": 4.669690403994366e-07, "loss": 0.6167, "step": 554 }, { "epoch": 0.15, "logps_train/chosen": -68.87958526611328, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -87.34165954589844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3912666440010071, "rewards_train/margins": 0.6028479337692261, "rewards_train/rejected": -0.9941145777702332, "step": 554 }, { "epoch": 0.16, "logps_train/chosen": -43.510765075683594, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -53.08168411254883, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.11672595888376236, "rewards_train/margins": 0.6015984639525414, "rewards_train/rejected": -0.7183244228363037, "step": 555 }, { "epoch": 0.16, "learning_rate": 4.666609128112681e-07, "loss": 0.4856, "step": 556 }, { "epoch": 0.16, "logps_train/chosen": -45.0446662902832, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -74.44892883300781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.10915383696556091, "rewards_train/margins": 0.6821256577968597, "rewards_train/rejected": -0.7912794947624207, "step": 556 }, { "epoch": 0.16, "logps_train/chosen": -35.762298583984375, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -42.06949996948242, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0168546661734581, "rewards_train/margins": 0.2217358574271202, "rewards_train/rejected": -0.2385905236005783, "step": 557 }, { "epoch": 0.16, "learning_rate": 4.6635145734915914e-07, "loss": 0.5506, "step": 558 }, { "epoch": 0.16, "logps_train/chosen": -61.52577209472656, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -72.86044311523438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3555067777633667, "rewards_train/margins": 0.3422565460205078, "rewards_train/rejected": -0.6977633237838745, "step": 558 }, { "epoch": 0.16, "logps_train/chosen": -60.89529037475586, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -69.43919372558594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2145289033651352, "rewards_train/margins": 0.7424758821725845, "rewards_train/rejected": -0.9570047855377197, "step": 559 }, { "epoch": 0.16, "learning_rate": 4.6604067590970414e-07, "loss": 0.5379, "step": 560 }, { "epoch": 0.16, "logps_train/chosen": -42.53410339355469, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -41.25, "logps_train/rejected": -48.164886474609375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.034953486174345016, "rewards_train/margins": 0.6631755717098713, "rewards_train/rejected": -0.6981290578842163, "step": 560 }, { "epoch": 0.16, "logps_train/chosen": -56.45689392089844, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -62.107749938964844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.07049411535263062, "rewards_train/margins": 0.7445778250694275, "rewards_train/rejected": -0.8150719404220581, "step": 561 }, { "epoch": 0.16, "learning_rate": 4.657285703976239e-07, "loss": 0.5179, "step": 562 }, { "epoch": 0.16, "logps_train/chosen": -46.0301399230957, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -64.97852325439453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5315294861793518, "rewards_train/margins": 0.5905417799949646, "rewards_train/rejected": -1.1220712661743164, "step": 562 }, { "epoch": 0.16, "logps_train/chosen": -29.97800064086914, "logps_train/ref_chosen": -28.75, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -48.09043502807617, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1265111118555069, "rewards_train/margins": 0.3512822836637497, "rewards_train/rejected": -0.4777933955192566, "step": 563 }, { "epoch": 0.16, "learning_rate": 4.6541514272575445e-07, "loss": 0.5393, "step": 564 }, { "epoch": 0.16, "logps_train/chosen": -59.70259094238281, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -66.1999282836914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22768065333366394, "rewards_train/margins": 0.1819603443145752, "rewards_train/rejected": -0.40964099764823914, "step": 564 }, { "epoch": 0.16, "logps_train/chosen": -71.78914642333984, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -78.57498168945312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.23555545508861542, "rewards_train/margins": 0.622724249958992, "rewards_train/rejected": -0.8582797050476074, "step": 565 }, { "epoch": 0.16, "learning_rate": 4.6510039481503485e-07, "loss": 0.5683, "step": 566 }, { "epoch": 0.16, "logps_train/chosen": -40.68168640136719, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -38.25, "logps_train/rejected": -43.14057159423828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.18135224282741547, "rewards_train/margins": 0.31434543430805206, "rewards_train/rejected": -0.49569767713546753, "step": 566 }, { "epoch": 0.16, "logps_train/chosen": -82.29490661621094, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -87.00576782226562, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.9814441204071045, "rewards_train/margins": -0.19434422254562378, "rewards_train/rejected": -0.7870998978614807, "step": 567 }, { "epoch": 0.16, "learning_rate": 4.6478432859449583e-07, "loss": 0.6966, "step": 568 }, { "epoch": 0.16, "logps_train/chosen": -48.82793045043945, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -49.01304626464844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1437305361032486, "rewards_train/margins": 0.3829648345708847, "rewards_train/rejected": -0.5266953706741333, "step": 568 }, { "epoch": 0.16, "logps_train/chosen": -19.0291748046875, "logps_train/ref_chosen": -19.875, "logps_train/ref_rejected": -23.125, "logps_train/rejected": -24.782997131347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.07878405600786209, "rewards_train/margins": 0.25057750195264816, "rewards_train/rejected": -0.17179344594478607, "step": 569 }, { "epoch": 0.16, "learning_rate": 4.644669460012478e-07, "loss": 0.5729, "step": 570 }, { "epoch": 0.16, "logps_train/chosen": -72.10989379882812, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -66.34757995605469, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.19704875349998474, "rewards_train/margins": 0.16400322318077087, "rewards_train/rejected": -0.3610519766807556, "step": 570 }, { "epoch": 0.16, "logps_train/chosen": -62.749656677246094, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -90.15699768066406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.12464368343353271, "rewards_train/margins": 0.8118283152580261, "rewards_train/rejected": -0.6871846318244934, "step": 571 }, { "epoch": 0.16, "learning_rate": 4.641482489804689e-07, "loss": 0.5792, "step": 572 }, { "epoch": 0.16, "logps_train/chosen": -87.15548706054688, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -105.77081298828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.08429862558841705, "rewards_train/margins": 0.4150484651327133, "rewards_train/rejected": -0.49934709072113037, "step": 572 }, { "epoch": 0.16, "logps_train/chosen": -70.7585678100586, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -90.46743774414062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.512184739112854, "rewards_train/margins": 0.8380746841430664, "rewards_train/rejected": -1.3502594232559204, "step": 573 }, { "epoch": 0.16, "learning_rate": 4.638282394853932e-07, "loss": 0.4728, "step": 574 }, { "epoch": 0.16, "logps_train/chosen": -68.72225189208984, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -88.58625793457031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6972252130508423, "rewards_train/margins": 0.637474775314331, "rewards_train/rejected": -1.3346999883651733, "step": 574 }, { "epoch": 0.16, "logps_train/chosen": -90.45333099365234, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -98.67056274414062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3683798611164093, "rewards_train/margins": 0.24164548516273499, "rewards_train/rejected": -0.6100253462791443, "step": 575 }, { "epoch": 0.16, "learning_rate": 4.6350691947729845e-07, "loss": 0.5621, "step": 576 }, { "epoch": 0.16, "logps_train/chosen": -77.0277099609375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -77.37614440917969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6768916249275208, "rewards_train/margins": 0.6372849345207214, "rewards_train/rejected": -1.3141765594482422, "step": 576 }, { "epoch": 0.16, "logps_train/chosen": -37.29064178466797, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -58.67890930175781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.35047563910484314, "rewards_train/margins": 0.5439777672290802, "rewards_train/rejected": -0.8944534063339233, "step": 577 }, { "epoch": 0.16, "learning_rate": 4.631842909254947e-07, "loss": 0.5355, "step": 578 }, { "epoch": 0.16, "logps_train/chosen": -84.76373291015625, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -108.00592041015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5107486844062805, "rewards_train/margins": 1.342577874660492, "rewards_train/rejected": -1.8533265590667725, "step": 578 }, { "epoch": 0.16, "logps_train/chosen": -66.60935974121094, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -92.59552001953125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2109362781047821, "rewards_train/margins": 1.2681472599506378, "rewards_train/rejected": -1.47908353805542, "step": 579 }, { "epoch": 0.16, "learning_rate": 4.628603558073115e-07, "loss": 0.3661, "step": 580 }, { "epoch": 0.16, "logps_train/chosen": -81.98976135253906, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -95.69758605957031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.35717883706092834, "rewards_train/margins": 0.37820491194725037, "rewards_train/rejected": -0.7353837490081787, "step": 580 }, { "epoch": 0.16, "logps_train/chosen": -80.45840454101562, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -93.5, "logps_train/rejected": -107.05191040039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3048250675201416, "rewards_train/margins": 1.0464599132537842, "rewards_train/rejected": -1.3512849807739258, "step": 581 }, { "epoch": 0.16, "learning_rate": 4.6253511610808625e-07, "loss": 0.4926, "step": 582 }, { "epoch": 0.16, "logps_train/chosen": -87.01325988769531, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -77.6513671875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4786696434020996, "rewards_train/margins": 0.3825607895851135, "rewards_train/rejected": -0.8612304329872131, "step": 582 }, { "epoch": 0.16, "logps_train/chosen": -49.4372444152832, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -75.7767105102539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.023411860689520836, "rewards_train/margins": 0.7585559580475092, "rewards_train/rejected": -0.78196781873703, "step": 583 }, { "epoch": 0.16, "learning_rate": 4.622085738211518e-07, "loss": 0.5155, "step": 584 }, { "epoch": 0.16, "logps_train/chosen": -44.25600051879883, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -52.15533447265625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5583515763282776, "rewards_train/margins": 0.17606604099273682, "rewards_train/rejected": -0.7344176173210144, "step": 584 }, { "epoch": 0.16, "logps_train/chosen": -35.850746154785156, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -63.023162841796875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.03819965571165085, "rewards_train/margins": 0.5672416165471077, "rewards_train/rejected": -0.6054412722587585, "step": 585 }, { "epoch": 0.16, "learning_rate": 4.618807309478243e-07, "loss": 0.5876, "step": 586 }, { "epoch": 0.16, "logps_train/chosen": -73.76558685302734, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -79.83191680908203, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7403284311294556, "rewards_train/margins": 0.5956953763961792, "rewards_train/rejected": -1.3360238075256348, "step": 586 }, { "epoch": 0.16, "logps_train/chosen": -91.15377044677734, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -92.85140991210938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.592720627784729, "rewards_train/margins": 0.6736701726913452, "rewards_train/rejected": -1.2663908004760742, "step": 587 }, { "epoch": 0.16, "learning_rate": 4.6155158949739103e-07, "loss": 0.4957, "step": 588 }, { "epoch": 0.16, "logps_train/chosen": -53.04076385498047, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -53.293479919433594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4956779479980469, "rewards_train/margins": 0.08484196662902832, "rewards_train/rejected": -0.5805199146270752, "step": 588 }, { "epoch": 0.16, "logps_train/chosen": -52.326904296875, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -48.143741607666016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3766355514526367, "rewards_train/margins": 0.007074713706970215, "rewards_train/rejected": -0.38371026515960693, "step": 589 }, { "epoch": 0.16, "learning_rate": 4.6122115148709793e-07, "loss": 0.683, "step": 590 }, { "epoch": 0.16, "logps_train/chosen": -59.204689025878906, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -65.17899322509766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.11109380424022675, "rewards_train/margins": 0.21383674442768097, "rewards_train/rejected": -0.3249305486679077, "step": 590 }, { "epoch": 0.17, "logps_train/chosen": -60.41273498535156, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -82.80255126953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.24361774325370789, "rewards_train/margins": 1.0311680138111115, "rewards_train/rejected": -1.2747857570648193, "step": 591 }, { "epoch": 0.17, "learning_rate": 4.608894189421374e-07, "loss": 0.5469, "step": 592 }, { "epoch": 0.17, "logps_train/chosen": -43.5977668762207, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -51.89591598510742, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.15645642578601837, "rewards_train/margins": 0.42454157769680023, "rewards_train/rejected": -0.5809980034828186, "step": 592 }, { "epoch": 0.17, "logps_train/chosen": -97.82725524902344, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -89.68708801269531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6387799978256226, "rewards_train/margins": 0.6754366159439087, "rewards_train/rejected": -1.3142166137695312, "step": 593 }, { "epoch": 0.17, "learning_rate": 4.6055639389563573e-07, "loss": 0.5269, "step": 594 }, { "epoch": 0.17, "logps_train/chosen": -78.71722412109375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -87.13782501220703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8431581258773804, "rewards_train/margins": 0.5284371376037598, "rewards_train/rejected": -1.3715952634811401, "step": 594 }, { "epoch": 0.17, "logps_train/chosen": -98.85196685791016, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -107.1817626953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4551188051700592, "rewards_train/margins": 0.703193873167038, "rewards_train/rejected": -1.1583126783370972, "step": 595 }, { "epoch": 0.17, "learning_rate": 4.6022207838864073e-07, "loss": 0.4972, "step": 596 }, { "epoch": 0.17, "logps_train/chosen": -104.48693084716797, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -122.48839569091797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.46881014108657837, "rewards_train/margins": 0.839794933795929, "rewards_train/rejected": -1.3086050748825073, "step": 596 }, { "epoch": 0.17, "logps_train/chosen": -72.1766357421875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -88.05975341796875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.34539756178855896, "rewards_train/margins": 0.8301087915897369, "rewards_train/rejected": -1.175506353378296, "step": 597 }, { "epoch": 0.17, "learning_rate": 4.598864744701092e-07, "loss": 0.4649, "step": 598 }, { "epoch": 0.17, "logps_train/chosen": -57.29808807373047, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -78.87712097167969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2516838014125824, "rewards_train/margins": 1.1243096888065338, "rewards_train/rejected": -1.3759934902191162, "step": 598 }, { "epoch": 0.17, "logps_train/chosen": -71.5537109375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -84.06674194335938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1652102768421173, "rewards_train/margins": 0.9629485309123993, "rewards_train/rejected": -1.1281588077545166, "step": 599 }, { "epoch": 0.17, "learning_rate": 4.595495841968944e-07, "loss": 0.4377, "step": 600 }, { "epoch": 0.17, "logps_train/chosen": -35.55915832519531, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -53.199073791503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.011466770432889462, "rewards_train/margins": 0.4583270838484168, "rewards_train/rejected": -0.44686031341552734, "step": 600 }, { "epoch": 0.17, "logps_train/chosen": -69.23992919921875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -76.4552230834961, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.0794619768857956, "rewards_train/margins": 0.830122783780098, "rewards_train/rejected": -0.9095847606658936, "step": 601 }, { "epoch": 0.17, "learning_rate": 4.592114096337333e-07, "loss": 0.487, "step": 602 }, { "epoch": 0.17, "logps_train/chosen": -61.107452392578125, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -54.82007598876953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6993560791015625, "rewards_train/margins": -0.008034110069274902, "rewards_train/rejected": -0.6913219690322876, "step": 602 }, { "epoch": 0.17, "logps_train/chosen": -48.074554443359375, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -72.86968231201172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3742521107196808, "rewards_train/margins": 0.7513880431652069, "rewards_train/rejected": -1.1256401538848877, "step": 603 }, { "epoch": 0.17, "learning_rate": 4.588719528532341e-07, "loss": 0.6121, "step": 604 }, { "epoch": 0.17, "logps_train/chosen": -55.13451385498047, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -60.32548522949219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3062373399734497, "rewards_train/margins": 0.35835492610931396, "rewards_train/rejected": -0.6645922660827637, "step": 604 }, { "epoch": 0.17, "logps_train/chosen": -79.34104919433594, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -90.52935028076172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3741443455219269, "rewards_train/margins": 0.7194158732891083, "rewards_train/rejected": -1.0935602188110352, "step": 605 }, { "epoch": 0.17, "learning_rate": 4.5853121593586365e-07, "loss": 0.5288, "step": 606 }, { "epoch": 0.17, "logps_train/chosen": -37.593963623046875, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -47.17842102050781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.25549042224884033, "rewards_train/margins": 0.49770355224609375, "rewards_train/rejected": -0.7531939744949341, "step": 606 }, { "epoch": 0.17, "logps_train/chosen": -91.64691162109375, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -120.29931640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.741253674030304, "rewards_train/margins": 0.7820373177528381, "rewards_train/rejected": -1.523290991783142, "step": 607 }, { "epoch": 0.17, "learning_rate": 4.581892009699342e-07, "loss": 0.5044, "step": 608 }, { "epoch": 0.17, "logps_train/chosen": -69.13575744628906, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -83.65777587890625, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.34111425280570984, "rewards_train/margins": 0.2746632397174835, "rewards_train/rejected": -0.6157774925231934, "step": 608 }, { "epoch": 0.17, "logps_train/chosen": -73.95113372802734, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -81.9327392578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5802695751190186, "rewards_train/margins": 0.517691969871521, "rewards_train/rejected": -1.0979615449905396, "step": 609 }, { "epoch": 0.17, "learning_rate": 4.578459100515911e-07, "loss": 0.558, "step": 610 }, { "epoch": 0.17, "logps_train/chosen": -103.4654541015625, "logps_train/ref_chosen": -96.5, "logps_train/ref_rejected": -115.5, "logps_train/rejected": -131.02651977539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6910765171051025, "rewards_train/margins": 0.8644089698791504, "rewards_train/rejected": -1.555485486984253, "step": 610 }, { "epoch": 0.17, "logps_train/chosen": -90.72823333740234, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -118.1752700805664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.022041872143745422, "rewards_train/margins": 1.3849381655454636, "rewards_train/rejected": -1.406980037689209, "step": 611 }, { "epoch": 0.17, "learning_rate": 4.5750134528479987e-07, "loss": 0.4026, "step": 612 }, { "epoch": 0.17, "logps_train/chosen": -38.14342498779297, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -42.50956726074219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0489390604197979, "rewards_train/margins": 0.29442719742655754, "rewards_train/rejected": -0.24548813700675964, "step": 612 }, { "epoch": 0.17, "logps_train/chosen": -71.17725372314453, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -94.51168823242188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2556159794330597, "rewards_train/margins": 0.49086567759513855, "rewards_train/rejected": -0.7464816570281982, "step": 613 }, { "epoch": 0.17, "learning_rate": 4.57155508781333e-07, "loss": 0.5611, "step": 614 }, { "epoch": 0.17, "logps_train/chosen": -75.19429016113281, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -105.45350646972656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.47997623682022095, "rewards_train/margins": 1.1724061369895935, "rewards_train/rejected": -1.6523823738098145, "step": 614 }, { "epoch": 0.17, "logps_train/chosen": -81.6805648803711, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -78.3815689086914, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.77650386095047, "rewards_train/margins": 0.5130689740180969, "rewards_train/rejected": -1.289572834968567, "step": 615 }, { "epoch": 0.17, "learning_rate": 4.568084026607574e-07, "loss": 0.4385, "step": 616 }, { "epoch": 0.17, "logps_train/chosen": -83.96728515625, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -89.79374694824219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.44086921215057373, "rewards_train/margins": 0.8107714653015137, "rewards_train/rejected": -1.2516406774520874, "step": 616 }, { "epoch": 0.17, "logps_train/chosen": -39.587135314941406, "logps_train/ref_chosen": -36.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -49.2750244140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.310862272977829, "rewards_train/margins": 0.37484338879585266, "rewards_train/rejected": -0.6857056617736816, "step": 617 }, { "epoch": 0.17, "learning_rate": 4.5646002905042096e-07, "loss": 0.5272, "step": 618 }, { "epoch": 0.17, "logps_train/chosen": -44.052940368652344, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -68.78353118896484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2131064385175705, "rewards_train/margins": 0.9093870669603348, "rewards_train/rejected": -1.1224935054779053, "step": 618 }, { "epoch": 0.17, "logps_train/chosen": -102.40580749511719, "logps_train/ref_chosen": -100.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -88.42894744873047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.22124440968036652, "rewards_train/margins": 0.8700881451368332, "rewards_train/rejected": -1.0913325548171997, "step": 619 }, { "epoch": 0.17, "learning_rate": 4.5611039008544007e-07, "loss": 0.4397, "step": 620 }, { "epoch": 0.17, "logps_train/chosen": -97.11043548583984, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -96.13004302978516, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6344809532165527, "rewards_train/margins": 0.3949296474456787, "rewards_train/rejected": -1.0294106006622314, "step": 620 }, { "epoch": 0.17, "logps_train/chosen": -61.36271286010742, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -63.25, "logps_train/rejected": -75.26327514648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.13861505687236786, "rewards_train/margins": 1.0523604899644852, "rewards_train/rejected": -1.190975546836853, "step": 621 }, { "epoch": 0.17, "learning_rate": 4.5575948790868603e-07, "loss": 0.4873, "step": 622 }, { "epoch": 0.17, "logps_train/chosen": -93.0059814453125, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -103.60557556152344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3269651532173157, "rewards_train/margins": 1.222850501537323, "rewards_train/rejected": -1.5498156547546387, "step": 622 }, { "epoch": 0.17, "logps_train/chosen": -82.55561828613281, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -94.36223602294922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3866158127784729, "rewards_train/margins": 0.893748939037323, "rewards_train/rejected": -1.280364751815796, "step": 623 }, { "epoch": 0.17, "learning_rate": 4.5540732467077233e-07, "loss": 0.4044, "step": 624 }, { "epoch": 0.17, "logps_train/chosen": -41.70579528808594, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -58.41779708862305, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.019016893580555916, "rewards_train/margins": 0.7829191368073225, "rewards_train/rejected": -0.8019360303878784, "step": 624 }, { "epoch": 0.17, "logps_train/chosen": -37.41344451904297, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -52.190956115722656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.18987981975078583, "rewards_train/margins": -0.01346006989479065, "rewards_train/rejected": -0.17641974985599518, "step": 625 }, { "epoch": 0.17, "learning_rate": 4.5505390253004104e-07, "loss": 0.5797, "step": 626 }, { "epoch": 0.17, "logps_train/chosen": -75.43762969970703, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -86.45477294921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3323371410369873, "rewards_train/margins": 0.812847375869751, "rewards_train/rejected": -1.1451845169067383, "step": 626 }, { "epoch": 0.18, "logps_train/chosen": -85.30070495605469, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -118.30488586425781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9474532604217529, "rewards_train/margins": 1.0707910060882568, "rewards_train/rejected": -2.0182442665100098, "step": 627 }, { "epoch": 0.18, "learning_rate": 4.5469922365254995e-07, "loss": 0.4124, "step": 628 }, { "epoch": 0.18, "logps_train/chosen": -68.51701354980469, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -91.02525329589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3190349042415619, "rewards_train/margins": 1.1977479755878448, "rewards_train/rejected": -1.5167828798294067, "step": 628 }, { "epoch": 0.18, "logps_train/chosen": -64.99640655517578, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -86.93624877929688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5605781674385071, "rewards_train/margins": 1.1076564192771912, "rewards_train/rejected": -1.6682345867156982, "step": 629 }, { "epoch": 0.18, "learning_rate": 4.543432902120591e-07, "loss": 0.4286, "step": 630 }, { "epoch": 0.18, "logps_train/chosen": -70.08255004882812, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -76.27694702148438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8255029320716858, "rewards_train/margins": 0.43936246633529663, "rewards_train/rejected": -1.2648653984069824, "step": 630 }, { "epoch": 0.18, "logps_train/chosen": -32.9892463684082, "logps_train/ref_chosen": -28.375, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -42.423362731933594, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.45908087491989136, "rewards_train/margins": -0.003072798252105713, "rewards_train/rejected": -0.45600807666778564, "step": 631 }, { "epoch": 0.18, "learning_rate": 4.5398610439001754e-07, "loss": 0.6323, "step": 632 }, { "epoch": 0.18, "logps_train/chosen": -70.62435150146484, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -88.1588134765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2561851739883423, "rewards_train/margins": 1.2421189546585083, "rewards_train/rejected": -1.4983041286468506, "step": 632 }, { "epoch": 0.18, "logps_train/chosen": -37.48558044433594, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -52.9913330078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.018186796456575394, "rewards_train/margins": 0.5228411667048931, "rewards_train/rejected": -0.5410279631614685, "step": 633 }, { "epoch": 0.18, "learning_rate": 4.5362766837555e-07, "loss": 0.462, "step": 634 }, { "epoch": 0.18, "logps_train/chosen": -59.0457878112793, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -66.14567565917969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3864145874977112, "rewards_train/margins": 0.9375272393226624, "rewards_train/rejected": -1.3239418268203735, "step": 634 }, { "epoch": 0.18, "logps_train/chosen": -50.50392150878906, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -54.16680145263672, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.31679850816726685, "rewards_train/margins": 0.23738187551498413, "rewards_train/rejected": -0.554180383682251, "step": 635 }, { "epoch": 0.18, "learning_rate": 4.5326798436544323e-07, "loss": 0.5731, "step": 636 }, { "epoch": 0.18, "logps_train/chosen": -61.84890365600586, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -76.36966705322266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.02512475848197937, "rewards_train/margins": 0.9423111975193024, "rewards_train/rejected": -0.9674359560012817, "step": 636 }, { "epoch": 0.18, "logps_train/chosen": -74.06765747070312, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -85.84880065917969, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.15209071338176727, "rewards_train/margins": 0.5753799229860306, "rewards_train/rejected": -0.7274706363677979, "step": 637 }, { "epoch": 0.18, "learning_rate": 4.5290705456413274e-07, "loss": 0.4851, "step": 638 }, { "epoch": 0.18, "logps_train/chosen": -71.73297882080078, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -71.759033203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.34693074226379395, "rewards_train/margins": 0.4447438716888428, "rewards_train/rejected": -0.7916746139526367, "step": 638 }, { "epoch": 0.18, "logps_train/chosen": -90.84843444824219, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -103.9638900756836, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8617967367172241, "rewards_train/margins": 0.9881085157394409, "rewards_train/rejected": -1.849905252456665, "step": 639 }, { "epoch": 0.18, "learning_rate": 4.525448811836895e-07, "loss": 0.5341, "step": 640 }, { "epoch": 0.18, "logps_train/chosen": -96.2398910522461, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -110.59107971191406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.605202317237854, "rewards_train/margins": 0.5812125205993652, "rewards_train/rejected": -1.1864148378372192, "step": 640 }, { "epoch": 0.18, "logps_train/chosen": -40.226585388183594, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -58.57625198364258, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.06034912168979645, "rewards_train/margins": 0.5128963440656662, "rewards_train/rejected": -0.45254722237586975, "step": 641 }, { "epoch": 0.18, "learning_rate": 4.521814664438058e-07, "loss": 0.5262, "step": 642 }, { "epoch": 0.18, "logps_train/chosen": -40.263187408447266, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -24.5, "logps_train/rejected": -30.442285537719727, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4275878965854645, "rewards_train/margins": 0.1609766185283661, "rewards_train/rejected": -0.5885645151138306, "step": 642 }, { "epoch": 0.18, "logps_train/chosen": -33.801544189453125, "logps_train/ref_chosen": -29.5, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -48.0215950012207, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.43054524064064026, "rewards_train/margins": 0.026692569255828857, "rewards_train/rejected": -0.4572378098964691, "step": 643 }, { "epoch": 0.18, "learning_rate": 4.5181681257178235e-07, "loss": 0.6861, "step": 644 }, { "epoch": 0.18, "logps_train/chosen": -73.14881896972656, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -79.70219421386719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7141008377075195, "rewards_train/margins": 0.7440091371536255, "rewards_train/rejected": -1.458109974861145, "step": 644 }, { "epoch": 0.18, "logps_train/chosen": -63.07398223876953, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -72.7054672241211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.425659716129303, "rewards_train/margins": 0.616371214389801, "rewards_train/rejected": -1.042030930519104, "step": 645 }, { "epoch": 0.18, "learning_rate": 4.514509218025139e-07, "loss": 0.4893, "step": 646 }, { "epoch": 0.18, "logps_train/chosen": -32.77337646484375, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -51.40776824951172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.001751527190208435, "rewards_train/margins": 0.24722842872142792, "rewards_train/rejected": -0.24897995591163635, "step": 646 }, { "epoch": 0.18, "logps_train/chosen": -95.13325500488281, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -115.75078582763672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4730914235115051, "rewards_train/margins": 0.8523772358894348, "rewards_train/rejected": -1.32546865940094, "step": 647 }, { "epoch": 0.18, "learning_rate": 4.510837963784762e-07, "loss": 0.5427, "step": 648 }, { "epoch": 0.18, "logps_train/chosen": -61.911338806152344, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -90.363525390625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.12453193217515945, "rewards_train/margins": 1.2088908478617668, "rewards_train/rejected": -1.3334227800369263, "step": 648 }, { "epoch": 0.18, "logps_train/chosen": -88.54911041259766, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -115.21802520751953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.36233267188072205, "rewards_train/margins": 1.0243140757083893, "rewards_train/rejected": -1.3866467475891113, "step": 649 }, { "epoch": 0.18, "learning_rate": 4.507154385497117e-07, "loss": 0.4207, "step": 650 }, { "epoch": 0.18, "logps_train/chosen": -81.17900085449219, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -102.59858703613281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.345048725605011, "rewards_train/margins": 1.1343411803245544, "rewards_train/rejected": -1.4793899059295654, "step": 650 }, { "epoch": 0.18, "logps_train/chosen": -72.46735382080078, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -83.43834686279297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6561101078987122, "rewards_train/margins": 0.20139652490615845, "rewards_train/rejected": -0.8575066328048706, "step": 651 }, { "epoch": 0.18, "learning_rate": 4.5034585057381626e-07, "loss": 0.5609, "step": 652 }, { "epoch": 0.18, "logps_train/chosen": -65.99787902832031, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -60.746192932128906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.363298624753952, "rewards_train/margins": 0.71584752202034, "rewards_train/rejected": -0.35254889726638794, "step": 652 }, { "epoch": 0.18, "logps_train/chosen": -64.7233657836914, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -82.37864685058594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3309304118156433, "rewards_train/margins": 0.6815438866615295, "rewards_train/rejected": -1.0124742984771729, "step": 653 }, { "epoch": 0.18, "learning_rate": 4.49975034715925e-07, "loss": 0.5347, "step": 654 }, { "epoch": 0.18, "logps_train/chosen": -35.867515563964844, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -55.71794128417969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.10979853570461273, "rewards_train/margins": 0.4460289031267166, "rewards_train/rejected": -0.5558274388313293, "step": 654 }, { "epoch": 0.18, "logps_train/chosen": -96.11976623535156, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -111.7199935913086, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8938125371932983, "rewards_train/margins": 0.9885386228561401, "rewards_train/rejected": -1.8823511600494385, "step": 655 }, { "epoch": 0.18, "learning_rate": 4.4960299324869857e-07, "loss": 0.4862, "step": 656 }, { "epoch": 0.18, "logps_train/chosen": -72.49343872070312, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -91.55741882324219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7552034258842468, "rewards_train/margins": 1.1499527096748352, "rewards_train/rejected": -1.905156135559082, "step": 656 }, { "epoch": 0.18, "logps_train/chosen": -71.75846099853516, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -95.741455078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4320959150791168, "rewards_train/margins": 0.6506434381008148, "rewards_train/rejected": -1.0827393531799316, "step": 657 }, { "epoch": 0.18, "learning_rate": 4.4922972845230895e-07, "loss": 0.4493, "step": 658 }, { "epoch": 0.18, "logps_train/chosen": -85.15675354003906, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -89.21485137939453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.24184703826904297, "rewards_train/margins": 0.3972163796424866, "rewards_train/rejected": -0.6390634179115295, "step": 658 }, { "epoch": 0.18, "logps_train/chosen": -98.7637939453125, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -108.19346618652344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7701294422149658, "rewards_train/margins": 1.2867178916931152, "rewards_train/rejected": -2.056847333908081, "step": 659 }, { "epoch": 0.18, "learning_rate": 4.4885524261442585e-07, "loss": 0.4751, "step": 660 }, { "epoch": 0.18, "logps_train/chosen": -79.21875, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -93.99169158935547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0007820129394531, "rewards_train/margins": 0.6814925670623779, "rewards_train/rejected": -1.682274580001831, "step": 660 }, { "epoch": 0.18, "logps_train/chosen": -68.07029724121094, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -94.90338897705078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6750957369804382, "rewards_train/margins": 0.9785241484642029, "rewards_train/rejected": -1.6536198854446411, "step": 661 }, { "epoch": 0.19, "learning_rate": 4.4847953803020246e-07, "loss": 0.4814, "step": 662 }, { "epoch": 0.19, "logps_train/chosen": -57.656044006347656, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -78.78140258789062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1460731029510498, "rewards_train/margins": 1.0043323040008545, "rewards_train/rejected": -1.1504054069519043, "step": 662 }, { "epoch": 0.19, "logps_train/chosen": -86.06736755371094, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -92.02853393554688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4536122679710388, "rewards_train/margins": 0.43361592292785645, "rewards_train/rejected": -0.8872281908988953, "step": 663 }, { "epoch": 0.19, "learning_rate": 4.481026170022614e-07, "loss": 0.4835, "step": 664 }, { "epoch": 0.19, "logps_train/chosen": -75.21578216552734, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -88.8048095703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7039028406143188, "rewards_train/margins": 1.113687515258789, "rewards_train/rejected": -1.817590355873108, "step": 664 }, { "epoch": 0.19, "logps_train/chosen": -89.71745300292969, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -92.02498626708984, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.843620240688324, "rewards_train/margins": 0.8002843260765076, "rewards_train/rejected": -1.6439045667648315, "step": 665 }, { "epoch": 0.19, "learning_rate": 4.4772448184068067e-07, "loss": 0.5129, "step": 666 }, { "epoch": 0.19, "logps_train/chosen": -85.15534210205078, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -94.31193542480469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8545966148376465, "rewards_train/margins": 0.23831546306610107, "rewards_train/rejected": -1.0929120779037476, "step": 666 }, { "epoch": 0.19, "logps_train/chosen": -23.884380340576172, "logps_train/ref_chosen": -23.75, "logps_train/ref_rejected": -18.5, "logps_train/rejected": -23.69770050048828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.014219170436263084, "rewards_train/margins": 0.49969165958464146, "rewards_train/rejected": -0.5139108300209045, "step": 667 }, { "epoch": 0.19, "learning_rate": 4.4734513486297964e-07, "loss": 0.5766, "step": 668 }, { "epoch": 0.19, "logps_train/chosen": -56.38758850097656, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -69.25550079345703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2778211534023285, "rewards_train/margins": 0.9207759201526642, "rewards_train/rejected": -1.1985970735549927, "step": 668 }, { "epoch": 0.19, "logps_train/chosen": -59.53071212768555, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -45.45075225830078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21596190333366394, "rewards_train/margins": 0.3982537090778351, "rewards_train/rejected": -0.614215612411499, "step": 669 }, { "epoch": 0.19, "learning_rate": 4.4696457839410427e-07, "loss": 0.499, "step": 670 }, { "epoch": 0.19, "logps_train/chosen": -61.48435592651367, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -67.98274230957031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7351545095443726, "rewards_train/margins": 0.7779637575149536, "rewards_train/rejected": -1.5131182670593262, "step": 670 }, { "epoch": 0.19, "logps_train/chosen": -78.70622253417969, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -94.24607849121094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7309736609458923, "rewards_train/margins": 0.7086732983589172, "rewards_train/rejected": -1.4396469593048096, "step": 671 }, { "epoch": 0.19, "learning_rate": 4.465828147664137e-07, "loss": 0.5492, "step": 672 }, { "epoch": 0.19, "logps_train/chosen": -62.592857360839844, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -67.83057403564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.34668827056884766, "rewards_train/margins": 0.641251802444458, "rewards_train/rejected": -0.9879400730133057, "step": 672 }, { "epoch": 0.19, "logps_train/chosen": -73.33329010009766, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -77.51690673828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6296181082725525, "rewards_train/margins": 0.3064476251602173, "rewards_train/rejected": -0.9360657334327698, "step": 673 }, { "epoch": 0.19, "learning_rate": 4.4619984631966524e-07, "loss": 0.5517, "step": 674 }, { "epoch": 0.19, "logps_train/chosen": -50.96469497680664, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -58.390281677246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.23562970757484436, "rewards_train/margins": 0.494804710149765, "rewards_train/rejected": -0.7304344177246094, "step": 674 }, { "epoch": 0.19, "logps_train/chosen": -81.35869598388672, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -81.34394836425781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.0551568940281868, "rewards_train/margins": 1.0149314925074577, "rewards_train/rejected": -1.0700883865356445, "step": 675 }, { "epoch": 0.19, "learning_rate": 4.458156754010004e-07, "loss": 0.4865, "step": 676 }, { "epoch": 0.19, "logps_train/chosen": -56.81840515136719, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -73.55111694335938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7370163798332214, "rewards_train/margins": 0.7875295281410217, "rewards_train/rejected": -1.5245459079742432, "step": 676 }, { "epoch": 0.19, "logps_train/chosen": -52.559120178222656, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -51.95066833496094, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.7412638664245605, "rewards_train/margins": 0.03309953212738037, "rewards_train/rejected": -0.7743633985519409, "step": 677 }, { "epoch": 0.19, "learning_rate": 4.4543030436493036e-07, "loss": 0.6319, "step": 678 }, { "epoch": 0.19, "logps_train/chosen": -51.42641830444336, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -68.86459350585938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.34811049699783325, "rewards_train/margins": 1.0535833239555359, "rewards_train/rejected": -1.4016938209533691, "step": 678 }, { "epoch": 0.19, "logps_train/chosen": -98.58399963378906, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -115.09819030761719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1162129640579224, "rewards_train/margins": 0.6939965486526489, "rewards_train/rejected": -1.8102095127105713, "step": 679 }, { "epoch": 0.19, "learning_rate": 4.450437355733217e-07, "loss": 0.4698, "step": 680 }, { "epoch": 0.19, "logps_train/chosen": -54.211544036865234, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -60.974143981933594, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.5453733205795288, "rewards_train/margins": 0.1684473156929016, "rewards_train/rejected": -0.7138206362724304, "step": 680 }, { "epoch": 0.19, "logps_train/chosen": -73.12992095947266, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -94.11784362792969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6758828163146973, "rewards_train/margins": 0.38355767726898193, "rewards_train/rejected": -1.0594404935836792, "step": 681 }, { "epoch": 0.19, "learning_rate": 4.4465597139538175e-07, "loss": 0.6193, "step": 682 }, { "epoch": 0.19, "logps_train/chosen": -62.34566116333008, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -85.4367904663086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9302570223808289, "rewards_train/margins": 0.7649847865104675, "rewards_train/rejected": -1.6952418088912964, "step": 682 }, { "epoch": 0.19, "logps_train/chosen": -80.58963775634766, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -67.1247329711914, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2303017377853394, "rewards_train/margins": 0.2347104549407959, "rewards_train/rejected": -1.4650121927261353, "step": 683 }, { "epoch": 0.19, "learning_rate": 4.442670142076441e-07, "loss": 0.5485, "step": 684 }, { "epoch": 0.19, "logps_train/chosen": -46.678382873535156, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -60.481929779052734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.01731795445084572, "rewards_train/margins": 0.6487140096724033, "rewards_train/rejected": -0.6313960552215576, "step": 684 }, { "epoch": 0.19, "logps_train/chosen": -75.64036560058594, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -90.77500915527344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3628641664981842, "rewards_train/margins": 0.8718629777431488, "rewards_train/rejected": -1.234727144241333, "step": 685 }, { "epoch": 0.19, "learning_rate": 4.4387686639395427e-07, "loss": 0.4704, "step": 686 }, { "epoch": 0.19, "logps_train/chosen": -97.7982177734375, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -132.18392944335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4970096945762634, "rewards_train/margins": 1.6504855751991272, "rewards_train/rejected": -2.1474952697753906, "step": 686 }, { "epoch": 0.19, "logps_train/chosen": -35.54115295410156, "logps_train/ref_chosen": -32.0, "logps_train/ref_rejected": -28.75, "logps_train/rejected": -35.30659484863281, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.34591197967529297, "rewards_train/margins": 0.30901503562927246, "rewards_train/rejected": -0.6549270153045654, "step": 687 }, { "epoch": 0.19, "learning_rate": 4.4348553034545455e-07, "loss": 0.4681, "step": 688 }, { "epoch": 0.19, "logps_train/chosen": -75.43375396728516, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -111.872802734375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.575553297996521, "rewards_train/margins": 1.1656333208084106, "rewards_train/rejected": -1.7411866188049316, "step": 688 }, { "epoch": 0.19, "logps_train/chosen": -54.86640930175781, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -51.671295166015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3280473053455353, "rewards_train/margins": 0.3699416220188141, "rewards_train/rejected": -0.6979889273643494, "step": 689 }, { "epoch": 0.19, "learning_rate": 4.4309300846056997e-07, "loss": 0.4749, "step": 690 }, { "epoch": 0.19, "logps_train/chosen": -65.31968688964844, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -115.38182067871094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12357078492641449, "rewards_train/margins": 1.2216432243585587, "rewards_train/rejected": -1.3452140092849731, "step": 690 }, { "epoch": 0.19, "logps_train/chosen": -54.80620574951172, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -85.59770965576172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.27944865822792053, "rewards_train/margins": 0.6799319684505463, "rewards_train/rejected": -0.9593806266784668, "step": 691 }, { "epoch": 0.19, "learning_rate": 4.426993031449934e-07, "loss": 0.4428, "step": 692 }, { "epoch": 0.19, "logps_train/chosen": -81.28584289550781, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -98.84505462646484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6164754629135132, "rewards_train/margins": 1.0586549043655396, "rewards_train/rejected": -1.6751303672790527, "step": 692 }, { "epoch": 0.19, "logps_train/chosen": -56.008544921875, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -62.47737121582031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.12531700730323792, "rewards_train/margins": 1.0977133214473724, "rewards_train/rejected": -1.2230303287506104, "step": 693 }, { "epoch": 0.19, "learning_rate": 4.423044168116703e-07, "loss": 0.4061, "step": 694 }, { "epoch": 0.19, "logps_train/chosen": -50.23446273803711, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -55.06452941894531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.27032139897346497, "rewards_train/margins": 0.43417826294898987, "rewards_train/rejected": -0.7044996619224548, "step": 694 }, { "epoch": 0.19, "logps_train/chosen": -67.40570068359375, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -74.43222045898438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5548282861709595, "rewards_train/margins": 0.3792140483856201, "rewards_train/rejected": -0.9340423345565796, "step": 695 }, { "epoch": 0.19, "learning_rate": 4.419083518807849e-07, "loss": 0.5375, "step": 696 }, { "epoch": 0.19, "logps_train/chosen": -66.59686279296875, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -87.62117767333984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.528436005115509, "rewards_train/margins": 0.6325101256370544, "rewards_train/rejected": -1.1609461307525635, "step": 696 }, { "epoch": 0.19, "logps_train/chosen": -96.261962890625, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -88.99104309082031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7295166254043579, "rewards_train/margins": 0.39590609073638916, "rewards_train/rejected": -1.125422716140747, "step": 697 }, { "epoch": 0.2, "learning_rate": 4.415111107797445e-07, "loss": 0.5404, "step": 698 }, { "epoch": 0.2, "logps_train/chosen": -38.09441375732422, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -42.03752899169922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.0683908611536026, "rewards_train/margins": 0.31149922311306, "rewards_train/rejected": -0.2431083619594574, "step": 698 }, { "epoch": 0.2, "logps_train/chosen": -74.51948547363281, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -99.397216796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6858353614807129, "rewards_train/margins": 1.3628711700439453, "rewards_train/rejected": -2.048706531524658, "step": 699 }, { "epoch": 0.2, "learning_rate": 4.4111269594316504e-07, "loss": 0.4587, "step": 700 }, { "epoch": 0.2, "logps_train/chosen": -68.04469299316406, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -63.25, "logps_train/rejected": -80.16493225097656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1466572284698486, "rewards_train/margins": 0.534679651260376, "rewards_train/rejected": -1.6813368797302246, "step": 700 }, { "epoch": 0.2, "logps_train/chosen": -65.927490234375, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -88.21231079101562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.49196773767471313, "rewards_train/margins": 0.9068029522895813, "rewards_train/rejected": -1.3987706899642944, "step": 701 }, { "epoch": 0.2, "learning_rate": 4.40713109812856e-07, "loss": 0.4802, "step": 702 }, { "epoch": 0.2, "logps_train/chosen": -43.46561050415039, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -64.47077941894531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2250766158103943, "rewards_train/margins": 1.2809858918190002, "rewards_train/rejected": -1.5060625076293945, "step": 702 }, { "epoch": 0.2, "logps_train/chosen": -93.71458435058594, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -76.2461929321289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5913801193237305, "rewards_train/margins": 0.34808313846588135, "rewards_train/rejected": -0.9394632577896118, "step": 703 }, { "epoch": 0.2, "learning_rate": 4.403123548378055e-07, "loss": 0.496, "step": 704 }, { "epoch": 0.2, "logps_train/chosen": -14.555025100708008, "logps_train/ref_chosen": -13.875, "logps_train/ref_rejected": -21.75, "logps_train/rejected": -24.022647857666016, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.06907662749290466, "rewards_train/margins": 0.1593601405620575, "rewards_train/rejected": -0.22843676805496216, "step": 704 }, { "epoch": 0.2, "logps_train/chosen": -55.99735641479492, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -88.1343002319336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5546185374259949, "rewards_train/margins": 1.595529854297638, "rewards_train/rejected": -2.150148391723633, "step": 705 }, { "epoch": 0.2, "learning_rate": 4.3991043347416545e-07, "loss": 0.5, "step": 706 }, { "epoch": 0.2, "logps_train/chosen": -15.79543685913086, "logps_train/ref_chosen": -16.375, "logps_train/ref_rejected": -23.375, "logps_train/rejected": -24.89569091796875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.055417317897081375, "rewards_train/margins": 0.21061145141720772, "rewards_train/rejected": -0.15519413352012634, "step": 706 }, { "epoch": 0.2, "logps_train/chosen": -93.42644500732422, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -99.34465026855469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.335613489151001, "rewards_train/margins": 0.7207262516021729, "rewards_train/rejected": -2.056339740753174, "step": 707 }, { "epoch": 0.2, "learning_rate": 4.3950734818523606e-07, "loss": 0.6011, "step": 708 }, { "epoch": 0.2, "logps_train/chosen": -53.259437561035156, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -57.41320037841797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4588903784751892, "rewards_train/margins": 0.8788536190986633, "rewards_train/rejected": -1.3377439975738525, "step": 708 }, { "epoch": 0.2, "logps_train/chosen": -67.23185729980469, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -80.57467651367188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.08325980603694916, "rewards_train/margins": 0.32588331401348114, "rewards_train/rejected": -0.24262350797653198, "step": 709 }, { "epoch": 0.2, "learning_rate": 4.3910310144145137e-07, "loss": 0.5841, "step": 710 }, { "epoch": 0.2, "logps_train/chosen": -50.67464828491211, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -71.0008773803711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.04099975898861885, "rewards_train/margins": 1.0672911517322063, "rewards_train/rejected": -1.1082909107208252, "step": 710 }, { "epoch": 0.2, "logps_train/chosen": -48.37436294555664, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -58.73229217529297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5963231325149536, "rewards_train/margins": 0.5827653408050537, "rewards_train/rejected": -1.1790884733200073, "step": 711 }, { "epoch": 0.2, "learning_rate": 4.386976957203633e-07, "loss": 0.4646, "step": 712 }, { "epoch": 0.2, "logps_train/chosen": -65.23060607910156, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -74.45390319824219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3915172517299652, "rewards_train/margins": 0.3273109495639801, "rewards_train/rejected": -0.7188282012939453, "step": 712 }, { "epoch": 0.2, "logps_train/chosen": -24.60601043701172, "logps_train/ref_chosen": -24.625, "logps_train/ref_rejected": -32.75, "logps_train/rejected": -38.18189239501953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.0063912393525242805, "rewards_train/margins": 0.5437214495614171, "rewards_train/rejected": -0.5373302102088928, "step": 713 }, { "epoch": 0.2, "learning_rate": 4.3829113350662737e-07, "loss": 0.5398, "step": 714 }, { "epoch": 0.2, "logps_train/chosen": -81.86094665527344, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -108.54100799560547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4165637791156769, "rewards_train/margins": 1.674256056547165, "rewards_train/rejected": -2.090819835662842, "step": 714 }, { "epoch": 0.2, "logps_train/chosen": -75.81840515136719, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -92.84339904785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.519242525100708, "rewards_train/margins": 1.4959566593170166, "rewards_train/rejected": -2.0151991844177246, "step": 715 }, { "epoch": 0.2, "learning_rate": 4.378834172919869e-07, "loss": 0.3572, "step": 716 }, { "epoch": 0.2, "logps_train/chosen": -28.508033752441406, "logps_train/ref_chosen": -26.875, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -47.858375549316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1649634838104248, "rewards_train/margins": 0.5304440855979919, "rewards_train/rejected": -0.6954075694084167, "step": 716 }, { "epoch": 0.2, "logps_train/chosen": -85.81369018554688, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -103.82503509521484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5036348104476929, "rewards_train/margins": 0.19332194328308105, "rewards_train/rejected": -1.696956753730774, "step": 717 }, { "epoch": 0.2, "learning_rate": 4.3747454957525755e-07, "loss": 0.5688, "step": 718 }, { "epoch": 0.2, "logps_train/chosen": -47.84799575805664, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -45.9123649597168, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3778659999370575, "rewards_train/margins": 0.10194471478462219, "rewards_train/rejected": -0.4798107147216797, "step": 718 }, { "epoch": 0.2, "logps_train/chosen": -53.432533264160156, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -75.42215728759766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.431144118309021, "rewards_train/margins": 1.0542360544204712, "rewards_train/rejected": -1.4853801727294922, "step": 719 }, { "epoch": 0.2, "learning_rate": 4.3706453286231273e-07, "loss": 0.5331, "step": 720 }, { "epoch": 0.2, "logps_train/chosen": -43.604515075683594, "logps_train/ref_chosen": -36.5, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -55.57349395751953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6987329125404358, "rewards_train/margins": 0.7650619149208069, "rewards_train/rejected": -1.4637948274612427, "step": 720 }, { "epoch": 0.2, "logps_train/chosen": -62.73842239379883, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -93.53396606445312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5853657126426697, "rewards_train/margins": 1.5977187752723694, "rewards_train/rejected": -2.183084487915039, "step": 721 }, { "epoch": 0.2, "learning_rate": 4.366533696660677e-07, "loss": 0.4224, "step": 722 }, { "epoch": 0.2, "logps_train/chosen": -36.93931579589844, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -55.6656379699707, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.06766213476657867, "rewards_train/margins": 0.5907962769269943, "rewards_train/rejected": -0.658458411693573, "step": 722 }, { "epoch": 0.2, "logps_train/chosen": -99.7580795288086, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -102.43901824951172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3067646026611328, "rewards_train/margins": 0.8185834884643555, "rewards_train/rejected": -2.1253480911254883, "step": 723 }, { "epoch": 0.2, "learning_rate": 4.362410625064642e-07, "loss": 0.5024, "step": 724 }, { "epoch": 0.2, "logps_train/chosen": -65.61380004882812, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -81.13764190673828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4079624116420746, "rewards_train/margins": 0.931974321603775, "rewards_train/rejected": -1.3399367332458496, "step": 724 }, { "epoch": 0.2, "logps_train/chosen": -54.373531341552734, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -67.60997772216797, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2898923456668854, "rewards_train/margins": 0.7055784165859222, "rewards_train/rejected": -0.9954707622528076, "step": 725 }, { "epoch": 0.2, "learning_rate": 4.3582761391045526e-07, "loss": 0.4936, "step": 726 }, { "epoch": 0.2, "logps_train/chosen": -54.74101638793945, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -77.01251220703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8436573147773743, "rewards_train/margins": 0.6876965165138245, "rewards_train/rejected": -1.5313538312911987, "step": 726 }, { "epoch": 0.2, "logps_train/chosen": -28.0190486907959, "logps_train/ref_chosen": -23.75, "logps_train/ref_rejected": -34.25, "logps_train/rejected": -44.53799819946289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4283209443092346, "rewards_train/margins": 0.6121000647544861, "rewards_train/rejected": -1.0404210090637207, "step": 727 }, { "epoch": 0.2, "learning_rate": 4.354130264119894e-07, "loss": 0.4942, "step": 728 }, { "epoch": 0.2, "logps_train/chosen": -26.15768051147461, "logps_train/ref_chosen": -23.75, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -37.0157470703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.23744767904281616, "rewards_train/margins": 0.48912733793258667, "rewards_train/rejected": -0.7265750169754028, "step": 728 }, { "epoch": 0.2, "logps_train/chosen": -74.88507080078125, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -88.21916198730469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8620414733886719, "rewards_train/margins": 0.8075311183929443, "rewards_train/rejected": -1.6695725917816162, "step": 729 }, { "epoch": 0.2, "learning_rate": 4.349973025519953e-07, "loss": 0.4937, "step": 730 }, { "epoch": 0.2, "logps_train/chosen": -73.86646270751953, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -82.05445098876953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4632086753845215, "rewards_train/margins": 0.384227991104126, "rewards_train/rejected": -0.8474366664886475, "step": 730 }, { "epoch": 0.2, "logps_train/chosen": -85.669189453125, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -99.95613098144531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.26691943407058716, "rewards_train/margins": 1.3876779675483704, "rewards_train/rejected": -1.6545974016189575, "step": 731 }, { "epoch": 0.2, "learning_rate": 4.345804448783664e-07, "loss": 0.4943, "step": 732 }, { "epoch": 0.2, "logps_train/chosen": -104.18721008300781, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -101.2660140991211, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1251177787780762, "rewards_train/margins": 0.44445228576660156, "rewards_train/rejected": -1.5695700645446777, "step": 732 }, { "epoch": 0.2, "logps_train/chosen": -61.69408416748047, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -73.74052429199219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3416737914085388, "rewards_train/margins": 0.692925751209259, "rewards_train/rejected": -1.0345995426177979, "step": 733 }, { "epoch": 0.21, "learning_rate": 4.341624559459447e-07, "loss": 0.4912, "step": 734 }, { "epoch": 0.21, "logps_train/chosen": -19.932621002197266, "logps_train/ref_chosen": -17.0, "logps_train/ref_rejected": -8.375, "logps_train/rejected": -12.725445747375488, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.2985355257987976, "rewards_train/margins": 0.13816922903060913, "rewards_train/rejected": -0.43670475482940674, "step": 734 }, { "epoch": 0.21, "logps_train/chosen": -50.78586959838867, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -65.61875915527344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5696026682853699, "rewards_train/margins": 0.8292847275733948, "rewards_train/rejected": -1.3988873958587646, "step": 735 }, { "epoch": 0.21, "learning_rate": 4.337433383165058e-07, "loss": 0.5719, "step": 736 }, { "epoch": 0.21, "logps_train/chosen": -57.67706298828125, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -49.850616455078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6176084280014038, "rewards_train/margins": 0.34167182445526123, "rewards_train/rejected": -0.959280252456665, "step": 736 }, { "epoch": 0.21, "logps_train/chosen": -137.65756225585938, "logps_train/ref_chosen": -122.0, "logps_train/ref_rejected": -136.0, "logps_train/rejected": -160.66299438476562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.550912618637085, "rewards_train/margins": 0.8966367244720459, "rewards_train/rejected": -2.447549343109131, "step": 737 }, { "epoch": 0.21, "learning_rate": 4.333230945587426e-07, "loss": 0.5527, "step": 738 }, { "epoch": 0.21, "logps_train/chosen": -81.39942169189453, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -89.80335998535156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8987309336662292, "rewards_train/margins": 1.0257458090782166, "rewards_train/rejected": -1.9244767427444458, "step": 738 }, { "epoch": 0.21, "logps_train/chosen": -60.17104721069336, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -85.5159912109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5997220277786255, "rewards_train/margins": 0.7862527370452881, "rewards_train/rejected": -1.3859747648239136, "step": 739 }, { "epoch": 0.21, "learning_rate": 4.3290172724825015e-07, "loss": 0.4614, "step": 740 }, { "epoch": 0.21, "logps_train/chosen": -84.48946380615234, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -80.91348266601562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1631065607070923, "rewards_train/margins": 0.3619331121444702, "rewards_train/rejected": -1.5250396728515625, "step": 740 }, { "epoch": 0.21, "logps_train/chosen": -45.46261978149414, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -20.25, "logps_train/rejected": -23.03445053100586, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.12907451391220093, "rewards_train/margins": 0.15093308687210083, "rewards_train/rejected": -0.28000760078430176, "step": 741 }, { "epoch": 0.21, "learning_rate": 4.3247923896750915e-07, "loss": 0.6586, "step": 742 }, { "epoch": 0.21, "logps_train/chosen": -64.82351684570312, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -61.36857986450195, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.16809424757957458, "rewards_train/margins": 0.582044929265976, "rewards_train/rejected": -0.7501391768455505, "step": 742 }, { "epoch": 0.21, "logps_train/chosen": -33.12548065185547, "logps_train/ref_chosen": -31.0, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -42.422828674316406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21118101477622986, "rewards_train/margins": 0.42172685265541077, "rewards_train/rejected": -0.6329078674316406, "step": 743 }, { "epoch": 0.21, "learning_rate": 4.320556323058709e-07, "loss": 0.5268, "step": 744 }, { "epoch": 0.21, "logps_train/chosen": -22.184978485107422, "logps_train/ref_chosen": -17.625, "logps_train/ref_rejected": -19.0, "logps_train/rejected": -21.502098083496094, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.45785337686538696, "rewards_train/margins": -0.21067100763320923, "rewards_train/rejected": -0.24718236923217773, "step": 744 }, { "epoch": 0.21, "logps_train/chosen": -66.39347076416016, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -94.58641052246094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0261726379394531, "rewards_train/margins": 1.144577980041504, "rewards_train/rejected": -2.170750617980957, "step": 745 }, { "epoch": 0.21, "learning_rate": 4.3163090985954074e-07, "loss": 0.7109, "step": 746 }, { "epoch": 0.21, "logps_train/chosen": -34.51667785644531, "logps_train/ref_chosen": -30.625, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -38.35007858276367, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.39158475399017334, "rewards_train/margins": -0.011191070079803467, "rewards_train/rejected": -0.3803936839103699, "step": 746 }, { "epoch": 0.21, "logps_train/chosen": -28.390628814697266, "logps_train/ref_chosen": -26.75, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -39.71607971191406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.16225619614124298, "rewards_train/margins": 0.44489888846874237, "rewards_train/rejected": -0.6071550846099854, "step": 747 }, { "epoch": 0.21, "learning_rate": 4.312050742315627e-07, "loss": 0.6253, "step": 748 }, { "epoch": 0.21, "logps_train/chosen": -69.1902084350586, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -75.73802185058594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5235127210617065, "rewards_train/margins": 0.9332972764968872, "rewards_train/rejected": -1.4568099975585938, "step": 748 }, { "epoch": 0.21, "logps_train/chosen": -49.8427734375, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -66.03128051757812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1014648973941803, "rewards_train/margins": 1.4969750344753265, "rewards_train/rejected": -1.5984399318695068, "step": 749 }, { "epoch": 0.21, "learning_rate": 4.307781280318031e-07, "loss": 0.4297, "step": 750 }, { "epoch": 0.21, "logps_train/chosen": -61.00275802612305, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -77.95832824707031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2415599822998047, "rewards_train/margins": 0.9275884628295898, "rewards_train/rejected": -1.1691484451293945, "step": 750 }, { "epoch": 0.21, "logps_train/chosen": -69.67094421386719, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -73.00233459472656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.7874069809913635, "rewards_train/margins": 0.7291356921195984, "rewards_train/rejected": -1.516542673110962, "step": 751 }, { "epoch": 0.21, "learning_rate": 4.303500738769348e-07, "loss": 0.4868, "step": 752 }, { "epoch": 0.21, "logps_train/chosen": -79.32991027832031, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -102.75628662109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5596757531166077, "rewards_train/margins": 1.2167338728904724, "rewards_train/rejected": -1.77640962600708, "step": 752 }, { "epoch": 0.21, "logps_train/chosen": -82.9136962890625, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -101.89189147949219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7511347532272339, "rewards_train/margins": 1.1444997787475586, "rewards_train/rejected": -1.8956345319747925, "step": 753 }, { "epoch": 0.21, "learning_rate": 4.299209143904211e-07, "loss": 0.3593, "step": 754 }, { "epoch": 0.21, "logps_train/chosen": -78.71809387207031, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -87.73480224609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9702469110488892, "rewards_train/margins": 0.9237418174743652, "rewards_train/rejected": -1.8939887285232544, "step": 754 }, { "epoch": 0.21, "logps_train/chosen": -70.54363250732422, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -91.7065658569336, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7981129884719849, "rewards_train/margins": 0.89559006690979, "rewards_train/rejected": -1.693703055381775, "step": 755 }, { "epoch": 0.21, "learning_rate": 4.294906522024997e-07, "loss": 0.4506, "step": 756 }, { "epoch": 0.21, "logps_train/chosen": -66.64875030517578, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -64.6475830078125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6670231819152832, "rewards_train/margins": 0.498028039932251, "rewards_train/rejected": -1.1650512218475342, "step": 756 }, { "epoch": 0.21, "logps_train/chosen": -56.22367477416992, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -63.79145812988281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8731487989425659, "rewards_train/margins": 1.0929113626480103, "rewards_train/rejected": -1.9660601615905762, "step": 757 }, { "epoch": 0.21, "learning_rate": 4.290592899501666e-07, "loss": 0.4703, "step": 758 }, { "epoch": 0.21, "logps_train/chosen": -75.13713073730469, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -101.6120834350586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0801197290420532, "rewards_train/margins": 1.2982760667800903, "rewards_train/rejected": -2.3783957958221436, "step": 758 }, { "epoch": 0.21, "logps_train/chosen": -66.06676483154297, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -72.21924591064453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5070674419403076, "rewards_train/margins": 0.6445446014404297, "rewards_train/rejected": -1.1516120433807373, "step": 759 }, { "epoch": 0.21, "learning_rate": 4.2862683027715975e-07, "loss": 0.4188, "step": 760 }, { "epoch": 0.21, "logps_train/chosen": -47.103973388671875, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -81.80034637451172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2663542628288269, "rewards_train/margins": 0.7855551838874817, "rewards_train/rejected": -1.0519094467163086, "step": 760 }, { "epoch": 0.21, "logps_train/chosen": -57.30543518066406, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -79.66067504882812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.62077796459198, "rewards_train/margins": 0.48337578773498535, "rewards_train/rejected": -1.1041537523269653, "step": 761 }, { "epoch": 0.21, "learning_rate": 4.281932758339431e-07, "loss": 0.4906, "step": 762 }, { "epoch": 0.21, "logps_train/chosen": -59.006019592285156, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -107.13850402832031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9209145307540894, "rewards_train/margins": 1.6273113489151, "rewards_train/rejected": -2.5482258796691895, "step": 762 }, { "epoch": 0.21, "logps_train/chosen": -53.86160659790039, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -61.93323516845703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5422154068946838, "rewards_train/margins": 0.6380223631858826, "rewards_train/rejected": -1.1802377700805664, "step": 763 }, { "epoch": 0.21, "learning_rate": 4.277586292776902e-07, "loss": 0.407, "step": 764 }, { "epoch": 0.21, "logps_train/chosen": -78.84117126464844, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -92.62709045410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9454452991485596, "rewards_train/margins": 0.49343574047088623, "rewards_train/rejected": -1.4388810396194458, "step": 764 }, { "epoch": 0.21, "logps_train/chosen": -44.09888458251953, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -56.846553802490234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.16955626010894775, "rewards_train/margins": 0.5073843598365784, "rewards_train/rejected": -0.6769406199455261, "step": 765 }, { "epoch": 0.21, "learning_rate": 4.273228932722679e-07, "loss": 0.5323, "step": 766 }, { "epoch": 0.21, "logps_train/chosen": -70.78803253173828, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -87.11528015136719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6672797799110413, "rewards_train/margins": 1.2649632096290588, "rewards_train/rejected": -1.9322429895401, "step": 766 }, { "epoch": 0.21, "logps_train/chosen": -65.51760864257812, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -81.43682098388672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4484404921531677, "rewards_train/margins": 1.420436680316925, "rewards_train/rejected": -1.8688771724700928, "step": 767 }, { "epoch": 0.21, "learning_rate": 4.268860704882202e-07, "loss": 0.3768, "step": 768 }, { "epoch": 0.21, "logps_train/chosen": -49.655696868896484, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -48.817283630371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1602962613105774, "rewards_train/margins": 0.4538540244102478, "rewards_train/rejected": -0.6141502857208252, "step": 768 }, { "epoch": 0.21, "logps_train/chosen": -74.95917510986328, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -99.34453582763672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.600800633430481, "rewards_train/margins": 0.9467389583587646, "rewards_train/rejected": -1.5475395917892456, "step": 769 }, { "epoch": 0.22, "learning_rate": 4.2644816360275173e-07, "loss": 0.4975, "step": 770 }, { "epoch": 0.22, "logps_train/chosen": -98.62361145019531, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -102.39410400390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3014233708381653, "rewards_train/margins": 0.9379865527153015, "rewards_train/rejected": -1.2394099235534668, "step": 770 }, { "epoch": 0.22, "logps_train/chosen": -86.83627319335938, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -76.5495834350586, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3202731609344482, "rewards_train/margins": -0.07351791858673096, "rewards_train/rejected": -1.2467552423477173, "step": 771 }, { "epoch": 0.22, "learning_rate": 4.2600917529971124e-07, "loss": 0.6152, "step": 772 }, { "epoch": 0.22, "logps_train/chosen": -72.23640441894531, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -64.30915069580078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7181720733642578, "rewards_train/margins": 0.31196165084838867, "rewards_train/rejected": -1.0301337242126465, "step": 772 }, { "epoch": 0.22, "logps_train/chosen": -99.03294372558594, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -115.25489807128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4572010040283203, "rewards_train/margins": 1.3969022035598755, "rewards_train/rejected": -1.8541032075881958, "step": 773 }, { "epoch": 0.22, "learning_rate": 4.255691082695754e-07, "loss": 0.4459, "step": 774 }, { "epoch": 0.22, "logps_train/chosen": -39.96922302246094, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -40.81270980834961, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4469220042228699, "rewards_train/margins": 0.03202971816062927, "rewards_train/rejected": -0.47895172238349915, "step": 774 }, { "epoch": 0.22, "logps_train/chosen": -72.17256164550781, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -85.79423522949219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.12487365305423737, "rewards_train/margins": 0.9592375606298447, "rewards_train/rejected": -1.084111213684082, "step": 775 }, { "epoch": 0.22, "learning_rate": 4.2512796520943215e-07, "loss": 0.5324, "step": 776 }, { "epoch": 0.22, "logps_train/chosen": -131.396728515625, "logps_train/ref_chosen": -115.5, "logps_train/ref_rejected": -128.0, "logps_train/rejected": -155.43008422851562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.614282488822937, "rewards_train/margins": 1.1396623849868774, "rewards_train/rejected": -2.7539448738098145, "step": 776 }, { "epoch": 0.22, "logps_train/chosen": -68.80056762695312, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -74.79603576660156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5984159708023071, "rewards_train/margins": 0.9292343854904175, "rewards_train/rejected": -1.5276503562927246, "step": 777 }, { "epoch": 0.22, "learning_rate": 4.246857488229644e-07, "loss": 0.4663, "step": 778 }, { "epoch": 0.22, "logps_train/chosen": -77.71932220458984, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -91.0114974975586, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.9127525091171265, "rewards_train/margins": 0.10480368137359619, "rewards_train/rejected": -1.0175561904907227, "step": 778 }, { "epoch": 0.22, "logps_train/chosen": -55.49123764038086, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -65.74839782714844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2718776762485504, "rewards_train/margins": 0.650716632604599, "rewards_train/rejected": -0.9225943088531494, "step": 779 }, { "epoch": 0.22, "learning_rate": 4.24242461820433e-07, "loss": 0.6503, "step": 780 }, { "epoch": 0.22, "logps_train/chosen": -35.94996643066406, "logps_train/ref_chosen": -27.125, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -50.71809005737305, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8792738914489746, "rewards_train/margins": 0.6047424077987671, "rewards_train/rejected": -1.4840162992477417, "step": 780 }, { "epoch": 0.22, "logps_train/chosen": -45.22307586669922, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -53.12763977050781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.09750300645828247, "rewards_train/margins": 0.6219016909599304, "rewards_train/rejected": -0.7194046974182129, "step": 781 }, { "epoch": 0.22, "learning_rate": 4.237981069186606e-07, "loss": 0.544, "step": 782 }, { "epoch": 0.22, "logps_train/chosen": -81.47496795654297, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -93.6407241821289, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.057995080947876, "rewards_train/margins": 0.8180397748947144, "rewards_train/rejected": -1.8760348558425903, "step": 782 }, { "epoch": 0.22, "logps_train/chosen": -107.39361572265625, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -114.52546691894531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.122174620628357, "rewards_train/margins": 0.6584969758987427, "rewards_train/rejected": -1.7806715965270996, "step": 783 }, { "epoch": 0.22, "learning_rate": 4.2335268684101456e-07, "loss": 0.5798, "step": 784 }, { "epoch": 0.22, "logps_train/chosen": -55.175079345703125, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -65.32829284667969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6636016964912415, "rewards_train/margins": 0.670985996723175, "rewards_train/rejected": -1.3345876932144165, "step": 784 }, { "epoch": 0.22, "logps_train/chosen": -69.07587432861328, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -69.96524047851562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5310250520706177, "rewards_train/margins": 0.8592489957809448, "rewards_train/rejected": -1.3902740478515625, "step": 785 }, { "epoch": 0.22, "learning_rate": 4.2290620431739083e-07, "loss": 0.4823, "step": 786 }, { "epoch": 0.22, "logps_train/chosen": -81.48248291015625, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -83.1921615600586, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.068756341934204, "rewards_train/margins": -0.012821078300476074, "rewards_train/rejected": -1.055935263633728, "step": 786 }, { "epoch": 0.22, "logps_train/chosen": -52.133052825927734, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -76.95704650878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5356687307357788, "rewards_train/margins": 1.3787859678268433, "rewards_train/rejected": -1.914454698562622, "step": 787 }, { "epoch": 0.22, "learning_rate": 4.2245866208419667e-07, "loss": 0.5452, "step": 788 }, { "epoch": 0.22, "logps_train/chosen": -36.936492919921875, "logps_train/ref_chosen": -32.0, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -38.59773254394531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.483004629611969, "rewards_train/margins": 0.009190499782562256, "rewards_train/rejected": -0.49219512939453125, "step": 788 }, { "epoch": 0.22, "logps_train/chosen": -52.72639846801758, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -59.34904479980469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.04763978719711304, "rewards_train/margins": 0.8146083354949951, "rewards_train/rejected": -0.8622481226921082, "step": 789 }, { "epoch": 0.22, "learning_rate": 4.220100628843342e-07, "loss": 0.606, "step": 790 }, { "epoch": 0.22, "logps_train/chosen": -43.210609436035156, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -49.56851577758789, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5629796385765076, "rewards_train/margins": 0.4407469630241394, "rewards_train/rejected": -1.003726601600647, "step": 790 }, { "epoch": 0.22, "logps_train/chosen": -40.289878845214844, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -39.0, "logps_train/rejected": -43.93583679199219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.18123379349708557, "rewards_train/margins": 0.3130333423614502, "rewards_train/rejected": -0.49426713585853577, "step": 791 }, { "epoch": 0.22, "learning_rate": 4.2156040946718343e-07, "loss": 0.5674, "step": 792 }, { "epoch": 0.22, "logps_train/chosen": -79.20447540283203, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -89.39700317382812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7209354043006897, "rewards_train/margins": 0.36651867628097534, "rewards_train/rejected": -1.087454080581665, "step": 792 }, { "epoch": 0.22, "logps_train/chosen": -55.22673034667969, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -44.716453552246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5199382901191711, "rewards_train/margins": 0.4990217089653015, "rewards_train/rejected": -1.0189599990844727, "step": 793 }, { "epoch": 0.22, "learning_rate": 4.2110970458858544e-07, "loss": 0.5269, "step": 794 }, { "epoch": 0.22, "logps_train/chosen": -34.29668426513672, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -58.57682418823242, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1391409933567047, "rewards_train/margins": 1.3052602708339691, "rewards_train/rejected": -1.4444012641906738, "step": 794 }, { "epoch": 0.22, "logps_train/chosen": -48.54473114013672, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -50.31939697265625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.35613349080085754, "rewards_train/margins": 0.5680917799472809, "rewards_train/rejected": -0.9242252707481384, "step": 795 }, { "epoch": 0.22, "learning_rate": 4.206579510108256e-07, "loss": 0.5159, "step": 796 }, { "epoch": 0.22, "logps_train/chosen": -42.91671371459961, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -30.0, "logps_train/rejected": -41.97731018066406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8901088237762451, "rewards_train/margins": 0.3125048875808716, "rewards_train/rejected": -1.2026137113571167, "step": 796 }, { "epoch": 0.22, "logps_train/chosen": -67.01020050048828, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -72.64645385742188, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6365671157836914, "rewards_train/margins": 0.46772706508636475, "rewards_train/rejected": -1.1042941808700562, "step": 797 }, { "epoch": 0.22, "learning_rate": 4.202051515026166e-07, "loss": 0.5596, "step": 798 }, { "epoch": 0.22, "logps_train/chosen": -53.409149169921875, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -55.79767608642578, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.2479463517665863, "rewards_train/margins": 0.8273290693759918, "rewards_train/rejected": -1.0752754211425781, "step": 798 }, { "epoch": 0.22, "logps_train/chosen": -96.3235855102539, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -124.00653076171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5710307955741882, "rewards_train/margins": 1.655402958393097, "rewards_train/rejected": -2.226433753967285, "step": 799 }, { "epoch": 0.22, "learning_rate": 4.197513088390813e-07, "loss": 0.431, "step": 800 }, { "epoch": 0.22, "logps_train/chosen": -65.17707061767578, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -92.5411376953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0185375213623047, "rewards_train/margins": 0.8723433017730713, "rewards_train/rejected": -1.890880823135376, "step": 800 }, { "epoch": 0.22, "logps_train/chosen": -44.5245361328125, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -60.77937316894531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6116332411766052, "rewards_train/margins": 0.7637651562690735, "rewards_train/rejected": -1.3753983974456787, "step": 801 }, { "epoch": 0.22, "learning_rate": 4.1929642580173585e-07, "loss": 0.4697, "step": 802 }, { "epoch": 0.22, "logps_train/chosen": -98.2421875, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -130.2311553955078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2947750091552734, "rewards_train/margins": 0.6127151250839233, "rewards_train/rejected": -1.9074901342391968, "step": 802 }, { "epoch": 0.22, "logps_train/chosen": -57.04893493652344, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -75.33086395263672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5850875973701477, "rewards_train/margins": 0.9184271693229675, "rewards_train/rejected": -1.5035147666931152, "step": 803 }, { "epoch": 0.22, "learning_rate": 4.188405051784729e-07, "loss": 0.5162, "step": 804 }, { "epoch": 0.22, "logps_train/chosen": -82.97969055175781, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -120.67314147949219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.1600789725780487, "rewards_train/margins": 1.8509850800037384, "rewards_train/rejected": -2.011064052581787, "step": 804 }, { "epoch": 0.22, "logps_train/chosen": -86.50658416748047, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -86.41155242919922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8424553871154785, "rewards_train/margins": 0.2572929859161377, "rewards_train/rejected": -1.0997483730316162, "step": 805 }, { "epoch": 0.23, "learning_rate": 4.1838354976354406e-07, "loss": 0.4673, "step": 806 }, { "epoch": 0.23, "logps_train/chosen": -62.000694274902344, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -73.03533935546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.21764707565307617, "rewards_train/margins": 1.3284653425216675, "rewards_train/rejected": -1.5461124181747437, "step": 806 }, { "epoch": 0.23, "logps_train/chosen": -65.02070617675781, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -87.37649536132812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.46691471338272095, "rewards_train/margins": 1.0633131861686707, "rewards_train/rejected": -1.5302278995513916, "step": 807 }, { "epoch": 0.23, "learning_rate": 4.17925562357543e-07, "loss": 0.4781, "step": 808 }, { "epoch": 0.23, "logps_train/chosen": -42.0662841796875, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -54.25373840332031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9398560523986816, "rewards_train/margins": 0.15233898162841797, "rewards_train/rejected": -1.0921950340270996, "step": 808 }, { "epoch": 0.23, "logps_train/chosen": -88.19157409667969, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -76.43254089355469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2472336292266846, "rewards_train/margins": 0.39992737770080566, "rewards_train/rejected": -1.6471610069274902, "step": 809 }, { "epoch": 0.23, "learning_rate": 4.1746654576738824e-07, "loss": 0.5823, "step": 810 }, { "epoch": 0.23, "logps_train/chosen": -69.62837219238281, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -73.64430236816406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6413524746894836, "rewards_train/margins": 0.9695623517036438, "rewards_train/rejected": -1.6109148263931274, "step": 810 }, { "epoch": 0.23, "logps_train/chosen": -104.60528564453125, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -109.66112518310547, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5290825366973877, "rewards_train/margins": 0.7444517612457275, "rewards_train/rejected": -2.2735342979431152, "step": 811 }, { "epoch": 0.23, "learning_rate": 4.1700650280630624e-07, "loss": 0.5085, "step": 812 }, { "epoch": 0.23, "logps_train/chosen": -78.03692626953125, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -75.56971740722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5325984954833984, "rewards_train/margins": 0.5513255596160889, "rewards_train/rejected": -2.0839240550994873, "step": 812 }, { "epoch": 0.23, "logps_train/chosen": -42.48480987548828, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -57.76921463012695, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5846140384674072, "rewards_train/margins": 0.813010573387146, "rewards_train/rejected": -1.3976246118545532, "step": 813 }, { "epoch": 0.23, "learning_rate": 4.1654543629381346e-07, "loss": 0.4606, "step": 814 }, { "epoch": 0.23, "logps_train/chosen": -62.89506912231445, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -75.18867492675781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7123585939407349, "rewards_train/margins": 0.9132469892501831, "rewards_train/rejected": -1.625605583190918, "step": 814 }, { "epoch": 0.23, "logps_train/chosen": -48.81223678588867, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -79.15168762207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.17887991666793823, "rewards_train/margins": 1.2269136309623718, "rewards_train/rejected": -1.40579354763031, "step": 815 }, { "epoch": 0.23, "learning_rate": 4.1608334905569986e-07, "loss": 0.4137, "step": 816 }, { "epoch": 0.23, "logps_train/chosen": -38.1473503112793, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -32.343116760253906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.19852405786514282, "rewards_train/margins": 0.05922490358352661, "rewards_train/rejected": -0.25774896144866943, "step": 816 }, { "epoch": 0.23, "logps_train/chosen": -91.79652404785156, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -98.53657531738281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5296523571014404, "rewards_train/margins": 0.8661925792694092, "rewards_train/rejected": -2.3958449363708496, "step": 817 }, { "epoch": 0.23, "learning_rate": 4.156202439240111e-07, "loss": 0.5826, "step": 818 }, { "epoch": 0.23, "logps_train/chosen": -62.88359451293945, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -64.70132446289062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.14617177844047546, "rewards_train/margins": 0.24075812101364136, "rewards_train/rejected": -0.3869298994541168, "step": 818 }, { "epoch": 0.23, "logps_train/chosen": -66.91609191894531, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -68.0125732421875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5916092395782471, "rewards_train/margins": 0.845194935798645, "rewards_train/rejected": -1.436804175376892, "step": 819 }, { "epoch": 0.23, "learning_rate": 4.1515612373703125e-07, "loss": 0.5709, "step": 820 }, { "epoch": 0.23, "logps_train/chosen": -72.45796203613281, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -93.32817077636719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5786080360412598, "rewards_train/margins": 0.9327239990234375, "rewards_train/rejected": -1.5113320350646973, "step": 820 }, { "epoch": 0.23, "logps_train/chosen": -77.12188720703125, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -94.6893310546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8117375373840332, "rewards_train/margins": 1.0581116676330566, "rewards_train/rejected": -1.8698492050170898, "step": 821 }, { "epoch": 0.23, "learning_rate": 4.1469099133926566e-07, "loss": 0.4374, "step": 822 }, { "epoch": 0.23, "logps_train/chosen": -97.09497833251953, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -132.5654296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1946545839309692, "rewards_train/margins": 1.7118874788284302, "rewards_train/rejected": -2.9065420627593994, "step": 822 }, { "epoch": 0.23, "logps_train/chosen": -70.23595428466797, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -97.10716247558594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.47086095809936523, "rewards_train/margins": 0.4336055517196655, "rewards_train/rejected": -0.9044665098190308, "step": 823 }, { "epoch": 0.23, "learning_rate": 4.1422484958142324e-07, "loss": 0.3872, "step": 824 }, { "epoch": 0.23, "logps_train/chosen": -61.643978118896484, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -80.45086669921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5362118482589722, "rewards_train/margins": 0.7791265249252319, "rewards_train/rejected": -1.315338373184204, "step": 824 }, { "epoch": 0.23, "logps_train/chosen": -71.22943115234375, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -95.36404418945312, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.9088804721832275, "rewards_train/margins": 1.4546713829040527, "rewards_train/rejected": -2.3635518550872803, "step": 825 }, { "epoch": 0.23, "learning_rate": 4.137577013203989e-07, "loss": 0.4482, "step": 826 }, { "epoch": 0.23, "logps_train/chosen": -63.636497497558594, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -54.24509811401367, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5723414421081543, "rewards_train/margins": 0.21090877056121826, "rewards_train/rejected": -0.7832502126693726, "step": 826 }, { "epoch": 0.23, "logps_train/chosen": -95.05361938476562, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -111.03193664550781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4521886110305786, "rewards_train/margins": 1.0213178396224976, "rewards_train/rejected": -2.473506450653076, "step": 827 }, { "epoch": 0.23, "learning_rate": 4.1328954941925656e-07, "loss": 0.5355, "step": 828 }, { "epoch": 0.23, "logps_train/chosen": -63.327484130859375, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -69.67037963867188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1571626663208008, "rewards_train/margins": 0.6307734251022339, "rewards_train/rejected": -1.7879360914230347, "step": 828 }, { "epoch": 0.23, "logps_train/chosen": -73.36369323730469, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -88.95587921142578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7222095727920532, "rewards_train/margins": 1.1878312826156616, "rewards_train/rejected": -2.910040855407715, "step": 829 }, { "epoch": 0.23, "learning_rate": 4.1282039674721093e-07, "loss": 0.6553, "step": 830 }, { "epoch": 0.23, "logps_train/chosen": -100.62915802001953, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -134.68023681640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0336189270019531, "rewards_train/margins": 1.7113580703735352, "rewards_train/rejected": -2.7449769973754883, "step": 830 }, { "epoch": 0.23, "logps_train/chosen": -44.26068115234375, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -61.75473403930664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": 0.1421934962272644, "rewards_train/margins": 0.8658111691474915, "rewards_train/rejected": -0.723617672920227, "step": 831 }, { "epoch": 0.23, "learning_rate": 4.123502461796105e-07, "loss": 0.3498, "step": 832 }, { "epoch": 0.23, "logps_train/chosen": -68.79542541503906, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -99.24661254882812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7224140763282776, "rewards_train/margins": 1.008497178554535, "rewards_train/rejected": -1.7309112548828125, "step": 832 }, { "epoch": 0.23, "logps_train/chosen": -53.21661376953125, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -72.57496643066406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7533018589019775, "rewards_train/margins": 1.0877885818481445, "rewards_train/rejected": -1.841090440750122, "step": 833 }, { "epoch": 0.23, "learning_rate": 4.118791005979195e-07, "loss": 0.4874, "step": 834 }, { "epoch": 0.23, "logps_train/chosen": -74.54422760009766, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -84.56739807128906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.37571173906326294, "rewards_train/margins": 1.3034891486167908, "rewards_train/rejected": -1.6792008876800537, "step": 834 }, { "epoch": 0.23, "logps_train/chosen": -63.8644905090332, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -77.23162841796875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6001206636428833, "rewards_train/margins": 1.1230422258377075, "rewards_train/rejected": -1.7231628894805908, "step": 835 }, { "epoch": 0.23, "learning_rate": 4.114069628897006e-07, "loss": 0.4857, "step": 836 }, { "epoch": 0.23, "logps_train/chosen": -63.14500427246094, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -61.190818786621094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8691879510879517, "rewards_train/margins": 0.39442527294158936, "rewards_train/rejected": -1.263613224029541, "step": 836 }, { "epoch": 0.23, "logps_train/chosen": -34.73402786254883, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -31.625, "logps_train/rejected": -39.241756439208984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.45207691192626953, "rewards_train/margins": 0.30350762605667114, "rewards_train/rejected": -0.7555845379829407, "step": 837 }, { "epoch": 0.23, "learning_rate": 4.109338359485968e-07, "loss": 0.5491, "step": 838 }, { "epoch": 0.23, "logps_train/chosen": -55.93766784667969, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -74.05572509765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.45892825722694397, "rewards_train/margins": 0.9501595199108124, "rewards_train/rejected": -1.4090877771377563, "step": 838 }, { "epoch": 0.23, "logps_train/chosen": -41.26186752319336, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -48.672325134277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.45548343658447266, "rewards_train/margins": 0.6961238384246826, "rewards_train/rejected": -1.1516072750091553, "step": 839 }, { "epoch": 0.23, "learning_rate": 4.1045972267431407e-07, "loss": 0.4481, "step": 840 }, { "epoch": 0.23, "logps_train/chosen": -39.77976989746094, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -55.97582244873047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.23769403994083405, "rewards_train/margins": 0.8598878234624863, "rewards_train/rejected": -1.0975818634033203, "step": 840 }, { "epoch": 0.24, "logps_train/chosen": -62.116844177246094, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -67.9813232421875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.22418440878391266, "rewards_train/margins": 0.8791235536336899, "rewards_train/rejected": -1.1033079624176025, "step": 841 }, { "epoch": 0.24, "learning_rate": 4.0998462597260343e-07, "loss": 0.4588, "step": 842 }, { "epoch": 0.24, "logps_train/chosen": -42.325218200683594, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -51.84090042114258, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.7645533084869385, "rewards_train/margins": 0.44268131256103516, "rewards_train/rejected": -1.2072346210479736, "step": 842 }, { "epoch": 0.24, "logps_train/chosen": -68.06748962402344, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -103.82437133789062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8422955274581909, "rewards_train/margins": 1.2043992280960083, "rewards_train/rejected": -2.046694755554199, "step": 843 }, { "epoch": 0.24, "learning_rate": 4.0950854875524305e-07, "loss": 0.48, "step": 844 }, { "epoch": 0.24, "logps_train/chosen": -72.60628509521484, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -78.16154479980469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9348475933074951, "rewards_train/margins": 0.702791690826416, "rewards_train/rejected": -1.6376392841339111, "step": 844 }, { "epoch": 0.24, "logps_train/chosen": -43.7841796875, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -68.80323791503906, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4030269980430603, "rewards_train/margins": 0.6636252999305725, "rewards_train/rejected": -1.0666522979736328, "step": 845 }, { "epoch": 0.24, "learning_rate": 4.090314939400206e-07, "loss": 0.5491, "step": 846 }, { "epoch": 0.24, "logps_train/chosen": -62.69173049926758, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -78.07003784179688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6222983598709106, "rewards_train/margins": 1.0536508560180664, "rewards_train/rejected": -1.675949215888977, "step": 846 }, { "epoch": 0.24, "logps_train/chosen": -99.3228759765625, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -129.64483642578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1174439191818237, "rewards_train/margins": 1.796258807182312, "rewards_train/rejected": -2.9137027263641357, "step": 847 }, { "epoch": 0.24, "learning_rate": 4.0855346445071524e-07, "loss": 0.3345, "step": 848 }, { "epoch": 0.24, "logps_train/chosen": -65.88160705566406, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -71.70245361328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2537854015827179, "rewards_train/margins": 0.8770063817501068, "rewards_train/rejected": -1.1307917833328247, "step": 848 }, { "epoch": 0.24, "logps_train/chosen": -62.95964050292969, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -73.73855590820312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.23619846999645233, "rewards_train/margins": 0.7333600074052811, "rewards_train/rejected": -0.9695584774017334, "step": 849 }, { "epoch": 0.24, "learning_rate": 4.0807446321707974e-07, "loss": 0.4208, "step": 850 }, { "epoch": 0.24, "logps_train/chosen": -69.36096954345703, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -101.65882873535156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1899546384811401, "rewards_train/margins": 1.8454598188400269, "rewards_train/rejected": -3.035414457321167, "step": 850 }, { "epoch": 0.24, "logps_train/chosen": -73.83930969238281, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -96.34136199951172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.49164584279060364, "rewards_train/margins": 1.4840918481349945, "rewards_train/rejected": -1.9757376909255981, "step": 851 }, { "epoch": 0.24, "learning_rate": 4.075944931748223e-07, "loss": 0.3147, "step": 852 }, { "epoch": 0.24, "logps_train/chosen": -61.73343276977539, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -58.92315673828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7012729644775391, "rewards_train/margins": 0.31762969493865967, "rewards_train/rejected": -1.0189026594161987, "step": 852 }, { "epoch": 0.24, "logps_train/chosen": -41.882659912109375, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -55.42266845703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3237150311470032, "rewards_train/margins": 0.8373019099235535, "rewards_train/rejected": -1.1610169410705566, "step": 853 }, { "epoch": 0.24, "learning_rate": 4.071135572655892e-07, "loss": 0.5335, "step": 854 }, { "epoch": 0.24, "logps_train/chosen": -102.44183349609375, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -109.13877868652344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1290467977523804, "rewards_train/margins": 1.2723315954208374, "rewards_train/rejected": -2.4013783931732178, "step": 854 }, { "epoch": 0.24, "logps_train/chosen": -94.62855529785156, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -114.26194763183594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6460587978363037, "rewards_train/margins": 1.2277929782867432, "rewards_train/rejected": -2.873851776123047, "step": 855 }, { "epoch": 0.24, "learning_rate": 4.066316584369458e-07, "loss": 0.4149, "step": 856 }, { "epoch": 0.24, "logps_train/chosen": -37.71443557739258, "logps_train/ref_chosen": -28.75, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -51.58930206298828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8912678956985474, "rewards_train/margins": 0.30984973907470703, "rewards_train/rejected": -1.2011176347732544, "step": 856 }, { "epoch": 0.24, "logps_train/chosen": -82.29135131835938, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -136.8566131591797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9064784049987793, "rewards_train/margins": 1.6909008026123047, "rewards_train/rejected": -2.597379207611084, "step": 857 }, { "epoch": 0.24, "learning_rate": 4.061487996423594e-07, "loss": 0.4545, "step": 858 }, { "epoch": 0.24, "logps_train/chosen": -66.29974365234375, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -64.24313354492188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5932061076164246, "rewards_train/margins": 0.669388473033905, "rewards_train/rejected": -1.2625945806503296, "step": 858 }, { "epoch": 0.24, "logps_train/chosen": -75.85748291015625, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -89.54031372070312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9605528116226196, "rewards_train/margins": 1.065353274345398, "rewards_train/rejected": -2.0259060859680176, "step": 859 }, { "epoch": 0.24, "learning_rate": 4.056649838411807e-07, "loss": 0.4745, "step": 860 }, { "epoch": 0.24, "logps_train/chosen": -76.42633056640625, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -81.47369384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3918523788452148, "rewards_train/margins": 1.0730953216552734, "rewards_train/rejected": -2.4649477005004883, "step": 860 }, { "epoch": 0.24, "logps_train/chosen": -77.15873718261719, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -130.86778259277344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8947802186012268, "rewards_train/margins": 2.401763617992401, "rewards_train/rejected": -3.296543836593628, "step": 861 }, { "epoch": 0.24, "learning_rate": 4.0518021399862554e-07, "loss": 0.384, "step": 862 }, { "epoch": 0.24, "logps_train/chosen": -64.38674926757812, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -65.65277862548828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9566435813903809, "rewards_train/margins": 0.3637123107910156, "rewards_train/rejected": -1.3203558921813965, "step": 862 }, { "epoch": 0.24, "logps_train/chosen": -57.803504943847656, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -62.15605545043945, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4194130301475525, "rewards_train/margins": 0.9262707829475403, "rewards_train/rejected": -1.3456838130950928, "step": 863 }, { "epoch": 0.24, "learning_rate": 4.0469449308575716e-07, "loss": 0.5045, "step": 864 }, { "epoch": 0.24, "logps_train/chosen": -58.545047760009766, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -87.83868408203125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8935671448707581, "rewards_train/margins": 0.9514340758323669, "rewards_train/rejected": -1.845001220703125, "step": 864 }, { "epoch": 0.24, "logps_train/chosen": -59.745479583740234, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -78.23158264160156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7815794348716736, "rewards_train/margins": 1.1259544491767883, "rewards_train/rejected": -1.907533884048462, "step": 865 }, { "epoch": 0.24, "learning_rate": 4.042078240794674e-07, "loss": 0.4875, "step": 866 }, { "epoch": 0.24, "logps_train/chosen": -87.96759033203125, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -115.6199951171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9797187447547913, "rewards_train/margins": 1.2900447249412537, "rewards_train/rejected": -2.269763469696045, "step": 866 }, { "epoch": 0.24, "logps_train/chosen": -97.64161682128906, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -92.29362487792969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0157241821289062, "rewards_train/margins": 0.2773101329803467, "rewards_train/rejected": -2.293034315109253, "step": 867 }, { "epoch": 0.24, "learning_rate": 4.0372020996245917e-07, "loss": 0.469, "step": 868 }, { "epoch": 0.24, "logps_train/chosen": -49.51487350463867, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -64.07353973388672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.660178542137146, "rewards_train/margins": 0.3977614641189575, "rewards_train/rejected": -1.0579400062561035, "step": 868 }, { "epoch": 0.24, "logps_train/chosen": -100.80717468261719, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -108.25254821777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.869194507598877, "rewards_train/margins": 0.7628970146179199, "rewards_train/rejected": -2.632091522216797, "step": 869 }, { "epoch": 0.24, "learning_rate": 4.032316537232274e-07, "loss": 0.4791, "step": 870 }, { "epoch": 0.24, "logps_train/chosen": -108.68254852294922, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -133.51974487304688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.027629852294922, "rewards_train/margins": 0.8053998947143555, "rewards_train/rejected": -2.8330297470092773, "step": 870 }, { "epoch": 0.24, "logps_train/chosen": -81.01268768310547, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -80.3427505493164, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5993157625198364, "rewards_train/margins": 0.3802715539932251, "rewards_train/rejected": -0.9795873165130615, "step": 871 }, { "epoch": 0.24, "learning_rate": 4.027421583560413e-07, "loss": 0.5457, "step": 872 }, { "epoch": 0.24, "logps_train/chosen": -72.78117370605469, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -90.53843688964844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6754804849624634, "rewards_train/margins": 0.45180022716522217, "rewards_train/rejected": -1.1272807121276855, "step": 872 }, { "epoch": 0.24, "logps_train/chosen": -76.83738708496094, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -79.88653564453125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6415513753890991, "rewards_train/margins": 0.40628230571746826, "rewards_train/rejected": -1.0478336811065674, "step": 873 }, { "epoch": 0.24, "learning_rate": 4.0225172686092594e-07, "loss": 0.5542, "step": 874 }, { "epoch": 0.24, "logps_train/chosen": -39.06175994873047, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -49.59917449951172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5120354890823364, "rewards_train/margins": 1.0713683366775513, "rewards_train/rejected": -1.5834038257598877, "step": 874 }, { "epoch": 0.24, "logps_train/chosen": -57.961402893066406, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -61.920379638671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3348119854927063, "rewards_train/margins": 0.7919917702674866, "rewards_train/rejected": -1.1268037557601929, "step": 875 }, { "epoch": 0.24, "learning_rate": 4.0176036224364353e-07, "loss": 0.4595, "step": 876 }, { "epoch": 0.24, "logps_train/chosen": -47.12226867675781, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -48.73451232910156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6239946484565735, "rewards_train/margins": 0.17172223329544067, "rewards_train/rejected": -0.7957168817520142, "step": 876 }, { "epoch": 0.25, "logps_train/chosen": -63.09483337402344, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -72.131103515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9297962188720703, "rewards_train/margins": 1.1639783382415771, "rewards_train/rejected": -2.0937745571136475, "step": 877 }, { "epoch": 0.25, "learning_rate": 4.0126806751567527e-07, "loss": 0.5059, "step": 878 }, { "epoch": 0.25, "logps_train/chosen": -83.72850036621094, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -94.05934143066406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2841782569885254, "rewards_train/margins": 1.6248812675476074, "rewards_train/rejected": -2.909059524536133, "step": 878 }, { "epoch": 0.25, "logps_train/chosen": -78.61212158203125, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -83.38127136230469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.42073363065719604, "rewards_train/margins": 0.3151957392692566, "rewards_train/rejected": -0.7359293699264526, "step": 879 }, { "epoch": 0.25, "learning_rate": 4.007748456942029e-07, "loss": 0.5034, "step": 880 }, { "epoch": 0.25, "logps_train/chosen": -62.1357536315918, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -62.99882507324219, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.5403333902359009, "rewards_train/margins": 0.20095562934875488, "rewards_train/rejected": -1.7412890195846558, "step": 880 }, { "epoch": 0.25, "logps_train/chosen": -84.62681579589844, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -85.95426177978516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5182240009307861, "rewards_train/margins": 0.0038379430770874023, "rewards_train/rejected": -1.5220619440078735, "step": 881 }, { "epoch": 0.25, "learning_rate": 4.002806998020901e-07, "loss": 0.7178, "step": 882 }, { "epoch": 0.25, "logps_train/chosen": -80.11326599121094, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -80.46920013427734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2980449199676514, "rewards_train/margins": 1.2316877841949463, "rewards_train/rejected": -2.5297327041625977, "step": 882 }, { "epoch": 0.25, "logps_train/chosen": -92.61192321777344, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -104.4124755859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2393176555633545, "rewards_train/margins": 1.0152103900909424, "rewards_train/rejected": -2.254528045654297, "step": 883 }, { "epoch": 0.25, "learning_rate": 3.997856328678639e-07, "loss": 0.4796, "step": 884 }, { "epoch": 0.25, "logps_train/chosen": -64.76846313476562, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -81.062255859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5537751317024231, "rewards_train/margins": 1.4395355582237244, "rewards_train/rejected": -1.9933106899261475, "step": 884 }, { "epoch": 0.25, "logps_train/chosen": -49.63147735595703, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -64.95520782470703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8460581302642822, "rewards_train/margins": 0.7018063068389893, "rewards_train/rejected": -1.5478644371032715, "step": 885 }, { "epoch": 0.25, "learning_rate": 3.9928964792569654e-07, "loss": 0.4331, "step": 886 }, { "epoch": 0.25, "logps_train/chosen": -52.839752197265625, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -60.27191925048828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8090975284576416, "rewards_train/margins": 0.3689000606536865, "rewards_train/rejected": -1.1779975891113281, "step": 886 }, { "epoch": 0.25, "logps_train/chosen": -40.76837158203125, "logps_train/ref_chosen": -31.5, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -42.9201545715332, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.9231261610984802, "rewards_train/margins": 0.07709234952926636, "rewards_train/rejected": -1.0002185106277466, "step": 887 }, { "epoch": 0.25, "learning_rate": 3.9879274801538614e-07, "loss": 0.6346, "step": 888 }, { "epoch": 0.25, "logps_train/chosen": -73.1782455444336, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -75.08341979980469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9022000432014465, "rewards_train/margins": 0.1959860920906067, "rewards_train/rejected": -1.0981861352920532, "step": 888 }, { "epoch": 0.25, "logps_train/chosen": -76.214599609375, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -87.58216094970703, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.1128662824630737, "rewards_train/margins": 1.0871468782424927, "rewards_train/rejected": -2.2000131607055664, "step": 889 }, { "epoch": 0.25, "learning_rate": 3.982949361823388e-07, "loss": 0.6556, "step": 890 }, { "epoch": 0.25, "logps_train/chosen": -64.17122650146484, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -76.07058715820312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.46184954047203064, "rewards_train/margins": 1.1381775438785553, "rewards_train/rejected": -1.600027084350586, "step": 890 }, { "epoch": 0.25, "logps_train/chosen": -52.07596206665039, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -66.04190826416016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6796664595603943, "rewards_train/margins": 1.0745243430137634, "rewards_train/rejected": -1.7541908025741577, "step": 891 }, { "epoch": 0.25, "learning_rate": 3.977962154775495e-07, "loss": 0.4402, "step": 892 }, { "epoch": 0.25, "logps_train/chosen": -81.47840118408203, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -98.81383514404297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8094611167907715, "rewards_train/margins": 1.1202619075775146, "rewards_train/rejected": -1.9297230243682861, "step": 892 }, { "epoch": 0.25, "logps_train/chosen": -124.6190414428711, "logps_train/ref_chosen": -112.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -122.74934387207031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2790918350219727, "rewards_train/margins": 0.726311206817627, "rewards_train/rejected": -2.0054030418395996, "step": 893 }, { "epoch": 0.25, "learning_rate": 3.9729658895758345e-07, "loss": 0.485, "step": 894 }, { "epoch": 0.25, "logps_train/chosen": -36.10619354248047, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -62.65257263183594, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.20097553730010986, "rewards_train/margins": 1.416821002960205, "rewards_train/rejected": -1.617796540260315, "step": 894 }, { "epoch": 0.25, "logps_train/chosen": -83.04344177246094, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -100.74031829833984, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4949696063995361, "rewards_train/margins": 0.9103126525878906, "rewards_train/rejected": -2.4052822589874268, "step": 895 }, { "epoch": 0.25, "learning_rate": 3.967960596845576e-07, "loss": 0.4898, "step": 896 }, { "epoch": 0.25, "logps_train/chosen": -84.88798522949219, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -119.5206298828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.436795711517334, "rewards_train/margins": 1.4168298244476318, "rewards_train/rejected": -2.853625535964966, "step": 896 }, { "epoch": 0.25, "logps_train/chosen": -36.703125, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -38.25, "logps_train/rejected": -44.81077194213867, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.1004394143819809, "rewards_train/margins": 0.5486068874597549, "rewards_train/rejected": -0.6490463018417358, "step": 897 }, { "epoch": 0.25, "learning_rate": 3.962946307261216e-07, "loss": 0.3956, "step": 898 }, { "epoch": 0.25, "logps_train/chosen": -98.51810455322266, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -106.60423278808594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1647982597351074, "rewards_train/margins": 1.142890453338623, "rewards_train/rejected": -2.3076887130737305, "step": 898 }, { "epoch": 0.25, "logps_train/chosen": -104.0866470336914, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -106.87762451171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3336645364761353, "rewards_train/margins": 0.8150357007980347, "rewards_train/rejected": -2.14870023727417, "step": 899 }, { "epoch": 0.25, "learning_rate": 3.9579230515543914e-07, "loss": 0.4522, "step": 900 }, { "epoch": 0.25, "logps_train/chosen": -86.26780700683594, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -123.68040466308594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1877177953720093, "rewards_train/margins": 1.0637205839157104, "rewards_train/rejected": -2.2514383792877197, "step": 900 }, { "epoch": 0.25, "logps_train/chosen": -56.91307067871094, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -60.705528259277344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8772448301315308, "rewards_train/margins": 0.783737301826477, "rewards_train/rejected": -1.6609821319580078, "step": 901 }, { "epoch": 0.25, "learning_rate": 3.952890860511691e-07, "loss": 0.4604, "step": 902 }, { "epoch": 0.25, "logps_train/chosen": -74.55409240722656, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -87.43728637695312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4456429481506348, "rewards_train/margins": 0.5340230464935303, "rewards_train/rejected": -1.979665994644165, "step": 902 }, { "epoch": 0.25, "logps_train/chosen": -63.08944320678711, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -81.43690490722656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7562098503112793, "rewards_train/margins": 1.2976367473602295, "rewards_train/rejected": -2.053846597671509, "step": 903 }, { "epoch": 0.25, "learning_rate": 3.9478497649744656e-07, "loss": 0.4786, "step": 904 }, { "epoch": 0.25, "logps_train/chosen": -67.21929931640625, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -90.53805541992188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8326723575592041, "rewards_train/margins": 1.0820711851119995, "rewards_train/rejected": -1.9147435426712036, "step": 904 }, { "epoch": 0.25, "logps_train/chosen": -72.20622253417969, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -80.93544006347656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2575366497039795, "rewards_train/margins": 0.5289757251739502, "rewards_train/rejected": -1.7865123748779297, "step": 905 }, { "epoch": 0.25, "learning_rate": 3.9427997958386403e-07, "loss": 0.5094, "step": 906 }, { "epoch": 0.25, "logps_train/chosen": -61.616329193115234, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -77.89895629882812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6491086483001709, "rewards_train/margins": 1.3169593811035156, "rewards_train/rejected": -1.9660680294036865, "step": 906 }, { "epoch": 0.25, "logps_train/chosen": -74.62712097167969, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -76.43041229248047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3951337337493896, "rewards_train/margins": 0.012946128845214844, "rewards_train/rejected": -1.4080798625946045, "step": 907 }, { "epoch": 0.25, "learning_rate": 3.9377409840545254e-07, "loss": 0.6108, "step": 908 }, { "epoch": 0.25, "logps_train/chosen": -75.5404281616211, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -73.61381530761719, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.308144211769104, "rewards_train/margins": 0.5717917084693909, "rewards_train/rejected": -0.8799359202384949, "step": 908 }, { "epoch": 0.25, "logps_train/chosen": -77.3523941040039, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -79.1645736694336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.293442726135254, "rewards_train/margins": 1.035905361175537, "rewards_train/rejected": -2.329348087310791, "step": 909 }, { "epoch": 0.25, "learning_rate": 3.932673360626627e-07, "loss": 0.4687, "step": 910 }, { "epoch": 0.25, "logps_train/chosen": -89.87672424316406, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -90.80999755859375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.7661880850791931, "rewards_train/margins": 0.5757489800453186, "rewards_train/rejected": -1.3419370651245117, "step": 910 }, { "epoch": 0.25, "logps_train/chosen": -42.630550384521484, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -54.55341339111328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4053402245044708, "rewards_train/margins": 0.9484386146068573, "rewards_train/rejected": -1.3537788391113281, "step": 911 }, { "epoch": 0.25, "learning_rate": 3.9275969566134526e-07, "loss": 0.5068, "step": 912 }, { "epoch": 0.25, "logps_train/chosen": -57.77507400512695, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -66.93160247802734, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.840886116027832, "rewards_train/margins": 0.19485235214233398, "rewards_train/rejected": -1.035738468170166, "step": 912 }, { "epoch": 0.26, "logps_train/chosen": -46.246376037597656, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -59.70482635498047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5017860531806946, "rewards_train/margins": 1.3604934811592102, "rewards_train/rejected": -1.8622795343399048, "step": 913 }, { "epoch": 0.26, "learning_rate": 3.9225118031273285e-07, "loss": 0.4865, "step": 914 }, { "epoch": 0.26, "logps_train/chosen": -74.92893981933594, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -98.67294311523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.033128261566162, "rewards_train/margins": 1.7943224906921387, "rewards_train/rejected": -2.827450752258301, "step": 914 }, { "epoch": 0.26, "logps_train/chosen": -79.56307983398438, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -82.70267486572266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1656832695007324, "rewards_train/margins": 0.8362245559692383, "rewards_train/rejected": -2.0019078254699707, "step": 915 }, { "epoch": 0.26, "learning_rate": 3.917417931334202e-07, "loss": 0.3316, "step": 916 }, { "epoch": 0.26, "logps_train/chosen": -82.14381408691406, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -92.40758514404297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7503196597099304, "rewards_train/margins": 1.2701264023780823, "rewards_train/rejected": -2.0204460620880127, "step": 916 }, { "epoch": 0.26, "logps_train/chosen": -60.04490280151367, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -68.78225708007812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3076155483722687, "rewards_train/margins": 1.4287887513637543, "rewards_train/rejected": -1.736404299736023, "step": 917 }, { "epoch": 0.26, "learning_rate": 3.912315372453455e-07, "loss": 0.3723, "step": 918 }, { "epoch": 0.26, "logps_train/chosen": -96.38775634765625, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -117.23223876953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.479400396347046, "rewards_train/margins": 1.7149176597595215, "rewards_train/rejected": -3.1943180561065674, "step": 918 }, { "epoch": 0.26, "logps_train/chosen": -43.03814697265625, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -61.24681854248047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.48052364587783813, "rewards_train/margins": 1.2604665160179138, "rewards_train/rejected": -1.740990161895752, "step": 919 }, { "epoch": 0.26, "learning_rate": 3.9072041577577086e-07, "loss": 0.4571, "step": 920 }, { "epoch": 0.26, "logps_train/chosen": -88.52217864990234, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -122.61509704589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8412805795669556, "rewards_train/margins": 2.4546045064926147, "rewards_train/rejected": -3.2958850860595703, "step": 920 }, { "epoch": 0.26, "logps_train/chosen": -87.45503997802734, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -82.5659408569336, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8392540216445923, "rewards_train/margins": 0.7376526594161987, "rewards_train/rejected": -1.576906681060791, "step": 921 }, { "epoch": 0.26, "learning_rate": 3.9020843185726375e-07, "loss": 0.3199, "step": 922 }, { "epoch": 0.26, "logps_train/chosen": -59.98433303833008, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -75.5238265991211, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5667929649353027, "rewards_train/margins": 0.9804141521453857, "rewards_train/rejected": -1.5472071170806885, "step": 922 }, { "epoch": 0.26, "logps_train/chosen": -101.81692504882812, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -92.60254669189453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4176301956176758, "rewards_train/margins": 0.9957494735717773, "rewards_train/rejected": -2.413379669189453, "step": 923 }, { "epoch": 0.26, "learning_rate": 3.89695588627677e-07, "loss": 0.4387, "step": 924 }, { "epoch": 0.26, "logps_train/chosen": -66.48164367675781, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -59.29444122314453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6981643438339233, "rewards_train/margins": 0.5075982809066772, "rewards_train/rejected": -1.2057626247406006, "step": 924 }, { "epoch": 0.26, "logps_train/chosen": -78.38574981689453, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -79.59117126464844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0385751724243164, "rewards_train/margins": 0.5092141628265381, "rewards_train/rejected": -1.5477893352508545, "step": 925 }, { "epoch": 0.26, "learning_rate": 3.891818892301304e-07, "loss": 0.549, "step": 926 }, { "epoch": 0.26, "logps_train/chosen": -58.79720687866211, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -70.23634338378906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1250334978103638, "rewards_train/margins": 0.9478198289871216, "rewards_train/rejected": -2.0728533267974854, "step": 926 }, { "epoch": 0.26, "logps_train/chosen": -31.278968811035156, "logps_train/ref_chosen": -31.0, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -41.0980224609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.022013187408447266, "rewards_train/margins": 0.3496786952018738, "rewards_train/rejected": -0.37169188261032104, "step": 927 }, { "epoch": 0.26, "learning_rate": 3.8866733681299066e-07, "loss": 0.5181, "step": 928 }, { "epoch": 0.26, "logps_train/chosen": -92.73963928222656, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -112.15428161621094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3704484701156616, "rewards_train/margins": 1.499667763710022, "rewards_train/rejected": -2.8701162338256836, "step": 928 }, { "epoch": 0.26, "logps_train/chosen": -69.67546081542969, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -82.57536315917969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.932390034198761, "rewards_train/margins": 1.0333487391471863, "rewards_train/rejected": -1.9657387733459473, "step": 929 }, { "epoch": 0.26, "learning_rate": 3.8815193452985274e-07, "loss": 0.351, "step": 930 }, { "epoch": 0.26, "logps_train/chosen": -52.05970764160156, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -60.10272216796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.21065837144851685, "rewards_train/margins": 1.4384812712669373, "rewards_train/rejected": -1.649139642715454, "step": 930 }, { "epoch": 0.26, "logps_train/chosen": -33.79679489135742, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -38.25, "logps_train/rejected": -49.92681884765625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.14764821529388428, "rewards_train/margins": 1.0192526578903198, "rewards_train/rejected": -1.166900873184204, "step": 931 }, { "epoch": 0.26, "learning_rate": 3.876356855395202e-07, "loss": 0.4436, "step": 932 }, { "epoch": 0.26, "logps_train/chosen": -57.312294006347656, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -75.57594299316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1866980493068695, "rewards_train/margins": 1.2412082254886627, "rewards_train/rejected": -1.4279062747955322, "step": 932 }, { "epoch": 0.26, "logps_train/chosen": -82.04449462890625, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -79.35250091552734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2163639068603516, "rewards_train/margins": 0.8449602127075195, "rewards_train/rejected": -2.061324119567871, "step": 933 }, { "epoch": 0.26, "learning_rate": 3.8711859300598584e-07, "loss": 0.4442, "step": 934 }, { "epoch": 0.26, "logps_train/chosen": -37.849609375, "logps_train/ref_chosen": -29.875, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -52.62455749511719, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8034486770629883, "rewards_train/margins": 0.4742414951324463, "rewards_train/rejected": -1.2776901721954346, "step": 934 }, { "epoch": 0.26, "logps_train/chosen": -77.09475708007812, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -101.5009536743164, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9555695056915283, "rewards_train/margins": 1.3156194686889648, "rewards_train/rejected": -2.271188974380493, "step": 935 }, { "epoch": 0.26, "learning_rate": 3.866006600984125e-07, "loss": 0.5286, "step": 936 }, { "epoch": 0.26, "logps_train/chosen": -66.92366790771484, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -81.85852813720703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8860437273979187, "rewards_train/margins": 1.6044233441352844, "rewards_train/rejected": -2.490467071533203, "step": 936 }, { "epoch": 0.26, "logps_train/chosen": -58.88975524902344, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -63.51271057128906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.958702027797699, "rewards_train/margins": 0.7064364552497864, "rewards_train/rejected": -1.6651384830474854, "step": 937 }, { "epoch": 0.26, "learning_rate": 3.860818899911134e-07, "loss": 0.4249, "step": 938 }, { "epoch": 0.26, "logps_train/chosen": -75.26493835449219, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -83.61276245117188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7279584407806396, "rewards_train/margins": 1.2928874492645264, "rewards_train/rejected": -2.020845890045166, "step": 938 }, { "epoch": 0.26, "logps_train/chosen": -80.58453369140625, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -91.38980102539062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2749571800231934, "rewards_train/margins": 0.7747652530670166, "rewards_train/rejected": -2.04972243309021, "step": 939 }, { "epoch": 0.26, "learning_rate": 3.855622858635329e-07, "loss": 0.5245, "step": 940 }, { "epoch": 0.26, "logps_train/chosen": -68.03158569335938, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -71.84562683105469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.38909628987312317, "rewards_train/margins": 0.714607447385788, "rewards_train/rejected": -1.1037037372589111, "step": 940 }, { "epoch": 0.26, "logps_train/chosen": -74.56521606445312, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -69.20830535888672, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.1147241592407227, "rewards_train/margins": -0.026706457138061523, "rewards_train/rejected": -1.0880177021026611, "step": 941 }, { "epoch": 0.26, "learning_rate": 3.850418509002269e-07, "loss": 0.6317, "step": 942 }, { "epoch": 0.26, "logps_train/chosen": -93.73075866699219, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -92.96345520019531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.697880744934082, "rewards_train/margins": 0.30159056186676025, "rewards_train/rejected": -1.9994713068008423, "step": 942 }, { "epoch": 0.26, "logps_train/chosen": -60.12846374511719, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -73.21841430664062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.28999507427215576, "rewards_train/margins": 0.5353621244430542, "rewards_train/rejected": -0.82535719871521, "step": 943 }, { "epoch": 0.26, "learning_rate": 3.845205882908432e-07, "loss": 0.6042, "step": 944 }, { "epoch": 0.26, "logps_train/chosen": -76.77639770507812, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -73.18373107910156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9510769248008728, "rewards_train/margins": 0.36924880743026733, "rewards_train/rejected": -1.3203257322311401, "step": 944 }, { "epoch": 0.26, "logps_train/chosen": -59.21940994262695, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -86.09684753417969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8250659704208374, "rewards_train/margins": 1.2748533487319946, "rewards_train/rejected": -2.099919319152832, "step": 945 }, { "epoch": 0.26, "learning_rate": 3.839985012301021e-07, "loss": 0.4803, "step": 946 }, { "epoch": 0.26, "logps_train/chosen": -77.0558853149414, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -75.96937561035156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2387912273406982, "rewards_train/margins": 0.46146631240844727, "rewards_train/rejected": -1.7002575397491455, "step": 946 }, { "epoch": 0.26, "logps_train/chosen": -39.2894287109375, "logps_train/ref_chosen": -34.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -53.667877197265625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4969112277030945, "rewards_train/margins": 0.2210482954978943, "rewards_train/rejected": -0.7179595232009888, "step": 947 }, { "epoch": 0.26, "learning_rate": 3.834755929177772e-07, "loss": 0.6018, "step": 948 }, { "epoch": 0.26, "logps_train/chosen": -82.52078247070312, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -75.34527587890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6145778894424438, "rewards_train/margins": 1.3945590257644653, "rewards_train/rejected": -2.009136915206909, "step": 948 }, { "epoch": 0.27, "logps_train/chosen": -117.22262573242188, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -127.93438720703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.590231418609619, "rewards_train/margins": 1.379183292388916, "rewards_train/rejected": -3.969414710998535, "step": 949 }, { "epoch": 0.27, "learning_rate": 3.8295186655867484e-07, "loss": 0.4674, "step": 950 }, { "epoch": 0.27, "logps_train/chosen": -107.68785095214844, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -121.63465881347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.365269899368286, "rewards_train/margins": 1.8106958866119385, "rewards_train/rejected": -4.175965785980225, "step": 950 }, { "epoch": 0.27, "logps_train/chosen": -76.14000701904297, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -120.9788589477539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3702507019042969, "rewards_train/margins": 2.1659164428710938, "rewards_train/rejected": -2.5361671447753906, "step": 951 }, { "epoch": 0.27, "learning_rate": 3.8242732536261534e-07, "loss": 0.3037, "step": 952 }, { "epoch": 0.27, "logps_train/chosen": -114.98648834228516, "logps_train/ref_chosen": -101.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -152.13429260253906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4033362865447998, "rewards_train/margins": 2.6171247959136963, "rewards_train/rejected": -4.020461082458496, "step": 952 }, { "epoch": 0.27, "logps_train/chosen": -93.19520568847656, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -125.11563110351562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.676551342010498, "rewards_train/margins": 2.2264187335968018, "rewards_train/rejected": -2.9029700756073, "step": 953 }, { "epoch": 0.27, "learning_rate": 3.819019725444129e-07, "loss": 0.2898, "step": 954 }, { "epoch": 0.27, "logps_train/chosen": -93.14697265625, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -114.73471069335938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6443849802017212, "rewards_train/margins": 1.9978359937667847, "rewards_train/rejected": -2.642220973968506, "step": 954 }, { "epoch": 0.27, "logps_train/chosen": -34.24162673950195, "logps_train/ref_chosen": -28.875, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -52.270198822021484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5331470966339111, "rewards_train/margins": 0.47202205657958984, "rewards_train/rejected": -1.005169153213501, "step": 955 }, { "epoch": 0.27, "learning_rate": 3.813758113238561e-07, "loss": 0.4821, "step": 956 }, { "epoch": 0.27, "logps_train/chosen": -57.884178161621094, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -73.18476104736328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7455227375030518, "rewards_train/margins": 1.4659221172332764, "rewards_train/rejected": -2.211444854736328, "step": 956 }, { "epoch": 0.27, "logps_train/chosen": -80.27413177490234, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -107.8955307006836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2862021923065186, "rewards_train/margins": 0.8603823184967041, "rewards_train/rejected": -2.1465845108032227, "step": 957 }, { "epoch": 0.27, "learning_rate": 3.808488449256879e-07, "loss": 0.4038, "step": 958 }, { "epoch": 0.27, "logps_train/chosen": -55.29138946533203, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -76.25001525878906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.17679546773433685, "rewards_train/margins": 0.6376593858003616, "rewards_train/rejected": -0.8144548535346985, "step": 958 }, { "epoch": 0.27, "logps_train/chosen": -99.0465087890625, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -74.69607543945312, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.4119257926940918, "rewards_train/margins": 0.3733067512512207, "rewards_train/rejected": -1.7852325439453125, "step": 959 }, { "epoch": 0.27, "learning_rate": 3.8032107657958626e-07, "loss": 0.6727, "step": 960 }, { "epoch": 0.27, "logps_train/chosen": -28.30646514892578, "logps_train/ref_chosen": -27.375, "logps_train/ref_rejected": -28.375, "logps_train/rejected": -37.51717758178711, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.08845892548561096, "rewards_train/margins": 0.8301533758640289, "rewards_train/rejected": -0.9186123013496399, "step": 960 }, { "epoch": 0.27, "logps_train/chosen": -31.715511322021484, "logps_train/ref_chosen": -28.0, "logps_train/ref_rejected": -28.125, "logps_train/rejected": -35.91783142089844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.37037909030914307, "rewards_train/margins": 0.40538859367370605, "rewards_train/rejected": -0.7757676839828491, "step": 961 }, { "epoch": 0.27, "learning_rate": 3.797925095201438e-07, "loss": 0.5759, "step": 962 }, { "epoch": 0.27, "logps_train/chosen": -66.50740814208984, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -103.23048400878906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7607013583183289, "rewards_train/margins": 2.427582085132599, "rewards_train/rejected": -3.1882834434509277, "step": 962 }, { "epoch": 0.27, "logps_train/chosen": -62.93327331542969, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -76.07221984863281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7359054088592529, "rewards_train/margins": 1.2978789806365967, "rewards_train/rejected": -2.0337843894958496, "step": 963 }, { "epoch": 0.27, "learning_rate": 3.792631469868487e-07, "loss": 0.3608, "step": 964 }, { "epoch": 0.27, "logps_train/chosen": -69.26075744628906, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -78.39433288574219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.73271644115448, "rewards_train/margins": 0.6305446624755859, "rewards_train/rejected": -1.363261103630066, "step": 964 }, { "epoch": 0.27, "logps_train/chosen": -85.05624389648438, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -91.3147964477539, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2844817042350769, "rewards_train/margins": 1.776245892047882, "rewards_train/rejected": -2.060727596282959, "step": 965 }, { "epoch": 0.27, "learning_rate": 3.787329922240642e-07, "loss": 0.4784, "step": 966 }, { "epoch": 0.27, "logps_train/chosen": -59.374000549316406, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -70.02639770507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.36415785551071167, "rewards_train/margins": 1.7120173573493958, "rewards_train/rejected": -2.0761752128601074, "step": 966 }, { "epoch": 0.27, "logps_train/chosen": -73.93157958984375, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -78.58828735351562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7268491983413696, "rewards_train/margins": 1.136960506439209, "rewards_train/rejected": -1.8638097047805786, "step": 967 }, { "epoch": 0.27, "learning_rate": 3.782020484810089e-07, "loss": 0.3588, "step": 968 }, { "epoch": 0.27, "logps_train/chosen": -75.43313598632812, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -83.15252685546875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.362356424331665, "rewards_train/margins": 0.17281866073608398, "rewards_train/rejected": -1.535175085067749, "step": 968 }, { "epoch": 0.27, "logps_train/chosen": -99.70472717285156, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -113.51841735839844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.9423480033874512, "rewards_train/margins": 1.1915254592895508, "rewards_train/rejected": -3.133873462677002, "step": 969 }, { "epoch": 0.27, "learning_rate": 3.776703190117372e-07, "loss": 0.5644, "step": 970 }, { "epoch": 0.27, "logps_train/chosen": -52.91246032714844, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -58.44598388671875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7568711042404175, "rewards_train/margins": 0.3406568765640259, "rewards_train/rejected": -1.0975279808044434, "step": 970 }, { "epoch": 0.27, "logps_train/chosen": -36.84766387939453, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -27.625, "logps_train/rejected": -32.782142639160156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.17265711724758148, "rewards_train/margins": 0.3489164263010025, "rewards_train/rejected": -0.521573543548584, "step": 971 }, { "epoch": 0.27, "learning_rate": 3.771378070751188e-07, "loss": 0.5983, "step": 972 }, { "epoch": 0.27, "logps_train/chosen": -61.879356384277344, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -71.01873779296875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.14887315034866333, "rewards_train/margins": 0.886203944683075, "rewards_train/rejected": -1.0350770950317383, "step": 972 }, { "epoch": 0.27, "logps_train/chosen": -60.30873107910156, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -57.47590637207031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7766494154930115, "rewards_train/margins": 0.3982852101325989, "rewards_train/rejected": -1.1749346256256104, "step": 973 }, { "epoch": 0.27, "learning_rate": 3.7660451593481906e-07, "loss": 0.4696, "step": 974 }, { "epoch": 0.27, "logps_train/chosen": -84.56649017333984, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -82.98353576660156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.775667428970337, "rewards_train/margins": 0.5963926315307617, "rewards_train/rejected": -2.3720600605010986, "step": 974 }, { "epoch": 0.27, "logps_train/chosen": -57.22075653076172, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -57.888938903808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.24507342278957367, "rewards_train/margins": 0.9418674856424332, "rewards_train/rejected": -1.1869409084320068, "step": 975 }, { "epoch": 0.27, "learning_rate": 3.7607044885927913e-07, "loss": 0.5744, "step": 976 }, { "epoch": 0.27, "logps_train/chosen": -116.5517578125, "logps_train/ref_chosen": -106.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -149.39083862304688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0426762104034424, "rewards_train/margins": 2.0589070320129395, "rewards_train/rejected": -3.101583242416382, "step": 976 }, { "epoch": 0.27, "logps_train/chosen": -100.63493347167969, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -96.67007446289062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4766771793365479, "rewards_train/margins": 0.6221663951873779, "rewards_train/rejected": -2.098843574523926, "step": 977 }, { "epoch": 0.27, "learning_rate": 3.7553560912169533e-07, "loss": 0.3579, "step": 978 }, { "epoch": 0.27, "logps_train/chosen": -63.02811050415039, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -115.67050170898438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8616006374359131, "rewards_train/margins": 2.8093068599700928, "rewards_train/rejected": -3.670907497406006, "step": 978 }, { "epoch": 0.27, "logps_train/chosen": -63.73190689086914, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -85.74191284179688, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.8986788988113403, "rewards_train/margins": 0.5228750705718994, "rewards_train/rejected": -1.4215539693832397, "step": 979 }, { "epoch": 0.27, "learning_rate": 3.75e-07, "loss": 0.4244, "step": 980 }, { "epoch": 0.27, "logps_train/chosen": -44.13560485839844, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -43.51254653930664, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.30501577258110046, "rewards_train/margins": 0.5095200836658478, "rewards_train/rejected": -0.8145358562469482, "step": 980 }, { "epoch": 0.27, "logps_train/chosen": -38.95943069458008, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -45.373722076416016, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.35805249214172363, "rewards_train/margins": 0.15978842973709106, "rewards_train/rejected": -0.5178409218788147, "step": 981 }, { "epoch": 0.27, "learning_rate": 3.744636247768404e-07, "loss": 0.6043, "step": 982 }, { "epoch": 0.27, "logps_train/chosen": -96.59376525878906, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -111.5, "logps_train/rejected": -136.58489990234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7636739611625671, "rewards_train/margins": 1.7620039582252502, "rewards_train/rejected": -2.5256779193878174, "step": 982 }, { "epoch": 0.27, "logps_train/chosen": -69.85820007324219, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -67.28929901123047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4448046684265137, "rewards_train/margins": 0.3501410484313965, "rewards_train/rejected": -1.7949457168579102, "step": 983 }, { "epoch": 0.28, "learning_rate": 3.739264867395593e-07, "loss": 0.4394, "step": 984 }, { "epoch": 0.28, "logps_train/chosen": -96.44001007080078, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -117.32788848876953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6221262812614441, "rewards_train/margins": 1.725213348865509, "rewards_train/rejected": -2.347339630126953, "step": 984 }, { "epoch": 0.28, "logps_train/chosen": -60.99164581298828, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -68.77722930908203, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7296335697174072, "rewards_train/margins": 0.820159912109375, "rewards_train/rejected": -1.5497934818267822, "step": 985 }, { "epoch": 0.28, "learning_rate": 3.7338858918017455e-07, "loss": 0.3727, "step": 986 }, { "epoch": 0.28, "logps_train/chosen": -33.59450912475586, "logps_train/ref_chosen": -24.625, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -53.87938690185547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8946071863174438, "rewards_train/margins": 0.527706503868103, "rewards_train/rejected": -1.4223136901855469, "step": 986 }, { "epoch": 0.28, "logps_train/chosen": -73.20545196533203, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -78.76162719726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4795297086238861, "rewards_train/margins": 1.1833525002002716, "rewards_train/rejected": -1.6628822088241577, "step": 987 }, { "epoch": 0.28, "learning_rate": 3.728499353953591e-07, "loss": 0.4296, "step": 988 }, { "epoch": 0.28, "logps_train/chosen": -92.98452758789062, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -112.6695556640625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.30782726407051086, "rewards_train/margins": 2.038816660642624, "rewards_train/rejected": -2.3466439247131348, "step": 988 }, { "epoch": 0.28, "logps_train/chosen": -41.62238311767578, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -32.75, "logps_train/rejected": -45.17969512939453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6384590864181519, "rewards_train/margins": 0.6095397472381592, "rewards_train/rejected": -1.247998833656311, "step": 989 }, { "epoch": 0.28, "learning_rate": 3.7231052868642066e-07, "loss": 0.4056, "step": 990 }, { "epoch": 0.28, "logps_train/chosen": -77.09317016601562, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -92.37982940673828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6284575462341309, "rewards_train/margins": 0.4040563106536865, "rewards_train/rejected": -2.0325138568878174, "step": 990 }, { "epoch": 0.28, "logps_train/chosen": -76.18235778808594, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -104.20475769042969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1693096160888672, "rewards_train/margins": 1.309173583984375, "rewards_train/rejected": -2.478483200073242, "step": 991 }, { "epoch": 0.28, "learning_rate": 3.717703723592811e-07, "loss": 0.5202, "step": 992 }, { "epoch": 0.28, "logps_train/chosen": -78.13817596435547, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -107.66416931152344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7498531341552734, "rewards_train/margins": 2.491173505783081, "rewards_train/rejected": -3.2410266399383545, "step": 992 }, { "epoch": 0.28, "logps_train/chosen": -33.91649627685547, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -15.375, "logps_train/rejected": -22.65131378173828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.34868061542510986, "rewards_train/margins": 0.3814411163330078, "rewards_train/rejected": -0.7301217317581177, "step": 993 }, { "epoch": 0.28, "learning_rate": 3.7122946972445717e-07, "loss": 0.4623, "step": 994 }, { "epoch": 0.28, "logps_train/chosen": -58.274356842041016, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -92.54460906982422, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.961810827255249, "rewards_train/margins": 1.0485093593597412, "rewards_train/rejected": -2.0103201866149902, "step": 994 }, { "epoch": 0.28, "logps_train/chosen": -89.96923828125, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -96.57150268554688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8988767862319946, "rewards_train/margins": 0.9582729339599609, "rewards_train/rejected": -1.8571497201919556, "step": 995 }, { "epoch": 0.28, "learning_rate": 3.706878240970391e-07, "loss": 0.4345, "step": 996 }, { "epoch": 0.28, "logps_train/chosen": -93.88801574707031, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -103.30751037597656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8087237477302551, "rewards_train/margins": 0.9267154335975647, "rewards_train/rejected": -1.7354391813278198, "step": 996 }, { "epoch": 0.28, "logps_train/chosen": -59.505218505859375, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -64.93518829345703, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.85823655128479, "rewards_train/margins": 0.6938760280609131, "rewards_train/rejected": -1.5521125793457031, "step": 997 }, { "epoch": 0.28, "learning_rate": 3.7014543879667095e-07, "loss": 0.4147, "step": 998 }, { "epoch": 0.28, "logps_train/chosen": -128.09014892578125, "logps_train/ref_chosen": -108.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -121.72225189208984, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9648754596710205, "rewards_train/margins": 0.9577405452728271, "rewards_train/rejected": -2.9226160049438477, "step": 998 }, { "epoch": 0.28, "logps_train/chosen": -88.49454498291016, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -89.01692199707031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7994546294212341, "rewards_train/margins": 0.8420813679695129, "rewards_train/rejected": -1.641535997390747, "step": 999 }, { "epoch": 0.28, "learning_rate": 3.696023171475301e-07, "loss": 0.5059, "step": 1000 }, { "epoch": 0.28, "logps_train/chosen": -92.29740905761719, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -124.15961456298828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.365678608417511, "rewards_train/margins": 2.425282657146454, "rewards_train/rejected": -2.790961265563965, "step": 1000 }, { "epoch": 0.28, "logps_train/chosen": -39.370155334472656, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -50.7298698425293, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.35166412591934204, "rewards_train/margins": 0.7014012932777405, "rewards_train/rejected": -1.0530654191970825, "step": 1001 }, { "epoch": 0.28, "learning_rate": 3.69058462478307e-07, "loss": 0.3835, "step": 1002 }, { "epoch": 0.28, "logps_train/chosen": -55.21368408203125, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -72.70394897460938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8161681890487671, "rewards_train/margins": 1.2327419519424438, "rewards_train/rejected": -2.048910140991211, "step": 1002 }, { "epoch": 0.28, "logps_train/chosen": -73.49148559570312, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -100.387451171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5975865721702576, "rewards_train/margins": 1.8755331635475159, "rewards_train/rejected": -2.4731197357177734, "step": 1003 }, { "epoch": 0.28, "learning_rate": 3.685138781221844e-07, "loss": 0.2971, "step": 1004 }, { "epoch": 0.28, "logps_train/chosen": -76.83126068115234, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -78.23001098632812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6876183152198792, "rewards_train/margins": 0.9588205218315125, "rewards_train/rejected": -1.6464388370513916, "step": 1004 }, { "epoch": 0.28, "logps_train/chosen": -92.24978637695312, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -117.96641540527344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.6948999166488647, "rewards_train/margins": 1.4544755220413208, "rewards_train/rejected": -3.1493754386901855, "step": 1005 }, { "epoch": 0.28, "learning_rate": 3.6796856741681726e-07, "loss": 0.436, "step": 1006 }, { "epoch": 0.28, "logps_train/chosen": -87.13131713867188, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -85.78050994873047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.376413106918335, "rewards_train/margins": 0.4496849775314331, "rewards_train/rejected": -1.826098084449768, "step": 1006 }, { "epoch": 0.28, "logps_train/chosen": -87.46511840820312, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -96.9076919555664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0320582389831543, "rewards_train/margins": 1.220038890838623, "rewards_train/rejected": -2.2520971298217773, "step": 1007 }, { "epoch": 0.28, "learning_rate": 3.674225337043123e-07, "loss": 0.6127, "step": 1008 }, { "epoch": 0.28, "logps_train/chosen": -84.76114654541016, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -96.82876586914062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4245526790618896, "rewards_train/margins": 0.9680891036987305, "rewards_train/rejected": -2.39264178276062, "step": 1008 }, { "epoch": 0.28, "logps_train/chosen": -61.201988220214844, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -50.20012664794922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8639487028121948, "rewards_train/margins": 0.16407179832458496, "rewards_train/rejected": -1.0280205011367798, "step": 1009 }, { "epoch": 0.28, "learning_rate": 3.6687578033120736e-07, "loss": 0.5959, "step": 1010 }, { "epoch": 0.28, "logps_train/chosen": -73.82183074951172, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -112.18840026855469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4138238430023193, "rewards_train/margins": 0.7862663269042969, "rewards_train/rejected": -2.200090169906616, "step": 1010 }, { "epoch": 0.28, "logps_train/chosen": -86.24446105957031, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -90.42868041992188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9603840112686157, "rewards_train/margins": 0.57701575756073, "rewards_train/rejected": -1.5373997688293457, "step": 1011 }, { "epoch": 0.28, "learning_rate": 3.6632831064845077e-07, "loss": 0.4987, "step": 1012 }, { "epoch": 0.28, "logps_train/chosen": -33.3088493347168, "logps_train/ref_chosen": -29.75, "logps_train/ref_rejected": -32.0, "logps_train/rejected": -48.71315383911133, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3562754690647125, "rewards_train/margins": 1.307129830121994, "rewards_train/rejected": -1.6634052991867065, "step": 1012 }, { "epoch": 0.28, "logps_train/chosen": -41.54804229736328, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -67.42402648925781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.26105427742004395, "rewards_train/margins": 1.3075196743011475, "rewards_train/rejected": -1.5685739517211914, "step": 1013 }, { "epoch": 0.28, "learning_rate": 3.657801280113813e-07, "loss": 0.4629, "step": 1014 }, { "epoch": 0.28, "logps_train/chosen": -67.31124114990234, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -93.14631652832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5881553292274475, "rewards_train/margins": 1.8823363184928894, "rewards_train/rejected": -2.470491647720337, "step": 1014 }, { "epoch": 0.28, "logps_train/chosen": -56.12282943725586, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -61.75353240966797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6810327768325806, "rewards_train/margins": 0.5302577018737793, "rewards_train/rejected": -1.2112904787063599, "step": 1015 }, { "epoch": 0.28, "learning_rate": 3.6523123577970693e-07, "loss": 0.4064, "step": 1016 }, { "epoch": 0.28, "logps_train/chosen": -127.6142807006836, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -140.6337890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.3192403316497803, "rewards_train/margins": 0.981637716293335, "rewards_train/rejected": -3.3008780479431152, "step": 1016 }, { "epoch": 0.28, "logps_train/chosen": -44.224998474121094, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -61.081878662109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4836325943470001, "rewards_train/margins": 1.0846136510372162, "rewards_train/rejected": -1.5682462453842163, "step": 1017 }, { "epoch": 0.28, "learning_rate": 3.64681637317485e-07, "loss": 0.4366, "step": 1018 }, { "epoch": 0.28, "logps_train/chosen": -100.96199798583984, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -116.30669403076172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7578212022781372, "rewards_train/margins": 1.118648886680603, "rewards_train/rejected": -2.8764700889587402, "step": 1018 }, { "epoch": 0.28, "logps_train/chosen": -66.99490356445312, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -78.38606262207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3624570071697235, "rewards_train/margins": 1.403178483247757, "rewards_train/rejected": -1.7656354904174805, "step": 1019 }, { "epoch": 0.29, "learning_rate": 3.6413133599310096e-07, "loss": 0.5204, "step": 1020 }, { "epoch": 0.29, "logps_train/chosen": -83.38843536376953, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -85.05964660644531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5329840183258057, "rewards_train/margins": 0.5751283168792725, "rewards_train/rejected": -2.108112335205078, "step": 1020 }, { "epoch": 0.29, "logps_train/chosen": -53.12415313720703, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -53.799285888671875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7831181883811951, "rewards_train/margins": 0.5073574185371399, "rewards_train/rejected": -1.290475606918335, "step": 1021 }, { "epoch": 0.29, "learning_rate": 3.635803351792479e-07, "loss": 0.5144, "step": 1022 }, { "epoch": 0.29, "logps_train/chosen": -76.83980560302734, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -97.3507308959961, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4844684898853302, "rewards_train/margins": 1.1534366309642792, "rewards_train/rejected": -1.6379051208496094, "step": 1022 }, { "epoch": 0.29, "logps_train/chosen": -33.135948181152344, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -57.80500793457031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2725791931152344, "rewards_train/margins": 0.8645621538162231, "rewards_train/rejected": -1.1371413469314575, "step": 1023 }, { "epoch": 0.29, "learning_rate": 3.630286382529061e-07, "loss": 0.4425, "step": 1024 }, { "epoch": 0.29, "logps_train/chosen": -55.14508056640625, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -52.56376266479492, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.35552334785461426, "rewards_train/margins": 0.6935285329818726, "rewards_train/rejected": -1.0490518808364868, "step": 1024 }, { "epoch": 0.29, "logps_train/chosen": -36.391231536865234, "logps_train/ref_chosen": -31.125, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -59.95484924316406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5218378305435181, "rewards_train/margins": 1.2287250757217407, "rewards_train/rejected": -1.7505629062652588, "step": 1025 }, { "epoch": 0.29, "learning_rate": 3.6247624859532223e-07, "loss": 0.4426, "step": 1026 }, { "epoch": 0.29, "logps_train/chosen": -44.920921325683594, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -58.58968734741211, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3239278793334961, "rewards_train/margins": 0.7397284507751465, "rewards_train/rejected": -1.0636563301086426, "step": 1026 }, { "epoch": 0.29, "logps_train/chosen": -22.355199813842773, "logps_train/ref_chosen": -18.75, "logps_train/ref_rejected": -21.625, "logps_train/rejected": -28.59585952758789, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.36482909321784973, "rewards_train/margins": 0.33109697699546814, "rewards_train/rejected": -0.6959260702133179, "step": 1027 }, { "epoch": 0.29, "learning_rate": 3.619231695919884e-07, "loss": 0.5221, "step": 1028 }, { "epoch": 0.29, "logps_train/chosen": -83.94831085205078, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -72.9941177368164, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0690497159957886, "rewards_train/margins": 0.6928616762161255, "rewards_train/rejected": -1.761911392211914, "step": 1028 }, { "epoch": 0.29, "logps_train/chosen": -50.01479721069336, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -65.52947998046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.33976075053215027, "rewards_train/margins": 0.7421911060810089, "rewards_train/rejected": -1.0819518566131592, "step": 1029 }, { "epoch": 0.29, "learning_rate": 3.613694046326217e-07, "loss": 0.5026, "step": 1030 }, { "epoch": 0.29, "logps_train/chosen": -64.20675659179688, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -63.5363655090332, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9573944807052612, "rewards_train/margins": 0.35444533824920654, "rewards_train/rejected": -1.3118398189544678, "step": 1030 }, { "epoch": 0.29, "logps_train/chosen": -64.35011291503906, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -94.18923950195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.395460307598114, "rewards_train/margins": 1.5453386902809143, "rewards_train/rejected": -1.9407989978790283, "step": 1031 }, { "epoch": 0.29, "learning_rate": 3.608149571111434e-07, "loss": 0.4341, "step": 1032 }, { "epoch": 0.29, "logps_train/chosen": -54.97735595703125, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -30.75, "logps_train/rejected": -44.96813201904297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6032044887542725, "rewards_train/margins": 0.8182178735733032, "rewards_train/rejected": -1.4214223623275757, "step": 1032 }, { "epoch": 0.29, "logps_train/chosen": -37.03369140625, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -61.21590042114258, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.23774416744709015, "rewards_train/margins": 1.2986898869276047, "rewards_train/rejected": -1.5364340543746948, "step": 1033 }, { "epoch": 0.29, "learning_rate": 3.6025983042565787e-07, "loss": 0.3872, "step": 1034 }, { "epoch": 0.29, "logps_train/chosen": -117.79395294189453, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -131.65074157714844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0825207233428955, "rewards_train/margins": 0.6700534820556641, "rewards_train/rejected": -2.7525742053985596, "step": 1034 }, { "epoch": 0.29, "logps_train/chosen": -127.1763916015625, "logps_train/ref_chosen": -111.5, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -129.92922973632812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5638302564620972, "rewards_train/margins": 0.9345625638961792, "rewards_train/rejected": -2.4983928203582764, "step": 1035 }, { "epoch": 0.29, "learning_rate": 3.59704027978432e-07, "loss": 0.5562, "step": 1036 }, { "epoch": 0.29, "logps_train/chosen": -40.47643280029297, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -63.31345748901367, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6408072710037231, "rewards_train/margins": 0.6225696802139282, "rewards_train/rejected": -1.2633769512176514, "step": 1036 }, { "epoch": 0.29, "logps_train/chosen": -59.064483642578125, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -85.43781280517578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4763703942298889, "rewards_train/margins": 1.6791290640830994, "rewards_train/rejected": -2.1554994583129883, "step": 1037 }, { "epoch": 0.29, "learning_rate": 3.591475531758745e-07, "loss": 0.4342, "step": 1038 }, { "epoch": 0.29, "logps_train/chosen": -45.847835540771484, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -51.582645416259766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5097836852073669, "rewards_train/margins": 0.5814889073371887, "rewards_train/rejected": -1.0912725925445557, "step": 1038 }, { "epoch": 0.29, "logps_train/chosen": -40.81827926635742, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -57.85715866088867, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.23973798751831055, "rewards_train/margins": 0.45515745878219604, "rewards_train/rejected": -0.6948954463005066, "step": 1039 }, { "epoch": 0.29, "learning_rate": 3.585904094285145e-07, "loss": 0.5034, "step": 1040 }, { "epoch": 0.29, "logps_train/chosen": -106.05583190917969, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -137.294921875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8173024654388428, "rewards_train/margins": 1.9246892929077148, "rewards_train/rejected": -3.7419917583465576, "step": 1040 }, { "epoch": 0.29, "logps_train/chosen": -84.23391723632812, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -113.50904846191406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9944851994514465, "rewards_train/margins": 1.298607051372528, "rewards_train/rejected": -2.2930922508239746, "step": 1041 }, { "epoch": 0.29, "learning_rate": 3.5803260015098113e-07, "loss": 0.4149, "step": 1042 }, { "epoch": 0.29, "logps_train/chosen": -89.83704376220703, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -101.71491241455078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6047983169555664, "rewards_train/margins": 1.0354430675506592, "rewards_train/rejected": -1.6402413845062256, "step": 1042 }, { "epoch": 0.29, "logps_train/chosen": -74.22798919677734, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -79.30785369873047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.019283413887024, "rewards_train/margins": 1.2669705152511597, "rewards_train/rejected": -2.2862539291381836, "step": 1043 }, { "epoch": 0.29, "learning_rate": 3.5747412876198256e-07, "loss": 0.4699, "step": 1044 }, { "epoch": 0.29, "logps_train/chosen": -39.050621032714844, "logps_train/ref_chosen": -29.25, "logps_train/ref_rejected": -26.0, "logps_train/rejected": -37.39555740356445, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.985042929649353, "rewards_train/margins": 0.15246200561523438, "rewards_train/rejected": -1.1375049352645874, "step": 1044 }, { "epoch": 0.29, "logps_train/chosen": -69.05142974853516, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -101.85133361816406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5389320850372314, "rewards_train/margins": 1.8883883953094482, "rewards_train/rejected": -3.4273204803466797, "step": 1045 }, { "epoch": 0.29, "learning_rate": 3.5691499868428463e-07, "loss": 0.491, "step": 1046 }, { "epoch": 0.29, "logps_train/chosen": -45.52775192260742, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -61.117061614990234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4158613085746765, "rewards_train/margins": 1.1839309334754944, "rewards_train/rejected": -1.599792242050171, "step": 1046 }, { "epoch": 0.29, "logps_train/chosen": -73.66189575195312, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -76.9146728515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7325963973999023, "rewards_train/margins": 0.7260591983795166, "rewards_train/rejected": -1.458655595779419, "step": 1047 }, { "epoch": 0.29, "learning_rate": 3.563552133446904e-07, "loss": 0.4242, "step": 1048 }, { "epoch": 0.29, "logps_train/chosen": -73.01969909667969, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -92.4240493774414, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0511879920959473, "rewards_train/margins": 1.2873108386993408, "rewards_train/rejected": -2.338498830795288, "step": 1048 }, { "epoch": 0.29, "logps_train/chosen": -81.02203369140625, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -83.94853973388672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.40083646774292, "rewards_train/margins": 0.8600332736968994, "rewards_train/rejected": -2.2608697414398193, "step": 1049 }, { "epoch": 0.29, "learning_rate": 3.557947761740188e-07, "loss": 0.415, "step": 1050 }, { "epoch": 0.29, "logps_train/chosen": -99.14959716796875, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -111.18470764160156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9208005666732788, "rewards_train/margins": 1.4121047258377075, "rewards_train/rejected": -3.3329052925109863, "step": 1050 }, { "epoch": 0.29, "logps_train/chosen": -46.27050018310547, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -65.02919006347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5790029764175415, "rewards_train/margins": 0.8339753150939941, "rewards_train/rejected": -1.4129782915115356, "step": 1051 }, { "epoch": 0.29, "learning_rate": 3.5523369060708374e-07, "loss": 0.4856, "step": 1052 }, { "epoch": 0.29, "logps_train/chosen": -53.709083557128906, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -66.9501724243164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4507913291454315, "rewards_train/margins": 0.4293822944164276, "rewards_train/rejected": -0.8801736235618591, "step": 1052 }, { "epoch": 0.29, "logps_train/chosen": -60.62303924560547, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -89.16844177246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4779288172721863, "rewards_train/margins": 2.288133919239044, "rewards_train/rejected": -2.7660627365112305, "step": 1053 }, { "epoch": 0.29, "learning_rate": 3.546719600826729e-07, "loss": 0.4181, "step": 1054 }, { "epoch": 0.29, "logps_train/chosen": -62.254615783691406, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -68.5887451171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.232297658920288, "rewards_train/margins": 0.6871232986450195, "rewards_train/rejected": -1.9194209575653076, "step": 1054 }, { "epoch": 0.29, "logps_train/chosen": -52.8306770324707, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -70.62107849121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.44537240266799927, "rewards_train/margins": 0.9804078936576843, "rewards_train/rejected": -1.4257802963256836, "step": 1055 }, { "epoch": 0.3, "learning_rate": 3.541095880435271e-07, "loss": 0.4846, "step": 1056 }, { "epoch": 0.3, "logps_train/chosen": -80.06373596191406, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -97.99337005615234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7899674773216248, "rewards_train/margins": 1.5953076481819153, "rewards_train/rejected": -2.38527512550354, "step": 1056 }, { "epoch": 0.3, "logps_train/chosen": -36.4183464050293, "logps_train/ref_chosen": -31.125, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -55.211761474609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5305064916610718, "rewards_train/margins": 0.9630330801010132, "rewards_train/rejected": -1.493539571762085, "step": 1057 }, { "epoch": 0.3, "learning_rate": 3.535465779363186e-07, "loss": 0.478, "step": 1058 }, { "epoch": 0.3, "logps_train/chosen": -71.41215515136719, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -92.251953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4980020821094513, "rewards_train/margins": 1.9035124480724335, "rewards_train/rejected": -2.4015145301818848, "step": 1058 }, { "epoch": 0.3, "logps_train/chosen": -119.8857421875, "logps_train/ref_chosen": -101.5, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -139.56463623046875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8229491710662842, "rewards_train/margins": 0.7442560195922852, "rewards_train/rejected": -2.5672051906585693, "step": 1059 }, { "epoch": 0.3, "learning_rate": 3.529829332116302e-07, "loss": 0.4137, "step": 1060 }, { "epoch": 0.3, "logps_train/chosen": -90.95877075195312, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -83.42403411865234, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2107205390930176, "rewards_train/margins": 1.610588788986206, "rewards_train/rejected": -2.8213093280792236, "step": 1060 }, { "epoch": 0.3, "logps_train/chosen": -58.11225891113281, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -69.92088317871094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2807567119598389, "rewards_train/margins": 0.35898780822753906, "rewards_train/rejected": -1.639744520187378, "step": 1061 }, { "epoch": 0.3, "learning_rate": 3.524186573239345e-07, "loss": 0.5618, "step": 1062 }, { "epoch": 0.3, "logps_train/chosen": -91.04060363769531, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -110.70835876464844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.5214428901672363, "rewards_train/margins": 1.2108182907104492, "rewards_train/rejected": -2.7322611808776855, "step": 1062 }, { "epoch": 0.3, "logps_train/chosen": -110.97198486328125, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -116.05752563476562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.957244634628296, "rewards_train/margins": 0.9339449405670166, "rewards_train/rejected": -2.8911895751953125, "step": 1063 }, { "epoch": 0.3, "learning_rate": 3.51853753731572e-07, "loss": 0.5831, "step": 1064 }, { "epoch": 0.3, "logps_train/chosen": -96.94522094726562, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -131.8717498779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.007804036140442, "rewards_train/margins": 2.5246838331222534, "rewards_train/rejected": -3.5324878692626953, "step": 1064 }, { "epoch": 0.3, "logps_train/chosen": -37.12311935424805, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -57.761512756347656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4888744354248047, "rewards_train/margins": 1.0573943853378296, "rewards_train/rejected": -1.5462688207626343, "step": 1065 }, { "epoch": 0.3, "learning_rate": 3.512882258967306e-07, "loss": 0.3271, "step": 1066 }, { "epoch": 0.3, "logps_train/chosen": -87.00187683105469, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -98.45104217529297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.48651570081710815, "rewards_train/margins": 1.7613230347633362, "rewards_train/rejected": -2.2478387355804443, "step": 1066 }, { "epoch": 0.3, "logps_train/chosen": -79.45100402832031, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -90.55461120605469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.145100474357605, "rewards_train/margins": 0.9455169439315796, "rewards_train/rejected": -2.0906174182891846, "step": 1067 }, { "epoch": 0.3, "learning_rate": 3.507220772854238e-07, "loss": 0.4016, "step": 1068 }, { "epoch": 0.3, "logps_train/chosen": -58.23585510253906, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -91.26455688476562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6845228672027588, "rewards_train/margins": 1.9044318199157715, "rewards_train/rejected": -2.5889546871185303, "step": 1068 }, { "epoch": 0.3, "logps_train/chosen": -54.60529327392578, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -74.64722442626953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.41863471269607544, "rewards_train/margins": 1.590228259563446, "rewards_train/rejected": -2.0088629722595215, "step": 1069 }, { "epoch": 0.3, "learning_rate": 3.501553113674699e-07, "loss": 0.4185, "step": 1070 }, { "epoch": 0.3, "logps_train/chosen": -53.26490783691406, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -67.52047729492188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5455335378646851, "rewards_train/margins": 1.1348342895507812, "rewards_train/rejected": -1.6803678274154663, "step": 1070 }, { "epoch": 0.3, "logps_train/chosen": -88.54481506347656, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -113.20479583740234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.270106315612793, "rewards_train/margins": 1.7519359588623047, "rewards_train/rejected": -3.0220422744750977, "step": 1071 }, { "epoch": 0.3, "learning_rate": 3.495879316164705e-07, "loss": 0.3781, "step": 1072 }, { "epoch": 0.3, "logps_train/chosen": -79.0526351928711, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -102.6068115234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3712785243988037, "rewards_train/margins": 1.157273530960083, "rewards_train/rejected": -2.5285520553588867, "step": 1072 }, { "epoch": 0.3, "logps_train/chosen": -111.29562377929688, "logps_train/ref_chosen": -92.5, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -142.13092041015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8853240013122559, "rewards_train/margins": 1.5762062072753906, "rewards_train/rejected": -3.4615302085876465, "step": 1073 }, { "epoch": 0.3, "learning_rate": 3.490199415097892e-07, "loss": 0.374, "step": 1074 }, { "epoch": 0.3, "logps_train/chosen": -61.015018463134766, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -65.94660949707031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.20950940251350403, "rewards_train/margins": 0.4718700349330902, "rewards_train/rejected": -0.6813794374465942, "step": 1074 }, { "epoch": 0.3, "logps_train/chosen": -91.4825439453125, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -90.24166107177734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9312621355056763, "rewards_train/margins": 0.6930994987487793, "rewards_train/rejected": -1.6243616342544556, "step": 1075 }, { "epoch": 0.3, "learning_rate": 3.4845134452853054e-07, "loss": 0.5903, "step": 1076 }, { "epoch": 0.3, "logps_train/chosen": -51.69927978515625, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -82.89276885986328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5726622343063354, "rewards_train/margins": 1.9298962354660034, "rewards_train/rejected": -2.502558469772339, "step": 1076 }, { "epoch": 0.3, "logps_train/chosen": -52.848289489746094, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -66.4503402709961, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0582666397094727, "rewards_train/margins": 0.9441893100738525, "rewards_train/rejected": -2.002455949783325, "step": 1077 }, { "epoch": 0.3, "learning_rate": 3.4788214415751823e-07, "loss": 0.4098, "step": 1078 }, { "epoch": 0.3, "logps_train/chosen": -45.74258804321289, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -58.0853271484375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5560947060585022, "rewards_train/margins": 0.8438445925712585, "rewards_train/rejected": -1.3999392986297607, "step": 1078 }, { "epoch": 0.3, "logps_train/chosen": -22.18498992919922, "logps_train/ref_chosen": -17.125, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -47.595497131347656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.5052910447120667, "rewards_train/margins": 0.8816022276878357, "rewards_train/rejected": -1.3868932723999023, "step": 1079 }, { "epoch": 0.3, "learning_rate": 3.4731234388527424e-07, "loss": 0.647, "step": 1080 }, { "epoch": 0.3, "logps_train/chosen": -48.87290573120117, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -70.92533111572266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2337748259305954, "rewards_train/margins": 1.3786800652742386, "rewards_train/rejected": -1.612454891204834, "step": 1080 }, { "epoch": 0.3, "logps_train/chosen": -66.94660186767578, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -98.40412902832031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5646798014640808, "rewards_train/margins": 1.3968260884284973, "rewards_train/rejected": -1.9615058898925781, "step": 1081 }, { "epoch": 0.3, "learning_rate": 3.4674194720399715e-07, "loss": 0.4132, "step": 1082 }, { "epoch": 0.3, "logps_train/chosen": -45.16423797607422, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -52.1833610534668, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.36388489603996277, "rewards_train/margins": 0.6267167627811432, "rewards_train/rejected": -0.990601658821106, "step": 1082 }, { "epoch": 0.3, "logps_train/chosen": -36.05809783935547, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -50.100059509277344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3359364867210388, "rewards_train/margins": 1.0795872807502747, "rewards_train/rejected": -1.4155237674713135, "step": 1083 }, { "epoch": 0.3, "learning_rate": 3.4617095760954086e-07, "loss": 0.52, "step": 1084 }, { "epoch": 0.3, "logps_train/chosen": -94.58909606933594, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -102.871337890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9440658688545227, "rewards_train/margins": 1.8233415484428406, "rewards_train/rejected": -2.7674074172973633, "step": 1084 }, { "epoch": 0.3, "logps_train/chosen": -67.44469451904297, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -96.22750091552734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1706414222717285, "rewards_train/margins": 1.7423431873321533, "rewards_train/rejected": -2.912984609603882, "step": 1085 }, { "epoch": 0.3, "learning_rate": 3.45599378601393e-07, "loss": 0.3278, "step": 1086 }, { "epoch": 0.3, "logps_train/chosen": -87.40137481689453, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -99.51847839355469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4643561840057373, "rewards_train/margins": 0.538663387298584, "rewards_train/rejected": -1.0030195713043213, "step": 1086 }, { "epoch": 0.3, "logps_train/chosen": -61.284934997558594, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -57.48148727416992, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.3559348583221436, "rewards_train/margins": 0.4422140121459961, "rewards_train/rejected": -1.7981488704681396, "step": 1087 }, { "epoch": 0.3, "learning_rate": 3.4502721368265367e-07, "loss": 0.6218, "step": 1088 }, { "epoch": 0.3, "logps_train/chosen": -73.07564544677734, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -105.82978057861328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7022911310195923, "rewards_train/margins": 2.396312117576599, "rewards_train/rejected": -3.0986032485961914, "step": 1088 }, { "epoch": 0.3, "logps_train/chosen": -75.86065673828125, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -64.31005096435547, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2260072231292725, "rewards_train/margins": 0.46945154666900635, "rewards_train/rejected": -1.6954587697982788, "step": 1089 }, { "epoch": 0.3, "learning_rate": 3.444544663600141e-07, "loss": 0.5199, "step": 1090 }, { "epoch": 0.3, "logps_train/chosen": -15.774518013000488, "logps_train/ref_chosen": -12.4375, "logps_train/ref_rejected": -11.25, "logps_train/rejected": -17.932443618774414, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.33580145239830017, "rewards_train/margins": 0.3344937264919281, "rewards_train/rejected": -0.6702951788902283, "step": 1090 }, { "epoch": 0.3, "logps_train/chosen": -60.13362121582031, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -63.003387451171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9002760648727417, "rewards_train/margins": 0.30728912353515625, "rewards_train/rejected": -1.207565188407898, "step": 1091 }, { "epoch": 0.31, "learning_rate": 3.438811401437346e-07, "loss": 0.5702, "step": 1092 }, { "epoch": 0.31, "logps_train/chosen": -58.83032989501953, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -88.28053283691406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6442146897315979, "rewards_train/margins": 1.63247150182724, "rewards_train/rejected": -2.276686191558838, "step": 1092 }, { "epoch": 0.31, "logps_train/chosen": -54.11398696899414, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -79.3961181640625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8170624375343323, "rewards_train/margins": 1.267861783504486, "rewards_train/rejected": -2.0849242210388184, "step": 1093 }, { "epoch": 0.31, "learning_rate": 3.4330723854762364e-07, "loss": 0.3817, "step": 1094 }, { "epoch": 0.31, "logps_train/chosen": -36.65195083618164, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -48.813690185546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6269139647483826, "rewards_train/margins": 0.6720332503318787, "rewards_train/rejected": -1.2989472150802612, "step": 1094 }, { "epoch": 0.31, "logps_train/chosen": -116.93565368652344, "logps_train/ref_chosen": -96.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -140.33056640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0654401779174805, "rewards_train/margins": 2.1191797256469727, "rewards_train/rejected": -4.184619903564453, "step": 1095 }, { "epoch": 0.31, "learning_rate": 3.4273276508901615e-07, "loss": 0.3646, "step": 1096 }, { "epoch": 0.31, "logps_train/chosen": -44.57806396484375, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -64.02806091308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.40770870447158813, "rewards_train/margins": 1.463066279888153, "rewards_train/rejected": -1.8707749843597412, "step": 1096 }, { "epoch": 0.31, "logps_train/chosen": -49.8616943359375, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -55.13411331176758, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2953982651233673, "rewards_train/margins": 0.9715287387371063, "rewards_train/rejected": -1.2669270038604736, "step": 1097 }, { "epoch": 0.31, "learning_rate": 3.4215772328875177e-07, "loss": 0.3748, "step": 1098 }, { "epoch": 0.31, "logps_train/chosen": -78.97357177734375, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -92.58221435546875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0360286235809326, "rewards_train/margins": 1.2143800258636475, "rewards_train/rejected": -2.25040864944458, "step": 1098 }, { "epoch": 0.31, "logps_train/chosen": -55.08749008178711, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -69.11585998535156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5506435036659241, "rewards_train/margins": 1.0691455006599426, "rewards_train/rejected": -1.6197890043258667, "step": 1099 }, { "epoch": 0.31, "learning_rate": 3.4158211667115335e-07, "loss": 0.4407, "step": 1100 }, { "epoch": 0.31, "logps_train/chosen": -120.96487426757812, "logps_train/ref_chosen": -103.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -122.01763916015625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7710965871810913, "rewards_train/margins": 0.7810579538345337, "rewards_train/rejected": -2.552154541015625, "step": 1100 }, { "epoch": 0.31, "logps_train/chosen": -61.33692169189453, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -65.42922973632812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.817481279373169, "rewards_train/margins": 0.36567628383636475, "rewards_train/rejected": -1.1831575632095337, "step": 1101 }, { "epoch": 0.31, "learning_rate": 3.4100594876400543e-07, "loss": 0.569, "step": 1102 }, { "epoch": 0.31, "logps_train/chosen": -45.08638381958008, "logps_train/ref_chosen": -33.5, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -46.0760383605957, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1685993671417236, "rewards_train/margins": 0.03919970989227295, "rewards_train/rejected": -1.2077990770339966, "step": 1102 }, { "epoch": 0.31, "logps_train/chosen": -121.77224731445312, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -131.49594116210938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -3.256521463394165, "rewards_train/margins": 0.7868220806121826, "rewards_train/rejected": -4.043343544006348, "step": 1103 }, { "epoch": 0.31, "learning_rate": 3.404292230985327e-07, "loss": 0.6479, "step": 1104 }, { "epoch": 0.31, "logps_train/chosen": -39.853607177734375, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -56.08747863769531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.16036087274551392, "rewards_train/margins": 1.1894025206565857, "rewards_train/rejected": -1.3497633934020996, "step": 1104 }, { "epoch": 0.31, "logps_train/chosen": -68.73233032226562, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -77.93067932128906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8608310222625732, "rewards_train/margins": 1.051377534866333, "rewards_train/rejected": -1.9122085571289062, "step": 1105 }, { "epoch": 0.31, "learning_rate": 3.3985194320937815e-07, "loss": 0.3638, "step": 1106 }, { "epoch": 0.31, "logps_train/chosen": -73.71665954589844, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -83.02525329589844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.192368507385254, "rewards_train/margins": 0.8316411972045898, "rewards_train/rejected": -2.0240097045898438, "step": 1106 }, { "epoch": 0.31, "logps_train/chosen": -69.09898376464844, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -78.67167663574219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1285510063171387, "rewards_train/margins": 0.9606866836547852, "rewards_train/rejected": -2.089237689971924, "step": 1107 }, { "epoch": 0.31, "learning_rate": 3.3927411263458166e-07, "loss": 0.4816, "step": 1108 }, { "epoch": 0.31, "logps_train/chosen": -68.6715316772461, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -103.76207733154297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7993796467781067, "rewards_train/margins": 1.4291722178459167, "rewards_train/rejected": -2.2285518646240234, "step": 1108 }, { "epoch": 0.31, "logps_train/chosen": -79.95457458496094, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -68.17952728271484, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.091942548751831, "rewards_train/margins": 0.08147883415222168, "rewards_train/rejected": -1.1734213829040527, "step": 1109 }, { "epoch": 0.31, "learning_rate": 3.386957349155578e-07, "loss": 0.4955, "step": 1110 }, { "epoch": 0.31, "logps_train/chosen": -57.168636322021484, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -70.13655853271484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0338555574417114, "rewards_train/margins": 1.1188629865646362, "rewards_train/rejected": -2.1527185440063477, "step": 1110 }, { "epoch": 0.31, "logps_train/chosen": -56.17914581298828, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -73.0317611694336, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1761176586151123, "rewards_train/margins": 1.321882963180542, "rewards_train/rejected": -2.4980006217956543, "step": 1111 }, { "epoch": 0.31, "learning_rate": 3.381168135970749e-07, "loss": 0.4141, "step": 1112 }, { "epoch": 0.31, "logps_train/chosen": -41.319061279296875, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -28.5, "logps_train/rejected": -36.65044021606445, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4227263927459717, "rewards_train/margins": 0.3862628936767578, "rewards_train/rejected": -0.8089892864227295, "step": 1112 }, { "epoch": 0.31, "logps_train/chosen": -88.0911865234375, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -92.42652130126953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0791380405426025, "rewards_train/margins": 0.5197646617889404, "rewards_train/rejected": -2.598902702331543, "step": 1113 }, { "epoch": 0.31, "learning_rate": 3.375373522272326e-07, "loss": 0.5766, "step": 1114 }, { "epoch": 0.31, "logps_train/chosen": -74.1852035522461, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -80.33769226074219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6786766052246094, "rewards_train/margins": 0.35704636573791504, "rewards_train/rejected": -2.0357229709625244, "step": 1114 }, { "epoch": 0.31, "logps_train/chosen": -63.75776672363281, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -92.407958984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7430132031440735, "rewards_train/margins": 2.0325481295585632, "rewards_train/rejected": -2.7755613327026367, "step": 1115 }, { "epoch": 0.31, "learning_rate": 3.3695735435744055e-07, "loss": 0.5116, "step": 1116 }, { "epoch": 0.31, "logps_train/chosen": -36.19145202636719, "logps_train/ref_chosen": -31.375, "logps_train/ref_rejected": -31.875, "logps_train/rejected": -43.52620315551758, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.48542898893356323, "rewards_train/margins": 0.6796916127204895, "rewards_train/rejected": -1.1651206016540527, "step": 1116 }, { "epoch": 0.31, "logps_train/chosen": -56.11375427246094, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -63.73368835449219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6451154947280884, "rewards_train/margins": 0.7055970430374146, "rewards_train/rejected": -1.350712537765503, "step": 1117 }, { "epoch": 0.31, "learning_rate": 3.3637682354239656e-07, "loss": 0.5013, "step": 1118 }, { "epoch": 0.31, "logps_train/chosen": -89.7490005493164, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -87.31269836425781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.10009503364563, "rewards_train/margins": 0.40578365325927734, "rewards_train/rejected": -2.5058786869049072, "step": 1118 }, { "epoch": 0.31, "logps_train/chosen": -65.16108703613281, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -85.17172241210938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.46166563034057617, "rewards_train/margins": 1.0532604455947876, "rewards_train/rejected": -1.5149260759353638, "step": 1119 }, { "epoch": 0.31, "learning_rate": 3.357957633400645e-07, "loss": 0.5347, "step": 1120 }, { "epoch": 0.31, "logps_train/chosen": -45.67375564575195, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -72.63983154296875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7869070172309875, "rewards_train/margins": 1.2682868838310242, "rewards_train/rejected": -2.0551939010620117, "step": 1120 }, { "epoch": 0.31, "logps_train/chosen": -69.65164184570312, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -82.64480590820312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7079370021820068, "rewards_train/margins": 1.0424809455871582, "rewards_train/rejected": -2.750417947769165, "step": 1121 }, { "epoch": 0.31, "learning_rate": 3.3521417731165323e-07, "loss": 0.4252, "step": 1122 }, { "epoch": 0.31, "logps_train/chosen": -43.70858383178711, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -57.502525329589844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8567959070205688, "rewards_train/margins": 0.27470648288726807, "rewards_train/rejected": -1.131502389907837, "step": 1122 }, { "epoch": 0.31, "logps_train/chosen": -52.49120330810547, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -70.92811584472656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5112298727035522, "rewards_train/margins": 0.816542387008667, "rewards_train/rejected": -1.3277722597122192, "step": 1123 }, { "epoch": 0.31, "learning_rate": 3.346320690215939e-07, "loss": 0.5869, "step": 1124 }, { "epoch": 0.31, "logps_train/chosen": -85.27842712402344, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -90.1741943359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5864367485046387, "rewards_train/margins": 0.9448494911193848, "rewards_train/rejected": -1.5312862396240234, "step": 1124 }, { "epoch": 0.31, "logps_train/chosen": -88.07566833496094, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -105.86996459960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7584952116012573, "rewards_train/margins": 1.5497905015945435, "rewards_train/rejected": -3.308285713195801, "step": 1125 }, { "epoch": 0.31, "learning_rate": 3.3404944203751847e-07, "loss": 0.4124, "step": 1126 }, { "epoch": 0.31, "logps_train/chosen": -73.59733581542969, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -103.78475189208984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8066086769104004, "rewards_train/margins": 1.584366798400879, "rewards_train/rejected": -2.3909754753112793, "step": 1126 }, { "epoch": 0.31, "logps_train/chosen": -44.5773811340332, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -20.875, "logps_train/rejected": -29.997364044189453, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.5758774280548096, "rewards_train/margins": 0.330987811088562, "rewards_train/rejected": -0.9068652391433716, "step": 1127 }, { "epoch": 0.32, "learning_rate": 3.334662999302382e-07, "loss": 0.4612, "step": 1128 }, { "epoch": 0.32, "logps_train/chosen": -69.8879165649414, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -88.37731170654297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7446507215499878, "rewards_train/margins": 1.3508929014205933, "rewards_train/rejected": -2.095543622970581, "step": 1128 }, { "epoch": 0.32, "logps_train/chosen": -105.51724243164062, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -122.52033996582031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6696932911872864, "rewards_train/margins": 1.9749180674552917, "rewards_train/rejected": -2.644611358642578, "step": 1129 }, { "epoch": 0.32, "learning_rate": 3.3288264627372115e-07, "loss": 0.3273, "step": 1130 }, { "epoch": 0.32, "logps_train/chosen": -86.48792266845703, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -88.79074096679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.600745439529419, "rewards_train/margins": 0.9377037286758423, "rewards_train/rejected": -1.5384491682052612, "step": 1130 }, { "epoch": 0.32, "logps_train/chosen": -47.698570251464844, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -61.304847717285156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7073571085929871, "rewards_train/margins": 0.8093579411506653, "rewards_train/rejected": -1.5167150497436523, "step": 1131 }, { "epoch": 0.32, "learning_rate": 3.322984846450708e-07, "loss": 0.5058, "step": 1132 }, { "epoch": 0.32, "logps_train/chosen": -69.10832214355469, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -92.10992431640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4506756067276, "rewards_train/margins": 1.310707449913025, "rewards_train/rejected": -2.761383056640625, "step": 1132 }, { "epoch": 0.32, "logps_train/chosen": -95.66616821289062, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -110.3429183959961, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0639801025390625, "rewards_train/margins": 2.1142570972442627, "rewards_train/rejected": -3.178237199783325, "step": 1133 }, { "epoch": 0.32, "learning_rate": 3.3171381862450366e-07, "loss": 0.3162, "step": 1134 }, { "epoch": 0.32, "logps_train/chosen": -74.5470962524414, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -76.4083251953125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7293190360069275, "rewards_train/margins": 0.8237200379371643, "rewards_train/rejected": -1.5530390739440918, "step": 1134 }, { "epoch": 0.32, "logps_train/chosen": -57.64684295654297, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -81.79562377929688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.486656665802002, "rewards_train/margins": 0.7013044357299805, "rewards_train/rejected": -2.1879611015319824, "step": 1135 }, { "epoch": 0.32, "learning_rate": 3.311286517953278e-07, "loss": 0.5588, "step": 1136 }, { "epoch": 0.32, "logps_train/chosen": -62.12207794189453, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -74.61312103271484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.278662919998169, "rewards_train/margins": 0.4844069480895996, "rewards_train/rejected": -1.7630698680877686, "step": 1136 }, { "epoch": 0.32, "logps_train/chosen": -60.08687210083008, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -64.04116821289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.39862850308418274, "rewards_train/margins": 1.336737722158432, "rewards_train/rejected": -1.7353662252426147, "step": 1137 }, { "epoch": 0.32, "learning_rate": 3.305429877439205e-07, "loss": 0.4804, "step": 1138 }, { "epoch": 0.32, "logps_train/chosen": -46.473594665527344, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -60.557891845703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.447359561920166, "rewards_train/margins": 0.47600793838500977, "rewards_train/rejected": -0.9233675003051758, "step": 1138 }, { "epoch": 0.32, "logps_train/chosen": -72.01480865478516, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -89.46124267578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5598795413970947, "rewards_train/margins": 0.8055808544158936, "rewards_train/rejected": -1.3654603958129883, "step": 1139 }, { "epoch": 0.32, "learning_rate": 3.2995683005970636e-07, "loss": 0.4624, "step": 1140 }, { "epoch": 0.32, "logps_train/chosen": -76.2835922241211, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -91.91067504882812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.303091049194336, "rewards_train/margins": 0.8092658519744873, "rewards_train/rejected": -3.1123569011688232, "step": 1140 }, { "epoch": 0.32, "logps_train/chosen": -53.02901840209961, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -81.51502990722656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1332728862762451, "rewards_train/margins": 1.7668633460998535, "rewards_train/rejected": -2.9001362323760986, "step": 1141 }, { "epoch": 0.32, "learning_rate": 3.2937018233513564e-07, "loss": 0.4394, "step": 1142 }, { "epoch": 0.32, "logps_train/chosen": -78.84823608398438, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -81.2984619140625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.217635989189148, "rewards_train/margins": 0.7903352975845337, "rewards_train/rejected": -2.0079712867736816, "step": 1142 }, { "epoch": 0.32, "logps_train/chosen": -74.09119415283203, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -89.66231536865234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.596228837966919, "rewards_train/margins": 0.7967607975006104, "rewards_train/rejected": -1.3929896354675293, "step": 1143 }, { "epoch": 0.32, "learning_rate": 3.287830481656616e-07, "loss": 0.4635, "step": 1144 }, { "epoch": 0.32, "logps_train/chosen": -69.08924865722656, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -97.29635620117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5905653238296509, "rewards_train/margins": 1.4461008310317993, "rewards_train/rejected": -2.03666615486145, "step": 1144 }, { "epoch": 0.32, "logps_train/chosen": -48.96424865722656, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -59.011512756347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.09945185482501984, "rewards_train/margins": 1.0329498499631882, "rewards_train/rejected": -1.132401704788208, "step": 1145 }, { "epoch": 0.32, "learning_rate": 3.281954311497192e-07, "loss": 0.3515, "step": 1146 }, { "epoch": 0.32, "logps_train/chosen": -54.37567138671875, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -31.625, "logps_train/rejected": -41.38093566894531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9230404496192932, "rewards_train/margins": 0.051674067974090576, "rewards_train/rejected": -0.9747145175933838, "step": 1146 }, { "epoch": 0.32, "logps_train/chosen": -32.55580139160156, "logps_train/ref_chosen": -30.125, "logps_train/ref_rejected": -27.75, "logps_train/rejected": -35.703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.23829485476016998, "rewards_train/margins": 0.5552596896886826, "rewards_train/rejected": -0.7935545444488525, "step": 1147 }, { "epoch": 0.32, "learning_rate": 3.276073348887024e-07, "loss": 0.6186, "step": 1148 }, { "epoch": 0.32, "logps_train/chosen": -47.1552734375, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -50.57854461669922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1946287155151367, "rewards_train/margins": 0.08939743041992188, "rewards_train/rejected": -1.2840261459350586, "step": 1148 }, { "epoch": 0.32, "logps_train/chosen": -100.0604019165039, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -114.34955596923828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4126805067062378, "rewards_train/margins": 1.0964940786361694, "rewards_train/rejected": -2.5091745853424072, "step": 1149 }, { "epoch": 0.32, "learning_rate": 3.2701876298694244e-07, "loss": 0.6195, "step": 1150 }, { "epoch": 0.32, "logps_train/chosen": -58.04315948486328, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -77.4443588256836, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.484394371509552, "rewards_train/margins": 2.067854344844818, "rewards_train/rejected": -2.55224871635437, "step": 1150 }, { "epoch": 0.32, "logps_train/chosen": -87.42581939697266, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -109.06840515136719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.020316243171692, "rewards_train/margins": 1.6830090284347534, "rewards_train/rejected": -2.7033252716064453, "step": 1151 }, { "epoch": 0.32, "learning_rate": 3.2642971905168566e-07, "loss": 0.303, "step": 1152 }, { "epoch": 0.32, "logps_train/chosen": -41.9207763671875, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -30.5, "logps_train/rejected": -42.09564208984375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6830934882164001, "rewards_train/margins": 0.47998613119125366, "rewards_train/rejected": -1.1630796194076538, "step": 1152 }, { "epoch": 0.32, "logps_train/chosen": -71.35394287109375, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -76.39705657958984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6361757516860962, "rewards_train/margins": 0.7066670656204224, "rewards_train/rejected": -2.3428428173065186, "step": 1153 }, { "epoch": 0.32, "learning_rate": 3.2584020669307144e-07, "loss": 0.5285, "step": 1154 }, { "epoch": 0.32, "logps_train/chosen": -71.41012573242188, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -80.9860610961914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4444795846939087, "rewards_train/margins": 1.4273685216903687, "rewards_train/rejected": -2.8718481063842773, "step": 1154 }, { "epoch": 0.32, "logps_train/chosen": -82.23304748535156, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -105.46199035644531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3001964092254639, "rewards_train/margins": 2.2279725074768066, "rewards_train/rejected": -3.5281689167022705, "step": 1155 }, { "epoch": 0.32, "learning_rate": 3.252502295241101e-07, "loss": 0.3612, "step": 1156 }, { "epoch": 0.32, "logps_train/chosen": -94.26513671875, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -103.13916015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5120606422424316, "rewards_train/margins": 1.803417682647705, "rewards_train/rejected": -3.3154783248901367, "step": 1156 }, { "epoch": 0.32, "logps_train/chosen": -42.25543975830078, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -30.375, "logps_train/rejected": -43.421539306640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8304266929626465, "rewards_train/margins": 0.471493124961853, "rewards_train/rejected": -1.3019198179244995, "step": 1157 }, { "epoch": 0.32, "learning_rate": 3.2465979116066053e-07, "loss": 0.4138, "step": 1158 }, { "epoch": 0.32, "logps_train/chosen": -36.60765838623047, "logps_train/ref_chosen": -32.5, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -61.4089241027832, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.41408610343933105, "rewards_train/margins": 0.7150387763977051, "rewards_train/rejected": -1.1291248798370361, "step": 1158 }, { "epoch": 0.32, "logps_train/chosen": -88.83758544921875, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -126.98834991455078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3618834018707275, "rewards_train/margins": 2.334998846054077, "rewards_train/rejected": -3.6968822479248047, "step": 1159 }, { "epoch": 0.32, "learning_rate": 3.240688952214085e-07, "loss": 0.3957, "step": 1160 }, { "epoch": 0.32, "logps_train/chosen": -64.8724365234375, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -83.40242767333984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8106813430786133, "rewards_train/margins": 1.5135459899902344, "rewards_train/rejected": -2.3242273330688477, "step": 1160 }, { "epoch": 0.32, "logps_train/chosen": -89.62482452392578, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -88.18783569335938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8790841102600098, "rewards_train/margins": 0.5377459526062012, "rewards_train/rejected": -1.416830062866211, "step": 1161 }, { "epoch": 0.32, "learning_rate": 3.2347754532784365e-07, "loss": 0.487, "step": 1162 }, { "epoch": 0.32, "logps_train/chosen": -67.27902221679688, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -81.1677017211914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7918180227279663, "rewards_train/margins": 2.1025396585464478, "rewards_train/rejected": -2.894357681274414, "step": 1162 }, { "epoch": 0.33, "logps_train/chosen": -77.49712371826172, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -92.51890563964844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8801814913749695, "rewards_train/margins": 0.7646780610084534, "rewards_train/rejected": -1.6448595523834229, "step": 1163 }, { "epoch": 0.33, "learning_rate": 3.228857451042384e-07, "loss": 0.4235, "step": 1164 }, { "epoch": 0.33, "logps_train/chosen": -68.49908447265625, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -91.06550598144531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9470763206481934, "rewards_train/margins": 0.8711934089660645, "rewards_train/rejected": -1.8182697296142578, "step": 1164 }, { "epoch": 0.33, "logps_train/chosen": -62.16388702392578, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -77.70111846923828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4075016975402832, "rewards_train/margins": 1.6556766033172607, "rewards_train/rejected": -2.063178300857544, "step": 1165 }, { "epoch": 0.33, "learning_rate": 3.2229349817762476e-07, "loss": 0.5344, "step": 1166 }, { "epoch": 0.33, "logps_train/chosen": -82.88621520996094, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -99.08432006835938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7245585322380066, "rewards_train/margins": 1.861997902393341, "rewards_train/rejected": -2.5865564346313477, "step": 1166 }, { "epoch": 0.33, "logps_train/chosen": -84.58604431152344, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -100.53765869140625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.134873628616333, "rewards_train/margins": 0.7903773784637451, "rewards_train/rejected": -2.925251007080078, "step": 1167 }, { "epoch": 0.33, "learning_rate": 3.2170080817777257e-07, "loss": 0.3993, "step": 1168 }, { "epoch": 0.33, "logps_train/chosen": -78.18352508544922, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -98.81004333496094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6421806812286377, "rewards_train/margins": 1.1499561071395874, "rewards_train/rejected": -1.792136788368225, "step": 1168 }, { "epoch": 0.33, "logps_train/chosen": -66.99746704101562, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -92.34423828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8132960796356201, "rewards_train/margins": 2.4062840938568115, "rewards_train/rejected": -3.2195801734924316, "step": 1169 }, { "epoch": 0.33, "learning_rate": 3.2110767873716736e-07, "loss": 0.3673, "step": 1170 }, { "epoch": 0.33, "logps_train/chosen": -101.83489990234375, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -129.85865783691406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4677672386169434, "rewards_train/margins": 1.4091145992279053, "rewards_train/rejected": -2.8768818378448486, "step": 1170 }, { "epoch": 0.33, "logps_train/chosen": -53.57844924926758, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -84.144287109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5094074010848999, "rewards_train/margins": 1.4049242734909058, "rewards_train/rejected": -1.9143316745758057, "step": 1171 }, { "epoch": 0.33, "learning_rate": 3.205141134909878e-07, "loss": 0.3803, "step": 1172 }, { "epoch": 0.33, "logps_train/chosen": -70.63436889648438, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -75.62820434570312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1985933780670166, "rewards_train/margins": 0.633758544921875, "rewards_train/rejected": -1.8323519229888916, "step": 1172 }, { "epoch": 0.33, "logps_train/chosen": -82.42655944824219, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -104.75534057617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2934373617172241, "rewards_train/margins": 1.3875659704208374, "rewards_train/rejected": -2.6810033321380615, "step": 1173 }, { "epoch": 0.33, "learning_rate": 3.1992011607708346e-07, "loss": 0.4462, "step": 1174 }, { "epoch": 0.33, "logps_train/chosen": -71.51427459716797, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -51.11572265625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5781850814819336, "rewards_train/margins": -0.47012877464294434, "rewards_train/rejected": -1.1080563068389893, "step": 1174 }, { "epoch": 0.33, "logps_train/chosen": -104.39380645751953, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -127.56053161621094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2862557172775269, "rewards_train/margins": 2.9838610887527466, "rewards_train/rejected": -4.270116806030273, "step": 1175 }, { "epoch": 0.33, "learning_rate": 3.193256901359526e-07, "loss": 0.847, "step": 1176 }, { "epoch": 0.33, "logps_train/chosen": -75.4478988647461, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -91.39605712890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.531899333000183, "rewards_train/margins": 1.1452065706253052, "rewards_train/rejected": -2.6771059036254883, "step": 1176 }, { "epoch": 0.33, "logps_train/chosen": -74.81266784667969, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -87.51103973388672, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4264819622039795, "rewards_train/margins": 0.781848669052124, "rewards_train/rejected": -2.2083306312561035, "step": 1177 }, { "epoch": 0.33, "learning_rate": 3.187308393107201e-07, "loss": 0.4431, "step": 1178 }, { "epoch": 0.33, "logps_train/chosen": -82.31769561767578, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -110.784423828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0415349006652832, "rewards_train/margins": 1.4191339015960693, "rewards_train/rejected": -2.4606688022613525, "step": 1178 }, { "epoch": 0.33, "logps_train/chosen": -62.3345832824707, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -90.6983642578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.65494304895401, "rewards_train/margins": 1.8934091925621033, "rewards_train/rejected": -2.5483522415161133, "step": 1179 }, { "epoch": 0.33, "learning_rate": 3.181355672471144e-07, "loss": 0.3626, "step": 1180 }, { "epoch": 0.33, "logps_train/chosen": -80.82015228271484, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -87.15404510498047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1870933771133423, "rewards_train/margins": 0.5337800979614258, "rewards_train/rejected": -1.720873475074768, "step": 1180 }, { "epoch": 0.33, "logps_train/chosen": -69.87452697753906, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -89.21676635742188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5815937519073486, "rewards_train/margins": 0.7410590648651123, "rewards_train/rejected": -2.322652816772461, "step": 1181 }, { "epoch": 0.33, "learning_rate": 3.1753987759344616e-07, "loss": 0.5324, "step": 1182 }, { "epoch": 0.33, "logps_train/chosen": -92.84492492675781, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -119.51774597167969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9559764862060547, "rewards_train/margins": 2.623922348022461, "rewards_train/rejected": -4.579898834228516, "step": 1182 }, { "epoch": 0.33, "logps_train/chosen": -62.7845458984375, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -90.22932434082031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3030641078948975, "rewards_train/margins": 1.1229934692382812, "rewards_train/rejected": -2.4260575771331787, "step": 1183 }, { "epoch": 0.33, "learning_rate": 3.169437740005849e-07, "loss": 0.3255, "step": 1184 }, { "epoch": 0.33, "logps_train/chosen": -112.349365234375, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -142.35031127929688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2036864757537842, "rewards_train/margins": 1.8297817707061768, "rewards_train/rejected": -3.033468246459961, "step": 1184 }, { "epoch": 0.33, "logps_train/chosen": -75.7343521118164, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -102.13626098632812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.205039381980896, "rewards_train/margins": 0.9179619550704956, "rewards_train/rejected": -2.1230013370513916, "step": 1185 }, { "epoch": 0.33, "learning_rate": 3.1634726012193734e-07, "loss": 0.3849, "step": 1186 }, { "epoch": 0.33, "logps_train/chosen": -58.36687088012695, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -67.93152618408203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6109057068824768, "rewards_train/margins": 1.3588091731071472, "rewards_train/rejected": -1.969714879989624, "step": 1186 }, { "epoch": 0.33, "logps_train/chosen": -64.16983032226562, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -78.98739624023438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0888584852218628, "rewards_train/margins": 1.0196467638015747, "rewards_train/rejected": -2.1085052490234375, "step": 1187 }, { "epoch": 0.33, "learning_rate": 3.1575033961342477e-07, "loss": 0.421, "step": 1188 }, { "epoch": 0.33, "logps_train/chosen": -89.76327514648438, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -105.0319595336914, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5091396570205688, "rewards_train/margins": 1.062806248664856, "rewards_train/rejected": -2.571945905685425, "step": 1188 }, { "epoch": 0.33, "logps_train/chosen": -98.1567153930664, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -91.50418853759766, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.6533666849136353, "rewards_train/margins": 0.43455207347869873, "rewards_train/rejected": -2.087918758392334, "step": 1189 }, { "epoch": 0.33, "learning_rate": 3.151530161334607e-07, "loss": 0.5724, "step": 1190 }, { "epoch": 0.33, "logps_train/chosen": -109.42497253417969, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -121.7960205078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9292153120040894, "rewards_train/margins": 1.0121055841445923, "rewards_train/rejected": -2.9413208961486816, "step": 1190 }, { "epoch": 0.33, "logps_train/chosen": -92.81179809570312, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -98.54808044433594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8897742629051208, "rewards_train/margins": 1.693745195865631, "rewards_train/rejected": -2.583519458770752, "step": 1191 }, { "epoch": 0.33, "learning_rate": 3.14555293342928e-07, "loss": 0.3727, "step": 1192 }, { "epoch": 0.33, "logps_train/chosen": -69.55101776123047, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -70.6552963256836, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.644848108291626, "rewards_train/margins": 0.5960235595703125, "rewards_train/rejected": -2.2408716678619385, "step": 1192 }, { "epoch": 0.33, "logps_train/chosen": -50.99018096923828, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -67.72573852539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2188911437988281, "rewards_train/margins": 0.9634487628936768, "rewards_train/rejected": -2.182339906692505, "step": 1193 }, { "epoch": 0.33, "learning_rate": 3.1395717490515736e-07, "loss": 0.5125, "step": 1194 }, { "epoch": 0.33, "logps_train/chosen": -90.19172668457031, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -117.96939086914062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.4976885318756104, "rewards_train/margins": 0.965266227722168, "rewards_train/rejected": -3.4629547595977783, "step": 1194 }, { "epoch": 0.33, "logps_train/chosen": -107.63124084472656, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -132.37387084960938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.058436632156372, "rewards_train/margins": 2.381587028503418, "rewards_train/rejected": -3.44002366065979, "step": 1195 }, { "epoch": 0.33, "learning_rate": 3.133586644859039e-07, "loss": 0.4764, "step": 1196 }, { "epoch": 0.33, "logps_train/chosen": -45.805442810058594, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -54.42270278930664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5825948119163513, "rewards_train/margins": 0.3073318600654602, "rewards_train/rejected": -0.8899266719818115, "step": 1196 }, { "epoch": 0.33, "logps_train/chosen": -86.712158203125, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -88.41496276855469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7137949466705322, "rewards_train/margins": 1.6230137348175049, "rewards_train/rejected": -2.336808681488037, "step": 1197 }, { "epoch": 0.33, "learning_rate": 3.127597657533255e-07, "loss": 0.4552, "step": 1198 }, { "epoch": 0.33, "logps_train/chosen": -85.20030212402344, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -91.2405776977539, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6713238954544067, "rewards_train/margins": 0.6649652719497681, "rewards_train/rejected": -2.336289167404175, "step": 1198 }, { "epoch": 0.34, "logps_train/chosen": -52.18776321411133, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -63.601783752441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0654374361038208, "rewards_train/margins": 1.1307576894760132, "rewards_train/rejected": -2.196195125579834, "step": 1199 }, { "epoch": 0.34, "learning_rate": 3.1216048237795945e-07, "loss": 0.4597, "step": 1200 }, { "epoch": 0.34, "logps_train/chosen": -71.60494232177734, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -89.45936584472656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1646933555603027, "rewards_train/margins": 1.4083914756774902, "rewards_train/rejected": -2.573084831237793, "step": 1200 }, { "epoch": 0.34, "logps_train/chosen": -91.63750457763672, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -94.1178970336914, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5677721500396729, "rewards_train/margins": 1.1195242404937744, "rewards_train/rejected": -2.6872963905334473, "step": 1201 }, { "epoch": 0.34, "learning_rate": 3.1156081803270095e-07, "loss": 0.4558, "step": 1202 }, { "epoch": 0.34, "logps_train/chosen": -25.83116912841797, "logps_train/ref_chosen": -21.375, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -43.94371795654297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4452263116836548, "rewards_train/margins": 0.4569580554962158, "rewards_train/rejected": -0.9021843671798706, "step": 1202 }, { "epoch": 0.34, "logps_train/chosen": -56.673805236816406, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -60.42707443237305, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5927220582962036, "rewards_train/margins": 1.0473486185073853, "rewards_train/rejected": -1.6400706768035889, "step": 1203 }, { "epoch": 0.34, "learning_rate": 3.109607763927798e-07, "loss": 0.4603, "step": 1204 }, { "epoch": 0.34, "logps_train/chosen": -59.55609893798828, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -75.75169372558594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.47221139073371887, "rewards_train/margins": 0.9004187285900116, "rewards_train/rejected": -1.3726301193237305, "step": 1204 }, { "epoch": 0.34, "logps_train/chosen": -71.41329193115234, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -93.61686706542969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9730676412582397, "rewards_train/margins": 0.9899858236312866, "rewards_train/rejected": -1.9630534648895264, "step": 1205 }, { "epoch": 0.34, "learning_rate": 3.103603611357381e-07, "loss": 0.4422, "step": 1206 }, { "epoch": 0.34, "logps_train/chosen": -97.32916259765625, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -110.93840789794922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1220767498016357, "rewards_train/margins": 1.1252796649932861, "rewards_train/rejected": -3.247356414794922, "step": 1206 }, { "epoch": 0.34, "logps_train/chosen": -73.59026336669922, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -92.25763702392578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0117605924606323, "rewards_train/margins": 1.4169319868087769, "rewards_train/rejected": -2.428692579269409, "step": 1207 }, { "epoch": 0.34, "learning_rate": 3.097595759414081e-07, "loss": 0.377, "step": 1208 }, { "epoch": 0.34, "logps_train/chosen": -73.49578857421875, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -89.04098510742188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2222349643707275, "rewards_train/margins": 1.3064734935760498, "rewards_train/rejected": -2.5287084579467773, "step": 1208 }, { "epoch": 0.34, "logps_train/chosen": -35.669700622558594, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -30.25, "logps_train/rejected": -43.356666564941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.35395729541778564, "rewards_train/margins": 0.9569779634475708, "rewards_train/rejected": -1.3109352588653564, "step": 1209 }, { "epoch": 0.34, "learning_rate": 3.09158424491889e-07, "loss": 0.44, "step": 1210 }, { "epoch": 0.34, "logps_train/chosen": -60.83733367919922, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -101.0129623413086, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6280696392059326, "rewards_train/margins": 2.691976308822632, "rewards_train/rejected": -3.3200459480285645, "step": 1210 }, { "epoch": 0.34, "logps_train/chosen": -58.323280334472656, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -72.33810424804688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": 0.005464881658554077, "rewards_train/margins": 2.2822435796260834, "rewards_train/rejected": -2.2767786979675293, "step": 1211 }, { "epoch": 0.34, "learning_rate": 3.08556910471525e-07, "loss": 0.3123, "step": 1212 }, { "epoch": 0.34, "logps_train/chosen": -75.33900451660156, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -118.93092346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.710463285446167, "rewards_train/margins": 1.4701282978057861, "rewards_train/rejected": -2.180591583251953, "step": 1212 }, { "epoch": 0.34, "logps_train/chosen": -86.78569793701172, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -108.21052551269531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.197808027267456, "rewards_train/margins": 1.1191422939300537, "rewards_train/rejected": -3.3169503211975098, "step": 1213 }, { "epoch": 0.34, "learning_rate": 3.0795503756688205e-07, "loss": 0.5301, "step": 1214 }, { "epoch": 0.34, "logps_train/chosen": -80.62371063232422, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -126.99827575683594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.921746015548706, "rewards_train/margins": 2.1335508823394775, "rewards_train/rejected": -4.055296897888184, "step": 1214 }, { "epoch": 0.34, "logps_train/chosen": -64.48846435546875, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -103.56227111816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.049627661705017, "rewards_train/margins": 2.3608967065811157, "rewards_train/rejected": -3.410524368286133, "step": 1215 }, { "epoch": 0.34, "learning_rate": 3.0735280946672604e-07, "loss": 0.3228, "step": 1216 }, { "epoch": 0.34, "logps_train/chosen": -73.5542984008789, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -61.83903884887695, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2909764051437378, "rewards_train/margins": -0.19144737720489502, "rewards_train/rejected": -1.0995290279388428, "step": 1216 }, { "epoch": 0.34, "logps_train/chosen": -47.58641815185547, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -84.713134765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0947747230529785, "rewards_train/margins": 1.342163324356079, "rewards_train/rejected": -2.4369380474090576, "step": 1217 }, { "epoch": 0.34, "learning_rate": 3.067502298619996e-07, "loss": 0.7511, "step": 1218 }, { "epoch": 0.34, "logps_train/chosen": -108.0360107421875, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -113.9399185180664, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.850402355194092, "rewards_train/margins": 0.8303086757659912, "rewards_train/rejected": -3.680711030960083, "step": 1218 }, { "epoch": 0.34, "logps_train/chosen": -94.44078063964844, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -108.80403900146484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.619859218597412, "rewards_train/margins": 0.8124978542327881, "rewards_train/rejected": -2.4323570728302, "step": 1219 }, { "epoch": 0.34, "learning_rate": 3.061473024457995e-07, "loss": 0.4899, "step": 1220 }, { "epoch": 0.34, "logps_train/chosen": -83.36814880371094, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -106.5103988647461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0624502897262573, "rewards_train/margins": 1.9005528688430786, "rewards_train/rejected": -2.963003158569336, "step": 1220 }, { "epoch": 0.34, "logps_train/chosen": -75.4331283569336, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -80.57693481445312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5780785083770752, "rewards_train/margins": 0.5919194221496582, "rewards_train/rejected": -2.1699979305267334, "step": 1221 }, { "epoch": 0.34, "learning_rate": 3.0554403091335454e-07, "loss": 0.416, "step": 1222 }, { "epoch": 0.34, "logps_train/chosen": -110.41349792480469, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -126.67073822021484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8757256269454956, "rewards_train/margins": 1.1765047311782837, "rewards_train/rejected": -3.0522303581237793, "step": 1222 }, { "epoch": 0.34, "logps_train/chosen": -25.623764038085938, "logps_train/ref_chosen": -19.25, "logps_train/ref_rejected": -19.375, "logps_train/rejected": -29.09286117553711, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6393295526504517, "rewards_train/margins": 0.3276714086532593, "rewards_train/rejected": -0.9670009613037109, "step": 1223 }, { "epoch": 0.34, "learning_rate": 3.049404189620023e-07, "loss": 0.4956, "step": 1224 }, { "epoch": 0.34, "logps_train/chosen": -79.60652160644531, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -85.83671569824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3190501928329468, "rewards_train/margins": 0.8620820045471191, "rewards_train/rejected": -1.181132197380066, "step": 1224 }, { "epoch": 0.34, "logps_train/chosen": -84.14520263671875, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -106.1170425415039, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.40280088782310486, "rewards_train/margins": 2.290543705224991, "rewards_train/rejected": -2.6933445930480957, "step": 1225 }, { "epoch": 0.34, "learning_rate": 3.043364702911667e-07, "loss": 0.3709, "step": 1226 }, { "epoch": 0.34, "logps_train/chosen": -129.52308654785156, "logps_train/ref_chosen": -103.0, "logps_train/ref_rejected": -117.0, "logps_train/rejected": -174.3406524658203, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.6671528816223145, "rewards_train/margins": 3.0770678520202637, "rewards_train/rejected": -5.744220733642578, "step": 1226 }, { "epoch": 0.34, "logps_train/chosen": -105.80215454101562, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -124.0, "logps_train/rejected": -161.20736694335938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5141997933387756, "rewards_train/margins": 3.201849400997162, "rewards_train/rejected": -3.7160491943359375, "step": 1227 }, { "epoch": 0.34, "learning_rate": 3.037321886023356e-07, "loss": 0.1914, "step": 1228 }, { "epoch": 0.34, "logps_train/chosen": -86.39804077148438, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -109.82169342041016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3898032903671265, "rewards_train/margins": 0.9011553525924683, "rewards_train/rejected": -2.2909586429595947, "step": 1228 }, { "epoch": 0.34, "logps_train/chosen": -44.62421417236328, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -57.36525344848633, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8378124237060547, "rewards_train/margins": 0.5795726776123047, "rewards_train/rejected": -1.4173851013183594, "step": 1229 }, { "epoch": 0.34, "learning_rate": 3.0312757759903746e-07, "loss": 0.4381, "step": 1230 }, { "epoch": 0.34, "logps_train/chosen": -85.21878051757812, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -99.37905883789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9085963368415833, "rewards_train/margins": 1.5668098330497742, "rewards_train/rejected": -2.4754061698913574, "step": 1230 }, { "epoch": 0.34, "logps_train/chosen": -103.96012878417969, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -122.29784393310547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.054802179336548, "rewards_train/margins": 1.137481927871704, "rewards_train/rejected": -3.192284107208252, "step": 1231 }, { "epoch": 0.34, "learning_rate": 3.0252264098681946e-07, "loss": 0.3885, "step": 1232 }, { "epoch": 0.34, "logps_train/chosen": -63.1392936706543, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -94.64093017578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0686168670654297, "rewards_train/margins": 1.7798504829406738, "rewards_train/rejected": -2.8484673500061035, "step": 1232 }, { "epoch": 0.34, "logps_train/chosen": -71.816650390625, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -73.51524353027344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2732181251049042, "rewards_train/margins": 0.277133971452713, "rewards_train/rejected": -0.5503520965576172, "step": 1233 }, { "epoch": 0.34, "learning_rate": 3.0191738247322415e-07, "loss": 0.4933, "step": 1234 }, { "epoch": 0.34, "logps_train/chosen": -37.84785079956055, "logps_train/ref_chosen": -32.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -54.35480880737305, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5383008718490601, "rewards_train/margins": 0.6491333246231079, "rewards_train/rejected": -1.187434196472168, "step": 1234 }, { "epoch": 0.35, "logps_train/chosen": -64.3022232055664, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -71.05715942382812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.359714388847351, "rewards_train/margins": 0.20303237438201904, "rewards_train/rejected": -1.5627467632293701, "step": 1235 }, { "epoch": 0.35, "learning_rate": 3.0131180576776694e-07, "loss": 0.5652, "step": 1236 }, { "epoch": 0.35, "logps_train/chosen": -36.36655807495117, "logps_train/ref_chosen": -30.5, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -48.76054000854492, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5925151109695435, "rewards_train/margins": 0.5085386037826538, "rewards_train/rejected": -1.1010537147521973, "step": 1236 }, { "epoch": 0.35, "logps_train/chosen": -99.78321075439453, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -120.41878509521484, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8110358715057373, "rewards_train/margins": 1.3636550903320312, "rewards_train/rejected": -3.1746909618377686, "step": 1237 }, { "epoch": 0.35, "learning_rate": 3.0070591458191354e-07, "loss": 0.502, "step": 1238 }, { "epoch": 0.35, "logps_train/chosen": -22.466035842895508, "logps_train/ref_chosen": -19.5, "logps_train/ref_rejected": -30.25, "logps_train/rejected": -40.89089584350586, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3012911081314087, "rewards_train/margins": 0.7671929597854614, "rewards_train/rejected": -1.0684840679168701, "step": 1238 }, { "epoch": 0.35, "logps_train/chosen": -60.26660919189453, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -68.46043395996094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3268800675868988, "rewards_train/margins": 0.9654524624347687, "rewards_train/rejected": -1.2923325300216675, "step": 1239 }, { "epoch": 0.35, "learning_rate": 3.0009971262905686e-07, "loss": 0.4518, "step": 1240 }, { "epoch": 0.35, "logps_train/chosen": -83.158935546875, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -112.37612915039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1037836074829102, "rewards_train/margins": 1.5873448848724365, "rewards_train/rejected": -2.6911284923553467, "step": 1240 }, { "epoch": 0.35, "logps_train/chosen": -47.118743896484375, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -60.689964294433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.46206945180892944, "rewards_train/margins": 0.8864193558692932, "rewards_train/rejected": -1.3484888076782227, "step": 1241 }, { "epoch": 0.35, "learning_rate": 2.9949320362449454e-07, "loss": 0.3206, "step": 1242 }, { "epoch": 0.35, "logps_train/chosen": -60.50547790527344, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -60.14662551879883, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5290637016296387, "rewards_train/margins": 0.906301736831665, "rewards_train/rejected": -1.4353654384613037, "step": 1242 }, { "epoch": 0.35, "logps_train/chosen": -71.1565933227539, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -85.62747192382812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2133152484893799, "rewards_train/margins": 1.2104673385620117, "rewards_train/rejected": -2.4237825870513916, "step": 1243 }, { "epoch": 0.35, "learning_rate": 2.988863912854061e-07, "loss": 0.4171, "step": 1244 }, { "epoch": 0.35, "logps_train/chosen": -59.45911407470703, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -68.69921112060547, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7736943364143372, "rewards_train/margins": 1.1976673007011414, "rewards_train/rejected": -1.9713616371154785, "step": 1244 }, { "epoch": 0.35, "logps_train/chosen": -68.25922393798828, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -72.01715850830078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1981877088546753, "rewards_train/margins": 0.9541140794754028, "rewards_train/rejected": -2.152301788330078, "step": 1245 }, { "epoch": 0.35, "learning_rate": 2.982792793308301e-07, "loss": 0.4688, "step": 1246 }, { "epoch": 0.35, "logps_train/chosen": -46.17108917236328, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -81.40778350830078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5603708028793335, "rewards_train/margins": 2.0001343488693237, "rewards_train/rejected": -2.5605051517486572, "step": 1246 }, { "epoch": 0.35, "logps_train/chosen": -41.5633544921875, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -26.75, "logps_train/rejected": -40.036712646484375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3092648684978485, "rewards_train/margins": 1.0149140655994415, "rewards_train/rejected": -1.32417893409729, "step": 1247 }, { "epoch": 0.35, "learning_rate": 2.976718714816414e-07, "loss": 0.4138, "step": 1248 }, { "epoch": 0.35, "logps_train/chosen": -47.901123046875, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -65.83989715576172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.36174318194389343, "rewards_train/margins": 1.0100879967212677, "rewards_train/rejected": -1.3718311786651611, "step": 1248 }, { "epoch": 0.35, "logps_train/chosen": -44.63545608520508, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -57.441184997558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7454789876937866, "rewards_train/margins": 0.5191473960876465, "rewards_train/rejected": -1.264626383781433, "step": 1249 }, { "epoch": 0.35, "learning_rate": 2.9706417146052835e-07, "loss": 0.4548, "step": 1250 }, { "epoch": 0.35, "logps_train/chosen": -53.336307525634766, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -58.01958465576172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.036365270614624, "rewards_train/margins": 0.21832752227783203, "rewards_train/rejected": -1.254692792892456, "step": 1250 }, { "epoch": 0.35, "logps_train/chosen": -85.30303192138672, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -90.73812866210938, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.7932426929473877, "rewards_train/margins": 0.6220254898071289, "rewards_train/rejected": -2.4152681827545166, "step": 1251 }, { "epoch": 0.35, "learning_rate": 2.9645618299196994e-07, "loss": 0.626, "step": 1252 }, { "epoch": 0.35, "logps_train/chosen": -62.463233947753906, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -37.0, "logps_train/rejected": -52.76800537109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1841163635253906, "rewards_train/margins": 0.384188175201416, "rewards_train/rejected": -1.5683045387268066, "step": 1252 }, { "epoch": 0.35, "logps_train/chosen": -116.64669799804688, "logps_train/ref_chosen": -107.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -141.56182861328125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.920920193195343, "rewards_train/margins": 2.556355655193329, "rewards_train/rejected": -3.477275848388672, "step": 1253 }, { "epoch": 0.35, "learning_rate": 2.95847909802213e-07, "loss": 0.5828, "step": 1254 }, { "epoch": 0.35, "logps_train/chosen": -94.45301818847656, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -98.36775207519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9395405054092407, "rewards_train/margins": 0.41823089122772217, "rewards_train/rejected": -2.357771396636963, "step": 1254 }, { "epoch": 0.35, "logps_train/chosen": -101.71021270751953, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -100.24636840820312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8341076374053955, "rewards_train/margins": 0.9092800617218018, "rewards_train/rejected": -2.7433876991271973, "step": 1255 }, { "epoch": 0.35, "learning_rate": 2.952393556192495e-07, "loss": 0.5338, "step": 1256 }, { "epoch": 0.35, "logps_train/chosen": -49.79737091064453, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -65.52215576171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8700078129768372, "rewards_train/margins": 0.5651177763938904, "rewards_train/rejected": -1.4351255893707275, "step": 1256 }, { "epoch": 0.35, "logps_train/chosen": -56.505470275878906, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -73.62117004394531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1196873188018799, "rewards_train/margins": 0.22035980224609375, "rewards_train/rejected": -1.3400471210479736, "step": 1257 }, { "epoch": 0.35, "learning_rate": 2.946305241727933e-07, "loss": 0.6657, "step": 1258 }, { "epoch": 0.35, "logps_train/chosen": -52.27532958984375, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -54.501914978027344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8747987151145935, "rewards_train/margins": 0.5121118426322937, "rewards_train/rejected": -1.3869105577468872, "step": 1258 }, { "epoch": 0.35, "logps_train/chosen": -76.8271255493164, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -123.54518127441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5729472637176514, "rewards_train/margins": 2.4991490840911865, "rewards_train/rejected": -4.072096347808838, "step": 1259 }, { "epoch": 0.35, "learning_rate": 2.9402141919425784e-07, "loss": 0.5543, "step": 1260 }, { "epoch": 0.35, "logps_train/chosen": -50.75778579711914, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -72.40275573730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6005833148956299, "rewards_train/margins": 1.2896926403045654, "rewards_train/rejected": -1.8902759552001953, "step": 1260 }, { "epoch": 0.35, "logps_train/chosen": -49.02370834350586, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -55.26200866699219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8599880337715149, "rewards_train/margins": 0.6197280287742615, "rewards_train/rejected": -1.4797160625457764, "step": 1261 }, { "epoch": 0.35, "learning_rate": 2.934120444167326e-07, "loss": 0.4021, "step": 1262 }, { "epoch": 0.35, "logps_train/chosen": -67.32768249511719, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -65.77238464355469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.48794418573379517, "rewards_train/margins": 0.9049198031425476, "rewards_train/rejected": -1.3928639888763428, "step": 1262 }, { "epoch": 0.35, "logps_train/chosen": -41.7213134765625, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -69.01918029785156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6291625499725342, "rewards_train/margins": 1.0112313032150269, "rewards_train/rejected": -1.640393853187561, "step": 1263 }, { "epoch": 0.35, "learning_rate": 2.928024035749611e-07, "loss": 0.4033, "step": 1264 }, { "epoch": 0.35, "logps_train/chosen": -63.10002136230469, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -77.05767059326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8517013788223267, "rewards_train/margins": 1.2935189008712769, "rewards_train/rejected": -2.1452202796936035, "step": 1264 }, { "epoch": 0.35, "logps_train/chosen": -64.24785614013672, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -87.11654663085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6533015370368958, "rewards_train/margins": 1.6905805468559265, "rewards_train/rejected": -2.3438820838928223, "step": 1265 }, { "epoch": 0.35, "learning_rate": 2.9219250040531716e-07, "loss": 0.3183, "step": 1266 }, { "epoch": 0.35, "logps_train/chosen": -65.76615905761719, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -89.967041015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6911664605140686, "rewards_train/margins": 1.2617883086204529, "rewards_train/rejected": -1.9529547691345215, "step": 1266 }, { "epoch": 0.35, "logps_train/chosen": -94.05767822265625, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -140.08242797851562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8002994060516357, "rewards_train/margins": 2.2704427242279053, "rewards_train/rejected": -4.070742130279541, "step": 1267 }, { "epoch": 0.35, "learning_rate": 2.915823386457825e-07, "loss": 0.4391, "step": 1268 }, { "epoch": 0.35, "logps_train/chosen": -69.37425231933594, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -90.4566421508789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7557848691940308, "rewards_train/margins": 1.8482776880264282, "rewards_train/rejected": -2.604062557220459, "step": 1268 }, { "epoch": 0.35, "logps_train/chosen": -73.65614318847656, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -88.40058135986328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0031148195266724, "rewards_train/margins": 2.053739905357361, "rewards_train/rejected": -3.056854724884033, "step": 1269 }, { "epoch": 0.35, "learning_rate": 2.9097192203592373e-07, "loss": 0.4572, "step": 1270 }, { "epoch": 0.35, "logps_train/chosen": -111.06425476074219, "logps_train/ref_chosen": -90.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -131.41131591796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.059159517288208, "rewards_train/margins": 1.825331211090088, "rewards_train/rejected": -3.884490728378296, "step": 1270 }, { "epoch": 0.36, "logps_train/chosen": -44.51241683959961, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -63.68146514892578, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7299526929855347, "rewards_train/margins": 1.1296004056930542, "rewards_train/rejected": -1.8595530986785889, "step": 1271 }, { "epoch": 0.36, "learning_rate": 2.9036125431686916e-07, "loss": 0.3528, "step": 1272 }, { "epoch": 0.36, "logps_train/chosen": -63.27284240722656, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -66.57807922363281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8819718360900879, "rewards_train/margins": 0.9352107048034668, "rewards_train/rejected": -1.8171825408935547, "step": 1272 }, { "epoch": 0.36, "logps_train/chosen": -80.39818572998047, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -106.00479125976562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8234123587608337, "rewards_train/margins": 2.020621597766876, "rewards_train/rejected": -2.84403395652771, "step": 1273 }, { "epoch": 0.36, "learning_rate": 2.897503392312864e-07, "loss": 0.3524, "step": 1274 }, { "epoch": 0.36, "logps_train/chosen": -53.39342498779297, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -54.14075469970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4791374206542969, "rewards_train/margins": 0.9202895164489746, "rewards_train/rejected": -1.3994269371032715, "step": 1274 }, { "epoch": 0.36, "logps_train/chosen": -111.9495849609375, "logps_train/ref_chosen": -92.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -122.84001159667969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9674193859100342, "rewards_train/margins": 1.512382984161377, "rewards_train/rejected": -3.479802370071411, "step": 1275 }, { "epoch": 0.36, "learning_rate": 2.8913918052335884e-07, "loss": 0.4267, "step": 1276 }, { "epoch": 0.36, "logps_train/chosen": -73.24745178222656, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -93.18992614746094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1024799346923828, "rewards_train/margins": 1.9378023147583008, "rewards_train/rejected": -3.0402822494506836, "step": 1276 }, { "epoch": 0.36, "logps_train/chosen": -95.64163970947266, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -115.248046875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9829139709472656, "rewards_train/margins": 0.716890811920166, "rewards_train/rejected": -2.6998047828674316, "step": 1277 }, { "epoch": 0.36, "learning_rate": 2.8852778193876333e-07, "loss": 0.418, "step": 1278 }, { "epoch": 0.36, "logps_train/chosen": -68.90501403808594, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -93.73394775390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9147204160690308, "rewards_train/margins": 0.9772288799285889, "rewards_train/rejected": -1.8919492959976196, "step": 1278 }, { "epoch": 0.36, "logps_train/chosen": -51.51862716674805, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -77.333984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8381906747817993, "rewards_train/margins": 1.2866140604019165, "rewards_train/rejected": -2.124804735183716, "step": 1279 }, { "epoch": 0.36, "learning_rate": 2.879161472246465e-07, "loss": 0.3712, "step": 1280 }, { "epoch": 0.36, "logps_train/chosen": -97.43931579589844, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -134.86773681640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7779157161712646, "rewards_train/margins": 2.234835386276245, "rewards_train/rejected": -4.01275110244751, "step": 1280 }, { "epoch": 0.36, "logps_train/chosen": -68.90665435791016, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -86.76878356933594, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -1.2789463996887207, "rewards_train/margins": 0.5182435512542725, "rewards_train/rejected": -1.7971899509429932, "step": 1281 }, { "epoch": 0.36, "learning_rate": 2.8730428012960245e-07, "loss": 0.5033, "step": 1282 }, { "epoch": 0.36, "logps_train/chosen": -115.7667236328125, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -121.2967300415039, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.5526981353759766, "rewards_train/margins": 1.2461156845092773, "rewards_train/rejected": -3.798813819885254, "step": 1282 }, { "epoch": 0.36, "logps_train/chosen": -64.69181823730469, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -98.20594787597656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7613693475723267, "rewards_train/margins": 2.1143044233322144, "rewards_train/rejected": -2.875673770904541, "step": 1283 }, { "epoch": 0.36, "learning_rate": 2.8669218440364933e-07, "loss": 0.3413, "step": 1284 }, { "epoch": 0.36, "logps_train/chosen": -73.53129577636719, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -93.92050170898438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6960985064506531, "rewards_train/margins": 0.9115764498710632, "rewards_train/rejected": -1.6076749563217163, "step": 1284 }, { "epoch": 0.36, "logps_train/chosen": -112.06536865234375, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -128.71908569335938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2846617698669434, "rewards_train/margins": 2.4806065559387207, "rewards_train/rejected": -3.765268325805664, "step": 1285 }, { "epoch": 0.36, "learning_rate": 2.8607986379820664e-07, "loss": 0.446, "step": 1286 }, { "epoch": 0.36, "logps_train/chosen": -88.01637268066406, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -104.08353424072266, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6192153692245483, "rewards_train/margins": 0.16101324558258057, "rewards_train/rejected": -1.780228614807129, "step": 1286 }, { "epoch": 0.36, "logps_train/chosen": -39.651729583740234, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -50.218223571777344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5624388456344604, "rewards_train/margins": 0.934578537940979, "rewards_train/rejected": -1.4970173835754395, "step": 1287 }, { "epoch": 0.36, "learning_rate": 2.854673220660721e-07, "loss": 0.5327, "step": 1288 }, { "epoch": 0.36, "logps_train/chosen": -114.07559204101562, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -132.51544189453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0622470378875732, "rewards_train/margins": 2.03265643119812, "rewards_train/rejected": -4.094903469085693, "step": 1288 }, { "epoch": 0.36, "logps_train/chosen": -88.67436218261719, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -90.67655944824219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4979051351547241, "rewards_train/margins": 0.8566652536392212, "rewards_train/rejected": -2.3545703887939453, "step": 1289 }, { "epoch": 0.36, "learning_rate": 2.848545629613986e-07, "loss": 0.3232, "step": 1290 }, { "epoch": 0.36, "logps_train/chosen": -49.19293975830078, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -60.60626220703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.687457799911499, "rewards_train/margins": 0.9311761856079102, "rewards_train/rejected": -1.6186339855194092, "step": 1290 }, { "epoch": 0.36, "logps_train/chosen": -72.29499816894531, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -95.83135223388672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.78213632106781, "rewards_train/margins": 1.9825423955917358, "rewards_train/rejected": -3.764678716659546, "step": 1291 }, { "epoch": 0.36, "learning_rate": 2.842415902396713e-07, "loss": 0.5225, "step": 1292 }, { "epoch": 0.36, "logps_train/chosen": -56.585472106933594, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -68.98127746582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.199172019958496, "rewards_train/margins": 0.8024711608886719, "rewards_train/rejected": -2.001643180847168, "step": 1292 }, { "epoch": 0.36, "logps_train/chosen": -111.89640808105469, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -127.68263244628906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6958913803100586, "rewards_train/margins": 1.5458097457885742, "rewards_train/rejected": -3.241701126098633, "step": 1293 }, { "epoch": 0.36, "learning_rate": 2.8362840765768476e-07, "loss": 0.4456, "step": 1294 }, { "epoch": 0.36, "logps_train/chosen": -25.271020889282227, "logps_train/ref_chosen": -21.25, "logps_train/ref_rejected": -22.5, "logps_train/rejected": -29.85843276977539, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.3960718512535095, "rewards_train/margins": 0.33901458978652954, "rewards_train/rejected": -0.7350864410400391, "step": 1294 }, { "epoch": 0.36, "logps_train/chosen": -102.75201416015625, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -158.03952026367188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7902405261993408, "rewards_train/margins": 3.8672268390655518, "rewards_train/rejected": -5.657467365264893, "step": 1295 }, { "epoch": 0.36, "learning_rate": 2.830150189735193e-07, "loss": 0.3755, "step": 1296 }, { "epoch": 0.36, "logps_train/chosen": -39.12625503540039, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -53.60280227661133, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.30244502425193787, "rewards_train/margins": 0.9511698782444, "rewards_train/rejected": -1.253614902496338, "step": 1296 }, { "epoch": 0.36, "logps_train/chosen": -106.22415161132812, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -112.7197494506836, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6665563583374023, "rewards_train/margins": 1.257957935333252, "rewards_train/rejected": -2.9245142936706543, "step": 1297 }, { "epoch": 0.36, "learning_rate": 2.8240142794651895e-07, "loss": 0.4655, "step": 1298 }, { "epoch": 0.36, "logps_train/chosen": -105.55613708496094, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -112.70767974853516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9675285816192627, "rewards_train/margins": 0.6024830341339111, "rewards_train/rejected": -2.570011615753174, "step": 1298 }, { "epoch": 0.36, "logps_train/chosen": -96.76789855957031, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -139.1729736328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8603839874267578, "rewards_train/margins": 2.5506625175476074, "rewards_train/rejected": -3.4110465049743652, "step": 1299 }, { "epoch": 0.36, "learning_rate": 2.8178763833726734e-07, "loss": 0.4321, "step": 1300 }, { "epoch": 0.36, "logps_train/chosen": -69.91500854492188, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -105.3660659790039, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6147429943084717, "rewards_train/margins": 1.8849496841430664, "rewards_train/rejected": -2.499692678451538, "step": 1300 }, { "epoch": 0.36, "logps_train/chosen": -148.17092895507812, "logps_train/ref_chosen": -109.5, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -158.2682647705078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.8811564445495605, "rewards_train/margins": 1.742936134338379, "rewards_train/rejected": -5.6240925788879395, "step": 1301 }, { "epoch": 0.36, "learning_rate": 2.811736539075656e-07, "loss": 0.4972, "step": 1302 }, { "epoch": 0.36, "logps_train/chosen": -59.03349685668945, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -89.21664428710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8964160084724426, "rewards_train/margins": 1.5689980387687683, "rewards_train/rejected": -2.465414047241211, "step": 1302 }, { "epoch": 0.36, "logps_train/chosen": -84.39813232421875, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -86.77507019042969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2852232456207275, "rewards_train/margins": 0.9193837642669678, "rewards_train/rejected": -2.2046070098876953, "step": 1303 }, { "epoch": 0.36, "learning_rate": 2.8055947842040863e-07, "loss": 0.4232, "step": 1304 }, { "epoch": 0.36, "logps_train/chosen": -72.73558807373047, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -104.67257690429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1188716888427734, "rewards_train/margins": 2.3550262451171875, "rewards_train/rejected": -3.473897933959961, "step": 1304 }, { "epoch": 0.36, "logps_train/chosen": -46.97467803955078, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -93.93573760986328, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6736396551132202, "rewards_train/margins": 1.2531369924545288, "rewards_train/rejected": -1.926776647567749, "step": 1305 }, { "epoch": 0.37, "learning_rate": 2.799451156399623e-07, "loss": 0.3026, "step": 1306 }, { "epoch": 0.37, "logps_train/chosen": -85.8043441772461, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -94.05465698242188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0390287637710571, "rewards_train/margins": 1.0871402025222778, "rewards_train/rejected": -2.126168966293335, "step": 1306 }, { "epoch": 0.37, "logps_train/chosen": -37.19535827636719, "logps_train/ref_chosen": -32.0, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -61.08477020263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5227588415145874, "rewards_train/margins": 1.153588891029358, "rewards_train/rejected": -1.6763477325439453, "step": 1307 }, { "epoch": 0.37, "learning_rate": 2.7933056933154055e-07, "loss": 0.4183, "step": 1308 }, { "epoch": 0.37, "logps_train/chosen": -50.97650909423828, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -56.72451400756836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.683588445186615, "rewards_train/margins": 1.0373005270957947, "rewards_train/rejected": -1.7208889722824097, "step": 1308 }, { "epoch": 0.37, "logps_train/chosen": -104.0382080078125, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -107.07575225830078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7108514308929443, "rewards_train/margins": 1.9105427265167236, "rewards_train/rejected": -3.621394157409668, "step": 1309 }, { "epoch": 0.37, "learning_rate": 2.7871584326158183e-07, "loss": 0.3898, "step": 1310 }, { "epoch": 0.37, "logps_train/chosen": -26.640560150146484, "logps_train/ref_chosen": -19.75, "logps_train/ref_rejected": -14.1875, "logps_train/rejected": -24.182024002075195, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6896420121192932, "rewards_train/margins": 0.3077598214149475, "rewards_train/rejected": -0.9974018335342407, "step": 1310 }, { "epoch": 0.37, "logps_train/chosen": -95.34265899658203, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -100.6622314453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6484259366989136, "rewards_train/margins": 1.381079077720642, "rewards_train/rejected": -3.0295050144195557, "step": 1311 }, { "epoch": 0.37, "learning_rate": 2.7810094119762656e-07, "loss": 0.5341, "step": 1312 }, { "epoch": 0.37, "logps_train/chosen": -61.50445556640625, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -81.31883239746094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9082581996917725, "rewards_train/margins": 1.175969123840332, "rewards_train/rejected": -2.0842273235321045, "step": 1312 }, { "epoch": 0.37, "logps_train/chosen": -92.20587921142578, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -112.29820251464844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8151195049285889, "rewards_train/margins": 2.1342315673828125, "rewards_train/rejected": -3.9493510723114014, "step": 1313 }, { "epoch": 0.37, "learning_rate": 2.774858669082937e-07, "loss": 0.3724, "step": 1314 }, { "epoch": 0.37, "logps_train/chosen": -96.3318099975586, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -94.733154296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.773806095123291, "rewards_train/margins": 0.4018535614013672, "rewards_train/rejected": -2.175659656524658, "step": 1314 }, { "epoch": 0.37, "logps_train/chosen": -51.42466735839844, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -81.90716552734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7602400779724121, "rewards_train/margins": 1.5957107543945312, "rewards_train/rejected": -2.3559508323669434, "step": 1315 }, { "epoch": 0.37, "learning_rate": 2.7687062416325777e-07, "loss": 0.441, "step": 1316 }, { "epoch": 0.37, "logps_train/chosen": -95.26358795166016, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -120.08601379394531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.937295913696289, "rewards_train/margins": 1.0259923934936523, "rewards_train/rejected": -2.9632883071899414, "step": 1316 }, { "epoch": 0.37, "logps_train/chosen": -102.5262451171875, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -98.22103881835938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0816283226013184, "rewards_train/margins": 0.20883560180664062, "rewards_train/rejected": -2.290463924407959, "step": 1317 }, { "epoch": 0.37, "learning_rate": 2.7625521673322584e-07, "loss": 0.6372, "step": 1318 }, { "epoch": 0.37, "logps_train/chosen": -85.05008697509766, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -89.3863296508789, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.3417274951934814, "rewards_train/margins": 0.4308891296386719, "rewards_train/rejected": -2.7726166248321533, "step": 1318 }, { "epoch": 0.37, "logps_train/chosen": -50.07927322387695, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -53.705081939697266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0952810049057007, "rewards_train/margins": 1.2998853921890259, "rewards_train/rejected": -2.3951663970947266, "step": 1319 }, { "epoch": 0.37, "learning_rate": 2.756396483899139e-07, "loss": 0.5068, "step": 1320 }, { "epoch": 0.37, "logps_train/chosen": -43.69392013549805, "logps_train/ref_chosen": -36.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -69.0543212890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7180249094963074, "rewards_train/margins": 0.6708055138587952, "rewards_train/rejected": -1.3888304233551025, "step": 1320 }, { "epoch": 0.37, "logps_train/chosen": -72.91769409179688, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -79.10346984863281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9517297744750977, "rewards_train/margins": 0.6179918050765991, "rewards_train/rejected": -1.5697215795516968, "step": 1321 }, { "epoch": 0.37, "learning_rate": 2.7502392290602463e-07, "loss": 0.5277, "step": 1322 }, { "epoch": 0.37, "logps_train/chosen": -94.58790588378906, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -104.88409423828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8958998918533325, "rewards_train/margins": 2.1021775007247925, "rewards_train/rejected": -3.998077392578125, "step": 1322 }, { "epoch": 0.37, "logps_train/chosen": -47.807395935058594, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -59.41307830810547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5037863850593567, "rewards_train/margins": 1.51603764295578, "rewards_train/rejected": -2.0198240280151367, "step": 1323 }, { "epoch": 0.37, "learning_rate": 2.7440804405522346e-07, "loss": 0.355, "step": 1324 }, { "epoch": 0.37, "logps_train/chosen": -73.08329772949219, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -95.97520446777344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7374316453933716, "rewards_train/margins": 1.0569642782211304, "rewards_train/rejected": -2.794395923614502, "step": 1324 }, { "epoch": 0.37, "logps_train/chosen": -85.72633361816406, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -82.99726867675781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8273212909698486, "rewards_train/margins": 0.2659609317779541, "rewards_train/rejected": -2.0932822227478027, "step": 1325 }, { "epoch": 0.37, "learning_rate": 2.73792015612116e-07, "loss": 0.5358, "step": 1326 }, { "epoch": 0.37, "logps_train/chosen": -43.96147918701172, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -67.97431945800781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.42471253871917725, "rewards_train/margins": 1.359047770500183, "rewards_train/rejected": -1.7837603092193604, "step": 1326 }, { "epoch": 0.37, "logps_train/chosen": -58.512733459472656, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -75.8866195678711, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.48896825313568115, "rewards_train/margins": 1.309849739074707, "rewards_train/rejected": -1.7988179922103882, "step": 1327 }, { "epoch": 0.37, "learning_rate": 2.7317584135222453e-07, "loss": 0.4733, "step": 1328 }, { "epoch": 0.37, "logps_train/chosen": -81.32911682128906, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -76.09212493896484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8594746589660645, "rewards_train/margins": 1.02180814743042, "rewards_train/rejected": -1.8812828063964844, "step": 1328 }, { "epoch": 0.37, "logps_train/chosen": -130.31271362304688, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -140.99732971191406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.4781460762023926, "rewards_train/margins": 1.3262743949890137, "rewards_train/rejected": -3.8044204711914062, "step": 1329 }, { "epoch": 0.37, "learning_rate": 2.7255952505196523e-07, "loss": 0.4357, "step": 1330 }, { "epoch": 0.37, "logps_train/chosen": -87.80403137207031, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -105.68490600585938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3147778511047363, "rewards_train/margins": 0.8310563564300537, "rewards_train/rejected": -2.14583420753479, "step": 1330 }, { "epoch": 0.37, "logps_train/chosen": -36.144046783447266, "logps_train/ref_chosen": -27.25, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -54.56989288330078, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8839359283447266, "rewards_train/margins": 0.697174072265625, "rewards_train/rejected": -1.5811100006103516, "step": 1331 }, { "epoch": 0.37, "learning_rate": 2.719430704886244e-07, "loss": 0.5688, "step": 1332 }, { "epoch": 0.37, "logps_train/chosen": -57.416873931884766, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -81.05432891845703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8594609498977661, "rewards_train/margins": 1.9582279920578003, "rewards_train/rejected": -2.8176889419555664, "step": 1332 }, { "epoch": 0.37, "logps_train/chosen": -86.974365234375, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -114.34844207763672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9052491784095764, "rewards_train/margins": 1.794048011302948, "rewards_train/rejected": -2.6992971897125244, "step": 1333 }, { "epoch": 0.37, "learning_rate": 2.7132648144033614e-07, "loss": 0.3223, "step": 1334 }, { "epoch": 0.37, "logps_train/chosen": -46.227272033691406, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -67.61605072021484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7809305787086487, "rewards_train/margins": 1.0939556956291199, "rewards_train/rejected": -1.8748862743377686, "step": 1334 }, { "epoch": 0.37, "logps_train/chosen": -49.509666442871094, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -66.00403594970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.34100592136383057, "rewards_train/margins": 1.7270735502243042, "rewards_train/rejected": -2.0680794715881348, "step": 1335 }, { "epoch": 0.37, "learning_rate": 2.707097616860587e-07, "loss": 0.3386, "step": 1336 }, { "epoch": 0.37, "logps_train/chosen": -106.86275482177734, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -132.5000762939453, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.2087366580963135, "rewards_train/margins": 1.1859004497528076, "rewards_train/rejected": -3.394637107849121, "step": 1336 }, { "epoch": 0.37, "logps_train/chosen": -74.65409088134766, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -82.0049819946289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5857215523719788, "rewards_train/margins": 0.8608705401420593, "rewards_train/rejected": -1.446592092514038, "step": 1337 }, { "epoch": 0.37, "learning_rate": 2.7009291500555113e-07, "loss": 0.5754, "step": 1338 }, { "epoch": 0.37, "logps_train/chosen": -69.99712371826172, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -76.87053680419922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4236384630203247, "rewards_train/margins": 0.9444705247879028, "rewards_train/rejected": -2.3681089878082275, "step": 1338 }, { "epoch": 0.37, "logps_train/chosen": -60.1782112121582, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -82.09654998779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4322744607925415, "rewards_train/margins": 1.5351929664611816, "rewards_train/rejected": -1.9674674272537231, "step": 1339 }, { "epoch": 0.37, "learning_rate": 2.694759451793508e-07, "loss": 0.3491, "step": 1340 }, { "epoch": 0.37, "logps_train/chosen": -67.44230651855469, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -82.63480377197266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5241507291793823, "rewards_train/margins": 0.9656966924667358, "rewards_train/rejected": -2.489847421646118, "step": 1340 }, { "epoch": 0.37, "logps_train/chosen": -43.637535095214844, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -63.39043426513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7274252772331238, "rewards_train/margins": 0.7655245661735535, "rewards_train/rejected": -1.4929498434066772, "step": 1341 }, { "epoch": 0.38, "learning_rate": 2.6885885598874946e-07, "loss": 0.3929, "step": 1342 }, { "epoch": 0.38, "logps_train/chosen": -71.57133483886719, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -108.21395111083984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7282274961471558, "rewards_train/margins": 2.0763708353042603, "rewards_train/rejected": -2.804598331451416, "step": 1342 }, { "epoch": 0.38, "logps_train/chosen": -57.34663391113281, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -85.48606872558594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6021436452865601, "rewards_train/margins": 2.0251747369766235, "rewards_train/rejected": -2.6273183822631836, "step": 1343 }, { "epoch": 0.38, "learning_rate": 2.682416512157707e-07, "loss": 0.3162, "step": 1344 }, { "epoch": 0.38, "logps_train/chosen": -79.02452087402344, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -98.95014190673828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1665629148483276, "rewards_train/margins": 1.4204438924789429, "rewards_train/rejected": -2.5870068073272705, "step": 1344 }, { "epoch": 0.38, "logps_train/chosen": -57.89361572265625, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -84.1726303100586, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.48418545722961426, "rewards_train/margins": 2.1544644832611084, "rewards_train/rejected": -2.6386499404907227, "step": 1345 }, { "epoch": 0.38, "learning_rate": 2.6762433464314625e-07, "loss": 0.3745, "step": 1346 }, { "epoch": 0.38, "logps_train/chosen": -60.114990234375, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -91.95317077636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3827880322933197, "rewards_train/margins": 1.4492479860782623, "rewards_train/rejected": -1.832036018371582, "step": 1346 }, { "epoch": 0.38, "logps_train/chosen": -88.88194274902344, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -94.5876235961914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4225689172744751, "rewards_train/margins": 1.2158806324005127, "rewards_train/rejected": -1.6384495496749878, "step": 1347 }, { "epoch": 0.38, "learning_rate": 2.6700691005429314e-07, "loss": 0.3202, "step": 1348 }, { "epoch": 0.38, "logps_train/chosen": -87.83692932128906, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -99.7630844116211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.4221696853637695, "rewards_train/margins": 1.8639039993286133, "rewards_train/rejected": -4.286073684692383, "step": 1348 }, { "epoch": 0.38, "logps_train/chosen": -48.34173583984375, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -76.79225158691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7172788977622986, "rewards_train/margins": 2.2607746720314026, "rewards_train/rejected": -2.978053569793701, "step": 1349 }, { "epoch": 0.38, "learning_rate": 2.663893812332905e-07, "loss": 0.3444, "step": 1350 }, { "epoch": 0.38, "logps_train/chosen": -60.93762969970703, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -58.11137390136719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0298957824707031, "rewards_train/margins": 0.9316320419311523, "rewards_train/rejected": -1.9615278244018555, "step": 1350 }, { "epoch": 0.38, "logps_train/chosen": -110.66854858398438, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -111.44385528564453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8504478931427002, "rewards_train/margins": 0.7445235252380371, "rewards_train/rejected": -2.5949714183807373, "step": 1351 }, { "epoch": 0.38, "learning_rate": 2.6577175196485616e-07, "loss": 0.5223, "step": 1352 }, { "epoch": 0.38, "logps_train/chosen": -51.827369689941406, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -70.49395751953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7101783752441406, "rewards_train/margins": 1.4089927673339844, "rewards_train/rejected": -2.119171142578125, "step": 1352 }, { "epoch": 0.38, "logps_train/chosen": -67.51397705078125, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -92.01215362548828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9197568893432617, "rewards_train/margins": 1.8904430866241455, "rewards_train/rejected": -2.8101999759674072, "step": 1353 }, { "epoch": 0.38, "learning_rate": 2.651540260343237e-07, "loss": 0.3153, "step": 1354 }, { "epoch": 0.38, "logps_train/chosen": -82.34182739257812, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -91.17100524902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.287113070487976, "rewards_train/margins": 1.5663150548934937, "rewards_train/rejected": -2.8534281253814697, "step": 1354 }, { "epoch": 0.38, "logps_train/chosen": -86.42059326171875, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -113.45274353027344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.267059326171875, "rewards_train/margins": 2.017570972442627, "rewards_train/rejected": -3.284630298614502, "step": 1355 }, { "epoch": 0.38, "learning_rate": 2.6453620722761895e-07, "loss": 0.3597, "step": 1356 }, { "epoch": 0.38, "logps_train/chosen": -59.536041259765625, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -79.4385986328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6543853282928467, "rewards_train/margins": 1.4230682849884033, "rewards_train/rejected": -2.07745361328125, "step": 1356 }, { "epoch": 0.38, "logps_train/chosen": -60.514652252197266, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -68.70114135742188, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1543948650360107, "rewards_train/margins": 0.48681366443634033, "rewards_train/rejected": -1.641208529472351, "step": 1357 }, { "epoch": 0.38, "learning_rate": 2.639182993312371e-07, "loss": 0.4524, "step": 1358 }, { "epoch": 0.38, "logps_train/chosen": -77.8732681274414, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -103.27001190185547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7883033752441406, "rewards_train/margins": 1.3624773025512695, "rewards_train/rejected": -2.15078067779541, "step": 1358 }, { "epoch": 0.38, "logps_train/chosen": -80.97784423828125, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -83.80522155761719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9926083087921143, "rewards_train/margins": 0.5324447154998779, "rewards_train/rejected": -2.525053024291992, "step": 1359 }, { "epoch": 0.38, "learning_rate": 2.6330030613221923e-07, "loss": 0.4443, "step": 1360 }, { "epoch": 0.38, "logps_train/chosen": -46.133155822753906, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -63.55250930786133, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5797218680381775, "rewards_train/margins": 1.1144937872886658, "rewards_train/rejected": -1.6942156553268433, "step": 1360 }, { "epoch": 0.38, "logps_train/chosen": -121.89741516113281, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -140.67169189453125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.856929302215576, "rewards_train/margins": 0.9571146965026855, "rewards_train/rejected": -3.8140439987182617, "step": 1361 }, { "epoch": 0.38, "learning_rate": 2.626822314181293e-07, "loss": 0.5153, "step": 1362 }, { "epoch": 0.38, "logps_train/chosen": -68.66605377197266, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -92.5390396118164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3077186346054077, "rewards_train/margins": 1.0979429483413696, "rewards_train/rejected": -2.4056615829467773, "step": 1362 }, { "epoch": 0.38, "logps_train/chosen": -94.73461151123047, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -149.11090087890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.228539228439331, "rewards_train/margins": 2.855987787246704, "rewards_train/rejected": -5.084527015686035, "step": 1363 }, { "epoch": 0.38, "learning_rate": 2.6206407897703093e-07, "loss": 0.3182, "step": 1364 }, { "epoch": 0.38, "logps_train/chosen": -82.84297943115234, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -119.45513153076172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9452354907989502, "rewards_train/margins": 2.7491061687469482, "rewards_train/rejected": -3.6943416595458984, "step": 1364 }, { "epoch": 0.38, "logps_train/chosen": -87.6287841796875, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -76.39540100097656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0286750793457031, "rewards_train/margins": 1.2200205326080322, "rewards_train/rejected": -2.2486956119537354, "step": 1365 }, { "epoch": 0.38, "learning_rate": 2.6144585259746394e-07, "loss": 0.3367, "step": 1366 }, { "epoch": 0.38, "logps_train/chosen": -76.80714416503906, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -121.25357055664062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0427266359329224, "rewards_train/margins": 2.338098883628845, "rewards_train/rejected": -3.3808255195617676, "step": 1366 }, { "epoch": 0.38, "logps_train/chosen": -77.67304992675781, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -113.48094177246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0262898206710815, "rewards_train/margins": 1.9893819093704224, "rewards_train/rejected": -3.015671730041504, "step": 1367 }, { "epoch": 0.38, "learning_rate": 2.6082755606842154e-07, "loss": 0.3611, "step": 1368 }, { "epoch": 0.38, "logps_train/chosen": -77.24444580078125, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -89.52877807617188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.33577299118042, "rewards_train/margins": 1.7199368476867676, "rewards_train/rejected": -3.0557098388671875, "step": 1368 }, { "epoch": 0.38, "logps_train/chosen": -60.85888671875, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -78.12150573730469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.511572241783142, "rewards_train/margins": 1.453703761100769, "rewards_train/rejected": -2.965276002883911, "step": 1369 }, { "epoch": 0.38, "learning_rate": 2.602091931793267e-07, "loss": 0.3925, "step": 1370 }, { "epoch": 0.38, "logps_train/chosen": -49.612762451171875, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -48.86111068725586, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5028778314590454, "rewards_train/margins": 0.6347957849502563, "rewards_train/rejected": -1.1376736164093018, "step": 1370 }, { "epoch": 0.38, "logps_train/chosen": -97.61985778808594, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -96.43189239501953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.229172945022583, "rewards_train/margins": 1.477590799331665, "rewards_train/rejected": -3.706763744354248, "step": 1371 }, { "epoch": 0.38, "learning_rate": 2.595907677200091e-07, "loss": 0.3864, "step": 1372 }, { "epoch": 0.38, "logps_train/chosen": -94.76100158691406, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -114.26131439208984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8886008262634277, "rewards_train/margins": 1.695343255996704, "rewards_train/rejected": -3.583944082260132, "step": 1372 }, { "epoch": 0.38, "logps_train/chosen": -75.87973022460938, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -90.60918426513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3162927627563477, "rewards_train/margins": 1.9688441753387451, "rewards_train/rejected": -3.2851369380950928, "step": 1373 }, { "epoch": 0.38, "learning_rate": 2.5897228348068195e-07, "loss": 0.2367, "step": 1374 }, { "epoch": 0.38, "logps_train/chosen": -73.16169738769531, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -104.44308471679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.189802646636963, "rewards_train/margins": 2.0533342361450195, "rewards_train/rejected": -3.2431368827819824, "step": 1374 }, { "epoch": 0.38, "logps_train/chosen": -54.54771423339844, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -71.60335540771484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4755721688270569, "rewards_train/margins": 1.3104471564292908, "rewards_train/rejected": -1.7860193252563477, "step": 1375 }, { "epoch": 0.38, "learning_rate": 2.583537442519186e-07, "loss": 0.3255, "step": 1376 }, { "epoch": 0.38, "logps_train/chosen": -128.61160278320312, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -120.68549346923828, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -3.0693626403808594, "rewards_train/margins": -0.3984694480895996, "rewards_train/rejected": -2.6708931922912598, "step": 1376 }, { "epoch": 0.38, "logps_train/chosen": -58.03902816772461, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -67.445556640625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8437464237213135, "rewards_train/margins": 0.6888947486877441, "rewards_train/rejected": -1.5326411724090576, "step": 1377 }, { "epoch": 0.39, "learning_rate": 2.577351538246298e-07, "loss": 0.8633, "step": 1378 }, { "epoch": 0.39, "logps_train/chosen": -56.045284271240234, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -87.34400177001953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4777708053588867, "rewards_train/margins": 1.1011600494384766, "rewards_train/rejected": -2.5789308547973633, "step": 1378 }, { "epoch": 0.39, "logps_train/chosen": -89.68913269042969, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -122.016357421875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0821945667266846, "rewards_train/margins": 1.4905357360839844, "rewards_train/rejected": -3.572730302810669, "step": 1379 }, { "epoch": 0.39, "learning_rate": 2.5711651599003945e-07, "loss": 0.3479, "step": 1380 }, { "epoch": 0.39, "logps_train/chosen": -63.23186492919922, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -75.99614715576172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.078655481338501, "rewards_train/margins": 0.7104123830795288, "rewards_train/rejected": -1.7890678644180298, "step": 1380 }, { "epoch": 0.39, "logps_train/chosen": -65.56197357177734, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -63.32878494262695, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5444786548614502, "rewards_train/margins": 0.27199387550354004, "rewards_train/rejected": -1.8164725303649902, "step": 1381 }, { "epoch": 0.39, "learning_rate": 2.564978345396627e-07, "loss": 0.568, "step": 1382 }, { "epoch": 0.39, "logps_train/chosen": -43.072750091552734, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -67.01962280273438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7892085313796997, "rewards_train/margins": 1.2805274724960327, "rewards_train/rejected": -2.0697360038757324, "step": 1382 }, { "epoch": 0.39, "logps_train/chosen": -108.58927917480469, "logps_train/ref_chosen": -87.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -108.79452514648438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1761159896850586, "rewards_train/margins": 1.159390926361084, "rewards_train/rejected": -3.3355069160461426, "step": 1383 }, { "epoch": 0.39, "learning_rate": 2.5587911326528145e-07, "loss": 0.4475, "step": 1384 }, { "epoch": 0.39, "logps_train/chosen": -96.11656188964844, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -133.13128662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9585314989089966, "rewards_train/margins": 3.8233460187911987, "rewards_train/rejected": -4.781877517700195, "step": 1384 }, { "epoch": 0.39, "logps_train/chosen": -138.98739624023438, "logps_train/ref_chosen": -115.0, "logps_train/ref_rejected": -190.0, "logps_train/rejected": -235.3164520263672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.378427505493164, "rewards_train/margins": 2.180171012878418, "rewards_train/rejected": -4.558598518371582, "step": 1385 }, { "epoch": 0.39, "learning_rate": 2.552603559589219e-07, "loss": 0.2649, "step": 1386 }, { "epoch": 0.39, "logps_train/chosen": -77.43486022949219, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -106.66267395019531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9927046298980713, "rewards_train/margins": 1.9477818012237549, "rewards_train/rejected": -2.940486431121826, "step": 1386 }, { "epoch": 0.39, "logps_train/chosen": -103.97190856933594, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -118.98243713378906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7370350360870361, "rewards_train/margins": 1.3971459865570068, "rewards_train/rejected": -3.134181022644043, "step": 1387 }, { "epoch": 0.39, "learning_rate": 2.5464156641283123e-07, "loss": 0.2919, "step": 1388 }, { "epoch": 0.39, "logps_train/chosen": -90.06669616699219, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -113.79029083251953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7799122333526611, "rewards_train/margins": 1.795210599899292, "rewards_train/rejected": -3.575122833251953, "step": 1388 }, { "epoch": 0.39, "logps_train/chosen": -79.31021118164062, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -111.15463256835938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.244302749633789, "rewards_train/margins": 1.87565279006958, "rewards_train/rejected": -3.119955539703369, "step": 1389 }, { "epoch": 0.39, "learning_rate": 2.5402274841945385e-07, "loss": 0.3535, "step": 1390 }, { "epoch": 0.39, "logps_train/chosen": -45.272926330566406, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -57.52449035644531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4745580554008484, "rewards_train/margins": 0.5942969918251038, "rewards_train/rejected": -1.0688550472259521, "step": 1390 }, { "epoch": 0.39, "logps_train/chosen": -49.201202392578125, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -64.12103271484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7560582160949707, "rewards_train/margins": 0.9017479419708252, "rewards_train/rejected": -1.657806158065796, "step": 1391 }, { "epoch": 0.39, "learning_rate": 2.534039057714089e-07, "loss": 0.4316, "step": 1392 }, { "epoch": 0.39, "logps_train/chosen": -110.06392669677734, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -144.35324096679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.13139271736145, "rewards_train/margins": 1.9023687839508057, "rewards_train/rejected": -4.033761501312256, "step": 1392 }, { "epoch": 0.39, "logps_train/chosen": -132.66116333007812, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -147.11500549316406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.239943742752075, "rewards_train/margins": 1.8199937343597412, "rewards_train/rejected": -5.059937477111816, "step": 1393 }, { "epoch": 0.39, "learning_rate": 2.5278504226146636e-07, "loss": 0.2451, "step": 1394 }, { "epoch": 0.39, "logps_train/chosen": -56.96629333496094, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -79.7743148803711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.16718590259552, "rewards_train/margins": 1.4344638586044312, "rewards_train/rejected": -2.601649761199951, "step": 1394 }, { "epoch": 0.39, "logps_train/chosen": -67.73016357421875, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -69.60502624511719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3655941486358643, "rewards_train/margins": 1.069908857345581, "rewards_train/rejected": -2.4355030059814453, "step": 1395 }, { "epoch": 0.39, "learning_rate": 2.5216616168252423e-07, "loss": 0.4427, "step": 1396 }, { "epoch": 0.39, "logps_train/chosen": -62.675872802734375, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -84.93183898925781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7977144718170166, "rewards_train/margins": 1.8155863285064697, "rewards_train/rejected": -2.6133008003234863, "step": 1396 }, { "epoch": 0.39, "logps_train/chosen": -68.78553009033203, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -85.96036529541016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5394909381866455, "rewards_train/margins": 1.480764627456665, "rewards_train/rejected": -3.0202555656433105, "step": 1397 }, { "epoch": 0.39, "learning_rate": 2.51547267827585e-07, "loss": 0.3628, "step": 1398 }, { "epoch": 0.39, "logps_train/chosen": -90.35094451904297, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -108.65039825439453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9983751773834229, "rewards_train/margins": 1.5662739276885986, "rewards_train/rejected": -3.5646491050720215, "step": 1398 }, { "epoch": 0.39, "logps_train/chosen": -83.69400024414062, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -91.17359161376953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0006496906280518, "rewards_train/margins": 0.5372178554534912, "rewards_train/rejected": -2.537867546081543, "step": 1399 }, { "epoch": 0.39, "learning_rate": 2.509283644897325e-07, "loss": 0.4923, "step": 1400 }, { "epoch": 0.39, "logps_train/chosen": -110.51560974121094, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -152.65670776367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.276365280151367, "rewards_train/margins": 2.3584470748901367, "rewards_train/rejected": -4.634812355041504, "step": 1400 }, { "epoch": 0.39, "logps_train/chosen": -53.87019729614258, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -60.21210479736328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.48213687539100647, "rewards_train/margins": 0.6132923066616058, "rewards_train/rejected": -1.0954291820526123, "step": 1401 }, { "epoch": 0.39, "learning_rate": 2.5030945546210894e-07, "loss": 0.361, "step": 1402 }, { "epoch": 0.39, "logps_train/chosen": -69.6895980834961, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -127.36253356933594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9013574719429016, "rewards_train/margins": 3.776277720928192, "rewards_train/rejected": -4.677635192871094, "step": 1402 }, { "epoch": 0.39, "logps_train/chosen": -113.97630310058594, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -152.47695922851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8050529956817627, "rewards_train/margins": 2.5016276836395264, "rewards_train/rejected": -4.306680679321289, "step": 1403 }, { "epoch": 0.39, "learning_rate": 2.4969054453789114e-07, "loss": 0.2161, "step": 1404 }, { "epoch": 0.39, "logps_train/chosen": -43.923423767089844, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -32.5, "logps_train/rejected": -51.70403289794922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9523031115531921, "rewards_train/margins": 0.9770846962928772, "rewards_train/rejected": -1.9293878078460693, "step": 1404 }, { "epoch": 0.39, "logps_train/chosen": -73.5328140258789, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -83.11129760742188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8794531226158142, "rewards_train/margins": 0.9228882193565369, "rewards_train/rejected": -1.802341341972351, "step": 1405 }, { "epoch": 0.39, "learning_rate": 2.490716355102675e-07, "loss": 0.4436, "step": 1406 }, { "epoch": 0.39, "logps_train/chosen": -88.65668487548828, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -98.30132293701172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8327586650848389, "rewards_train/margins": 1.605576753616333, "rewards_train/rejected": -3.438335418701172, "step": 1406 }, { "epoch": 0.39, "logps_train/chosen": -84.43414306640625, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -117.94034576416016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7590389251708984, "rewards_train/margins": 1.3990578651428223, "rewards_train/rejected": -3.1580967903137207, "step": 1407 }, { "epoch": 0.39, "learning_rate": 2.4845273217241503e-07, "loss": 0.4724, "step": 1408 }, { "epoch": 0.39, "logps_train/chosen": -93.84420776367188, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -112.18768310546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9602023363113403, "rewards_train/margins": 0.7858120203018188, "rewards_train/rejected": -2.746014356613159, "step": 1408 }, { "epoch": 0.39, "logps_train/chosen": -34.074806213378906, "logps_train/ref_chosen": -28.625, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -46.127540588378906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5441993474960327, "rewards_train/margins": 0.6728512048721313, "rewards_train/rejected": -1.217050552368164, "step": 1409 }, { "epoch": 0.39, "learning_rate": 2.478338383174758e-07, "loss": 0.4502, "step": 1410 }, { "epoch": 0.39, "logps_train/chosen": -69.9249038696289, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -101.27232360839844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9871188998222351, "rewards_train/margins": 2.603394567966461, "rewards_train/rejected": -3.5905134677886963, "step": 1410 }, { "epoch": 0.39, "logps_train/chosen": -64.23922729492188, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -70.24002838134766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7852510213851929, "rewards_train/margins": 1.0569158792495728, "rewards_train/rejected": -1.8421669006347656, "step": 1411 }, { "epoch": 0.39, "learning_rate": 2.472149577385336e-07, "loss": 0.3653, "step": 1412 }, { "epoch": 0.39, "logps_train/chosen": -84.017333984375, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -91.52421569824219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.217358350753784, "rewards_train/margins": 0.26475071907043457, "rewards_train/rejected": -2.4821090698242188, "step": 1412 }, { "epoch": 0.39, "logps_train/chosen": -57.075660705566406, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -67.00348663330078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0278780460357666, "rewards_train/margins": 0.3607523441314697, "rewards_train/rejected": -2.3886303901672363, "step": 1413 }, { "epoch": 0.4, "learning_rate": 2.4659609422859113e-07, "loss": 0.6514, "step": 1414 }, { "epoch": 0.4, "logps_train/chosen": -60.59565734863281, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -75.19113159179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0376907587051392, "rewards_train/margins": 1.0380626916885376, "rewards_train/rejected": -2.0757534503936768, "step": 1414 }, { "epoch": 0.4, "logps_train/chosen": -93.30435180664062, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -110.95439147949219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9898099899291992, "rewards_train/margins": 1.185317039489746, "rewards_train/rejected": -3.1751270294189453, "step": 1415 }, { "epoch": 0.4, "learning_rate": 2.459772515805462e-07, "loss": 0.4499, "step": 1416 }, { "epoch": 0.4, "logps_train/chosen": -90.56442260742188, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -103.7119140625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6349579095840454, "rewards_train/margins": 0.743850588798523, "rewards_train/rejected": -2.3788084983825684, "step": 1416 }, { "epoch": 0.4, "logps_train/chosen": -69.59407043457031, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -98.663818359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1266918182373047, "rewards_train/margins": 1.8064875602722168, "rewards_train/rejected": -2.9331793785095215, "step": 1417 }, { "epoch": 0.4, "learning_rate": 2.453584335871688e-07, "loss": 0.3702, "step": 1418 }, { "epoch": 0.4, "logps_train/chosen": -107.56165313720703, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -127.704345703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.343665361404419, "rewards_train/margins": 1.3588006496429443, "rewards_train/rejected": -2.7024660110473633, "step": 1418 }, { "epoch": 0.4, "logps_train/chosen": -90.00579833984375, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -127.27154541015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1014583110809326, "rewards_train/margins": 2.1901495456695557, "rewards_train/rejected": -3.2916078567504883, "step": 1419 }, { "epoch": 0.4, "learning_rate": 2.447396440410781e-07, "loss": 0.3124, "step": 1420 }, { "epoch": 0.4, "logps_train/chosen": -80.2513427734375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -99.35195922851562, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.0134150981903076, "rewards_train/margins": 2.0362343788146973, "rewards_train/rejected": -3.049649477005005, "step": 1420 }, { "epoch": 0.4, "logps_train/chosen": -56.434326171875, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -73.35064697265625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6969487071037292, "rewards_train/margins": 0.965459406375885, "rewards_train/rejected": -1.6624081134796143, "step": 1421 }, { "epoch": 0.4, "learning_rate": 2.441208867347186e-07, "loss": 0.4488, "step": 1422 }, { "epoch": 0.4, "logps_train/chosen": -69.16523742675781, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -73.96730041503906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9069539308547974, "rewards_train/margins": 1.4776662588119507, "rewards_train/rejected": -2.384620189666748, "step": 1422 }, { "epoch": 0.4, "logps_train/chosen": -19.333995819091797, "logps_train/ref_chosen": -13.25, "logps_train/ref_rejected": -19.875, "logps_train/rejected": -30.237205505371094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6063489317893982, "rewards_train/margins": 0.4295298457145691, "rewards_train/rejected": -1.0358787775039673, "step": 1423 }, { "epoch": 0.4, "learning_rate": 2.4350216546033736e-07, "loss": 0.4241, "step": 1424 }, { "epoch": 0.4, "logps_train/chosen": -85.90435028076172, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -110.66046905517578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.60361909866333, "rewards_train/margins": 1.4823493957519531, "rewards_train/rejected": -3.085968494415283, "step": 1424 }, { "epoch": 0.4, "logps_train/chosen": -49.999610900878906, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -64.6334228515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9207618236541748, "rewards_train/margins": 1.0402367115020752, "rewards_train/rejected": -1.96099853515625, "step": 1425 }, { "epoch": 0.4, "learning_rate": 2.428834840099605e-07, "loss": 0.4679, "step": 1426 }, { "epoch": 0.4, "logps_train/chosen": -84.17804718017578, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -94.75330352783203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9131168127059937, "rewards_train/margins": 2.0071359872817993, "rewards_train/rejected": -2.920252799987793, "step": 1426 }, { "epoch": 0.4, "logps_train/chosen": -54.0204963684082, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -41.25, "logps_train/rejected": -58.89433288574219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9514635801315308, "rewards_train/margins": 0.8146296739578247, "rewards_train/rejected": -1.7660932540893555, "step": 1427 }, { "epoch": 0.4, "learning_rate": 2.422648461753703e-07, "loss": 0.3872, "step": 1428 }, { "epoch": 0.4, "logps_train/chosen": -62.32866668701172, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -74.67293548583984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0082577466964722, "rewards_train/margins": 1.2414578199386597, "rewards_train/rejected": -2.249715566635132, "step": 1428 }, { "epoch": 0.4, "logps_train/chosen": -70.18084716796875, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -98.62975311279297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4383976459503174, "rewards_train/margins": 1.7653977870941162, "rewards_train/rejected": -3.2037954330444336, "step": 1429 }, { "epoch": 0.4, "learning_rate": 2.416462557480814e-07, "loss": 0.3173, "step": 1430 }, { "epoch": 0.4, "logps_train/chosen": -85.343017578125, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -91.13835144042969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1393800973892212, "rewards_train/margins": 0.9334388971328735, "rewards_train/rejected": -2.0728189945220947, "step": 1430 }, { "epoch": 0.4, "logps_train/chosen": -72.94872283935547, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -97.23341369628906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2464350461959839, "rewards_train/margins": 1.0312029123306274, "rewards_train/rejected": -2.2776379585266113, "step": 1431 }, { "epoch": 0.4, "learning_rate": 2.4102771651931813e-07, "loss": 0.4667, "step": 1432 }, { "epoch": 0.4, "logps_train/chosen": -107.73786163330078, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -141.23602294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.001227855682373, "rewards_train/margins": 2.3296990394592285, "rewards_train/rejected": -4.330926895141602, "step": 1432 }, { "epoch": 0.4, "logps_train/chosen": -54.949005126953125, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -85.45057678222656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4242461919784546, "rewards_train/margins": 1.7579203844070435, "rewards_train/rejected": -3.182166576385498, "step": 1433 }, { "epoch": 0.4, "learning_rate": 2.40409232279991e-07, "loss": 0.3107, "step": 1434 }, { "epoch": 0.4, "logps_train/chosen": -102.56112670898438, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -128.83555603027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.332674503326416, "rewards_train/margins": 1.5313501358032227, "rewards_train/rejected": -2.8640246391296387, "step": 1434 }, { "epoch": 0.4, "logps_train/chosen": -60.32343292236328, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -85.97602081298828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.524237871170044, "rewards_train/margins": 1.1214110851287842, "rewards_train/rejected": -2.645648956298828, "step": 1435 }, { "epoch": 0.4, "learning_rate": 2.397908068206733e-07, "loss": 0.3252, "step": 1436 }, { "epoch": 0.4, "logps_train/chosen": -88.18455505371094, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -92.77865600585938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.1090805530548096, "rewards_train/margins": 0.35159754753112793, "rewards_train/rejected": -2.4606781005859375, "step": 1436 }, { "epoch": 0.4, "logps_train/chosen": -68.40955352783203, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -114.69082641601562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6092169284820557, "rewards_train/margins": 2.2852563858032227, "rewards_train/rejected": -3.8944733142852783, "step": 1437 }, { "epoch": 0.4, "learning_rate": 2.391724439315785e-07, "loss": 0.4977, "step": 1438 }, { "epoch": 0.4, "logps_train/chosen": -67.13638305664062, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -102.11262512207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.28824782371521, "rewards_train/margins": 1.2427418231964111, "rewards_train/rejected": -2.530989646911621, "step": 1438 }, { "epoch": 0.4, "logps_train/chosen": -61.952911376953125, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -52.560672760009766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.185037612915039, "rewards_train/margins": 0.51741623878479, "rewards_train/rejected": -1.702453851699829, "step": 1439 }, { "epoch": 0.4, "learning_rate": 2.385541474025361e-07, "loss": 0.5124, "step": 1440 }, { "epoch": 0.4, "logps_train/chosen": -58.70077896118164, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -67.01563262939453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.221445083618164, "rewards_train/margins": 0.7805085182189941, "rewards_train/rejected": -2.001953601837158, "step": 1440 }, { "epoch": 0.4, "logps_train/chosen": -56.96097946166992, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -85.03782653808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.528519868850708, "rewards_train/margins": 1.402606725692749, "rewards_train/rejected": -2.931126594543457, "step": 1441 }, { "epoch": 0.4, "learning_rate": 2.3793592102296915e-07, "loss": 0.4275, "step": 1442 }, { "epoch": 0.4, "logps_train/chosen": -36.02882385253906, "logps_train/ref_chosen": -30.75, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -49.92866897583008, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.526124894618988, "rewards_train/margins": 0.6936948895454407, "rewards_train/rejected": -1.2198197841644287, "step": 1442 }, { "epoch": 0.4, "logps_train/chosen": -54.827972412109375, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -64.4866943359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8992036581039429, "rewards_train/margins": 1.4025903940200806, "rewards_train/rejected": -2.3017940521240234, "step": 1443 }, { "epoch": 0.4, "learning_rate": 2.3731776858187078e-07, "loss": 0.3823, "step": 1444 }, { "epoch": 0.4, "logps_train/chosen": -133.40647888183594, "logps_train/ref_chosen": -108.5, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -153.29942321777344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.5126941204071045, "rewards_train/margins": 1.604919195175171, "rewards_train/rejected": -4.117613315582275, "step": 1444 }, { "epoch": 0.4, "logps_train/chosen": -59.57331466674805, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -72.4448471069336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9545971751213074, "rewards_train/margins": 0.8010204434394836, "rewards_train/rejected": -1.755617618560791, "step": 1445 }, { "epoch": 0.4, "learning_rate": 2.3669969386778085e-07, "loss": 0.4354, "step": 1446 }, { "epoch": 0.4, "logps_train/chosen": -89.79341888427734, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -119.29327392578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.681685447692871, "rewards_train/margins": 1.515317440032959, "rewards_train/rejected": -3.19700288772583, "step": 1446 }, { "epoch": 0.4, "logps_train/chosen": -28.20357894897461, "logps_train/ref_chosen": -19.5, "logps_train/ref_rejected": -21.25, "logps_train/rejected": -37.50383758544922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8654752969741821, "rewards_train/margins": 0.7658655643463135, "rewards_train/rejected": -1.6313408613204956, "step": 1447 }, { "epoch": 0.4, "learning_rate": 2.3608170066876298e-07, "loss": 0.4243, "step": 1448 }, { "epoch": 0.4, "logps_train/chosen": -54.646671295166016, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -58.266937255859375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8347843289375305, "rewards_train/margins": 1.413784682750702, "rewards_train/rejected": -2.2485690116882324, "step": 1448 }, { "epoch": 0.4, "logps_train/chosen": -87.08670043945312, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -111.1590576171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8739047050476074, "rewards_train/margins": 1.529500961303711, "rewards_train/rejected": -3.4034056663513184, "step": 1449 }, { "epoch": 0.41, "learning_rate": 2.3546379277238103e-07, "loss": 0.376, "step": 1450 }, { "epoch": 0.41, "logps_train/chosen": -53.08202362060547, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -78.91864776611328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7962394952774048, "rewards_train/margins": 0.821308970451355, "rewards_train/rejected": -1.6175484657287598, "step": 1450 }, { "epoch": 0.41, "logps_train/chosen": -61.199378967285156, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -92.09947204589844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6254065036773682, "rewards_train/margins": 1.6849310398101807, "rewards_train/rejected": -2.310337543487549, "step": 1451 }, { "epoch": 0.41, "learning_rate": 2.348459739656763e-07, "loss": 0.4027, "step": 1452 }, { "epoch": 0.41, "logps_train/chosen": -49.50818634033203, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -53.81367492675781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.24110230803489685, "rewards_train/margins": 1.484796792268753, "rewards_train/rejected": -1.72589910030365, "step": 1452 }, { "epoch": 0.41, "logps_train/chosen": -32.541934967041016, "logps_train/ref_chosen": -31.75, "logps_train/ref_rejected": -25.0, "logps_train/rejected": -34.23435974121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.08534583449363708, "rewards_train/margins": 0.8364298045635223, "rewards_train/rejected": -0.9217756390571594, "step": 1453 }, { "epoch": 0.41, "learning_rate": 2.3422824803514382e-07, "loss": 0.3802, "step": 1454 }, { "epoch": 0.41, "logps_train/chosen": -55.98424530029297, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -72.5770263671875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4207881689071655, "rewards_train/margins": 0.8268557786941528, "rewards_train/rejected": -2.2476439476013184, "step": 1454 }, { "epoch": 0.41, "logps_train/chosen": -131.9940948486328, "logps_train/ref_chosen": -95.0, "logps_train/ref_rejected": -118.5, "logps_train/rejected": -163.99038696289062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -3.7119102478027344, "rewards_train/margins": 0.823847770690918, "rewards_train/rejected": -4.535758018493652, "step": 1455 }, { "epoch": 0.41, "learning_rate": 2.3361061876670945e-07, "loss": 0.6821, "step": 1456 }, { "epoch": 0.41, "logps_train/chosen": -90.28365325927734, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -77.54593658447266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9314900636672974, "rewards_train/margins": 0.515583872795105, "rewards_train/rejected": -2.4470739364624023, "step": 1456 }, { "epoch": 0.41, "logps_train/chosen": -52.99207305908203, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -67.475341796875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7374885082244873, "rewards_train/margins": 1.3489134311676025, "rewards_train/rejected": -2.08640193939209, "step": 1457 }, { "epoch": 0.41, "learning_rate": 2.3299308994570687e-07, "loss": 0.4125, "step": 1458 }, { "epoch": 0.41, "logps_train/chosen": -50.42257308959961, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -58.40668869018555, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.3668668866157532, "rewards_train/margins": 1.0919663310050964, "rewards_train/rejected": -1.4588332176208496, "step": 1458 }, { "epoch": 0.41, "logps_train/chosen": -54.10042190551758, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -77.4073257446289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4252277910709381, "rewards_train/margins": 1.105690211057663, "rewards_train/rejected": -1.530918002128601, "step": 1459 }, { "epoch": 0.41, "learning_rate": 2.3237566535685375e-07, "loss": 0.4745, "step": 1460 }, { "epoch": 0.41, "logps_train/chosen": -98.09123229980469, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -108.29418182373047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8040449619293213, "rewards_train/margins": 1.8378732204437256, "rewards_train/rejected": -3.641918182373047, "step": 1460 }, { "epoch": 0.41, "logps_train/chosen": -79.92696380615234, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -93.69712829589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1676963567733765, "rewards_train/margins": 1.066860318183899, "rewards_train/rejected": -2.2345566749572754, "step": 1461 }, { "epoch": 0.41, "learning_rate": 2.3175834878422931e-07, "loss": 0.357, "step": 1462 }, { "epoch": 0.41, "logps_train/chosen": -60.13316345214844, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -69.96977996826172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1663434505462646, "rewards_train/margins": 1.3459665775299072, "rewards_train/rejected": -2.512310028076172, "step": 1462 }, { "epoch": 0.41, "logps_train/chosen": -63.537193298339844, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -77.98126220703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7668053507804871, "rewards_train/margins": 0.44968026876449585, "rewards_train/rejected": -1.216485619544983, "step": 1463 }, { "epoch": 0.41, "learning_rate": 2.3114114401125054e-07, "loss": 0.5119, "step": 1464 }, { "epoch": 0.41, "logps_train/chosen": -51.874088287353516, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -54.085289001464844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2372134923934937, "rewards_train/margins": 0.6703388690948486, "rewards_train/rejected": -1.9075523614883423, "step": 1464 }, { "epoch": 0.41, "logps_train/chosen": -69.12730407714844, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -73.90693664550781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1927599906921387, "rewards_train/margins": 1.1556973457336426, "rewards_train/rejected": -2.3484573364257812, "step": 1465 }, { "epoch": 0.41, "learning_rate": 2.3052405482064919e-07, "loss": 0.5114, "step": 1466 }, { "epoch": 0.41, "logps_train/chosen": -44.410728454589844, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -38.25, "logps_train/rejected": -51.31831359863281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.25765013694763184, "rewards_train/margins": 1.0583362579345703, "rewards_train/rejected": -1.3159863948822021, "step": 1466 }, { "epoch": 0.41, "logps_train/chosen": -95.55567932128906, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -132.2721405029297, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.4610371589660645, "rewards_train/margins": 1.3130521774291992, "rewards_train/rejected": -2.7740893363952637, "step": 1467 }, { "epoch": 0.41, "learning_rate": 2.2990708499444885e-07, "loss": 0.5435, "step": 1468 }, { "epoch": 0.41, "logps_train/chosen": -50.43743133544922, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -45.369049072265625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.919084906578064, "rewards_train/margins": 0.6369608640670776, "rewards_train/rejected": -1.5560457706451416, "step": 1468 }, { "epoch": 0.41, "logps_train/chosen": -61.037750244140625, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -62.6798095703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.742446780204773, "rewards_train/margins": 0.9290498495101929, "rewards_train/rejected": -1.6714966297149658, "step": 1469 }, { "epoch": 0.41, "learning_rate": 2.2929023831394133e-07, "loss": 0.4351, "step": 1470 }, { "epoch": 0.41, "logps_train/chosen": -148.80490112304688, "logps_train/ref_chosen": -123.5, "logps_train/ref_rejected": -116.0, "logps_train/rejected": -158.92843627929688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.543478012084961, "rewards_train/margins": 1.7431154251098633, "rewards_train/rejected": -4.286593437194824, "step": 1470 }, { "epoch": 0.41, "logps_train/chosen": -83.69702911376953, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -116.85832214355469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0747811794281006, "rewards_train/margins": 2.4360508918762207, "rewards_train/rejected": -3.5108320713043213, "step": 1471 }, { "epoch": 0.41, "learning_rate": 2.2867351855966384e-07, "loss": 0.3182, "step": 1472 }, { "epoch": 0.41, "logps_train/chosen": -38.83192443847656, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -55.64658737182617, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.32225489616394043, "rewards_train/margins": 0.6670131087303162, "rewards_train/rejected": -0.9892680048942566, "step": 1472 }, { "epoch": 0.41, "logps_train/chosen": -58.36408615112305, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -78.29402923583984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8770335912704468, "rewards_train/margins": 1.590455174446106, "rewards_train/rejected": -2.4674887657165527, "step": 1473 }, { "epoch": 0.41, "learning_rate": 2.2805692951137557e-07, "loss": 0.4299, "step": 1474 }, { "epoch": 0.41, "logps_train/chosen": -61.07868957519531, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -61.661033630371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8830645680427551, "rewards_train/margins": 0.9844058156013489, "rewards_train/rejected": -1.867470383644104, "step": 1474 }, { "epoch": 0.41, "logps_train/chosen": -120.68931579589844, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -131.37936401367188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.6204943656921387, "rewards_train/margins": 0.9018170833587646, "rewards_train/rejected": -3.5223114490509033, "step": 1475 }, { "epoch": 0.41, "learning_rate": 2.274404749480348e-07, "loss": 0.3896, "step": 1476 }, { "epoch": 0.41, "logps_train/chosen": -80.03251647949219, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -87.73963928222656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6452434062957764, "rewards_train/margins": 0.6898534297943115, "rewards_train/rejected": -2.335096836090088, "step": 1476 }, { "epoch": 0.41, "logps_train/chosen": -86.63934326171875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -94.40544128417969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8170599937438965, "rewards_train/margins": 1.0135233402252197, "rewards_train/rejected": -2.830583333969116, "step": 1477 }, { "epoch": 0.41, "learning_rate": 2.2682415864777547e-07, "loss": 0.4884, "step": 1478 }, { "epoch": 0.41, "logps_train/chosen": -74.99855041503906, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -95.32831573486328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.817237913608551, "rewards_train/margins": 1.817546784877777, "rewards_train/rejected": -2.634784698486328, "step": 1478 }, { "epoch": 0.41, "logps_train/chosen": -67.54692840576172, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -90.11866760253906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7957087755203247, "rewards_train/margins": 1.9872523546218872, "rewards_train/rejected": -2.782961130142212, "step": 1479 }, { "epoch": 0.41, "learning_rate": 2.26207984387884e-07, "loss": 0.2718, "step": 1480 }, { "epoch": 0.41, "logps_train/chosen": -86.44613647460938, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -113.71174621582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1243016719818115, "rewards_train/margins": 2.810936212539673, "rewards_train/rejected": -3.9352378845214844, "step": 1480 }, { "epoch": 0.41, "logps_train/chosen": -61.34617614746094, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -71.50247955322266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0271226167678833, "rewards_train/margins": 1.1606501340866089, "rewards_train/rejected": -2.187772750854492, "step": 1481 }, { "epoch": 0.41, "learning_rate": 2.2559195594477657e-07, "loss": 0.3558, "step": 1482 }, { "epoch": 0.41, "logps_train/chosen": -62.66151428222656, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -109.41070556640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.189589023590088, "rewards_train/margins": 2.595231533050537, "rewards_train/rejected": -3.784820556640625, "step": 1482 }, { "epoch": 0.41, "logps_train/chosen": -43.43029022216797, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -60.65336608886719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5588005185127258, "rewards_train/margins": 1.5702082514762878, "rewards_train/rejected": -2.1290087699890137, "step": 1483 }, { "epoch": 0.41, "learning_rate": 2.249760770939754e-07, "loss": 0.3329, "step": 1484 }, { "epoch": 0.41, "logps_train/chosen": -63.7703742980957, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -79.15364074707031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5767444372177124, "rewards_train/margins": 1.5843230485916138, "rewards_train/rejected": -2.161067485809326, "step": 1484 }, { "epoch": 0.42, "logps_train/chosen": -54.031463623046875, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -65.74440002441406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.241231918334961, "rewards_train/margins": 1.3985400199890137, "rewards_train/rejected": -2.6397719383239746, "step": 1485 }, { "epoch": 0.42, "learning_rate": 2.2436035161008616e-07, "loss": 0.3665, "step": 1486 }, { "epoch": 0.42, "logps_train/chosen": -78.16743469238281, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -93.56329345703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2112748622894287, "rewards_train/margins": 1.2382426261901855, "rewards_train/rejected": -3.4495174884796143, "step": 1486 }, { "epoch": 0.42, "logps_train/chosen": -90.05267333984375, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -123.19520568847656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7634705305099487, "rewards_train/margins": 1.7564400434494019, "rewards_train/rejected": -3.5199105739593506, "step": 1487 }, { "epoch": 0.42, "learning_rate": 2.237447832667742e-07, "loss": 0.3788, "step": 1488 }, { "epoch": 0.42, "logps_train/chosen": -48.116065979003906, "logps_train/ref_chosen": -36.5, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -90.1641616821289, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.165952205657959, "rewards_train/margins": 1.8254642486572266, "rewards_train/rejected": -2.9914164543151855, "step": 1488 }, { "epoch": 0.42, "logps_train/chosen": -39.31227111816406, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -60.464569091796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.38650065660476685, "rewards_train/margins": 1.913423240184784, "rewards_train/rejected": -2.299923896789551, "step": 1489 }, { "epoch": 0.42, "learning_rate": 2.2312937583674218e-07, "loss": 0.4177, "step": 1490 }, { "epoch": 0.42, "logps_train/chosen": -43.495296478271484, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -54.47026062011719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7647639513015747, "rewards_train/margins": 0.5806996822357178, "rewards_train/rejected": -1.3454636335372925, "step": 1490 }, { "epoch": 0.42, "logps_train/chosen": -34.33486557006836, "logps_train/ref_chosen": -28.875, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -41.52058410644531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.543252170085907, "rewards_train/margins": 0.39259546995162964, "rewards_train/rejected": -0.9358476400375366, "step": 1491 }, { "epoch": 0.42, "learning_rate": 2.225141330917063e-07, "loss": 0.5265, "step": 1492 }, { "epoch": 0.42, "logps_train/chosen": -60.08317565917969, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -62.24585723876953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1428883075714111, "rewards_train/margins": 0.38697075843811035, "rewards_train/rejected": -1.5298590660095215, "step": 1492 }, { "epoch": 0.42, "logps_train/chosen": -33.77716064453125, "logps_train/ref_chosen": -27.125, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -50.05274200439453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6660950779914856, "rewards_train/margins": 0.8389837145805359, "rewards_train/rejected": -1.5050787925720215, "step": 1493 }, { "epoch": 0.42, "learning_rate": 2.2189905880237342e-07, "loss": 0.5073, "step": 1494 }, { "epoch": 0.42, "logps_train/chosen": -60.791893005371094, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -83.19351196289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.502333641052246, "rewards_train/margins": 1.4160408973693848, "rewards_train/rejected": -2.918374538421631, "step": 1494 }, { "epoch": 0.42, "logps_train/chosen": -60.85224914550781, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -56.961856842041016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2508498430252075, "rewards_train/margins": 0.4314686059951782, "rewards_train/rejected": -1.6823184490203857, "step": 1495 }, { "epoch": 0.42, "learning_rate": 2.2128415673841822e-07, "loss": 0.5841, "step": 1496 }, { "epoch": 0.42, "logps_train/chosen": -78.28062438964844, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -103.302490234375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1964218616485596, "rewards_train/margins": 1.0543348789215088, "rewards_train/rejected": -3.2507567405700684, "step": 1496 }, { "epoch": 0.42, "logps_train/chosen": -48.63485336303711, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -80.08786010742188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6955165266990662, "rewards_train/margins": 2.0489140152931213, "rewards_train/rejected": -2.7444305419921875, "step": 1497 }, { "epoch": 0.42, "learning_rate": 2.2066943066845948e-07, "loss": 0.4681, "step": 1498 }, { "epoch": 0.42, "logps_train/chosen": -89.67605590820312, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -102.9144515991211, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.70627760887146, "rewards_train/margins": 1.2285270690917969, "rewards_train/rejected": -2.934804677963257, "step": 1498 }, { "epoch": 0.42, "logps_train/chosen": -71.27426147460938, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -98.13237762451172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.699691891670227, "rewards_train/margins": 0.8092492818832397, "rewards_train/rejected": -2.508941173553467, "step": 1499 }, { "epoch": 0.42, "learning_rate": 2.2005488436003768e-07, "loss": 0.4944, "step": 1500 }, { "epoch": 0.42, "logps_train/chosen": -44.52510070800781, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -50.1291389465332, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8290092349052429, "rewards_train/margins": 0.6119661927223206, "rewards_train/rejected": -1.4409754276275635, "step": 1500 }, { "epoch": 0.42, "logps_train/chosen": -53.74895477294922, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -62.9795036315918, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3704276084899902, "rewards_train/margins": 0.5708823204040527, "rewards_train/rejected": -1.941309928894043, "step": 1501 }, { "epoch": 0.42, "learning_rate": 2.1944052157959142e-07, "loss": 0.552, "step": 1502 }, { "epoch": 0.42, "logps_train/chosen": -74.89353942871094, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -93.01203918457031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.33759605884552, "rewards_train/margins": 1.4645837545394897, "rewards_train/rejected": -2.8021798133850098, "step": 1502 }, { "epoch": 0.42, "logps_train/chosen": -37.61762237548828, "logps_train/ref_chosen": -31.375, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -70.26980590820312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6247933506965637, "rewards_train/margins": 1.9302577376365662, "rewards_train/rejected": -2.55505108833313, "step": 1503 }, { "epoch": 0.42, "learning_rate": 2.188263460924344e-07, "loss": 0.3061, "step": 1504 }, { "epoch": 0.42, "logps_train/chosen": -86.88961791992188, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -106.73487854003906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.139157295227051, "rewards_train/margins": 1.1720263957977295, "rewards_train/rejected": -3.3111836910247803, "step": 1504 }, { "epoch": 0.42, "logps_train/chosen": -88.45167541503906, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -123.12904357910156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8604015707969666, "rewards_train/margins": 1.408558189868927, "rewards_train/rejected": -2.2689597606658936, "step": 1505 }, { "epoch": 0.42, "learning_rate": 2.1821236166273267e-07, "loss": 0.4179, "step": 1506 }, { "epoch": 0.42, "logps_train/chosen": -42.475379943847656, "logps_train/ref_chosen": -32.5, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -55.643890380859375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9967566132545471, "rewards_train/margins": 0.8287650942802429, "rewards_train/rejected": -1.82552170753479, "step": 1506 }, { "epoch": 0.42, "logps_train/chosen": -69.06477355957031, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -84.2336654663086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7586259841918945, "rewards_train/margins": 1.5608341693878174, "rewards_train/rejected": -2.319460153579712, "step": 1507 }, { "epoch": 0.42, "learning_rate": 2.1759857205348108e-07, "loss": 0.3528, "step": 1508 }, { "epoch": 0.42, "logps_train/chosen": -109.0347900390625, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -130.70358276367188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.129260540008545, "rewards_train/margins": 1.237973690032959, "rewards_train/rejected": -3.367234230041504, "step": 1508 }, { "epoch": 0.42, "logps_train/chosen": -87.37171173095703, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -97.84712982177734, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.0922493934631348, "rewards_train/margins": 1.633967399597168, "rewards_train/rejected": -3.7262167930603027, "step": 1509 }, { "epoch": 0.42, "learning_rate": 2.169849810264807e-07, "loss": 0.4274, "step": 1510 }, { "epoch": 0.42, "logps_train/chosen": -128.63365173339844, "logps_train/ref_chosen": -110.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -131.63816833496094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8315294981002808, "rewards_train/margins": 1.2322875261306763, "rewards_train/rejected": -3.063817024230957, "step": 1510 }, { "epoch": 0.42, "logps_train/chosen": -46.5494384765625, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -80.72059631347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6025997996330261, "rewards_train/margins": 1.8684833645820618, "rewards_train/rejected": -2.471083164215088, "step": 1511 }, { "epoch": 0.42, "learning_rate": 2.163715923423153e-07, "loss": 0.3519, "step": 1512 }, { "epoch": 0.42, "logps_train/chosen": -39.29479217529297, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -41.62604904174805, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5459343791007996, "rewards_train/margins": 0.31251996755599976, "rewards_train/rejected": -0.8584543466567993, "step": 1512 }, { "epoch": 0.42, "logps_train/chosen": -68.48253631591797, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -79.24789428710938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6947380304336548, "rewards_train/margins": 0.9049538373947144, "rewards_train/rejected": -2.599691867828369, "step": 1513 }, { "epoch": 0.42, "learning_rate": 2.1575840976032866e-07, "loss": 0.5562, "step": 1514 }, { "epoch": 0.42, "logps_train/chosen": -129.51622009277344, "logps_train/ref_chosen": -107.5, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -159.49038696289062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2031846046447754, "rewards_train/margins": 3.1849169731140137, "rewards_train/rejected": -5.388101577758789, "step": 1514 }, { "epoch": 0.42, "logps_train/chosen": -58.58176040649414, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -80.47071838378906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8816139698028564, "rewards_train/margins": 1.0724897384643555, "rewards_train/rejected": -1.954103708267212, "step": 1515 }, { "epoch": 0.42, "learning_rate": 2.1514543703860144e-07, "loss": 0.4387, "step": 1516 }, { "epoch": 0.42, "logps_train/chosen": -112.3211669921875, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -117.17977905273438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.417858362197876, "rewards_train/margins": 1.5387914180755615, "rewards_train/rejected": -2.9566497802734375, "step": 1516 }, { "epoch": 0.42, "logps_train/chosen": -76.02619934082031, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -118.46748352050781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1244951486587524, "rewards_train/margins": 1.9710818529129028, "rewards_train/rejected": -3.0955770015716553, "step": 1517 }, { "epoch": 0.42, "learning_rate": 2.145326779339279e-07, "loss": 0.2708, "step": 1518 }, { "epoch": 0.42, "logps_train/chosen": -67.40851593017578, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -108.60820007324219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9724920988082886, "rewards_train/margins": 2.044577717781067, "rewards_train/rejected": -3.0170698165893555, "step": 1518 }, { "epoch": 0.42, "logps_train/chosen": -45.17582702636719, "logps_train/ref_chosen": -31.0, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -52.9072265625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4156420230865479, "rewards_train/margins": 0.3946244716644287, "rewards_train/rejected": -1.8102664947509766, "step": 1519 }, { "epoch": 0.42, "learning_rate": 2.1392013620179336e-07, "loss": 0.4075, "step": 1520 }, { "epoch": 0.42, "logps_train/chosen": -79.74624633789062, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -100.21214294433594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7996245622634888, "rewards_train/margins": 1.8547929525375366, "rewards_train/rejected": -2.6544175148010254, "step": 1520 }, { "epoch": 0.43, "logps_train/chosen": -117.86734008789062, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -138.39466857910156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.948812961578369, "rewards_train/margins": 0.9601542949676514, "rewards_train/rejected": -3.9089672565460205, "step": 1521 }, { "epoch": 0.43, "learning_rate": 2.1330781559635065e-07, "loss": 0.3667, "step": 1522 }, { "epoch": 0.43, "logps_train/chosen": -80.56924438476562, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -91.5592041015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6569247245788574, "rewards_train/margins": 1.0097377300262451, "rewards_train/rejected": -1.6666624546051025, "step": 1522 }, { "epoch": 0.43, "logps_train/chosen": -64.69086456298828, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -94.71272277832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8429144620895386, "rewards_train/margins": 2.170544743537903, "rewards_train/rejected": -3.0134592056274414, "step": 1523 }, { "epoch": 0.43, "learning_rate": 2.1269571987039756e-07, "loss": 0.3577, "step": 1524 }, { "epoch": 0.43, "logps_train/chosen": -82.77047729492188, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -105.81295013427734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2954072952270508, "rewards_train/margins": 1.6944327354431152, "rewards_train/rejected": -2.989840030670166, "step": 1524 }, { "epoch": 0.43, "logps_train/chosen": -52.450042724609375, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -66.30683898925781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0827972888946533, "rewards_train/margins": 0.9615585803985596, "rewards_train/rejected": -2.044355869293213, "step": 1525 }, { "epoch": 0.43, "learning_rate": 2.120838527753535e-07, "loss": 0.355, "step": 1526 }, { "epoch": 0.43, "logps_train/chosen": -61.018638610839844, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -98.53857421875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8206138014793396, "rewards_train/margins": 1.8656659722328186, "rewards_train/rejected": -2.686279773712158, "step": 1526 }, { "epoch": 0.43, "logps_train/chosen": -36.256561279296875, "logps_train/ref_chosen": -29.875, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -60.390262603759766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.63527512550354, "rewards_train/margins": 1.3281654119491577, "rewards_train/rejected": -1.9634405374526978, "step": 1527 }, { "epoch": 0.43, "learning_rate": 2.1147221806123667e-07, "loss": 0.4442, "step": 1528 }, { "epoch": 0.43, "logps_train/chosen": -68.06429290771484, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -96.5125961303711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.286897897720337, "rewards_train/margins": 1.5995185375213623, "rewards_train/rejected": -2.886416435241699, "step": 1528 }, { "epoch": 0.43, "logps_train/chosen": -74.76358032226562, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -72.84532928466797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8099517226219177, "rewards_train/margins": 1.0472373366355896, "rewards_train/rejected": -1.8571890592575073, "step": 1529 }, { "epoch": 0.43, "learning_rate": 2.1086081947664114e-07, "loss": 0.3596, "step": 1530 }, { "epoch": 0.43, "logps_train/chosen": -70.0419921875, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -97.6251449584961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3801755905151367, "rewards_train/margins": 1.469740867614746, "rewards_train/rejected": -2.849916458129883, "step": 1530 }, { "epoch": 0.43, "logps_train/chosen": -120.617919921875, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -125.33355712890625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.850855588912964, "rewards_train/margins": 0.5739061832427979, "rewards_train/rejected": -3.4247617721557617, "step": 1531 }, { "epoch": 0.43, "learning_rate": 2.102496607687136e-07, "loss": 0.4903, "step": 1532 }, { "epoch": 0.43, "logps_train/chosen": -79.79019165039062, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -82.10564422607422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3633946180343628, "rewards_train/margins": 1.6943382024765015, "rewards_train/rejected": -3.0577328205108643, "step": 1532 }, { "epoch": 0.43, "logps_train/chosen": -69.94976043701172, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -107.3936767578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9145069718360901, "rewards_train/margins": 1.904352605342865, "rewards_train/rejected": -2.818859577178955, "step": 1533 }, { "epoch": 0.43, "learning_rate": 2.0963874568313087e-07, "loss": 0.3106, "step": 1534 }, { "epoch": 0.43, "logps_train/chosen": -82.80169677734375, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -109.240234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0413024425506592, "rewards_train/margins": 2.1533265113830566, "rewards_train/rejected": -3.194628953933716, "step": 1534 }, { "epoch": 0.43, "logps_train/chosen": -83.65385437011719, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -89.20149230957031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.33724844455719, "rewards_train/margins": 0.7130402326583862, "rewards_train/rejected": -2.050288677215576, "step": 1535 }, { "epoch": 0.43, "learning_rate": 2.0902807796407628e-07, "loss": 0.4139, "step": 1536 }, { "epoch": 0.43, "logps_train/chosen": -93.53363037109375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -126.63748168945312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.31449556350708, "rewards_train/margins": 2.2656588554382324, "rewards_train/rejected": -4.5801544189453125, "step": 1536 }, { "epoch": 0.43, "logps_train/chosen": -62.9566764831543, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -78.16910552978516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6032848358154297, "rewards_train/margins": 1.0370631217956543, "rewards_train/rejected": -1.640347957611084, "step": 1537 }, { "epoch": 0.43, "learning_rate": 2.0841766135421747e-07, "loss": 0.3557, "step": 1538 }, { "epoch": 0.43, "logps_train/chosen": -62.165611267089844, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -71.07669830322266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.111288070678711, "rewards_train/margins": 1.066694736480713, "rewards_train/rejected": -2.177982807159424, "step": 1538 }, { "epoch": 0.43, "logps_train/chosen": -175.44473266601562, "logps_train/ref_chosen": -130.0, "logps_train/ref_rejected": -132.0, "logps_train/rejected": -195.6644287109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -4.542909145355225, "rewards_train/margins": 1.7797837257385254, "rewards_train/rejected": -6.32269287109375, "step": 1539 }, { "epoch": 0.43, "learning_rate": 2.0780749959468287e-07, "loss": 0.5776, "step": 1540 }, { "epoch": 0.43, "logps_train/chosen": -62.14244079589844, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -86.73136901855469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7204940915107727, "rewards_train/margins": 1.8340887427330017, "rewards_train/rejected": -2.5545828342437744, "step": 1540 }, { "epoch": 0.43, "logps_train/chosen": -38.095069885253906, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -31.0, "logps_train/rejected": -48.69858169555664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5007179379463196, "rewards_train/margins": 1.2690913081169128, "rewards_train/rejected": -1.7698092460632324, "step": 1541 }, { "epoch": 0.43, "learning_rate": 2.071975964250389e-07, "loss": 0.3767, "step": 1542 }, { "epoch": 0.43, "logps_train/chosen": -123.3168716430664, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -157.4512481689453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.049264907836914, "rewards_train/margins": 2.0337510108947754, "rewards_train/rejected": -5.0830159187316895, "step": 1542 }, { "epoch": 0.43, "logps_train/chosen": -51.47052764892578, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -62.78307342529297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0050606727600098, "rewards_train/margins": 1.019536018371582, "rewards_train/rejected": -2.024596691131592, "step": 1543 }, { "epoch": 0.43, "learning_rate": 2.065879555832674e-07, "loss": 0.3898, "step": 1544 }, { "epoch": 0.43, "logps_train/chosen": -117.35748291015625, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -132.3830108642578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.7892637252807617, "rewards_train/margins": 1.3103656768798828, "rewards_train/rejected": -4.0996294021606445, "step": 1544 }, { "epoch": 0.43, "logps_train/chosen": -71.69520568847656, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -65.30738067626953, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -0.546473503112793, "rewards_train/margins": 0.48553431034088135, "rewards_train/rejected": -1.0320078134536743, "step": 1545 }, { "epoch": 0.43, "learning_rate": 2.0597858080574221e-07, "loss": 0.6783, "step": 1546 }, { "epoch": 0.43, "logps_train/chosen": -94.24891662597656, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -98.80707550048828, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.5139542818069458, "rewards_train/margins": 0.9077686071395874, "rewards_train/rejected": -2.421722888946533, "step": 1546 }, { "epoch": 0.43, "logps_train/chosen": -78.47566986083984, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -100.68916320800781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3827235698699951, "rewards_train/margins": 2.2131459712982178, "rewards_train/rejected": -3.595869541168213, "step": 1547 }, { "epoch": 0.43, "learning_rate": 2.0536947582720668e-07, "loss": 0.4052, "step": 1548 }, { "epoch": 0.43, "logps_train/chosen": -88.38264465332031, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -120.88613891601562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9195142388343811, "rewards_train/margins": 2.117536962032318, "rewards_train/rejected": -3.037051200866699, "step": 1548 }, { "epoch": 0.43, "logps_train/chosen": -74.20623779296875, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -79.50244903564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9571474194526672, "rewards_train/margins": 1.3395819067955017, "rewards_train/rejected": -2.296729326248169, "step": 1549 }, { "epoch": 0.43, "learning_rate": 2.047606443807505e-07, "loss": 0.3008, "step": 1550 }, { "epoch": 0.43, "logps_train/chosen": -65.69995880126953, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -88.05136108398438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.623120903968811, "rewards_train/margins": 1.9284995794296265, "rewards_train/rejected": -2.5516204833984375, "step": 1550 }, { "epoch": 0.43, "logps_train/chosen": -67.62134552001953, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -92.56690979003906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0144782066345215, "rewards_train/margins": 1.018775463104248, "rewards_train/rejected": -2.0332536697387695, "step": 1551 }, { "epoch": 0.43, "learning_rate": 2.0415209019778693e-07, "loss": 0.3619, "step": 1552 }, { "epoch": 0.43, "logps_train/chosen": -43.14768981933594, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -58.05607604980469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3968001902103424, "rewards_train/margins": 1.0730652511119843, "rewards_train/rejected": -1.4698654413223267, "step": 1552 }, { "epoch": 0.43, "logps_train/chosen": -82.86640930175781, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -89.47486877441406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.404609203338623, "rewards_train/margins": 1.6585025787353516, "rewards_train/rejected": -3.0631117820739746, "step": 1553 }, { "epoch": 0.43, "learning_rate": 2.0354381700803002e-07, "loss": 0.5328, "step": 1554 }, { "epoch": 0.43, "logps_train/chosen": -75.02198791503906, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -94.145263671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2661638259887695, "rewards_train/margins": 1.4509997367858887, "rewards_train/rejected": -2.717163562774658, "step": 1554 }, { "epoch": 0.43, "logps_train/chosen": -95.974609375, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -129.45941162109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.234570264816284, "rewards_train/margins": 1.7551214694976807, "rewards_train/rejected": -3.989691734313965, "step": 1555 }, { "epoch": 0.43, "learning_rate": 2.029358285394716e-07, "loss": 0.4075, "step": 1556 }, { "epoch": 0.43, "logps_train/chosen": -92.38813018798828, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -132.50436401367188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9878367185592651, "rewards_train/margins": 1.9760764837265015, "rewards_train/rejected": -3.9639132022857666, "step": 1556 }, { "epoch": 0.44, "logps_train/chosen": -91.25912475585938, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -106.64566040039062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.3690762519836426, "rewards_train/margins": 0.9525213241577148, "rewards_train/rejected": -3.3215975761413574, "step": 1557 }, { "epoch": 0.44, "learning_rate": 2.0232812851835857e-07, "loss": 0.3707, "step": 1558 }, { "epoch": 0.44, "logps_train/chosen": -81.57861328125, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -88.1158676147461, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0727052688598633, "rewards_train/margins": 1.0502095222473145, "rewards_train/rejected": -2.1229147911071777, "step": 1558 }, { "epoch": 0.44, "logps_train/chosen": -85.18222045898438, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -98.25654602050781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.636190414428711, "rewards_train/margins": 0.7580184936523438, "rewards_train/rejected": -2.3942089080810547, "step": 1559 }, { "epoch": 0.44, "learning_rate": 2.0172072066916985e-07, "loss": 0.4155, "step": 1560 }, { "epoch": 0.44, "logps_train/chosen": -91.49235534667969, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -131.5889129638672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9978681802749634, "rewards_train/margins": 2.384656310081482, "rewards_train/rejected": -4.382524490356445, "step": 1560 }, { "epoch": 0.44, "logps_train/chosen": -37.26866149902344, "logps_train/ref_chosen": -32.0, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -40.53761672973633, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.5304304361343384, "rewards_train/margins": 0.2887610197067261, "rewards_train/rejected": -0.8191914558410645, "step": 1561 }, { "epoch": 0.44, "learning_rate": 2.0111360871459388e-07, "loss": 0.39, "step": 1562 }, { "epoch": 0.44, "logps_train/chosen": -131.1982879638672, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -137.15306091308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.0684123039245605, "rewards_train/margins": 0.8005070686340332, "rewards_train/rejected": -3.8689193725585938, "step": 1562 }, { "epoch": 0.44, "logps_train/chosen": -93.68301391601562, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -107.22701263427734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8948639631271362, "rewards_train/margins": 1.4883843660354614, "rewards_train/rejected": -3.3832483291625977, "step": 1563 }, { "epoch": 0.44, "learning_rate": 2.0050679637550544e-07, "loss": 0.4393, "step": 1564 }, { "epoch": 0.44, "logps_train/chosen": -92.06768798828125, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -135.05026245117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2126283645629883, "rewards_train/margins": 2.6457176208496094, "rewards_train/rejected": -4.858345985412598, "step": 1564 }, { "epoch": 0.44, "logps_train/chosen": -85.63681030273438, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -105.09564208984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8238368034362793, "rewards_train/margins": 1.8904154300689697, "rewards_train/rejected": -3.714252233505249, "step": 1565 }, { "epoch": 0.44, "learning_rate": 1.9990028737094312e-07, "loss": 0.313, "step": 1566 }, { "epoch": 0.44, "logps_train/chosen": -33.341217041015625, "logps_train/ref_chosen": -25.125, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -63.069393157958984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8253324627876282, "rewards_train/margins": 1.378872811794281, "rewards_train/rejected": -2.204205274581909, "step": 1566 }, { "epoch": 0.44, "logps_train/chosen": -29.861522674560547, "logps_train/ref_chosen": -26.0, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -41.419776916503906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.38209956884384155, "rewards_train/margins": 0.4716942310333252, "rewards_train/rejected": -0.8537937998771667, "step": 1567 }, { "epoch": 0.44, "learning_rate": 1.9929408541808646e-07, "loss": 0.4046, "step": 1568 }, { "epoch": 0.44, "logps_train/chosen": -57.043373107910156, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -59.03089904785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0348060131072998, "rewards_train/margins": 0.48078417778015137, "rewards_train/rejected": -1.5155901908874512, "step": 1568 }, { "epoch": 0.44, "logps_train/chosen": -72.8663330078125, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -118.95845794677734, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1471807956695557, "rewards_train/margins": 1.9658522605895996, "rewards_train/rejected": -3.1130330562591553, "step": 1569 }, { "epoch": 0.44, "learning_rate": 1.9868819423223298e-07, "loss": 0.4329, "step": 1570 }, { "epoch": 0.44, "logps_train/chosen": -77.43839263916016, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -96.0757827758789, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.661221981048584, "rewards_train/margins": 1.6092469692230225, "rewards_train/rejected": -2.2704689502716064, "step": 1570 }, { "epoch": 0.44, "logps_train/chosen": -51.52234649658203, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -52.100154876708984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6694222092628479, "rewards_train/margins": 0.43668705224990845, "rewards_train/rejected": -1.1061092615127563, "step": 1571 }, { "epoch": 0.44, "learning_rate": 1.9808261752677583e-07, "loss": 0.4694, "step": 1572 }, { "epoch": 0.44, "logps_train/chosen": -70.57655334472656, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -79.20134735107422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5283582210540771, "rewards_train/margins": 0.6954877376556396, "rewards_train/rejected": -2.223845958709717, "step": 1572 }, { "epoch": 0.44, "logps_train/chosen": -71.13848876953125, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -87.99737548828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.511504888534546, "rewards_train/margins": 1.6058104038238525, "rewards_train/rejected": -3.1173152923583984, "step": 1573 }, { "epoch": 0.44, "learning_rate": 1.9747735901318046e-07, "loss": 0.4525, "step": 1574 }, { "epoch": 0.44, "logps_train/chosen": -94.76878356933594, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -122.83468627929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.165159225463867, "rewards_train/margins": 1.7991690635681152, "rewards_train/rejected": -3.9643282890319824, "step": 1574 }, { "epoch": 0.44, "logps_train/chosen": -46.97719192504883, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -49.47740936279297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8016255497932434, "rewards_train/margins": 0.6672093272209167, "rewards_train/rejected": -1.4688348770141602, "step": 1575 }, { "epoch": 0.44, "learning_rate": 1.9687242240096246e-07, "loss": 0.3539, "step": 1576 }, { "epoch": 0.44, "logps_train/chosen": -113.40764617919922, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -149.0386962890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2259206771850586, "rewards_train/margins": 2.448261260986328, "rewards_train/rejected": -4.674181938171387, "step": 1576 }, { "epoch": 0.44, "logps_train/chosen": -76.69839477539062, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -70.870361328125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8221832513809204, "rewards_train/margins": 0.1711028814315796, "rewards_train/rejected": -1.9932861328125, "step": 1577 }, { "epoch": 0.44, "learning_rate": 1.9626781139766442e-07, "loss": 0.5414, "step": 1578 }, { "epoch": 0.44, "logps_train/chosen": -97.37671661376953, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -128.31997680664062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5454843044281006, "rewards_train/margins": 2.8747947216033936, "rewards_train/rejected": -4.420279026031494, "step": 1578 }, { "epoch": 0.44, "logps_train/chosen": -86.0423583984375, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -97.94073486328125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.609997510910034, "rewards_train/margins": 0.583294153213501, "rewards_train/rejected": -3.193291664123535, "step": 1579 }, { "epoch": 0.44, "learning_rate": 1.956635297088332e-07, "loss": 0.4043, "step": 1580 }, { "epoch": 0.44, "logps_train/chosen": -91.88868713378906, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -114.54054260253906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.056056499481201, "rewards_train/margins": 2.033935546875, "rewards_train/rejected": -4.089992046356201, "step": 1580 }, { "epoch": 0.44, "logps_train/chosen": -75.75950622558594, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -82.37225341796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3267312049865723, "rewards_train/margins": 1.5382280349731445, "rewards_train/rejected": -2.864959239959717, "step": 1581 }, { "epoch": 0.44, "learning_rate": 1.9505958103799768e-07, "loss": 0.3429, "step": 1582 }, { "epoch": 0.44, "logps_train/chosen": -97.96509552001953, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -105.13853454589844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3652596473693848, "rewards_train/margins": 2.265194892883301, "rewards_train/rejected": -3.6304545402526855, "step": 1582 }, { "epoch": 0.44, "logps_train/chosen": -82.00286102294922, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -88.98050689697266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7584893107414246, "rewards_train/margins": 0.9598739743232727, "rewards_train/rejected": -1.7183632850646973, "step": 1583 }, { "epoch": 0.44, "learning_rate": 1.9445596908664538e-07, "loss": 0.4702, "step": 1584 }, { "epoch": 0.44, "logps_train/chosen": -55.20734405517578, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -56.561405181884766, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5101875066757202, "rewards_train/margins": 0.38892197608947754, "rewards_train/rejected": -1.8991094827651978, "step": 1584 }, { "epoch": 0.44, "logps_train/chosen": -61.61973571777344, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -71.2435302734375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8783797025680542, "rewards_train/margins": 0.592848539352417, "rewards_train/rejected": -1.4712282419204712, "step": 1585 }, { "epoch": 0.44, "learning_rate": 1.9385269755420044e-07, "loss": 0.5605, "step": 1586 }, { "epoch": 0.44, "logps_train/chosen": -58.24223709106445, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -76.9065933227539, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9369189739227295, "rewards_train/margins": 1.8613574504852295, "rewards_train/rejected": -2.798276424407959, "step": 1586 }, { "epoch": 0.44, "logps_train/chosen": -74.80534362792969, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -40.0, "logps_train/rejected": -60.732025146484375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8530930280685425, "rewards_train/margins": 1.2278248071670532, "rewards_train/rejected": -2.0809178352355957, "step": 1587 }, { "epoch": 0.44, "learning_rate": 1.9324977013800043e-07, "loss": 0.2951, "step": 1588 }, { "epoch": 0.44, "logps_train/chosen": -71.7240219116211, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -63.767738342285156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.393227219581604, "rewards_train/margins": 0.153273344039917, "rewards_train/rejected": -1.546500563621521, "step": 1588 }, { "epoch": 0.44, "logps_train/chosen": -78.40270233154297, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -94.98567199707031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1555049419403076, "rewards_train/margins": 1.9446251392364502, "rewards_train/rejected": -3.100130081176758, "step": 1589 }, { "epoch": 0.44, "learning_rate": 1.926471905332739e-07, "loss": 0.5442, "step": 1590 }, { "epoch": 0.44, "logps_train/chosen": -81.09420776367188, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -97.06202697753906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8138642311096191, "rewards_train/margins": 1.286088466644287, "rewards_train/rejected": -2.0999526977539062, "step": 1590 }, { "epoch": 0.44, "logps_train/chosen": -63.191864013671875, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -93.6617431640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2117648124694824, "rewards_train/margins": 2.119253635406494, "rewards_train/rejected": -3.3310184478759766, "step": 1591 }, { "epoch": 0.44, "learning_rate": 1.920449624331179e-07, "loss": 0.3014, "step": 1592 }, { "epoch": 0.44, "logps_train/chosen": -63.863407135009766, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -111.71652221679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9037234783172607, "rewards_train/margins": 1.6468348503112793, "rewards_train/rejected": -2.55055832862854, "step": 1592 }, { "epoch": 0.45, "logps_train/chosen": -47.715370178222656, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -72.30594635009766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5560826063156128, "rewards_train/margins": 1.3600589036941528, "rewards_train/rejected": -1.9161415100097656, "step": 1593 }, { "epoch": 0.45, "learning_rate": 1.9144308952847498e-07, "loss": 0.3609, "step": 1594 }, { "epoch": 0.45, "logps_train/chosen": -72.75309753417969, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -96.64938354492188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3952317237854004, "rewards_train/margins": 1.6150193214416504, "rewards_train/rejected": -3.010251045227051, "step": 1594 }, { "epoch": 0.45, "logps_train/chosen": -26.114727020263672, "logps_train/ref_chosen": -21.125, "logps_train/ref_rejected": -32.75, "logps_train/rejected": -44.55803680419922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4950665235519409, "rewards_train/margins": 0.6759716272354126, "rewards_train/rejected": -1.1710381507873535, "step": 1595 }, { "epoch": 0.45, "learning_rate": 1.9084157550811095e-07, "loss": 0.4269, "step": 1596 }, { "epoch": 0.45, "logps_train/chosen": -57.08452606201172, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -60.630889892578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0283747911453247, "rewards_train/margins": 1.4489721059799194, "rewards_train/rejected": -2.477346897125244, "step": 1596 }, { "epoch": 0.45, "logps_train/chosen": -35.66205978393555, "logps_train/ref_chosen": -29.875, "logps_train/ref_rejected": -29.875, "logps_train/rejected": -44.3210563659668, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.581831157207489, "rewards_train/margins": 0.8682431578636169, "rewards_train/rejected": -1.450074315071106, "step": 1597 }, { "epoch": 0.45, "learning_rate": 1.9024042405859185e-07, "loss": 0.4342, "step": 1598 }, { "epoch": 0.45, "logps_train/chosen": -88.10818481445312, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -120.07489013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7823021411895752, "rewards_train/margins": 2.396085023880005, "rewards_train/rejected": -4.17838716506958, "step": 1598 }, { "epoch": 0.45, "logps_train/chosen": -70.81685638427734, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -126.18684387207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6357871294021606, "rewards_train/margins": 3.0518428087234497, "rewards_train/rejected": -3.6876299381256104, "step": 1599 }, { "epoch": 0.45, "learning_rate": 1.8963963886426195e-07, "loss": 0.185, "step": 1600 }, { "epoch": 0.45, "logps_train/chosen": -81.81231689453125, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -108.00234985351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1275206804275513, "rewards_train/margins": 2.3703707456588745, "rewards_train/rejected": -3.497891426086426, "step": 1600 }, { "epoch": 0.45, "logps_train/chosen": -109.75550842285156, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -117.73065185546875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.7650036811828613, "rewards_train/margins": 0.33384275436401367, "rewards_train/rejected": -2.098846435546875, "step": 1601 }, { "epoch": 0.45, "learning_rate": 1.890392236072203e-07, "loss": 0.5113, "step": 1602 }, { "epoch": 0.45, "logps_train/chosen": -92.44400024414062, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -103.2388916015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.005727767944336, "rewards_train/margins": 0.7662084102630615, "rewards_train/rejected": -3.7719361782073975, "step": 1602 }, { "epoch": 0.45, "logps_train/chosen": -95.73787689208984, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -107.28487396240234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.738631248474121, "rewards_train/margins": 0.9564576148986816, "rewards_train/rejected": -3.6950888633728027, "step": 1603 }, { "epoch": 0.45, "learning_rate": 1.884391819672991e-07, "loss": 0.7016, "step": 1604 }, { "epoch": 0.45, "logps_train/chosen": -36.527244567871094, "logps_train/ref_chosen": -26.625, "logps_train/ref_rejected": -23.375, "logps_train/rejected": -34.958683013916016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9954978227615356, "rewards_train/margins": 0.15857350826263428, "rewards_train/rejected": -1.15407133102417, "step": 1604 }, { "epoch": 0.45, "logps_train/chosen": -55.56608200073242, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -62.14733123779297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5710611939430237, "rewards_train/margins": 1.0249220728874207, "rewards_train/rejected": -1.5959832668304443, "step": 1605 }, { "epoch": 0.45, "learning_rate": 1.8783951762204052e-07, "loss": 0.5639, "step": 1606 }, { "epoch": 0.45, "logps_train/chosen": -129.42323303222656, "logps_train/ref_chosen": -103.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -142.60104370117188, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.6395885944366455, "rewards_train/margins": 0.8181729316711426, "rewards_train/rejected": -3.457761526107788, "step": 1606 }, { "epoch": 0.45, "logps_train/chosen": -71.31681060791016, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -83.39795684814453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2392005920410156, "rewards_train/margins": 1.2903413772583008, "rewards_train/rejected": -2.5295419692993164, "step": 1607 }, { "epoch": 0.45, "learning_rate": 1.8724023424667458e-07, "loss": 0.5695, "step": 1608 }, { "epoch": 0.45, "logps_train/chosen": -68.46073913574219, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -81.42782592773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.986014723777771, "rewards_train/margins": 1.2684859037399292, "rewards_train/rejected": -2.2545006275177, "step": 1608 }, { "epoch": 0.45, "logps_train/chosen": -96.95525360107422, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -159.29974365234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.351286768913269, "rewards_train/margins": 3.5308371782302856, "rewards_train/rejected": -4.882123947143555, "step": 1609 }, { "epoch": 0.45, "learning_rate": 1.8664133551409612e-07, "loss": 0.2235, "step": 1610 }, { "epoch": 0.45, "logps_train/chosen": -69.56402587890625, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -72.66041564941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5195867419242859, "rewards_train/margins": 1.407002031803131, "rewards_train/rejected": -1.926588773727417, "step": 1610 }, { "epoch": 0.45, "logps_train/chosen": -56.912479400634766, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -71.60416412353516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.887536883354187, "rewards_train/margins": 1.1965123414993286, "rewards_train/rejected": -2.0840492248535156, "step": 1611 }, { "epoch": 0.45, "learning_rate": 1.860428250948427e-07, "loss": 0.4269, "step": 1612 }, { "epoch": 0.45, "logps_train/chosen": -70.08004760742188, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -85.83873748779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8978487253189087, "rewards_train/margins": 2.6473532915115356, "rewards_train/rejected": -3.5452020168304443, "step": 1612 }, { "epoch": 0.45, "logps_train/chosen": -71.24488067626953, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -112.40217590332031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.221753716468811, "rewards_train/margins": 1.962995171546936, "rewards_train/rejected": -3.184748888015747, "step": 1613 }, { "epoch": 0.45, "learning_rate": 1.8544470665707207e-07, "loss": 0.2423, "step": 1614 }, { "epoch": 0.45, "logps_train/chosen": -68.96395111083984, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -85.09397888183594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5140705108642578, "rewards_train/margins": 1.222280502319336, "rewards_train/rejected": -2.7363510131835938, "step": 1614 }, { "epoch": 0.45, "logps_train/chosen": -89.24520874023438, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -105.38399505615234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6350679397583008, "rewards_train/margins": 1.7178335189819336, "rewards_train/rejected": -3.3529014587402344, "step": 1615 }, { "epoch": 0.45, "learning_rate": 1.848469838665394e-07, "loss": 0.3986, "step": 1616 }, { "epoch": 0.45, "logps_train/chosen": -95.86478424072266, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -128.3060302734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.703665852546692, "rewards_train/margins": 2.2550617456436157, "rewards_train/rejected": -3.9587275981903076, "step": 1616 }, { "epoch": 0.45, "logps_train/chosen": -51.01324462890625, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -46.25, "logps_train/rejected": -67.15058898925781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9450746178627014, "rewards_train/margins": 1.134828507900238, "rewards_train/rejected": -2.0799031257629395, "step": 1617 }, { "epoch": 0.45, "learning_rate": 1.8424966038657523e-07, "loss": 0.2926, "step": 1618 }, { "epoch": 0.45, "logps_train/chosen": -65.51244354248047, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -74.79045104980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1387443542480469, "rewards_train/margins": 1.0923757553100586, "rewards_train/rejected": -2.2311201095581055, "step": 1618 }, { "epoch": 0.45, "logps_train/chosen": -77.77388000488281, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -112.19791412353516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4059033393859863, "rewards_train/margins": 1.275606632232666, "rewards_train/rejected": -3.6815099716186523, "step": 1619 }, { "epoch": 0.45, "learning_rate": 1.836527398780627e-07, "loss": 0.3525, "step": 1620 }, { "epoch": 0.45, "logps_train/chosen": -77.53721618652344, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -116.349853515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.230870008468628, "rewards_train/margins": 2.0150530338287354, "rewards_train/rejected": -3.2459230422973633, "step": 1620 }, { "epoch": 0.45, "logps_train/chosen": -68.93390655517578, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -93.30572509765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.704816460609436, "rewards_train/margins": 2.114622950553894, "rewards_train/rejected": -2.81943941116333, "step": 1621 }, { "epoch": 0.45, "learning_rate": 1.8305622599941517e-07, "loss": 0.3649, "step": 1622 }, { "epoch": 0.45, "logps_train/chosen": -33.22088623046875, "logps_train/ref_chosen": -25.625, "logps_train/ref_rejected": -29.0, "logps_train/rejected": -42.651756286621094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7562317848205566, "rewards_train/margins": 0.6143757104873657, "rewards_train/rejected": -1.3706074953079224, "step": 1622 }, { "epoch": 0.45, "logps_train/chosen": -35.414363861083984, "logps_train/ref_chosen": -30.125, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -51.605384826660156, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5338191986083984, "rewards_train/margins": 0.48638737201690674, "rewards_train/rejected": -1.0202065706253052, "step": 1623 }, { "epoch": 0.45, "learning_rate": 1.8246012240655395e-07, "loss": 0.5101, "step": 1624 }, { "epoch": 0.45, "logps_train/chosen": -64.7273941040039, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -80.7210693359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.942855954170227, "rewards_train/margins": 1.150150179862976, "rewards_train/rejected": -2.093006134033203, "step": 1624 }, { "epoch": 0.45, "logps_train/chosen": -93.31907653808594, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -99.22573852539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.049095630645752, "rewards_train/margins": 0.7547280788421631, "rewards_train/rejected": -2.803823709487915, "step": 1625 }, { "epoch": 0.45, "learning_rate": 1.818644327528856e-07, "loss": 0.4991, "step": 1626 }, { "epoch": 0.45, "logps_train/chosen": -129.418212890625, "logps_train/ref_chosen": -107.0, "logps_train/ref_rejected": -134.0, "logps_train/rejected": -190.81375122070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2543210983276367, "rewards_train/margins": 3.434868335723877, "rewards_train/rejected": -5.689189434051514, "step": 1626 }, { "epoch": 0.45, "logps_train/chosen": -67.3095703125, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -83.19931030273438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3274415731430054, "rewards_train/margins": 1.3395591974258423, "rewards_train/rejected": -2.6670007705688477, "step": 1627 }, { "epoch": 0.46, "learning_rate": 1.8126916068927997e-07, "loss": 0.2331, "step": 1628 }, { "epoch": 0.46, "logps_train/chosen": -68.84822082519531, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -94.31878662109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7074776887893677, "rewards_train/margins": 1.3550652265548706, "rewards_train/rejected": -3.0625429153442383, "step": 1628 }, { "epoch": 0.46, "logps_train/chosen": -47.78813171386719, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -79.74202728271484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8522503972053528, "rewards_train/margins": 2.2678512930870056, "rewards_train/rejected": -3.1201016902923584, "step": 1629 }, { "epoch": 0.46, "learning_rate": 1.8067430986404735e-07, "loss": 0.33, "step": 1630 }, { "epoch": 0.46, "logps_train/chosen": -73.79712677001953, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -93.31788635253906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.421118974685669, "rewards_train/margins": 1.9993417263031006, "rewards_train/rejected": -3.4204607009887695, "step": 1630 }, { "epoch": 0.46, "logps_train/chosen": -78.96018981933594, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -112.003173828125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5717514753341675, "rewards_train/margins": 0.7545429468154907, "rewards_train/rejected": -2.326294422149658, "step": 1631 }, { "epoch": 0.46, "learning_rate": 1.8007988392291662e-07, "loss": 0.6382, "step": 1632 }, { "epoch": 0.46, "logps_train/chosen": -65.02816009521484, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -87.3437271118164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7250816226005554, "rewards_train/margins": 2.588588058948517, "rewards_train/rejected": -3.3136696815490723, "step": 1632 }, { "epoch": 0.46, "logps_train/chosen": -63.71031951904297, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -70.89663696289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6694695949554443, "rewards_train/margins": 0.4854285717010498, "rewards_train/rejected": -2.154898166656494, "step": 1633 }, { "epoch": 0.46, "learning_rate": 1.7948588650901225e-07, "loss": 0.3873, "step": 1634 }, { "epoch": 0.46, "logps_train/chosen": -113.53593444824219, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -134.50201416015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.909928321838379, "rewards_train/margins": 1.2794208526611328, "rewards_train/rejected": -4.189349174499512, "step": 1634 }, { "epoch": 0.46, "logps_train/chosen": -76.97315979003906, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -100.32398986816406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1965343952178955, "rewards_train/margins": 1.6050055027008057, "rewards_train/rejected": -2.801539897918701, "step": 1635 }, { "epoch": 0.46, "learning_rate": 1.7889232126283267e-07, "loss": 0.3359, "step": 1636 }, { "epoch": 0.46, "logps_train/chosen": -79.50006103515625, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -84.783447265625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5851625204086304, "rewards_train/margins": 1.392400860786438, "rewards_train/rejected": -2.9775633811950684, "step": 1636 }, { "epoch": 0.46, "logps_train/chosen": -69.0367431640625, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -70.38298797607422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6337518692016602, "rewards_train/margins": 0.7023983001708984, "rewards_train/rejected": -2.3361501693725586, "step": 1637 }, { "epoch": 0.46, "learning_rate": 1.782991918222275e-07, "loss": 0.508, "step": 1638 }, { "epoch": 0.46, "logps_train/chosen": -55.38524627685547, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -74.1991195678711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0447261333465576, "rewards_train/margins": 1.190420150756836, "rewards_train/rejected": -2.2351462841033936, "step": 1638 }, { "epoch": 0.46, "logps_train/chosen": -47.96641159057617, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -77.42793273925781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8376566171646118, "rewards_train/margins": 1.2662690877914429, "rewards_train/rejected": -2.1039257049560547, "step": 1639 }, { "epoch": 0.46, "learning_rate": 1.7770650182237532e-07, "loss": 0.3921, "step": 1640 }, { "epoch": 0.46, "logps_train/chosen": -86.46134948730469, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -96.79161834716797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1559011936187744, "rewards_train/margins": 1.272479772567749, "rewards_train/rejected": -2.4283809661865234, "step": 1640 }, { "epoch": 0.46, "logps_train/chosen": -73.64527893066406, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -96.40797424316406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.004371166229248, "rewards_train/margins": 2.070801019668579, "rewards_train/rejected": -3.075172185897827, "step": 1641 }, { "epoch": 0.46, "learning_rate": 1.7711425489576164e-07, "loss": 0.3354, "step": 1642 }, { "epoch": 0.46, "logps_train/chosen": -33.769466400146484, "logps_train/ref_chosen": -27.625, "logps_train/ref_rejected": -31.125, "logps_train/rejected": -46.60411071777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6140562295913696, "rewards_train/margins": 0.9340499639511108, "rewards_train/rejected": -1.5481061935424805, "step": 1642 }, { "epoch": 0.46, "logps_train/chosen": -62.69888687133789, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -99.6690673828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0552401542663574, "rewards_train/margins": 1.9081511497497559, "rewards_train/rejected": -2.9633913040161133, "step": 1643 }, { "epoch": 0.46, "learning_rate": 1.7652245467215633e-07, "loss": 0.3381, "step": 1644 }, { "epoch": 0.46, "logps_train/chosen": -77.92562866210938, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -88.2549819946289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3132660388946533, "rewards_train/margins": 1.3321540355682373, "rewards_train/rejected": -2.6454200744628906, "step": 1644 }, { "epoch": 0.46, "logps_train/chosen": -54.02538299560547, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -83.53839111328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2481436729431152, "rewards_train/margins": 1.958625078201294, "rewards_train/rejected": -3.206768751144409, "step": 1645 }, { "epoch": 0.46, "learning_rate": 1.7593110477859152e-07, "loss": 0.4441, "step": 1646 }, { "epoch": 0.46, "logps_train/chosen": -92.95575714111328, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -100.82368469238281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.848998546600342, "rewards_train/margins": 0.4821047782897949, "rewards_train/rejected": -3.3311033248901367, "step": 1646 }, { "epoch": 0.46, "logps_train/chosen": -66.4138412475586, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -34.25, "logps_train/rejected": -60.00886535644531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8527121543884277, "rewards_train/margins": 0.7304012775421143, "rewards_train/rejected": -2.583113431930542, "step": 1647 }, { "epoch": 0.46, "learning_rate": 1.7534020883933942e-07, "loss": 0.5099, "step": 1648 }, { "epoch": 0.46, "logps_train/chosen": -117.94528198242188, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -151.95156860351562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3450403213500977, "rewards_train/margins": 2.9756298065185547, "rewards_train/rejected": -5.320670127868652, "step": 1648 }, { "epoch": 0.46, "logps_train/chosen": -80.22037506103516, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -120.06130981445312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2251629829406738, "rewards_train/margins": 2.590343952178955, "rewards_train/rejected": -3.815506935119629, "step": 1649 }, { "epoch": 0.46, "learning_rate": 1.747497704758899e-07, "loss": 0.2625, "step": 1650 }, { "epoch": 0.46, "logps_train/chosen": -61.823944091796875, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -73.3372573852539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8113001585006714, "rewards_train/margins": 1.7501598596572876, "rewards_train/rejected": -2.561460018157959, "step": 1650 }, { "epoch": 0.46, "logps_train/chosen": -55.210697174072266, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -86.23442077636719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5924079418182373, "rewards_train/margins": 2.030057430267334, "rewards_train/rejected": -3.6224653720855713, "step": 1651 }, { "epoch": 0.46, "learning_rate": 1.7415979330692857e-07, "loss": 0.3315, "step": 1652 }, { "epoch": 0.46, "logps_train/chosen": -77.4622802734375, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -105.72571563720703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.144274353981018, "rewards_train/margins": 2.211891531944275, "rewards_train/rejected": -3.356165885925293, "step": 1652 }, { "epoch": 0.46, "logps_train/chosen": -137.295654296875, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -180.1324005126953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -4.613160133361816, "rewards_train/margins": 1.7364087104797363, "rewards_train/rejected": -6.349568843841553, "step": 1653 }, { "epoch": 0.46, "learning_rate": 1.7357028094831437e-07, "loss": 0.4237, "step": 1654 }, { "epoch": 0.46, "logps_train/chosen": -48.06719970703125, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -44.5, "logps_train/rejected": -68.30729675292969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.49802833795547485, "rewards_train/margins": 1.8776230216026306, "rewards_train/rejected": -2.3756513595581055, "step": 1654 }, { "epoch": 0.46, "logps_train/chosen": -51.079383850097656, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -81.62925720214844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6306922435760498, "rewards_train/margins": 0.698249340057373, "rewards_train/rejected": -2.328941583633423, "step": 1655 }, { "epoch": 0.46, "learning_rate": 1.7298123701305761e-07, "loss": 0.432, "step": 1656 }, { "epoch": 0.46, "logps_train/chosen": -89.21125030517578, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -110.03629302978516, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3027660846710205, "rewards_train/margins": 1.8664886951446533, "rewards_train/rejected": -3.169254779815674, "step": 1656 }, { "epoch": 0.46, "logps_train/chosen": -27.07877540588379, "logps_train/ref_chosen": -21.875, "logps_train/ref_rejected": -23.625, "logps_train/rejected": -31.229503631591797, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.5249674320220947, "rewards_train/margins": 0.23509228229522705, "rewards_train/rejected": -0.7600597143173218, "step": 1657 }, { "epoch": 0.46, "learning_rate": 1.723926651112976e-07, "loss": 0.4783, "step": 1658 }, { "epoch": 0.46, "logps_train/chosen": -90.55311584472656, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -117.06472778320312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.685780644416809, "rewards_train/margins": 2.1785048246383667, "rewards_train/rejected": -3.864285469055176, "step": 1658 }, { "epoch": 0.46, "logps_train/chosen": -131.74801635742188, "logps_train/ref_chosen": -106.0, "logps_train/ref_rejected": -114.5, "logps_train/rejected": -150.03887939453125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.5576157569885254, "rewards_train/margins": 1.001741886138916, "rewards_train/rejected": -3.5593576431274414, "step": 1659 }, { "epoch": 0.46, "learning_rate": 1.718045688502808e-07, "loss": 0.3645, "step": 1660 }, { "epoch": 0.46, "logps_train/chosen": -123.75086975097656, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -115.00794982910156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -4.132118225097656, "rewards_train/margins": 0.23215341567993164, "rewards_train/rejected": -4.364271640777588, "step": 1660 }, { "epoch": 0.46, "logps_train/chosen": -84.78007507324219, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -117.05474090576172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7177529335021973, "rewards_train/margins": 1.9377210140228271, "rewards_train/rejected": -3.6554739475250244, "step": 1661 }, { "epoch": 0.46, "learning_rate": 1.7121695183433838e-07, "loss": 0.5338, "step": 1662 }, { "epoch": 0.46, "logps_train/chosen": -68.51127624511719, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -90.26943969726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9937058091163635, "rewards_train/margins": 2.0795273184776306, "rewards_train/rejected": -3.073233127593994, "step": 1662 }, { "epoch": 0.46, "logps_train/chosen": -32.415794372558594, "logps_train/ref_chosen": -25.625, "logps_train/ref_rejected": -21.5, "logps_train/rejected": -30.881206512451172, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6844506859779358, "rewards_train/margins": 0.25552552938461304, "rewards_train/rejected": -0.9399762153625488, "step": 1663 }, { "epoch": 0.47, "learning_rate": 1.7062981766486436e-07, "loss": 0.3761, "step": 1664 }, { "epoch": 0.47, "logps_train/chosen": -70.52616119384766, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -70.73539733886719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3366740942001343, "rewards_train/margins": 1.369947075843811, "rewards_train/rejected": -2.7066211700439453, "step": 1664 }, { "epoch": 0.47, "logps_train/chosen": -94.04827880859375, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -90.67620849609375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.702630877494812, "rewards_train/margins": 1.6329580545425415, "rewards_train/rejected": -3.3355889320373535, "step": 1665 }, { "epoch": 0.47, "learning_rate": 1.7004316994029364e-07, "loss": 0.6045, "step": 1666 }, { "epoch": 0.47, "logps_train/chosen": -84.98683166503906, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -80.54086303710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.287354975938797, "rewards_train/margins": 1.6485677659511566, "rewards_train/rejected": -1.9359227418899536, "step": 1666 }, { "epoch": 0.47, "logps_train/chosen": -48.174278259277344, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -36.25, "logps_train/rejected": -51.08442687988281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8951619863510132, "rewards_train/margins": 0.5955073833465576, "rewards_train/rejected": -1.4906693696975708, "step": 1667 }, { "epoch": 0.47, "learning_rate": 1.6945701225607952e-07, "loss": 0.3778, "step": 1668 }, { "epoch": 0.47, "logps_train/chosen": -92.9300308227539, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -125.04141235351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6633155345916748, "rewards_train/margins": 2.776664972305298, "rewards_train/rejected": -4.439980506896973, "step": 1668 }, { "epoch": 0.47, "logps_train/chosen": -52.9864616394043, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -81.70050048828125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6720833778381348, "rewards_train/margins": 2.5932788848876953, "rewards_train/rejected": -3.26536226272583, "step": 1669 }, { "epoch": 0.47, "learning_rate": 1.6887134820467222e-07, "loss": 0.2394, "step": 1670 }, { "epoch": 0.47, "logps_train/chosen": -92.07159423828125, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -93.31512451171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.925518751144409, "rewards_train/margins": 0.351306676864624, "rewards_train/rejected": -3.276825428009033, "step": 1670 }, { "epoch": 0.47, "logps_train/chosen": -84.47746276855469, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -92.91738891601562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.983292818069458, "rewards_train/margins": 0.7264158725738525, "rewards_train/rejected": -2.7097086906433105, "step": 1671 }, { "epoch": 0.47, "learning_rate": 1.6828618137549635e-07, "loss": 0.5321, "step": 1672 }, { "epoch": 0.47, "logps_train/chosen": -55.901512145996094, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -84.44178771972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5995261669158936, "rewards_train/margins": 1.7020747661590576, "rewards_train/rejected": -3.301600933074951, "step": 1672 }, { "epoch": 0.47, "logps_train/chosen": -70.01547241210938, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -98.51997375488281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2319185733795166, "rewards_train/margins": 1.4917585849761963, "rewards_train/rejected": -2.723677158355713, "step": 1673 }, { "epoch": 0.47, "learning_rate": 1.6770151535492921e-07, "loss": 0.3731, "step": 1674 }, { "epoch": 0.47, "logps_train/chosen": -42.512962341308594, "logps_train/ref_chosen": -36.5, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -47.57307434082031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6077415943145752, "rewards_train/margins": 0.5992730855941772, "rewards_train/rejected": -1.2070146799087524, "step": 1674 }, { "epoch": 0.47, "logps_train/chosen": -79.2337646484375, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -91.4706802368164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.477673888206482, "rewards_train/margins": 1.107772946357727, "rewards_train/rejected": -2.585446834564209, "step": 1675 }, { "epoch": 0.47, "learning_rate": 1.6711735372627888e-07, "loss": 0.4862, "step": 1676 }, { "epoch": 0.47, "logps_train/chosen": -67.922607421875, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -84.69071197509766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.776440143585205, "rewards_train/margins": 0.7301309108734131, "rewards_train/rejected": -2.506571054458618, "step": 1676 }, { "epoch": 0.47, "logps_train/chosen": -99.23464965820312, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -99.31503295898438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6422152519226074, "rewards_train/margins": 0.9697575569152832, "rewards_train/rejected": -2.6119728088378906, "step": 1677 }, { "epoch": 0.47, "learning_rate": 1.6653370006976182e-07, "loss": 0.4877, "step": 1678 }, { "epoch": 0.47, "logps_train/chosen": -59.814300537109375, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -97.75444030761719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9835784435272217, "rewards_train/margins": 2.1797568798065186, "rewards_train/rejected": -3.1633353233337402, "step": 1678 }, { "epoch": 0.47, "logps_train/chosen": -107.05012512207031, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -100.28025817871094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.135676860809326, "rewards_train/margins": 0.923403263092041, "rewards_train/rejected": -4.059080123901367, "step": 1679 }, { "epoch": 0.47, "learning_rate": 1.6595055796248154e-07, "loss": 0.4452, "step": 1680 }, { "epoch": 0.47, "logps_train/chosen": -58.643882751464844, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -71.38655090332031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7219080328941345, "rewards_train/margins": 0.7577629685401917, "rewards_train/rejected": -1.4796710014343262, "step": 1680 }, { "epoch": 0.47, "logps_train/chosen": -70.05195617675781, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -88.61813354492188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5166213512420654, "rewards_train/margins": 1.2826917171478271, "rewards_train/rejected": -2.7993130683898926, "step": 1681 }, { "epoch": 0.47, "learning_rate": 1.6536793097840613e-07, "loss": 0.4482, "step": 1682 }, { "epoch": 0.47, "logps_train/chosen": -71.62968444824219, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -64.81156921386719, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5358200073242188, "rewards_train/margins": 0.4359617233276367, "rewards_train/rejected": -1.9717817306518555, "step": 1682 }, { "epoch": 0.47, "logps_train/chosen": -64.4502182006836, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -85.279541015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.465919852256775, "rewards_train/margins": 0.6679915189743042, "rewards_train/rejected": -2.133911371231079, "step": 1683 }, { "epoch": 0.47, "learning_rate": 1.6478582268834674e-07, "loss": 0.5673, "step": 1684 }, { "epoch": 0.47, "logps_train/chosen": -77.01126861572266, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -83.2972412109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1683144569396973, "rewards_train/margins": 1.007601022720337, "rewards_train/rejected": -2.175915479660034, "step": 1684 }, { "epoch": 0.47, "logps_train/chosen": -91.12889099121094, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -125.72279357910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7468738555908203, "rewards_train/margins": 2.4564595222473145, "rewards_train/rejected": -4.203333377838135, "step": 1685 }, { "epoch": 0.47, "learning_rate": 1.6420423665993543e-07, "loss": 0.3966, "step": 1686 }, { "epoch": 0.47, "logps_train/chosen": -69.03770446777344, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -108.81758880615234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1006451845169067, "rewards_train/margins": 1.8715428113937378, "rewards_train/rejected": -2.9721879959106445, "step": 1686 }, { "epoch": 0.47, "logps_train/chosen": -87.07865142822266, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -99.63607788085938, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3953654766082764, "rewards_train/margins": 1.3096485137939453, "rewards_train/rejected": -2.7050139904022217, "step": 1687 }, { "epoch": 0.47, "learning_rate": 1.6362317645760345e-07, "loss": 0.5543, "step": 1688 }, { "epoch": 0.47, "logps_train/chosen": -74.01127624511719, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -69.46797180175781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9556194543838501, "rewards_train/margins": 0.04820847511291504, "rewards_train/rejected": -1.0038279294967651, "step": 1688 }, { "epoch": 0.47, "logps_train/chosen": -28.50318717956543, "logps_train/ref_chosen": -23.5, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -45.10893249511719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4987562298774719, "rewards_train/margins": 0.5574496388435364, "rewards_train/rejected": -1.0562058687210083, "step": 1689 }, { "epoch": 0.47, "learning_rate": 1.6304264564255945e-07, "loss": 0.5874, "step": 1690 }, { "epoch": 0.47, "logps_train/chosen": -80.19734191894531, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -99.33476257324219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7146562337875366, "rewards_train/margins": 2.0879613161087036, "rewards_train/rejected": -2.8026175498962402, "step": 1690 }, { "epoch": 0.47, "logps_train/chosen": -41.62714385986328, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -53.11134719848633, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.45900335907936096, "rewards_train/margins": 1.2771314084529877, "rewards_train/rejected": -1.7361347675323486, "step": 1691 }, { "epoch": 0.47, "learning_rate": 1.624626477727674e-07, "loss": 0.3138, "step": 1692 }, { "epoch": 0.47, "logps_train/chosen": -45.346580505371094, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -66.0799789428711, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.3510642945766449, "rewards_train/margins": 1.1681641638278961, "rewards_train/rejected": -1.519228458404541, "step": 1692 }, { "epoch": 0.47, "logps_train/chosen": -68.06881713867188, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -83.01776123046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1756322383880615, "rewards_train/margins": 2.289034843444824, "rewards_train/rejected": -3.4646670818328857, "step": 1693 }, { "epoch": 0.47, "learning_rate": 1.618831864029251e-07, "loss": 0.3492, "step": 1694 }, { "epoch": 0.47, "logps_train/chosen": -96.76583862304688, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -105.95556640625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3433805704116821, "rewards_train/margins": 0.977958083152771, "rewards_train/rejected": -2.321338653564453, "step": 1694 }, { "epoch": 0.47, "logps_train/chosen": -43.882144927978516, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -52.5103759765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5408511161804199, "rewards_train/margins": 0.7512019872665405, "rewards_train/rejected": -1.2920531034469604, "step": 1695 }, { "epoch": 0.47, "learning_rate": 1.613042650844422e-07, "loss": 0.4764, "step": 1696 }, { "epoch": 0.47, "logps_train/chosen": -86.63471221923828, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -115.38475036621094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3977975845336914, "rewards_train/margins": 1.965238332748413, "rewards_train/rejected": -3.3630359172821045, "step": 1696 }, { "epoch": 0.47, "logps_train/chosen": -84.23179626464844, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -113.83992004394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0578969717025757, "rewards_train/margins": 2.1768757104873657, "rewards_train/rejected": -3.2347726821899414, "step": 1697 }, { "epoch": 0.47, "learning_rate": 1.6072588736541837e-07, "loss": 0.2471, "step": 1698 }, { "epoch": 0.47, "logps_train/chosen": -62.672122955322266, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -122.68158721923828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.400708794593811, "rewards_train/margins": 2.427606225013733, "rewards_train/rejected": -3.828315019607544, "step": 1698 }, { "epoch": 0.47, "logps_train/chosen": -54.730247497558594, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -107.04042053222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0784939527511597, "rewards_train/margins": 2.445860505104065, "rewards_train/rejected": -3.5243544578552246, "step": 1699 }, { "epoch": 0.48, "learning_rate": 1.6014805679062183e-07, "loss": 0.2489, "step": 1700 }, { "epoch": 0.48, "logps_train/chosen": -80.585693359375, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -104.49479675292969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.811694622039795, "rewards_train/margins": 2.1026291847229004, "rewards_train/rejected": -3.9143238067626953, "step": 1700 }, { "epoch": 0.48, "logps_train/chosen": -93.85848999023438, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -108.67376708984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.591318368911743, "rewards_train/margins": 0.9119961261749268, "rewards_train/rejected": -3.50331449508667, "step": 1701 }, { "epoch": 0.48, "learning_rate": 1.5957077690146728e-07, "loss": 0.5183, "step": 1702 }, { "epoch": 0.48, "logps_train/chosen": -62.26563262939453, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -73.73411560058594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8724616169929504, "rewards_train/margins": 1.190403163433075, "rewards_train/rejected": -2.0628647804260254, "step": 1702 }, { "epoch": 0.48, "logps_train/chosen": -59.47084426879883, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -61.20113754272461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8697408437728882, "rewards_train/margins": 1.4027162790298462, "rewards_train/rejected": -2.2724571228027344, "step": 1703 }, { "epoch": 0.48, "learning_rate": 1.589940512359946e-07, "loss": 0.3901, "step": 1704 }, { "epoch": 0.48, "logps_train/chosen": -85.05970001220703, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -119.24519348144531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.244251012802124, "rewards_train/margins": 2.076362371444702, "rewards_train/rejected": -3.320613384246826, "step": 1704 }, { "epoch": 0.48, "logps_train/chosen": -54.460426330566406, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -74.13459777832031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.25561290979385376, "rewards_train/margins": 1.2282573580741882, "rewards_train/rejected": -1.483870267868042, "step": 1705 }, { "epoch": 0.48, "learning_rate": 1.584178833288467e-07, "loss": 0.4027, "step": 1706 }, { "epoch": 0.48, "logps_train/chosen": -58.76069641113281, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -82.43553924560547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.3251905143260956, "rewards_train/margins": 1.0753945410251617, "rewards_train/rejected": -1.4005850553512573, "step": 1706 }, { "epoch": 0.48, "logps_train/chosen": -104.62451171875, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -137.52508544921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.912841558456421, "rewards_train/margins": 2.640448808670044, "rewards_train/rejected": -4.553290367126465, "step": 1707 }, { "epoch": 0.48, "learning_rate": 1.5784227671124826e-07, "loss": 0.2927, "step": 1708 }, { "epoch": 0.48, "logps_train/chosen": -55.8347053527832, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -85.9326171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.945970356464386, "rewards_train/margins": 1.4238542914390564, "rewards_train/rejected": -2.3698246479034424, "step": 1708 }, { "epoch": 0.48, "logps_train/chosen": -54.66715621948242, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -86.22138977050781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9868572354316711, "rewards_train/margins": 1.8746129870414734, "rewards_train/rejected": -2.8614702224731445, "step": 1709 }, { "epoch": 0.48, "learning_rate": 1.5726723491098383e-07, "loss": 0.304, "step": 1710 }, { "epoch": 0.48, "logps_train/chosen": -90.70524597167969, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -92.46198272705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8190110325813293, "rewards_train/margins": 2.3880273699760437, "rewards_train/rejected": -3.207038402557373, "step": 1710 }, { "epoch": 0.48, "logps_train/chosen": -86.20796203613281, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -116.68403625488281, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.6706008911132812, "rewards_train/margins": 2.073974609375, "rewards_train/rejected": -3.7445755004882812, "step": 1711 }, { "epoch": 0.48, "learning_rate": 1.566927614523763e-07, "loss": 0.4273, "step": 1712 }, { "epoch": 0.48, "logps_train/chosen": -61.82952880859375, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -77.03433990478516, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -2.1032655239105225, "rewards_train/margins": 0.28376221656799316, "rewards_train/rejected": -2.3870277404785156, "step": 1712 }, { "epoch": 0.48, "logps_train/chosen": -78.32061767578125, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -102.40614318847656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6319637298583984, "rewards_train/margins": 1.013533592224121, "rewards_train/rejected": -2.6454973220825195, "step": 1713 }, { "epoch": 0.48, "learning_rate": 1.5611885985626544e-07, "loss": 0.6185, "step": 1714 }, { "epoch": 0.48, "logps_train/chosen": -138.0715789794922, "logps_train/ref_chosen": -116.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -113.78358459472656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.2212202548980713, "rewards_train/margins": 1.4633877277374268, "rewards_train/rejected": -3.684607982635498, "step": 1714 }, { "epoch": 0.48, "logps_train/chosen": -105.45602416992188, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -155.59356689453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.344820976257324, "rewards_train/margins": 2.350083351135254, "rewards_train/rejected": -4.694904327392578, "step": 1715 }, { "epoch": 0.48, "learning_rate": 1.5554553363998592e-07, "loss": 0.4026, "step": 1716 }, { "epoch": 0.48, "logps_train/chosen": -39.249446868896484, "logps_train/ref_chosen": -33.5, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -60.99778747558594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5843197107315063, "rewards_train/margins": 1.1607716083526611, "rewards_train/rejected": -1.7450913190841675, "step": 1716 }, { "epoch": 0.48, "logps_train/chosen": -65.62832641601562, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -77.27700805664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0398834943771362, "rewards_train/margins": 1.0017582178115845, "rewards_train/rejected": -2.0416417121887207, "step": 1717 }, { "epoch": 0.48, "learning_rate": 1.549727863173463e-07, "loss": 0.3592, "step": 1718 }, { "epoch": 0.48, "logps_train/chosen": -122.73854064941406, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -116.5, "logps_train/rejected": -166.14834594726562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7176034450531006, "rewards_train/margins": 3.262075662612915, "rewards_train/rejected": -4.979679107666016, "step": 1718 }, { "epoch": 0.48, "logps_train/chosen": -55.00486755371094, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -60.80463409423828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6984851360321045, "rewards_train/margins": 0.5397908687591553, "rewards_train/rejected": -2.2382760047912598, "step": 1719 }, { "epoch": 0.48, "learning_rate": 1.5440062139860702e-07, "loss": 0.4756, "step": 1720 }, { "epoch": 0.48, "logps_train/chosen": -62.37964630126953, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -77.89312744140625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6694098711013794, "rewards_train/margins": 2.1906055212020874, "rewards_train/rejected": -2.860015392303467, "step": 1720 }, { "epoch": 0.48, "logps_train/chosen": -49.41643524169922, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -75.60308837890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3776785135269165, "rewards_train/margins": 1.2463024854660034, "rewards_train/rejected": -2.62398099899292, "step": 1721 }, { "epoch": 0.48, "learning_rate": 1.5382904239045917e-07, "loss": 0.3565, "step": 1722 }, { "epoch": 0.48, "logps_train/chosen": -86.35062408447266, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -94.67170715332031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7866249084472656, "rewards_train/margins": 0.8211703300476074, "rewards_train/rejected": -2.607795238494873, "step": 1722 }, { "epoch": 0.48, "logps_train/chosen": -84.44341278076172, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -99.17850494384766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9177792072296143, "rewards_train/margins": 2.126633405685425, "rewards_train/rejected": -4.044412612915039, "step": 1723 }, { "epoch": 0.48, "learning_rate": 1.5325805279600285e-07, "loss": 0.3435, "step": 1724 }, { "epoch": 0.48, "logps_train/chosen": -60.262733459472656, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -38.0, "logps_train/rejected": -59.72871017456055, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3323768377304077, "rewards_train/margins": 0.8286288976669312, "rewards_train/rejected": -2.161005735397339, "step": 1724 }, { "epoch": 0.48, "logps_train/chosen": -43.0749626159668, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -70.24840545654297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7276864051818848, "rewards_train/margins": 0.7864855527877808, "rewards_train/rejected": -1.5141719579696655, "step": 1725 }, { "epoch": 0.48, "learning_rate": 1.5268765611472574e-07, "loss": 0.4338, "step": 1726 }, { "epoch": 0.48, "logps_train/chosen": -63.47959518432617, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -64.1396255493164, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9475688934326172, "rewards_train/margins": 0.8548210859298706, "rewards_train/rejected": -1.8023899793624878, "step": 1726 }, { "epoch": 0.48, "logps_train/chosen": -57.35747528076172, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -76.74878692626953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5443410873413086, "rewards_train/margins": 1.814131259918213, "rewards_train/rejected": -3.3584723472595215, "step": 1727 }, { "epoch": 0.48, "learning_rate": 1.5211785584248178e-07, "loss": 0.3767, "step": 1728 }, { "epoch": 0.48, "logps_train/chosen": -96.9595947265625, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -142.14691162109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3178340196609497, "rewards_train/margins": 2.7702945470809937, "rewards_train/rejected": -4.088128566741943, "step": 1728 }, { "epoch": 0.48, "logps_train/chosen": -59.227317810058594, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -76.9884033203125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7125757932662964, "rewards_train/margins": 0.8030610084533691, "rewards_train/rejected": -1.5156368017196655, "step": 1729 }, { "epoch": 0.48, "learning_rate": 1.5154865547146947e-07, "loss": 0.2882, "step": 1730 }, { "epoch": 0.48, "logps_train/chosen": -65.23429870605469, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -77.76101684570312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.2140541970729828, "rewards_train/margins": 1.3245470821857452, "rewards_train/rejected": -1.538601279258728, "step": 1730 }, { "epoch": 0.48, "logps_train/chosen": -75.65512084960938, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -88.51319885253906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.544906735420227, "rewards_train/margins": 1.561881422996521, "rewards_train/rejected": -2.106788158416748, "step": 1731 }, { "epoch": 0.48, "learning_rate": 1.5098005849021078e-07, "loss": 0.3378, "step": 1732 }, { "epoch": 0.48, "logps_train/chosen": -70.87867736816406, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -87.4938735961914, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0714613199234009, "rewards_train/margins": 0.35556256771087646, "rewards_train/rejected": -1.4270238876342773, "step": 1732 }, { "epoch": 0.48, "logps_train/chosen": -65.34355926513672, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -71.9080810546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.462090015411377, "rewards_train/margins": 0.5146552324295044, "rewards_train/rejected": -1.9767452478408813, "step": 1733 }, { "epoch": 0.48, "learning_rate": 1.5041206838352955e-07, "loss": 0.5774, "step": 1734 }, { "epoch": 0.48, "logps_train/chosen": -73.20880126953125, "logps_train/ref_chosen": -58.0, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -85.79881286621094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5255677700042725, "rewards_train/margins": 1.7681806087493896, "rewards_train/rejected": -3.293748378753662, "step": 1734 }, { "epoch": 0.48, "logps_train/chosen": -61.277870178222656, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -93.12185668945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2486857175827026, "rewards_train/margins": 2.0986567735671997, "rewards_train/rejected": -3.3473424911499023, "step": 1735 }, { "epoch": 0.49, "learning_rate": 1.4984468863253007e-07, "loss": 0.2993, "step": 1736 }, { "epoch": 0.49, "logps_train/chosen": -100.27379608154297, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -94.43202209472656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9312859773635864, "rewards_train/margins": 0.6416035890579224, "rewards_train/rejected": -2.572889566421509, "step": 1736 }, { "epoch": 0.49, "logps_train/chosen": -73.89569091796875, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -78.6570816040039, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1137696504592896, "rewards_train/margins": 1.4405921697616577, "rewards_train/rejected": -2.5543618202209473, "step": 1737 }, { "epoch": 0.49, "learning_rate": 1.4927792271457624e-07, "loss": 0.4012, "step": 1738 }, { "epoch": 0.49, "logps_train/chosen": -48.92772674560547, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -59.254119873046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9099600315093994, "rewards_train/margins": 0.5400614738464355, "rewards_train/rejected": -1.450021505355835, "step": 1738 }, { "epoch": 0.49, "logps_train/chosen": -46.54792022705078, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -65.42706298828125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1446361541748047, "rewards_train/margins": 1.0652580261230469, "rewards_train/rejected": -2.2098941802978516, "step": 1739 }, { "epoch": 0.49, "learning_rate": 1.487117741032694e-07, "loss": 0.507, "step": 1740 }, { "epoch": 0.49, "logps_train/chosen": -52.60486602783203, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -72.66886138916016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4809945225715637, "rewards_train/margins": 1.3015164732933044, "rewards_train/rejected": -1.7825109958648682, "step": 1740 }, { "epoch": 0.49, "logps_train/chosen": -60.44313049316406, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -100.72547912597656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3650164604187012, "rewards_train/margins": 1.9962029457092285, "rewards_train/rejected": -3.3612194061279297, "step": 1741 }, { "epoch": 0.49, "learning_rate": 1.4814624626842797e-07, "loss": 0.3217, "step": 1742 }, { "epoch": 0.49, "logps_train/chosen": -117.7610092163086, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -148.991943359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.43860125541687, "rewards_train/margins": 2.3387176990509033, "rewards_train/rejected": -4.777318954467773, "step": 1742 }, { "epoch": 0.49, "logps_train/chosen": -68.24562072753906, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -101.6647720336914, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5278820991516113, "rewards_train/margins": 1.294844627380371, "rewards_train/rejected": -2.8227267265319824, "step": 1743 }, { "epoch": 0.49, "learning_rate": 1.475813426760655e-07, "loss": 0.4032, "step": 1744 }, { "epoch": 0.49, "logps_train/chosen": -37.63656234741211, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -58.51951599121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5460291504859924, "rewards_train/margins": 1.319887101650238, "rewards_train/rejected": -1.8659162521362305, "step": 1744 }, { "epoch": 0.49, "logps_train/chosen": -102.67803192138672, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -125.20155334472656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2474911212921143, "rewards_train/margins": 1.614070177078247, "rewards_train/rejected": -2.8615612983703613, "step": 1745 }, { "epoch": 0.49, "learning_rate": 1.4701706678836977e-07, "loss": 0.3559, "step": 1746 }, { "epoch": 0.49, "logps_train/chosen": -48.163970947265625, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -79.82466888427734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.2507719397544861, "rewards_train/margins": 1.78032785654068, "rewards_train/rejected": -2.031099796295166, "step": 1746 }, { "epoch": 0.49, "logps_train/chosen": -69.75738525390625, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -104.90107727050781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2784242630004883, "rewards_train/margins": 2.199769973754883, "rewards_train/rejected": -3.478194236755371, "step": 1747 }, { "epoch": 0.49, "learning_rate": 1.4645342206368144e-07, "loss": 0.3322, "step": 1748 }, { "epoch": 0.49, "logps_train/chosen": -106.47711944580078, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -127.27220153808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2395095825195312, "rewards_train/margins": 2.168569564819336, "rewards_train/rejected": -4.408079147338867, "step": 1748 }, { "epoch": 0.49, "logps_train/chosen": -91.26492309570312, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -126.6604995727539, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0190703868865967, "rewards_train/margins": 1.5723700523376465, "rewards_train/rejected": -3.591440439224243, "step": 1749 }, { "epoch": 0.49, "learning_rate": 1.4589041195647283e-07, "loss": 0.3291, "step": 1750 }, { "epoch": 0.49, "logps_train/chosen": -101.08950805664062, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -71.98365783691406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9179353713989258, "rewards_train/margins": 0.2476177215576172, "rewards_train/rejected": -2.165553092956543, "step": 1750 }, { "epoch": 0.49, "logps_train/chosen": -31.443187713623047, "logps_train/ref_chosen": -19.875, "logps_train/ref_rejected": -20.125, "logps_train/rejected": -36.207881927490234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1613110303878784, "rewards_train/margins": 0.4471726417541504, "rewards_train/rejected": -1.6084836721420288, "step": 1751 }, { "epoch": 0.49, "learning_rate": 1.4532803991732697e-07, "loss": 0.6374, "step": 1752 }, { "epoch": 0.49, "logps_train/chosen": -93.20042419433594, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -134.81411743164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0637922286987305, "rewards_train/margins": 2.7484796047210693, "rewards_train/rejected": -3.8122718334198, "step": 1752 }, { "epoch": 0.49, "logps_train/chosen": -57.33367156982422, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -63.25, "logps_train/rejected": -86.43705749511719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3732109069824219, "rewards_train/margins": 0.9560410976409912, "rewards_train/rejected": -2.329252004623413, "step": 1753 }, { "epoch": 0.49, "learning_rate": 1.447663093929163e-07, "loss": 0.316, "step": 1754 }, { "epoch": 0.49, "logps_train/chosen": -42.070743560791016, "logps_train/ref_chosen": -37.75, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -58.13180160522461, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.42699646949768066, "rewards_train/margins": 1.5381369590759277, "rewards_train/rejected": -1.9651334285736084, "step": 1754 }, { "epoch": 0.49, "logps_train/chosen": -93.35173797607422, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -105.97796630859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6605641841888428, "rewards_train/margins": 2.121607780456543, "rewards_train/rejected": -2.7821719646453857, "step": 1755 }, { "epoch": 0.49, "learning_rate": 1.4420522382598116e-07, "loss": 0.3327, "step": 1756 }, { "epoch": 0.49, "logps_train/chosen": -72.68170166015625, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -79.17109680175781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8264704942703247, "rewards_train/margins": 1.0676897764205933, "rewards_train/rejected": -2.894160270690918, "step": 1756 }, { "epoch": 0.49, "logps_train/chosen": -108.734375, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -122.84239196777344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.72695255279541, "rewards_train/margins": 1.0113883018493652, "rewards_train/rejected": -3.7383408546447754, "step": 1757 }, { "epoch": 0.49, "learning_rate": 1.4364478665530956e-07, "loss": 0.5524, "step": 1758 }, { "epoch": 0.49, "logps_train/chosen": -59.95671081542969, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -97.79368591308594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3226239681243896, "rewards_train/margins": 2.2794010639190674, "rewards_train/rejected": -3.602025032043457, "step": 1758 }, { "epoch": 0.49, "logps_train/chosen": -111.15128326416016, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -154.73086547851562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -3.0514564514160156, "rewards_train/margins": 1.722020149230957, "rewards_train/rejected": -4.773476600646973, "step": 1759 }, { "epoch": 0.49, "learning_rate": 1.4308500131571538e-07, "loss": 0.4535, "step": 1760 }, { "epoch": 0.49, "logps_train/chosen": -65.82362365722656, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -86.1658935546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8222059011459351, "rewards_train/margins": 1.7561031579971313, "rewards_train/rejected": -2.5783090591430664, "step": 1760 }, { "epoch": 0.49, "logps_train/chosen": -55.25353240966797, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -59.853878021240234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3292596340179443, "rewards_train/margins": 0.7237551212310791, "rewards_train/rejected": -2.0530147552490234, "step": 1761 }, { "epoch": 0.49, "learning_rate": 1.4252587123801745e-07, "loss": 0.4007, "step": 1762 }, { "epoch": 0.49, "logps_train/chosen": -94.54315185546875, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -107.18714904785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1226749420166016, "rewards_train/margins": 1.3268015384674072, "rewards_train/rejected": -3.449476480484009, "step": 1762 }, { "epoch": 0.49, "logps_train/chosen": -45.40351104736328, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -87.63957214355469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.579804003238678, "rewards_train/margins": 1.9727277159690857, "rewards_train/rejected": -2.5525317192077637, "step": 1763 }, { "epoch": 0.49, "learning_rate": 1.419673998490188e-07, "loss": 0.4272, "step": 1764 }, { "epoch": 0.49, "logps_train/chosen": -70.68592834472656, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -68.03724670410156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.372938871383667, "rewards_train/margins": 0.5843008756637573, "rewards_train/rejected": -1.9572397470474243, "step": 1764 }, { "epoch": 0.49, "logps_train/chosen": -122.95270538330078, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -137.17544555664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.398590564727783, "rewards_train/margins": 1.3312108516693115, "rewards_train/rejected": -3.7298014163970947, "step": 1765 }, { "epoch": 0.49, "learning_rate": 1.4140959057148544e-07, "loss": 0.5153, "step": 1766 }, { "epoch": 0.49, "logps_train/chosen": -61.5564079284668, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -60.4739990234375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.424195408821106, "rewards_train/margins": 0.5708611011505127, "rewards_train/rejected": -1.9950565099716187, "step": 1766 }, { "epoch": 0.49, "logps_train/chosen": -89.47825622558594, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -84.82290649414062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6915267705917358, "rewards_train/margins": -0.07530021667480469, "rewards_train/rejected": -1.6162265539169312, "step": 1767 }, { "epoch": 0.49, "learning_rate": 1.408524468241255e-07, "loss": 0.7928, "step": 1768 }, { "epoch": 0.49, "logps_train/chosen": -74.18833923339844, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -94.327880859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5604358911514282, "rewards_train/margins": 1.4539932012557983, "rewards_train/rejected": -3.0144290924072266, "step": 1768 }, { "epoch": 0.49, "logps_train/chosen": -82.75665283203125, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -97.08348083496094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9408992528915405, "rewards_train/margins": 0.7682303190231323, "rewards_train/rejected": -2.709129571914673, "step": 1769 }, { "epoch": 0.49, "learning_rate": 1.4029597202156793e-07, "loss": 0.5645, "step": 1770 }, { "epoch": 0.49, "logps_train/chosen": -39.584197998046875, "logps_train/ref_chosen": -29.0, "logps_train/ref_rejected": -28.75, "logps_train/rejected": -47.16441345214844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0589078664779663, "rewards_train/margins": 0.7766741514205933, "rewards_train/rejected": -1.8355820178985596, "step": 1770 }, { "epoch": 0.49, "logps_train/chosen": -69.99488067626953, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -73.59834289550781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.058838963508606, "rewards_train/margins": 1.936932921409607, "rewards_train/rejected": -2.995771884918213, "step": 1771 }, { "epoch": 0.5, "learning_rate": 1.3974016957434206e-07, "loss": 0.3904, "step": 1772 }, { "epoch": 0.5, "logps_train/chosen": -69.9470443725586, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -80.33355712890625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2562280893325806, "rewards_train/margins": 1.5761502981185913, "rewards_train/rejected": -2.832378387451172, "step": 1772 }, { "epoch": 0.5, "logps_train/chosen": -91.52737426757812, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -134.87603759765625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.305472493171692, "rewards_train/margins": 2.5784205198287964, "rewards_train/rejected": -3.8838930130004883, "step": 1773 }, { "epoch": 0.5, "learning_rate": 1.3918504288885658e-07, "loss": 0.4017, "step": 1774 }, { "epoch": 0.5, "logps_train/chosen": -103.61128234863281, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -120.56626892089844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3197214603424072, "rewards_train/margins": 1.8423736095428467, "rewards_train/rejected": -4.162095069885254, "step": 1774 }, { "epoch": 0.5, "logps_train/chosen": -82.91297912597656, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -77.99188995361328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8811415433883667, "rewards_train/margins": 0.7254692316055298, "rewards_train/rejected": -2.6066107749938965, "step": 1775 }, { "epoch": 0.5, "learning_rate": 1.386305953673782e-07, "loss": 0.5205, "step": 1776 }, { "epoch": 0.5, "logps_train/chosen": -39.34204864501953, "logps_train/ref_chosen": -23.625, "logps_train/ref_rejected": -21.5, "logps_train/rejected": -43.0682373046875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5776619911193848, "rewards_train/margins": 0.5769162178039551, "rewards_train/rejected": -2.15457820892334, "step": 1776 }, { "epoch": 0.5, "logps_train/chosen": -106.58378601074219, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -115.2689208984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7001752853393555, "rewards_train/margins": 1.4103107452392578, "rewards_train/rejected": -3.1104860305786133, "step": 1777 }, { "epoch": 0.5, "learning_rate": 1.3807683040801153e-07, "loss": 0.5109, "step": 1778 }, { "epoch": 0.5, "logps_train/chosen": -70.5435791015625, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -86.02911376953125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7059206366539001, "rewards_train/margins": 1.1188653111457825, "rewards_train/rejected": -1.8247859477996826, "step": 1778 }, { "epoch": 0.5, "logps_train/chosen": -57.89008712768555, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -75.26858520507812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7229932546615601, "rewards_train/margins": 0.9643144607543945, "rewards_train/rejected": -1.6873077154159546, "step": 1779 }, { "epoch": 0.5, "learning_rate": 1.375237514046777e-07, "loss": 0.3594, "step": 1780 }, { "epoch": 0.5, "logps_train/chosen": -46.415016174316406, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -79.39259338378906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5808570384979248, "rewards_train/margins": 2.629349946975708, "rewards_train/rejected": -3.210206985473633, "step": 1780 }, { "epoch": 0.5, "logps_train/chosen": -37.79524230957031, "logps_train/ref_chosen": -23.875, "logps_train/ref_rejected": -28.875, "logps_train/rejected": -44.77808380126953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3961255550384521, "rewards_train/margins": 0.20023739337921143, "rewards_train/rejected": -1.5963629484176636, "step": 1781 }, { "epoch": 0.5, "learning_rate": 1.3697136174709386e-07, "loss": 0.4521, "step": 1782 }, { "epoch": 0.5, "logps_train/chosen": -70.57560729980469, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -91.58863830566406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.71346914768219, "rewards_train/margins": 1.2479337453842163, "rewards_train/rejected": -2.9614028930664062, "step": 1782 }, { "epoch": 0.5, "logps_train/chosen": -57.05122375488281, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -63.5496711730957, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9015088081359863, "rewards_train/margins": 0.7023842334747314, "rewards_train/rejected": -1.6038930416107178, "step": 1783 }, { "epoch": 0.5, "learning_rate": 1.3641966482075205e-07, "loss": 0.4846, "step": 1784 }, { "epoch": 0.5, "logps_train/chosen": -90.04568481445312, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -114.89344787597656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9240999221801758, "rewards_train/margins": 0.944542407989502, "rewards_train/rejected": -2.8686423301696777, "step": 1784 }, { "epoch": 0.5, "logps_train/chosen": -62.505821228027344, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -55.99300003051758, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7806605100631714, "rewards_train/margins": 0.21375662088394165, "rewards_train/rejected": -0.994417130947113, "step": 1785 }, { "epoch": 0.5, "learning_rate": 1.35868664006899e-07, "loss": 0.6153, "step": 1786 }, { "epoch": 0.5, "logps_train/chosen": -69.90780639648438, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -54.83738708496094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6353120803833008, "rewards_train/margins": 0.17498886585235596, "rewards_train/rejected": -1.8103009462356567, "step": 1786 }, { "epoch": 0.5, "logps_train/chosen": -84.24098205566406, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -103.84087371826172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.306910753250122, "rewards_train/margins": 1.5486609935760498, "rewards_train/rejected": -2.855571746826172, "step": 1787 }, { "epoch": 0.5, "learning_rate": 1.3531836268251494e-07, "loss": 0.7109, "step": 1788 }, { "epoch": 0.5, "logps_train/chosen": -83.69889831542969, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -105.857177734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.070427179336548, "rewards_train/margins": 1.9996659755706787, "rewards_train/rejected": -4.070093154907227, "step": 1788 }, { "epoch": 0.5, "logps_train/chosen": -59.895565032958984, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -80.91615295410156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2710018157958984, "rewards_train/margins": 1.7010817527770996, "rewards_train/rejected": -2.972083568572998, "step": 1789 }, { "epoch": 0.5, "learning_rate": 1.34768764220293e-07, "loss": 0.3358, "step": 1790 }, { "epoch": 0.5, "logps_train/chosen": -119.17463684082031, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -95.608642578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.7974438667297363, "rewards_train/margins": -0.2254467010498047, "rewards_train/rejected": -2.5719971656799316, "step": 1790 }, { "epoch": 0.5, "logps_train/chosen": -65.99052429199219, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -95.28819274902344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9435832500457764, "rewards_train/margins": 1.1352365016937256, "rewards_train/rejected": -3.078819751739502, "step": 1791 }, { "epoch": 0.5, "learning_rate": 1.3421987198861866e-07, "loss": 0.8264, "step": 1792 }, { "epoch": 0.5, "logps_train/chosen": -31.896549224853516, "logps_train/ref_chosen": -26.25, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -56.73073959350586, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5677797794342041, "rewards_train/margins": 1.3667197227478027, "rewards_train/rejected": -1.9344995021820068, "step": 1792 }, { "epoch": 0.5, "logps_train/chosen": -77.688720703125, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -86.49745178222656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2786376476287842, "rewards_train/margins": 1.1798484325408936, "rewards_train/rejected": -2.4584860801696777, "step": 1793 }, { "epoch": 0.5, "learning_rate": 1.336716893515492e-07, "loss": 0.4138, "step": 1794 }, { "epoch": 0.5, "logps_train/chosen": -82.18788146972656, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -99.10455322265625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.650233507156372, "rewards_train/margins": 1.616276741027832, "rewards_train/rejected": -3.266510248184204, "step": 1794 }, { "epoch": 0.5, "logps_train/chosen": -83.833251953125, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -111.43186950683594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8843994140625, "rewards_train/margins": 1.9326152801513672, "rewards_train/rejected": -3.817014694213867, "step": 1795 }, { "epoch": 0.5, "learning_rate": 1.3312421966879273e-07, "loss": 0.5196, "step": 1796 }, { "epoch": 0.5, "logps_train/chosen": -96.25450134277344, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -114.88604736328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6770128011703491, "rewards_train/margins": 2.2950881719589233, "rewards_train/rejected": -3.9721009731292725, "step": 1796 }, { "epoch": 0.5, "logps_train/chosen": -86.13140106201172, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -105.61611938476562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2568906545639038, "rewards_train/margins": 1.9767910242080688, "rewards_train/rejected": -3.2336816787719727, "step": 1797 }, { "epoch": 0.5, "learning_rate": 1.3257746629568772e-07, "loss": 0.3998, "step": 1798 }, { "epoch": 0.5, "logps_train/chosen": -53.450172424316406, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -85.01348114013672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9946267604827881, "rewards_train/margins": 2.2108232975006104, "rewards_train/rejected": -3.2054500579833984, "step": 1798 }, { "epoch": 0.5, "logps_train/chosen": -28.21187973022461, "logps_train/ref_chosen": -25.125, "logps_train/ref_rejected": -22.5, "logps_train/rejected": -30.372562408447266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.30788224935531616, "rewards_train/margins": 0.4855995178222656, "rewards_train/rejected": -0.7934817671775818, "step": 1799 }, { "epoch": 0.5, "learning_rate": 1.3203143258318272e-07, "loss": 0.4383, "step": 1800 }, { "epoch": 0.5, "logps_train/chosen": -41.677310943603516, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -19.125, "logps_train/rejected": -35.0609130859375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9311587810516357, "rewards_train/margins": 0.6671199798583984, "rewards_train/rejected": -1.5982787609100342, "step": 1800 }, { "epoch": 0.5, "logps_train/chosen": -53.61812973022461, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -89.99407196044922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9002898335456848, "rewards_train/margins": 1.2363728880882263, "rewards_train/rejected": -2.136662721633911, "step": 1801 }, { "epoch": 0.5, "learning_rate": 1.314861218778156e-07, "loss": 0.4323, "step": 1802 }, { "epoch": 0.5, "logps_train/chosen": -71.07485961914062, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -87.70455932617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.233511209487915, "rewards_train/margins": 1.4119446277618408, "rewards_train/rejected": -2.645455837249756, "step": 1802 }, { "epoch": 0.5, "logps_train/chosen": -46.52236557006836, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -55.869659423828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0539944171905518, "rewards_train/margins": 0.9950804710388184, "rewards_train/rejected": -2.04907488822937, "step": 1803 }, { "epoch": 0.5, "learning_rate": 1.3094153752169307e-07, "loss": 0.3371, "step": 1804 }, { "epoch": 0.5, "logps_train/chosen": -70.13768768310547, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -73.90690612792969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3694329261779785, "rewards_train/margins": 1.1970391273498535, "rewards_train/rejected": -2.566472053527832, "step": 1804 }, { "epoch": 0.5, "logps_train/chosen": -53.03296661376953, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -105.7511215209961, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3784918785095215, "rewards_train/margins": 3.680800437927246, "rewards_train/rejected": -5.059292316436768, "step": 1805 }, { "epoch": 0.5, "learning_rate": 1.3039768285246988e-07, "loss": 0.4646, "step": 1806 }, { "epoch": 0.5, "logps_train/chosen": -101.19830322265625, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -137.64564514160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.6630680561065674, "rewards_train/margins": 1.894465684890747, "rewards_train/rejected": -4.5575337409973145, "step": 1806 }, { "epoch": 0.51, "logps_train/chosen": -35.44614791870117, "logps_train/ref_chosen": -29.5, "logps_train/ref_rejected": -35.75, "logps_train/rejected": -49.23463439941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.5943704843521118, "rewards_train/margins": 0.7475991249084473, "rewards_train/rejected": -1.341969609260559, "step": 1807 }, { "epoch": 0.51, "learning_rate": 1.2985456120332906e-07, "loss": 0.3964, "step": 1808 }, { "epoch": 0.51, "logps_train/chosen": -89.47793579101562, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -92.598876953125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2364659309387207, "rewards_train/margins": 0.88826584815979, "rewards_train/rejected": -2.1247317790985107, "step": 1808 }, { "epoch": 0.51, "logps_train/chosen": -56.61750030517578, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -77.45428466796875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.775275707244873, "rewards_train/margins": 0.792320728302002, "rewards_train/rejected": -1.567596435546875, "step": 1809 }, { "epoch": 0.51, "learning_rate": 1.29312175902961e-07, "loss": 0.4759, "step": 1810 }, { "epoch": 0.51, "logps_train/chosen": -69.15110778808594, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -87.00193786621094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2891346216201782, "rewards_train/margins": 1.273363709449768, "rewards_train/rejected": -2.5624983310699463, "step": 1810 }, { "epoch": 0.51, "logps_train/chosen": -117.8868408203125, "logps_train/ref_chosen": -99.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -121.09489440917969, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8168091773986816, "rewards_train/margins": 1.848149061203003, "rewards_train/rejected": -3.6649582386016846, "step": 1811 }, { "epoch": 0.51, "learning_rate": 1.2877053027554289e-07, "loss": 0.408, "step": 1812 }, { "epoch": 0.51, "logps_train/chosen": -43.83164978027344, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -74.97264099121094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9997665286064148, "rewards_train/margins": 1.9084346890449524, "rewards_train/rejected": -2.908201217651367, "step": 1812 }, { "epoch": 0.51, "logps_train/chosen": -93.790283203125, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -114.37774658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8887935876846313, "rewards_train/margins": 1.6864818334579468, "rewards_train/rejected": -3.575275421142578, "step": 1813 }, { "epoch": 0.51, "learning_rate": 1.2822962764071888e-07, "loss": 0.3362, "step": 1814 }, { "epoch": 0.51, "logps_train/chosen": -86.02261352539062, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -119.37617492675781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1738920211791992, "rewards_train/margins": 3.034770965576172, "rewards_train/rejected": -4.208662986755371, "step": 1814 }, { "epoch": 0.51, "logps_train/chosen": -51.77228546142578, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -67.97145080566406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8561345338821411, "rewards_train/margins": 1.2830032110214233, "rewards_train/rejected": -2.1391377449035645, "step": 1815 }, { "epoch": 0.51, "learning_rate": 1.2768947131357937e-07, "loss": 0.33, "step": 1816 }, { "epoch": 0.51, "logps_train/chosen": -54.13633728027344, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -62.22792053222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9581648111343384, "rewards_train/margins": 0.7034945487976074, "rewards_train/rejected": -1.6616593599319458, "step": 1816 }, { "epoch": 0.51, "logps_train/chosen": -89.08345031738281, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -105.21546173095703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5163532495498657, "rewards_train/margins": 2.492693781852722, "rewards_train/rejected": -4.009047031402588, "step": 1817 }, { "epoch": 0.51, "learning_rate": 1.2715006460464093e-07, "loss": 0.3507, "step": 1818 }, { "epoch": 0.51, "logps_train/chosen": -34.79269790649414, "logps_train/ref_chosen": -26.25, "logps_train/ref_rejected": -26.75, "logps_train/rejected": -40.77827453613281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8591037392616272, "rewards_train/margins": 0.5477034449577332, "rewards_train/rejected": -1.4068071842193604, "step": 1818 }, { "epoch": 0.51, "logps_train/chosen": -131.22537231445312, "logps_train/ref_chosen": -103.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -149.39248657226562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.822537422180176, "rewards_train/margins": 2.0487422943115234, "rewards_train/rejected": -4.871279716491699, "step": 1819 }, { "epoch": 0.51, "learning_rate": 1.2661141081982545e-07, "loss": 0.4243, "step": 1820 }, { "epoch": 0.51, "logps_train/chosen": -88.078857421875, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -113.62788391113281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7383538484573364, "rewards_train/margins": 1.958809733390808, "rewards_train/rejected": -2.6971635818481445, "step": 1820 }, { "epoch": 0.51, "logps_train/chosen": -84.630859375, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -91.62677764892578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7440423965454102, "rewards_train/margins": 1.5096511840820312, "rewards_train/rejected": -3.2536935806274414, "step": 1821 }, { "epoch": 0.51, "learning_rate": 1.2607351326044074e-07, "loss": 0.2819, "step": 1822 }, { "epoch": 0.51, "logps_train/chosen": -63.17647933959961, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -95.75277709960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2062222957611084, "rewards_train/margins": 2.6799933910369873, "rewards_train/rejected": -3.8862156867980957, "step": 1822 }, { "epoch": 0.51, "logps_train/chosen": -95.8468017578125, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -117.04344177246094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.633227825164795, "rewards_train/margins": 1.5704450607299805, "rewards_train/rejected": -3.2036728858947754, "step": 1823 }, { "epoch": 0.51, "learning_rate": 1.2553637522315967e-07, "loss": 0.2748, "step": 1824 }, { "epoch": 0.51, "logps_train/chosen": -21.561628341674805, "logps_train/ref_chosen": -15.875, "logps_train/ref_rejected": -11.0625, "logps_train/rejected": -25.239707946777344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5680767297744751, "rewards_train/margins": 0.8473002910614014, "rewards_train/rejected": -1.4153770208358765, "step": 1824 }, { "epoch": 0.51, "logps_train/chosen": -93.94432830810547, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -119.57408905029297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.6206047534942627, "rewards_train/margins": 1.6477415561676025, "rewards_train/rejected": -4.268346309661865, "step": 1825 }, { "epoch": 0.51, "learning_rate": 1.2500000000000005e-07, "loss": 0.4641, "step": 1826 }, { "epoch": 0.51, "logps_train/chosen": -96.84722137451172, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -125.43040466308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3909720182418823, "rewards_train/margins": 2.451971173286438, "rewards_train/rejected": -3.8429431915283203, "step": 1826 }, { "epoch": 0.51, "logps_train/chosen": -83.45915222167969, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -95.38729858398438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.786729097366333, "rewards_train/margins": 0.764153242111206, "rewards_train/rejected": -2.550882339477539, "step": 1827 }, { "epoch": 0.51, "learning_rate": 1.2446439087830462e-07, "loss": 0.3792, "step": 1828 }, { "epoch": 0.51, "logps_train/chosen": -60.217140197753906, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -69.36685180664062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0772801637649536, "rewards_train/margins": 0.9910463094711304, "rewards_train/rejected": -2.068326473236084, "step": 1828 }, { "epoch": 0.51, "logps_train/chosen": -96.72798156738281, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -131.86734008789062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1935014724731445, "rewards_train/margins": 2.4166698455810547, "rewards_train/rejected": -3.610171318054199, "step": 1829 }, { "epoch": 0.51, "learning_rate": 1.2392955114072098e-07, "loss": 0.371, "step": 1830 }, { "epoch": 0.51, "logps_train/chosen": -84.11962127685547, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -118.79898071289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4166494607925415, "rewards_train/margins": 2.806998133659363, "rewards_train/rejected": -4.223647594451904, "step": 1830 }, { "epoch": 0.51, "logps_train/chosen": -55.4085807800293, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -75.881591796875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2488659620285034, "rewards_train/margins": 1.0535508394241333, "rewards_train/rejected": -2.3024168014526367, "step": 1831 }, { "epoch": 0.51, "learning_rate": 1.23395484065181e-07, "loss": 0.3201, "step": 1832 }, { "epoch": 0.51, "logps_train/chosen": -60.6721076965332, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -78.96751403808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2451891899108887, "rewards_train/margins": 1.912353754043579, "rewards_train/rejected": -3.1575429439544678, "step": 1832 }, { "epoch": 0.51, "logps_train/chosen": -67.5907974243164, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -79.24272918701172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8152318000793457, "rewards_train/margins": 0.4012284278869629, "rewards_train/rejected": -2.2164602279663086, "step": 1833 }, { "epoch": 0.51, "learning_rate": 1.228621929248813e-07, "loss": 0.418, "step": 1834 }, { "epoch": 0.51, "logps_train/chosen": -50.921104431152344, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -65.22193908691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.523750901222229, "rewards_train/margins": 1.4542043209075928, "rewards_train/rejected": -1.9779552221298218, "step": 1834 }, { "epoch": 0.51, "logps_train/chosen": -101.90826416015625, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -126.49099731445312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.65059232711792, "rewards_train/margins": 3.072726249694824, "rewards_train/rejected": -4.723318576812744, "step": 1835 }, { "epoch": 0.51, "learning_rate": 1.2232968098826284e-07, "loss": 0.3201, "step": 1836 }, { "epoch": 0.51, "logps_train/chosen": -51.65633773803711, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -38.25, "logps_train/rejected": -49.79901123046875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0818445682525635, "rewards_train/margins": 0.0666114091873169, "rewards_train/rejected": -1.1484559774398804, "step": 1836 }, { "epoch": 0.51, "logps_train/chosen": -114.28877258300781, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -164.07217407226562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.4196486473083496, "rewards_train/margins": 2.074678897857666, "rewards_train/rejected": -5.494327545166016, "step": 1837 }, { "epoch": 0.51, "learning_rate": 1.217979515189912e-07, "loss": 0.5245, "step": 1838 }, { "epoch": 0.51, "logps_train/chosen": -106.10956573486328, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -118.07239532470703, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.0093939304351807, "rewards_train/margins": 1.4376893043518066, "rewards_train/rejected": -3.4470832347869873, "step": 1838 }, { "epoch": 0.51, "logps_train/chosen": -70.26261901855469, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -84.50745391845703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7466719150543213, "rewards_train/margins": 0.9300498962402344, "rewards_train/rejected": -2.6767218112945557, "step": 1839 }, { "epoch": 0.51, "learning_rate": 1.212670077759359e-07, "loss": 0.4998, "step": 1840 }, { "epoch": 0.51, "logps_train/chosen": -88.94002532958984, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -107.8089599609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5893150568008423, "rewards_train/margins": 1.4259554147720337, "rewards_train/rejected": -3.015270471572876, "step": 1840 }, { "epoch": 0.51, "logps_train/chosen": -74.72373962402344, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -113.5050277709961, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5709335803985596, "rewards_train/margins": 2.22956919670105, "rewards_train/rejected": -3.8005027770996094, "step": 1841 }, { "epoch": 0.51, "learning_rate": 1.2073685301315133e-07, "loss": 0.2643, "step": 1842 }, { "epoch": 0.51, "logps_train/chosen": -88.02828979492188, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -132.9336395263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.359664797782898, "rewards_train/margins": 2.0989338159561157, "rewards_train/rejected": -3.4585986137390137, "step": 1842 }, { "epoch": 0.52, "logps_train/chosen": -79.47792053222656, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -85.26771545410156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3286516666412354, "rewards_train/margins": 0.9178466796875, "rewards_train/rejected": -2.2464983463287354, "step": 1843 }, { "epoch": 0.52, "learning_rate": 1.2020749047985625e-07, "loss": 0.4137, "step": 1844 }, { "epoch": 0.52, "logps_train/chosen": -43.18343734741211, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -56.77774429321289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.855648398399353, "rewards_train/margins": 0.6510323286056519, "rewards_train/rejected": -1.5066807270050049, "step": 1844 }, { "epoch": 0.52, "logps_train/chosen": -77.59200286865234, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -100.8201675415039, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.642793893814087, "rewards_train/margins": 1.9681293964385986, "rewards_train/rejected": -3.6109232902526855, "step": 1845 }, { "epoch": 0.52, "learning_rate": 1.196789234204138e-07, "loss": 0.4572, "step": 1846 }, { "epoch": 0.52, "logps_train/chosen": -68.59378814697266, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -101.05061340332031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7761759757995605, "rewards_train/margins": 1.465604305267334, "rewards_train/rejected": -2.2417802810668945, "step": 1846 }, { "epoch": 0.52, "logps_train/chosen": -112.95681762695312, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -179.93466186523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8581812381744385, "rewards_train/margins": 3.404034376144409, "rewards_train/rejected": -5.262215614318848, "step": 1847 }, { "epoch": 0.52, "learning_rate": 1.1915115507431207e-07, "loss": 0.2125, "step": 1848 }, { "epoch": 0.52, "logps_train/chosen": -78.44413757324219, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -90.45282745361328, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6540809869766235, "rewards_train/margins": 1.2925690412521362, "rewards_train/rejected": -2.9466500282287598, "step": 1848 }, { "epoch": 0.52, "logps_train/chosen": -87.24072265625, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -134.02291870117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4100093841552734, "rewards_train/margins": 2.7375946044921875, "rewards_train/rejected": -4.147603988647461, "step": 1849 }, { "epoch": 0.52, "learning_rate": 1.1862418867614385e-07, "loss": 0.4177, "step": 1850 }, { "epoch": 0.52, "logps_train/chosen": -32.44071960449219, "logps_train/ref_chosen": -25.25, "logps_train/ref_rejected": -28.375, "logps_train/rejected": -43.17241287231445, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.715556263923645, "rewards_train/margins": 0.7645754814147949, "rewards_train/rejected": -1.48013174533844, "step": 1850 }, { "epoch": 0.52, "logps_train/chosen": -138.3934326171875, "logps_train/ref_chosen": -107.0, "logps_train/ref_rejected": -127.5, "logps_train/rejected": -199.50967407226562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.13934326171875, "rewards_train/margins": 4.055373191833496, "rewards_train/rejected": -7.194716453552246, "step": 1851 }, { "epoch": 0.52, "learning_rate": 1.1809802745558708e-07, "loss": 0.2895, "step": 1852 }, { "epoch": 0.52, "logps_train/chosen": -67.9940414428711, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -73.91470336914062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.490175724029541, "rewards_train/margins": 1.0944101810455322, "rewards_train/rejected": -2.5845859050750732, "step": 1852 }, { "epoch": 0.52, "logps_train/chosen": -88.08970642089844, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -126.63687133789062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.615220546722412, "rewards_train/margins": 1.051591396331787, "rewards_train/rejected": -3.666811943054199, "step": 1853 }, { "epoch": 0.52, "learning_rate": 1.1757267463738465e-07, "loss": 0.8369, "step": 1854 }, { "epoch": 0.52, "logps_train/chosen": -85.12269592285156, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -108.0712890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9829728603363037, "rewards_train/margins": 1.3514995574951172, "rewards_train/rejected": -3.334472417831421, "step": 1854 }, { "epoch": 0.52, "logps_train/chosen": -65.36869049072266, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -88.8803482055664, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6564977169036865, "rewards_train/margins": 1.757904052734375, "rewards_train/rejected": -2.4144017696380615, "step": 1855 }, { "epoch": 0.52, "learning_rate": 1.1704813344132514e-07, "loss": 0.4072, "step": 1856 }, { "epoch": 0.52, "logps_train/chosen": -92.02995300292969, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -105.53173065185547, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.514713764190674, "rewards_train/margins": 0.7595534324645996, "rewards_train/rejected": -3.2742671966552734, "step": 1856 }, { "epoch": 0.52, "logps_train/chosen": -79.29387664794922, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -108.56607055664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4967706203460693, "rewards_train/margins": 1.8145246505737305, "rewards_train/rejected": -3.3112952709198, "step": 1857 }, { "epoch": 0.52, "learning_rate": 1.1652440708222284e-07, "loss": 0.3761, "step": 1858 }, { "epoch": 0.52, "logps_train/chosen": -108.27483367919922, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -126.79065704345703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.0460376739501953, "rewards_train/margins": 1.5861525535583496, "rewards_train/rejected": -4.632190227508545, "step": 1858 }, { "epoch": 0.52, "logps_train/chosen": -107.4876708984375, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -123.75631713867188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.5261106491088867, "rewards_train/margins": 1.0970797538757324, "rewards_train/rejected": -3.623190402984619, "step": 1859 }, { "epoch": 0.52, "learning_rate": 1.1600149876989784e-07, "loss": 0.3844, "step": 1860 }, { "epoch": 0.52, "logps_train/chosen": -57.655517578125, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -79.86770629882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9548091888427734, "rewards_train/margins": 1.0356730222702026, "rewards_train/rejected": -1.990482211112976, "step": 1860 }, { "epoch": 0.52, "logps_train/chosen": -66.32403564453125, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -92.36734008789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4661928415298462, "rewards_train/margins": 1.7510100603103638, "rewards_train/rejected": -3.21720290184021, "step": 1861 }, { "epoch": 0.52, "learning_rate": 1.1547941170915685e-07, "loss": 0.3558, "step": 1862 }, { "epoch": 0.52, "logps_train/chosen": -42.74238586425781, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -50.778236389160156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.33380889892578125, "rewards_train/margins": 0.9889365434646606, "rewards_train/rejected": -1.322745442390442, "step": 1862 }, { "epoch": 0.52, "logps_train/chosen": -107.78318786621094, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -152.5138702392578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6712876558303833, "rewards_train/margins": 3.313693404197693, "rewards_train/rejected": -4.984981060028076, "step": 1863 }, { "epoch": 0.52, "learning_rate": 1.1495814909977311e-07, "loss": 0.3325, "step": 1864 }, { "epoch": 0.52, "logps_train/chosen": -66.98416137695312, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -87.318115234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2640411853790283, "rewards_train/margins": 1.536618947982788, "rewards_train/rejected": -2.8006601333618164, "step": 1864 }, { "epoch": 0.52, "logps_train/chosen": -78.55347442626953, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -84.07122802734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9069098830223083, "rewards_train/margins": 1.95372873544693, "rewards_train/rejected": -2.8606386184692383, "step": 1865 }, { "epoch": 0.52, "learning_rate": 1.1443771413646711e-07, "loss": 0.3175, "step": 1866 }, { "epoch": 0.52, "logps_train/chosen": -71.20903015136719, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -97.3011474609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8240283727645874, "rewards_train/margins": 2.2349923849105835, "rewards_train/rejected": -3.059020757675171, "step": 1866 }, { "epoch": 0.52, "logps_train/chosen": -49.83819580078125, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -61.071800231933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1485167741775513, "rewards_train/margins": 0.8909875154495239, "rewards_train/rejected": -2.039504289627075, "step": 1867 }, { "epoch": 0.52, "learning_rate": 1.139181100088866e-07, "loss": 0.4216, "step": 1868 }, { "epoch": 0.52, "logps_train/chosen": -102.03263854980469, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -134.36888122558594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.270841360092163, "rewards_train/margins": 1.7730779647827148, "rewards_train/rejected": -3.043919324874878, "step": 1868 }, { "epoch": 0.52, "logps_train/chosen": -87.11683654785156, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -103.55712890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1390278339385986, "rewards_train/margins": 2.2823097705841064, "rewards_train/rejected": -3.421337604522705, "step": 1869 }, { "epoch": 0.52, "learning_rate": 1.1339933990158749e-07, "loss": 0.2687, "step": 1870 }, { "epoch": 0.52, "logps_train/chosen": -62.227073669433594, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -83.20600891113281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6632344722747803, "rewards_train/margins": 1.267132043838501, "rewards_train/rejected": -2.9303665161132812, "step": 1870 }, { "epoch": 0.52, "logps_train/chosen": -22.738014221191406, "logps_train/ref_chosen": -16.0, "logps_train/ref_rejected": -22.375, "logps_train/rejected": -36.29288101196289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6761450171470642, "rewards_train/margins": 0.7185729146003723, "rewards_train/rejected": -1.3947179317474365, "step": 1871 }, { "epoch": 0.52, "learning_rate": 1.1288140699401421e-07, "loss": 0.4092, "step": 1872 }, { "epoch": 0.52, "logps_train/chosen": -57.997798919677734, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -73.33271789550781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6044674515724182, "rewards_train/margins": 1.2905232310295105, "rewards_train/rejected": -1.8949906826019287, "step": 1872 }, { "epoch": 0.52, "logps_train/chosen": -54.176692962646484, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -86.22315979003906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8534112572669983, "rewards_train/margins": 2.4538163542747498, "rewards_train/rejected": -3.307227611541748, "step": 1873 }, { "epoch": 0.52, "learning_rate": 1.1236431446047984e-07, "loss": 0.3006, "step": 1874 }, { "epoch": 0.52, "logps_train/chosen": -81.40237426757812, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -106.32112121582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4635772705078125, "rewards_train/margins": 2.0947067737579346, "rewards_train/rejected": -3.558284044265747, "step": 1874 }, { "epoch": 0.52, "logps_train/chosen": -94.24456787109375, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -120.14347839355469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.484222173690796, "rewards_train/margins": 1.7141101360321045, "rewards_train/rejected": -3.1983323097229004, "step": 1875 }, { "epoch": 0.52, "learning_rate": 1.1184806547014725e-07, "loss": 0.4348, "step": 1876 }, { "epoch": 0.52, "logps_train/chosen": -77.82167053222656, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -101.10649108886719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.754042148590088, "rewards_train/margins": 1.5484039783477783, "rewards_train/rejected": -3.302446126937866, "step": 1876 }, { "epoch": 0.52, "logps_train/chosen": -110.63151550292969, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -82.5, "logps_train/rejected": -128.6043243408203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.447136163711548, "rewards_train/margins": 2.15899920463562, "rewards_train/rejected": -4.606135368347168, "step": 1877 }, { "epoch": 0.52, "learning_rate": 1.1133266318700931e-07, "loss": 0.3177, "step": 1878 }, { "epoch": 0.52, "logps_train/chosen": -72.16413116455078, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -106.85801696777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9018137454986572, "rewards_train/margins": 2.35762095451355, "rewards_train/rejected": -4.259434700012207, "step": 1878 }, { "epoch": 0.53, "logps_train/chosen": -96.20481872558594, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -99.30570983886719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.954075813293457, "rewards_train/margins": 1.7632145881652832, "rewards_train/rejected": -3.7172904014587402, "step": 1879 }, { "epoch": 0.53, "learning_rate": 1.1081811076986963e-07, "loss": 0.3477, "step": 1880 }, { "epoch": 0.53, "logps_train/chosen": -103.07200622558594, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -133.30667114257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.6540753841400146, "rewards_train/margins": 2.924247980117798, "rewards_train/rejected": -5.5783233642578125, "step": 1880 }, { "epoch": 0.53, "logps_train/chosen": -75.77108764648438, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -73.75123596191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3454680442810059, "rewards_train/margins": 1.103874683380127, "rewards_train/rejected": -2.449342727661133, "step": 1881 }, { "epoch": 0.53, "learning_rate": 1.1030441137232294e-07, "loss": 0.2516, "step": 1882 }, { "epoch": 0.53, "logps_train/chosen": -61.58794403076172, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -61.39958953857422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5322317481040955, "rewards_train/margins": 0.9374150633811951, "rewards_train/rejected": -1.4696468114852905, "step": 1882 }, { "epoch": 0.53, "logps_train/chosen": -72.37861633300781, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -91.42536163330078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.844501793384552, "rewards_train/margins": 1.1734248995780945, "rewards_train/rejected": -2.0179266929626465, "step": 1883 }, { "epoch": 0.53, "learning_rate": 1.0979156814273621e-07, "loss": 0.3806, "step": 1884 }, { "epoch": 0.53, "logps_train/chosen": -65.94540405273438, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -95.26437377929688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1222748756408691, "rewards_train/margins": 1.2508411407470703, "rewards_train/rejected": -2.3731160163879395, "step": 1884 }, { "epoch": 0.53, "logps_train/chosen": -69.76664733886719, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -105.36920166015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9269583225250244, "rewards_train/margins": 1.6904304027557373, "rewards_train/rejected": -3.6173887252807617, "step": 1885 }, { "epoch": 0.53, "learning_rate": 1.0927958422422911e-07, "loss": 0.3645, "step": 1886 }, { "epoch": 0.53, "logps_train/chosen": -31.979169845581055, "logps_train/ref_chosen": -21.375, "logps_train/ref_rejected": -26.5, "logps_train/rejected": -47.55209732055664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0623213052749634, "rewards_train/margins": 1.0368338823318481, "rewards_train/rejected": -2.0991551876068115, "step": 1886 }, { "epoch": 0.53, "logps_train/chosen": -80.36085510253906, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -94.09173583984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7400413751602173, "rewards_train/margins": 1.7382735013961792, "rewards_train/rejected": -3.4783148765563965, "step": 1887 }, { "epoch": 0.53, "learning_rate": 1.0876846275465453e-07, "loss": 0.4227, "step": 1888 }, { "epoch": 0.53, "logps_train/chosen": -104.34622955322266, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -139.8173828125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.422123432159424, "rewards_train/margins": 0.8174281120300293, "rewards_train/rejected": -4.239551544189453, "step": 1888 }, { "epoch": 0.53, "logps_train/chosen": -118.78239440917969, "logps_train/ref_chosen": -107.0, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -154.7715606689453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1876142024993896, "rewards_train/margins": 3.4661037921905518, "rewards_train/rejected": -4.653717994689941, "step": 1889 }, { "epoch": 0.53, "learning_rate": 1.0825820686657978e-07, "loss": 0.2776, "step": 1890 }, { "epoch": 0.53, "logps_train/chosen": -92.30839538574219, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -105.17121887207031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8191206455230713, "rewards_train/margins": 1.3874547481536865, "rewards_train/rejected": -3.206575393676758, "step": 1890 }, { "epoch": 0.53, "logps_train/chosen": -89.18570709228516, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -102.78724670410156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.908805251121521, "rewards_train/margins": 1.424216628074646, "rewards_train/rejected": -3.333021879196167, "step": 1891 }, { "epoch": 0.53, "learning_rate": 1.0774881968726721e-07, "loss": 0.4718, "step": 1892 }, { "epoch": 0.53, "logps_train/chosen": -70.77232360839844, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -100.51925659179688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.485436201095581, "rewards_train/margins": 1.9508650302886963, "rewards_train/rejected": -3.4363012313842773, "step": 1892 }, { "epoch": 0.53, "logps_train/chosen": -89.73664855957031, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -104.7386474609375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4354815483093262, "rewards_train/margins": 1.6090869903564453, "rewards_train/rejected": -3.0445685386657715, "step": 1893 }, { "epoch": 0.53, "learning_rate": 1.0724030433865475e-07, "loss": 0.3676, "step": 1894 }, { "epoch": 0.53, "logps_train/chosen": -107.09449768066406, "logps_train/ref_chosen": -83.5, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -117.95587158203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.3559341430664062, "rewards_train/margins": 0.7964887619018555, "rewards_train/rejected": -3.1524229049682617, "step": 1894 }, { "epoch": 0.53, "logps_train/chosen": -67.088623046875, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -95.45256805419922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3133540153503418, "rewards_train/margins": 1.683074712753296, "rewards_train/rejected": -2.9964287281036377, "step": 1895 }, { "epoch": 0.53, "learning_rate": 1.0673266393733732e-07, "loss": 0.48, "step": 1896 }, { "epoch": 0.53, "logps_train/chosen": -57.43756103515625, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -59.90660095214844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7033750414848328, "rewards_train/margins": 1.03792005777359, "rewards_train/rejected": -1.7412950992584229, "step": 1896 }, { "epoch": 0.53, "logps_train/chosen": -39.66823196411133, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -69.99291229248047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5634051561355591, "rewards_train/margins": 1.352292537689209, "rewards_train/rejected": -1.915697693824768, "step": 1897 }, { "epoch": 0.53, "learning_rate": 1.062259015945474e-07, "loss": 0.4262, "step": 1898 }, { "epoch": 0.53, "logps_train/chosen": -73.24278259277344, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -113.96565246582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.681017279624939, "rewards_train/margins": 2.301095128059387, "rewards_train/rejected": -3.982112407684326, "step": 1898 }, { "epoch": 0.53, "logps_train/chosen": -30.016342163085938, "logps_train/ref_chosen": -23.125, "logps_train/ref_rejected": -29.75, "logps_train/rejected": -41.85903549194336, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6856186389923096, "rewards_train/margins": 0.5196206569671631, "rewards_train/rejected": -1.2052392959594727, "step": 1899 }, { "epoch": 0.53, "learning_rate": 1.0572002041613597e-07, "loss": 0.3987, "step": 1900 }, { "epoch": 0.53, "logps_train/chosen": -82.3949203491211, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -112.35581970214844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3531635999679565, "rewards_train/margins": 1.0581990480422974, "rewards_train/rejected": -2.411362648010254, "step": 1900 }, { "epoch": 0.53, "logps_train/chosen": -45.51484680175781, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -69.83395385742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8014847040176392, "rewards_train/margins": 1.8037861585617065, "rewards_train/rejected": -2.6052708625793457, "step": 1901 }, { "epoch": 0.53, "learning_rate": 1.0521502350255346e-07, "loss": 0.3813, "step": 1902 }, { "epoch": 0.53, "logps_train/chosen": -90.20823669433594, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -119.5953140258789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5494375228881836, "rewards_train/margins": 2.3565785884857178, "rewards_train/rejected": -3.9060161113739014, "step": 1902 }, { "epoch": 0.53, "logps_train/chosen": -79.43113708496094, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -80.36499786376953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2706531286239624, "rewards_train/margins": 1.3228775262832642, "rewards_train/rejected": -2.5935306549072266, "step": 1903 }, { "epoch": 0.53, "learning_rate": 1.0471091394883085e-07, "loss": 0.3232, "step": 1904 }, { "epoch": 0.53, "logps_train/chosen": -100.63670349121094, "logps_train/ref_chosen": -88.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -122.93672180175781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2228502035140991, "rewards_train/margins": 2.218478798866272, "rewards_train/rejected": -3.441329002380371, "step": 1904 }, { "epoch": 0.53, "logps_train/chosen": -56.86190414428711, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -73.1705551147461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.824276328086853, "rewards_train/margins": 1.226080060005188, "rewards_train/rejected": -2.050356388092041, "step": 1905 }, { "epoch": 0.53, "learning_rate": 1.0420769484456085e-07, "loss": 0.3196, "step": 1906 }, { "epoch": 0.53, "logps_train/chosen": -61.427406311035156, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -91.34333801269531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.938248336315155, "rewards_train/margins": 2.617960751056671, "rewards_train/rejected": -3.556209087371826, "step": 1906 }, { "epoch": 0.53, "logps_train/chosen": -67.15316772460938, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -80.15322875976562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.393442153930664, "rewards_train/margins": 0.6664113998413086, "rewards_train/rejected": -2.0598535537719727, "step": 1907 }, { "epoch": 0.53, "learning_rate": 1.0370536927387838e-07, "loss": 0.4639, "step": 1908 }, { "epoch": 0.53, "logps_train/chosen": -106.0721206665039, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -131.2165069580078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.347836971282959, "rewards_train/margins": 1.3841662406921387, "rewards_train/rejected": -3.7320032119750977, "step": 1908 }, { "epoch": 0.53, "logps_train/chosen": -132.96841430664062, "logps_train/ref_chosen": -100.5, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -168.9968719482422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.2632477283477783, "rewards_train/margins": 2.3536274433135986, "rewards_train/rejected": -5.616875171661377, "step": 1909 }, { "epoch": 0.53, "learning_rate": 1.0320394031544238e-07, "loss": 0.4148, "step": 1910 }, { "epoch": 0.53, "logps_train/chosen": -55.784812927246094, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -78.22610473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4109032452106476, "rewards_train/margins": 2.4781133830547333, "rewards_train/rejected": -2.889016628265381, "step": 1910 }, { "epoch": 0.53, "logps_train/chosen": -85.2554702758789, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -114.61570739746094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.309140682220459, "rewards_train/margins": 1.967566967010498, "rewards_train/rejected": -4.276707649230957, "step": 1911 }, { "epoch": 0.53, "learning_rate": 1.0270341104241654e-07, "loss": 0.2673, "step": 1912 }, { "epoch": 0.53, "logps_train/chosen": -52.6467399597168, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -51.80069351196289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.644459068775177, "rewards_train/margins": 1.1443992257118225, "rewards_train/rejected": -1.7888582944869995, "step": 1912 }, { "epoch": 0.53, "logps_train/chosen": -48.017059326171875, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -46.0, "logps_train/rejected": -62.773712158203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8917452096939087, "rewards_train/margins": 0.7830871343612671, "rewards_train/rejected": -1.6748323440551758, "step": 1913 }, { "epoch": 0.53, "learning_rate": 1.0220378452245057e-07, "loss": 0.493, "step": 1914 }, { "epoch": 0.53, "logps_train/chosen": -72.70944213867188, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -98.0931625366211, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.150045394897461, "rewards_train/margins": 1.0202088356018066, "rewards_train/rejected": -3.1702542304992676, "step": 1914 }, { "epoch": 0.54, "logps_train/chosen": -56.9299201965332, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -75.45361328125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8105697631835938, "rewards_train/margins": 1.608229160308838, "rewards_train/rejected": -2.4187989234924316, "step": 1915 }, { "epoch": 0.54, "learning_rate": 1.0170506381766119e-07, "loss": 0.4271, "step": 1916 }, { "epoch": 0.54, "logps_train/chosen": -72.64109802246094, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -104.75239562988281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1496570110321045, "rewards_train/margins": 2.017770290374756, "rewards_train/rejected": -3.1674273014068604, "step": 1916 }, { "epoch": 0.54, "logps_train/chosen": -64.56470489501953, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -72.0902099609375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6527595520019531, "rewards_train/margins": 0.7912225723266602, "rewards_train/rejected": -2.4439821243286133, "step": 1917 }, { "epoch": 0.54, "learning_rate": 1.0120725198461383e-07, "loss": 0.5625, "step": 1918 }, { "epoch": 0.54, "logps_train/chosen": -83.3591537475586, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -106.28707885742188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6050564050674438, "rewards_train/margins": 1.9398621320724487, "rewards_train/rejected": -3.5449185371398926, "step": 1918 }, { "epoch": 0.54, "logps_train/chosen": -32.98208999633789, "logps_train/ref_chosen": -26.125, "logps_train/ref_rejected": -34.5, "logps_train/rejected": -54.504356384277344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6852939128875732, "rewards_train/margins": 1.3149466514587402, "rewards_train/rejected": -2.0002405643463135, "step": 1919 }, { "epoch": 0.54, "learning_rate": 1.007103520743035e-07, "loss": 0.4866, "step": 1920 }, { "epoch": 0.54, "logps_train/chosen": -107.27156066894531, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -139.72808837890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.063093662261963, "rewards_train/margins": 1.846433162689209, "rewards_train/rejected": -3.909526824951172, "step": 1920 }, { "epoch": 0.54, "logps_train/chosen": -72.44178009033203, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -92.11134338378906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9953500032424927, "rewards_train/margins": 1.080236792564392, "rewards_train/rejected": -3.0755867958068848, "step": 1921 }, { "epoch": 0.54, "learning_rate": 1.0021436713213605e-07, "loss": 0.3617, "step": 1922 }, { "epoch": 0.54, "logps_train/chosen": -84.2673110961914, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -99.67121887207031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.077024221420288, "rewards_train/margins": 0.6986920833587646, "rewards_train/rejected": -2.7757163047790527, "step": 1922 }, { "epoch": 0.54, "logps_train/chosen": -48.90370178222656, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -66.9290771484375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6816787123680115, "rewards_train/margins": 1.1111312508583069, "rewards_train/rejected": -1.7928099632263184, "step": 1923 }, { "epoch": 0.54, "learning_rate": 9.97193001979099e-08, "loss": 0.4458, "step": 1924 }, { "epoch": 0.54, "logps_train/chosen": -90.2362060546875, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -97.33097076416016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.97967529296875, "rewards_train/margins": 0.9403355121612549, "rewards_train/rejected": -2.920010805130005, "step": 1924 }, { "epoch": 0.54, "logps_train/chosen": -89.4881362915039, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -107.73068237304688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.316781997680664, "rewards_train/margins": 1.0887079238891602, "rewards_train/rejected": -3.405489921569824, "step": 1925 }, { "epoch": 0.54, "learning_rate": 9.922515430579706e-08, "loss": 0.487, "step": 1926 }, { "epoch": 0.54, "logps_train/chosen": -54.097442626953125, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -69.08293151855469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3391391038894653, "rewards_train/margins": 0.815247654914856, "rewards_train/rejected": -2.1543867588043213, "step": 1926 }, { "epoch": 0.54, "logps_train/chosen": -33.45086669921875, "logps_train/ref_chosen": -28.125, "logps_train/ref_rejected": -21.25, "logps_train/rejected": -34.63164520263672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5357113480567932, "rewards_train/margins": 0.8057733178138733, "rewards_train/rejected": -1.3414846658706665, "step": 1927 }, { "epoch": 0.54, "learning_rate": 9.873193248432474e-08, "loss": 0.5634, "step": 1928 }, { "epoch": 0.54, "logps_train/chosen": -70.0160903930664, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -88.89080810546875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.854928970336914, "rewards_train/margins": 1.8595430850982666, "rewards_train/rejected": -3.7144720554351807, "step": 1928 }, { "epoch": 0.54, "logps_train/chosen": -36.294586181640625, "logps_train/ref_chosen": -31.5, "logps_train/ref_rejected": -27.375, "logps_train/rejected": -38.279640197753906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.47565028071403503, "rewards_train/margins": 0.6132514774799347, "rewards_train/rejected": -1.0889017581939697, "step": 1929 }, { "epoch": 0.54, "learning_rate": 9.823963775635649e-08, "loss": 0.61, "step": 1930 }, { "epoch": 0.54, "logps_train/chosen": -110.10010528564453, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -130.25558471679688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.5490732192993164, "rewards_train/margins": 1.4658403396606445, "rewards_train/rejected": -4.014913558959961, "step": 1930 }, { "epoch": 0.54, "logps_train/chosen": -73.57592010498047, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -71.04751586914062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.130516529083252, "rewards_train/margins": 0.7465004920959473, "rewards_train/rejected": -2.877017021179199, "step": 1931 }, { "epoch": 0.54, "learning_rate": 9.774827313907402e-08, "loss": 0.6667, "step": 1932 }, { "epoch": 0.54, "logps_train/chosen": -42.27134704589844, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -54.67235565185547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6021837592124939, "rewards_train/margins": 0.7863407731056213, "rewards_train/rejected": -1.3885245323181152, "step": 1932 }, { "epoch": 0.54, "logps_train/chosen": -119.25166320800781, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -141.34066772460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -3.3017287254333496, "rewards_train/margins": 1.8729634284973145, "rewards_train/rejected": -5.174692153930664, "step": 1933 }, { "epoch": 0.54, "learning_rate": 9.725784164395869e-08, "loss": 0.3698, "step": 1934 }, { "epoch": 0.54, "logps_train/chosen": -76.71092224121094, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -98.15559387207031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.210374355316162, "rewards_train/margins": 1.2499113082885742, "rewards_train/rejected": -2.4602856636047363, "step": 1934 }, { "epoch": 0.54, "logps_train/chosen": -120.2344970703125, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -142.18515014648438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.640246868133545, "rewards_train/margins": 1.0337355136871338, "rewards_train/rejected": -3.6739823818206787, "step": 1935 }, { "epoch": 0.54, "learning_rate": 9.67683462767726e-08, "loss": 0.5539, "step": 1936 }, { "epoch": 0.54, "logps_train/chosen": -38.442298889160156, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -54.857452392578125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3488198220729828, "rewards_train/margins": 1.1718863546848297, "rewards_train/rejected": -1.5207061767578125, "step": 1936 }, { "epoch": 0.54, "logps_train/chosen": -72.53105163574219, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -80.7066421508789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6605265140533447, "rewards_train/margins": 2.13670015335083, "rewards_train/rejected": -2.797226667404175, "step": 1937 }, { "epoch": 0.54, "learning_rate": 9.627979003754081e-08, "loss": 0.3234, "step": 1938 }, { "epoch": 0.54, "logps_train/chosen": -45.484642028808594, "logps_train/ref_chosen": -31.375, "logps_train/ref_rejected": -25.0, "logps_train/rejected": -40.9714469909668, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.413405418395996, "rewards_train/margins": 0.1870594024658203, "rewards_train/rejected": -1.6004648208618164, "step": 1938 }, { "epoch": 0.54, "logps_train/chosen": -68.18218231201172, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -75.88032531738281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.1447806358337402, "rewards_train/margins": 0.7670800685882568, "rewards_train/rejected": -2.911860704421997, "step": 1939 }, { "epoch": 0.54, "learning_rate": 9.579217592053249e-08, "loss": 0.5756, "step": 1940 }, { "epoch": 0.54, "logps_train/chosen": -32.406002044677734, "logps_train/ref_chosen": -23.5, "logps_train/ref_rejected": -33.75, "logps_train/rejected": -49.05712127685547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8916256427764893, "rewards_train/margins": 0.6273677349090576, "rewards_train/rejected": -1.5189933776855469, "step": 1940 }, { "epoch": 0.54, "logps_train/chosen": -64.72395324707031, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -93.2551040649414, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1455401182174683, "rewards_train/margins": 1.7887598276138306, "rewards_train/rejected": -2.934299945831299, "step": 1941 }, { "epoch": 0.54, "learning_rate": 9.530550691424283e-08, "loss": 0.4961, "step": 1942 }, { "epoch": 0.54, "logps_train/chosen": -86.69308471679688, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -131.5696563720703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.000558853149414, "rewards_train/margins": 2.1673450469970703, "rewards_train/rejected": -4.167903900146484, "step": 1942 }, { "epoch": 0.54, "logps_train/chosen": -69.41508483886719, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -71.49929809570312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.0921919345855713, "rewards_train/margins": 0.15363574028015137, "rewards_train/rejected": -2.2458276748657227, "step": 1943 }, { "epoch": 0.54, "learning_rate": 9.481978600137435e-08, "loss": 0.4903, "step": 1944 }, { "epoch": 0.54, "logps_train/chosen": -87.29005432128906, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -101.59136962890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3891611099243164, "rewards_train/margins": 1.0793511867523193, "rewards_train/rejected": -2.4685122966766357, "step": 1944 }, { "epoch": 0.54, "logps_train/chosen": -65.32451629638672, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -79.38421630859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5969047546386719, "rewards_train/margins": 1.0954232215881348, "rewards_train/rejected": -2.6923279762268066, "step": 1945 }, { "epoch": 0.54, "learning_rate": 9.433501615881923e-08, "loss": 0.516, "step": 1946 }, { "epoch": 0.54, "logps_train/chosen": -71.54859924316406, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -86.97831726074219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.762526035308838, "rewards_train/margins": 0.750784158706665, "rewards_train/rejected": -2.513310194015503, "step": 1946 }, { "epoch": 0.54, "logps_train/chosen": -94.74353790283203, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -117.25440979003906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7579474449157715, "rewards_train/margins": 1.9831185340881348, "rewards_train/rejected": -3.7410659790039062, "step": 1947 }, { "epoch": 0.54, "learning_rate": 9.385120035764057e-08, "loss": 0.4191, "step": 1948 }, { "epoch": 0.54, "logps_train/chosen": -58.763999938964844, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -75.54961395263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9271815419197083, "rewards_train/margins": 1.110592544078827, "rewards_train/rejected": -2.037774085998535, "step": 1948 }, { "epoch": 0.54, "logps_train/chosen": -105.40151977539062, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -141.94189453125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3798977136611938, "rewards_train/margins": 2.4640473127365112, "rewards_train/rejected": -3.843945026397705, "step": 1949 }, { "epoch": 0.54, "learning_rate": 9.33683415630542e-08, "loss": 0.3266, "step": 1950 }, { "epoch": 0.54, "logps_train/chosen": -44.61640167236328, "logps_train/ref_chosen": -42.5, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -69.68132019042969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.22238191962242126, "rewards_train/margins": 0.8638648092746735, "rewards_train/rejected": -1.0862467288970947, "step": 1950 }, { "epoch": 0.55, "logps_train/chosen": -78.37663269042969, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -103.28611755371094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0622730255126953, "rewards_train/margins": 2.4971981048583984, "rewards_train/rejected": -4.559471130371094, "step": 1951 }, { "epoch": 0.55, "learning_rate": 9.288644273441082e-08, "loss": 0.3534, "step": 1952 }, { "epoch": 0.55, "logps_train/chosen": -43.369773864746094, "logps_train/ref_chosen": -36.75, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -81.58674621582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.652895450592041, "rewards_train/margins": 2.721795082092285, "rewards_train/rejected": -3.374690532684326, "step": 1952 }, { "epoch": 0.55, "logps_train/chosen": -43.837947845458984, "logps_train/ref_chosen": -34.25, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -72.29368591308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.962164044380188, "rewards_train/margins": 2.0916680097579956, "rewards_train/rejected": -3.0538320541381836, "step": 1953 }, { "epoch": 0.55, "learning_rate": 9.240550682517767e-08, "loss": 0.2719, "step": 1954 }, { "epoch": 0.55, "logps_train/chosen": -100.79878997802734, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -133.30992126464844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3505821228027344, "rewards_train/margins": 2.0655670166015625, "rewards_train/rejected": -4.416149139404297, "step": 1954 }, { "epoch": 0.55, "logps_train/chosen": -72.240478515625, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -87.75106811523438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3447513580322266, "rewards_train/margins": 1.1574063301086426, "rewards_train/rejected": -2.502157688140869, "step": 1955 }, { "epoch": 0.55, "learning_rate": 9.192553678292028e-08, "loss": 0.4237, "step": 1956 }, { "epoch": 0.55, "logps_train/chosen": -58.43614196777344, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -66.32608032226562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6959576606750488, "rewards_train/margins": 0.8934860229492188, "rewards_train/rejected": -1.5894436836242676, "step": 1956 }, { "epoch": 0.55, "logps_train/chosen": -108.7609634399414, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -138.67153930664062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5696511268615723, "rewards_train/margins": 1.7826590538024902, "rewards_train/rejected": -3.3523101806640625, "step": 1957 }, { "epoch": 0.55, "learning_rate": 9.14465355492847e-08, "loss": 0.3997, "step": 1958 }, { "epoch": 0.55, "logps_train/chosen": -57.13661193847656, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -78.82752990722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7325087189674377, "rewards_train/margins": 1.3393065333366394, "rewards_train/rejected": -2.071815252304077, "step": 1958 }, { "epoch": 0.55, "logps_train/chosen": -67.84454345703125, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -57.69300842285156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3555479049682617, "rewards_train/margins": 0.2727372646331787, "rewards_train/rejected": -1.6282851696014404, "step": 1959 }, { "epoch": 0.55, "learning_rate": 9.09685060599793e-08, "loss": 0.4911, "step": 1960 }, { "epoch": 0.55, "logps_train/chosen": -36.913307189941406, "logps_train/ref_chosen": -25.5, "logps_train/ref_rejected": -21.25, "logps_train/rejected": -37.476165771484375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1362526416778564, "rewards_train/margins": 0.4801138639450073, "rewards_train/rejected": -1.6163665056228638, "step": 1960 }, { "epoch": 0.55, "logps_train/chosen": -109.95804595947266, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -153.1173858642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9790079593658447, "rewards_train/margins": 2.569059133529663, "rewards_train/rejected": -4.548067092895508, "step": 1961 }, { "epoch": 0.55, "learning_rate": 9.049145124475697e-08, "loss": 0.4072, "step": 1962 }, { "epoch": 0.55, "logps_train/chosen": -97.06443786621094, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -104.98481750488281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.699608325958252, "rewards_train/margins": 1.2527806758880615, "rewards_train/rejected": -3.9523890018463135, "step": 1962 }, { "epoch": 0.55, "logps_train/chosen": -102.10678100585938, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -99.5, "logps_train/rejected": -146.4178466796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4352869987487793, "rewards_train/margins": 3.264212131500244, "rewards_train/rejected": -4.699499130249023, "step": 1963 }, { "epoch": 0.55, "learning_rate": 9.001537402739656e-08, "loss": 0.5195, "step": 1964 }, { "epoch": 0.55, "logps_train/chosen": -77.20133972167969, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -111.71385192871094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7654464244842529, "rewards_train/margins": 2.846759080886841, "rewards_train/rejected": -3.6122055053710938, "step": 1964 }, { "epoch": 0.55, "logps_train/chosen": -32.98747634887695, "logps_train/ref_chosen": -23.25, "logps_train/ref_rejected": -20.375, "logps_train/rejected": -35.0021858215332, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9791187644004822, "rewards_train/margins": 0.48169535398483276, "rewards_train/rejected": -1.460814118385315, "step": 1965 }, { "epoch": 0.55, "learning_rate": 8.95402773256859e-08, "loss": 0.3197, "step": 1966 }, { "epoch": 0.55, "logps_train/chosen": -70.00694274902344, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -86.17904663085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9338979721069336, "rewards_train/margins": 1.4931864738464355, "rewards_train/rejected": -2.427084445953369, "step": 1966 }, { "epoch": 0.55, "logps_train/chosen": -102.46730041503906, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -125.98727416992188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1924326419830322, "rewards_train/margins": 1.3684041500091553, "rewards_train/rejected": -3.5608367919921875, "step": 1967 }, { "epoch": 0.55, "learning_rate": 8.906616405140324e-08, "loss": 0.3615, "step": 1968 }, { "epoch": 0.55, "logps_train/chosen": -84.91909790039062, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -139.67703247070312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.200113296508789, "rewards_train/margins": 3.58477783203125, "rewards_train/rejected": -4.784891128540039, "step": 1968 }, { "epoch": 0.55, "logps_train/chosen": -64.94412994384766, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -93.78373718261719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7944126129150391, "rewards_train/margins": 1.239234209060669, "rewards_train/rejected": -2.033646821975708, "step": 1969 }, { "epoch": 0.55, "learning_rate": 8.859303711029939e-08, "loss": 0.2596, "step": 1970 }, { "epoch": 0.55, "logps_train/chosen": -90.74423217773438, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -97.94482421875, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.6834075450897217, "rewards_train/margins": 0.7979888916015625, "rewards_train/rejected": -2.481396436691284, "step": 1970 }, { "epoch": 0.55, "logps_train/chosen": -76.51930236816406, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -133.88336181640625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8037859797477722, "rewards_train/margins": 2.807988226413727, "rewards_train/rejected": -3.611774206161499, "step": 1971 }, { "epoch": 0.55, "learning_rate": 8.812089940208043e-08, "loss": 0.5362, "step": 1972 }, { "epoch": 0.55, "logps_train/chosen": -74.79251098632812, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -79.21062469482422, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2431179285049438, "rewards_train/margins": 0.5658348798751831, "rewards_train/rejected": -1.808952808380127, "step": 1972 }, { "epoch": 0.55, "logps_train/chosen": -64.97998046875, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -82.74063873291016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4780757427215576, "rewards_train/margins": 1.6629807949066162, "rewards_train/rejected": -3.141056537628174, "step": 1973 }, { "epoch": 0.55, "learning_rate": 8.764975382038942e-08, "loss": 0.4841, "step": 1974 }, { "epoch": 0.55, "logps_train/chosen": -48.83981704711914, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -74.11100006103516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.733005166053772, "rewards_train/margins": 1.6060246229171753, "rewards_train/rejected": -2.3390297889709473, "step": 1974 }, { "epoch": 0.55, "logps_train/chosen": -90.83193969726562, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -116.63704681396484, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6828033924102783, "rewards_train/margins": 1.145744800567627, "rewards_train/rejected": -2.8285481929779053, "step": 1975 }, { "epoch": 0.55, "learning_rate": 8.717960325278904e-08, "loss": 0.3899, "step": 1976 }, { "epoch": 0.55, "logps_train/chosen": -102.88260650634766, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -108.89571380615234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.6577913761138916, "rewards_train/margins": 1.1907646656036377, "rewards_train/rejected": -3.8485560417175293, "step": 1976 }, { "epoch": 0.55, "logps_train/chosen": -31.117368698120117, "logps_train/ref_chosen": -21.5, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -62.46642303466797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.959783673286438, "rewards_train/margins": 0.8899835348129272, "rewards_train/rejected": -1.8497672080993652, "step": 1977 }, { "epoch": 0.55, "learning_rate": 8.671045058074342e-08, "loss": 0.4141, "step": 1978 }, { "epoch": 0.55, "logps_train/chosen": -84.71221923828125, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -104.71487426757812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1792783737182617, "rewards_train/margins": 1.6450409889221191, "rewards_train/rejected": -3.824319362640381, "step": 1978 }, { "epoch": 0.55, "logps_train/chosen": -83.40449523925781, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -105.50485229492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8709185123443604, "rewards_train/margins": 2.112574815750122, "rewards_train/rejected": -3.9834933280944824, "step": 1979 }, { "epoch": 0.55, "learning_rate": 8.624229867960101e-08, "loss": 0.322, "step": 1980 }, { "epoch": 0.55, "logps_train/chosen": -60.2187385559082, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -80.80245971679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8312669992446899, "rewards_train/margins": 1.6380594968795776, "rewards_train/rejected": -2.4693264961242676, "step": 1980 }, { "epoch": 0.55, "logps_train/chosen": -92.03398132324219, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -106.09092712402344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.175859212875366, "rewards_train/margins": 1.680499792098999, "rewards_train/rejected": -3.8563590049743652, "step": 1981 }, { "epoch": 0.55, "learning_rate": 8.57751504185768e-08, "loss": 0.3531, "step": 1982 }, { "epoch": 0.55, "logps_train/chosen": -49.1508674621582, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -26.375, "logps_train/rejected": -41.218502044677734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3704088926315308, "rewards_train/margins": 0.11394131183624268, "rewards_train/rejected": -1.4843502044677734, "step": 1982 }, { "epoch": 0.55, "logps_train/chosen": -77.5458755493164, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -109.19812774658203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5270484685897827, "rewards_train/margins": 2.009170174598694, "rewards_train/rejected": -3.5362186431884766, "step": 1983 }, { "epoch": 0.55, "learning_rate": 8.530900866073431e-08, "loss": 0.6593, "step": 1984 }, { "epoch": 0.55, "logps_train/chosen": -71.50971984863281, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -73.53953552246094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1530224084854126, "rewards_train/margins": 1.1723178625106812, "rewards_train/rejected": -2.3253402709960938, "step": 1984 }, { "epoch": 0.55, "logps_train/chosen": -105.9326171875, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -108.26788330078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2108397483825684, "rewards_train/margins": 1.617511510848999, "rewards_train/rejected": -3.8283512592315674, "step": 1985 }, { "epoch": 0.56, "learning_rate": 8.484387626296871e-08, "loss": 0.3764, "step": 1986 }, { "epoch": 0.56, "logps_train/chosen": -108.44713592529297, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -95.0, "logps_train/rejected": -135.17124938964844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.673912763595581, "rewards_train/margins": 1.3355953693389893, "rewards_train/rejected": -4.00950813293457, "step": 1986 }, { "epoch": 0.56, "logps_train/chosen": -62.282108306884766, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -77.98124694824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0186405181884766, "rewards_train/margins": 1.3792893886566162, "rewards_train/rejected": -2.3979299068450928, "step": 1987 }, { "epoch": 0.56, "learning_rate": 8.437975607598888e-08, "loss": 0.4001, "step": 1988 }, { "epoch": 0.56, "logps_train/chosen": -38.71179962158203, "logps_train/ref_chosen": -28.875, "logps_train/ref_rejected": -31.0, "logps_train/rejected": -47.60798645019531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9852428436279297, "rewards_train/margins": 0.6798524856567383, "rewards_train/rejected": -1.665095329284668, "step": 1988 }, { "epoch": 0.56, "logps_train/chosen": -69.31269073486328, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -106.16201782226562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5469427108764648, "rewards_train/margins": 2.226533889770508, "rewards_train/rejected": -3.7734766006469727, "step": 1989 }, { "epoch": 0.56, "learning_rate": 8.391665094430023e-08, "loss": 0.425, "step": 1990 }, { "epoch": 0.56, "logps_train/chosen": -75.35893249511719, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -88.6258544921875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6272995471954346, "rewards_train/margins": 1.7893869876861572, "rewards_train/rejected": -3.416686534881592, "step": 1990 }, { "epoch": 0.56, "logps_train/chosen": -65.22088623046875, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -98.70135498046875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8096866607666016, "rewards_train/margins": 2.1319329738616943, "rewards_train/rejected": -2.941619634628296, "step": 1991 }, { "epoch": 0.56, "learning_rate": 8.345456370618659e-08, "loss": 0.3566, "step": 1992 }, { "epoch": 0.56, "logps_train/chosen": -90.94308471679688, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -98.82499694824219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6549532413482666, "rewards_train/margins": 1.2091882228851318, "rewards_train/rejected": -2.8641414642333984, "step": 1992 }, { "epoch": 0.56, "logps_train/chosen": -84.97198486328125, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -90.90570068359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.515363097190857, "rewards_train/margins": 1.443956971168518, "rewards_train/rejected": -2.959320068359375, "step": 1993 }, { "epoch": 0.56, "learning_rate": 8.29934971936938e-08, "loss": 0.435, "step": 1994 }, { "epoch": 0.56, "logps_train/chosen": -84.12213897705078, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -118.45738220214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3056586980819702, "rewards_train/margins": 2.090555787086487, "rewards_train/rejected": -3.396214485168457, "step": 1994 }, { "epoch": 0.56, "logps_train/chosen": -82.85786437988281, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -109.7349624633789, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1667675971984863, "rewards_train/margins": 1.7896640300750732, "rewards_train/rejected": -2.9564316272735596, "step": 1995 }, { "epoch": 0.56, "learning_rate": 8.253345423261168e-08, "loss": 0.3296, "step": 1996 }, { "epoch": 0.56, "logps_train/chosen": -127.67427825927734, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -132.77032470703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -3.4143025875091553, "rewards_train/margins": -0.17008256912231445, "rewards_train/rejected": -3.244220018386841, "step": 1996 }, { "epoch": 0.56, "logps_train/chosen": -105.43716430664062, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -123.96332550048828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0671541690826416, "rewards_train/margins": 1.8073036670684814, "rewards_train/rejected": -3.874457836151123, "step": 1997 }, { "epoch": 0.56, "learning_rate": 8.207443764245705e-08, "loss": 0.9856, "step": 1998 }, { "epoch": 0.56, "logps_train/chosen": -75.80126953125, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -98.28489685058594, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8289549350738525, "rewards_train/margins": 1.5110580921173096, "rewards_train/rejected": -2.340013027191162, "step": 1998 }, { "epoch": 0.56, "logps_train/chosen": -68.47697448730469, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -91.54481506347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.4064865112304688, "rewards_train/margins": 0.2593224048614502, "rewards_train/rejected": -2.665808916091919, "step": 1999 }, { "epoch": 0.56, "learning_rate": 8.161645023645597e-08, "loss": 0.581, "step": 2000 }, { "epoch": 0.56, "logps_train/chosen": -53.53972625732422, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -68.63607788085938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0336599349975586, "rewards_train/margins": 0.8455724716186523, "rewards_train/rejected": -1.879232406616211, "step": 2000 }, { "epoch": 0.56, "logps_train/chosen": -83.170166015625, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -83.50080871582031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0488529205322266, "rewards_train/margins": 1.5910711288452148, "rewards_train/rejected": -2.6399240493774414, "step": 2001 }, { "epoch": 0.56, "learning_rate": 8.115949482152709e-08, "loss": 0.4007, "step": 2002 }, { "epoch": 0.56, "logps_train/chosen": -74.61415100097656, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -85.65557861328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.399109959602356, "rewards_train/margins": 1.2592214345932007, "rewards_train/rejected": -2.6583313941955566, "step": 2002 }, { "epoch": 0.56, "logps_train/chosen": -69.95663452148438, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -97.09745788574219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8202722072601318, "rewards_train/margins": 1.9869341850280762, "rewards_train/rejected": -3.807206392288208, "step": 2003 }, { "epoch": 0.56, "learning_rate": 8.070357419826418e-08, "loss": 0.2954, "step": 2004 }, { "epoch": 0.56, "logps_train/chosen": -92.07615661621094, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -118.43268585205078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3320298194885254, "rewards_train/margins": 2.094050884246826, "rewards_train/rejected": -3.4260807037353516, "step": 2004 }, { "epoch": 0.56, "logps_train/chosen": -86.79302978515625, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -115.96133422851562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4421939849853516, "rewards_train/margins": 1.427767276763916, "rewards_train/rejected": -2.8699612617492676, "step": 2005 }, { "epoch": 0.56, "learning_rate": 8.024869116091879e-08, "loss": 0.347, "step": 2006 }, { "epoch": 0.56, "logps_train/chosen": -126.97574615478516, "logps_train/ref_chosen": -102.5, "logps_train/ref_rejected": -126.5, "logps_train/rejected": -168.7589111328125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.456949234008789, "rewards_train/margins": 1.759566307067871, "rewards_train/rejected": -4.21651554107666, "step": 2006 }, { "epoch": 0.56, "logps_train/chosen": -110.49139404296875, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -124.17587280273438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.5725769996643066, "rewards_train/margins": 2.064542293548584, "rewards_train/rejected": -4.637119293212891, "step": 2007 }, { "epoch": 0.56, "learning_rate": 7.979484849738344e-08, "loss": 0.367, "step": 2008 }, { "epoch": 0.56, "logps_train/chosen": -95.91465759277344, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -118.02151489257812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2083609104156494, "rewards_train/margins": 1.537247896194458, "rewards_train/rejected": -3.7456088066101074, "step": 2008 }, { "epoch": 0.56, "logps_train/chosen": -78.37964630126953, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -85.4207763671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.478003740310669, "rewards_train/margins": 0.887902021408081, "rewards_train/rejected": -2.36590576171875, "step": 2009 }, { "epoch": 0.56, "learning_rate": 7.93420489891744e-08, "loss": 0.5685, "step": 2010 }, { "epoch": 0.56, "logps_train/chosen": -60.42747497558594, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -89.84246826171875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7189193964004517, "rewards_train/margins": 1.29794442653656, "rewards_train/rejected": -3.0168638229370117, "step": 2010 }, { "epoch": 0.56, "logps_train/chosen": -105.34133911132812, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -133.2752685546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -3.130227565765381, "rewards_train/margins": 1.6566743850708008, "rewards_train/rejected": -4.786901950836182, "step": 2011 }, { "epoch": 0.56, "learning_rate": 7.889029541141465e-08, "loss": 0.3696, "step": 2012 }, { "epoch": 0.56, "logps_train/chosen": -102.57357788085938, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -125.34063720703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4393885135650635, "rewards_train/margins": 2.0649876594543457, "rewards_train/rejected": -3.504376173019409, "step": 2012 }, { "epoch": 0.56, "logps_train/chosen": -34.7305908203125, "logps_train/ref_chosen": -22.5, "logps_train/ref_rejected": -36.25, "logps_train/rejected": -61.56572723388672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2244750261306763, "rewards_train/margins": 1.316668152809143, "rewards_train/rejected": -2.5411431789398193, "step": 2013 }, { "epoch": 0.56, "learning_rate": 7.843959053281663e-08, "loss": 0.269, "step": 2014 }, { "epoch": 0.56, "logps_train/chosen": -64.35531616210938, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -95.31759643554688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1378759145736694, "rewards_train/margins": 1.2759398221969604, "rewards_train/rejected": -2.41381573677063, "step": 2014 }, { "epoch": 0.56, "logps_train/chosen": -48.608985900878906, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -62.054054260253906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.266660213470459, "rewards_train/margins": 1.068042278289795, "rewards_train/rejected": -2.334702491760254, "step": 2015 }, { "epoch": 0.56, "learning_rate": 7.798993711566581e-08, "loss": 0.3833, "step": 2016 }, { "epoch": 0.56, "logps_train/chosen": -90.3540267944336, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -109.29360961914062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.173293113708496, "rewards_train/margins": 1.4865365028381348, "rewards_train/rejected": -3.659829616546631, "step": 2016 }, { "epoch": 0.56, "logps_train/chosen": -65.87181091308594, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -66.69879150390625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7055405378341675, "rewards_train/margins": 1.0889474153518677, "rewards_train/rejected": -2.794487953186035, "step": 2017 }, { "epoch": 0.56, "learning_rate": 7.754133791580339e-08, "loss": 0.3551, "step": 2018 }, { "epoch": 0.56, "logps_train/chosen": -84.87860870361328, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -92.8297348022461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9077825546264648, "rewards_train/margins": 1.2103471755981445, "rewards_train/rejected": -3.1181297302246094, "step": 2018 }, { "epoch": 0.56, "logps_train/chosen": -89.56547546386719, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -124.26695251464844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3358440399169922, "rewards_train/margins": 2.7084288597106934, "rewards_train/rejected": -4.0442728996276855, "step": 2019 }, { "epoch": 0.56, "learning_rate": 7.709379568260921e-08, "loss": 0.3807, "step": 2020 }, { "epoch": 0.56, "logps_train/chosen": -48.11452865600586, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -38.75, "logps_train/rejected": -59.88193893432617, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9053493738174438, "rewards_train/margins": 1.2148758172988892, "rewards_train/rejected": -2.120225191116333, "step": 2020 }, { "epoch": 0.56, "logps_train/chosen": -58.42257308959961, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -33.25, "logps_train/rejected": -59.07329177856445, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6925015449523926, "rewards_train/margins": 0.9008626937866211, "rewards_train/rejected": -2.5933642387390137, "step": 2021 }, { "epoch": 0.57, "learning_rate": 7.664731315898546e-08, "loss": 0.3955, "step": 2022 }, { "epoch": 0.57, "logps_train/chosen": -61.435279846191406, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -98.72279357910156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7993875741958618, "rewards_train/margins": 1.5148848295211792, "rewards_train/rejected": -3.314272403717041, "step": 2022 }, { "epoch": 0.57, "logps_train/chosen": -56.93460464477539, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -68.29438781738281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0959997177124023, "rewards_train/margins": 0.5771892070770264, "rewards_train/rejected": -1.6731889247894287, "step": 2023 }, { "epoch": 0.57, "learning_rate": 7.620189308133943e-08, "loss": 0.4807, "step": 2024 }, { "epoch": 0.57, "logps_train/chosen": -103.25361633300781, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -132.89398193359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.267939567565918, "rewards_train/margins": 2.0386457443237305, "rewards_train/rejected": -4.306585311889648, "step": 2024 }, { "epoch": 0.57, "logps_train/chosen": -77.77604675292969, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -101.76101684570312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7494794130325317, "rewards_train/margins": 1.9812120199203491, "rewards_train/rejected": -3.730691432952881, "step": 2025 }, { "epoch": 0.57, "learning_rate": 7.575753817956702e-08, "loss": 0.3636, "step": 2026 }, { "epoch": 0.57, "logps_train/chosen": -71.26786804199219, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -79.71773529052734, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.054130792617798, "rewards_train/margins": 0.13385415077209473, "rewards_train/rejected": -2.1879849433898926, "step": 2026 }, { "epoch": 0.57, "logps_train/chosen": -53.09149932861328, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -72.68791198730469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.880146324634552, "rewards_train/margins": 1.07927006483078, "rewards_train/rejected": -1.959416389465332, "step": 2027 }, { "epoch": 0.57, "learning_rate": 7.531425117703557e-08, "loss": 0.5769, "step": 2028 }, { "epoch": 0.57, "logps_train/chosen": -94.37026977539062, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -99.60851287841797, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.2429842948913574, "rewards_train/margins": 0.6723837852478027, "rewards_train/rejected": -2.91536808013916, "step": 2028 }, { "epoch": 0.57, "logps_train/chosen": -85.78739929199219, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -123.35035705566406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8318644762039185, "rewards_train/margins": 1.481296181678772, "rewards_train/rejected": -3.3131606578826904, "step": 2029 }, { "epoch": 0.57, "learning_rate": 7.487203479056777e-08, "loss": 0.5022, "step": 2030 }, { "epoch": 0.57, "logps_train/chosen": -61.43556213378906, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -43.75, "logps_train/rejected": -72.5194091796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9685559272766113, "rewards_train/margins": 1.9017441272735596, "rewards_train/rejected": -2.870300054550171, "step": 2030 }, { "epoch": 0.57, "logps_train/chosen": -99.33772277832031, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -89.69277954101562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3941230773925781, "rewards_train/margins": 0.8712484836578369, "rewards_train/rejected": -2.265371561050415, "step": 2031 }, { "epoch": 0.57, "learning_rate": 7.443089173042466e-08, "loss": 0.449, "step": 2032 }, { "epoch": 0.57, "logps_train/chosen": -70.18465423583984, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -86.06988525390625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.4484455585479736, "rewards_train/margins": 1.011277675628662, "rewards_train/rejected": -3.4597232341766357, "step": 2032 }, { "epoch": 0.57, "logps_train/chosen": -80.47709655761719, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -117.68765258789062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.264530658721924, "rewards_train/margins": 2.062462329864502, "rewards_train/rejected": -4.326992988586426, "step": 2033 }, { "epoch": 0.57, "learning_rate": 7.399082470028883e-08, "loss": 0.5092, "step": 2034 }, { "epoch": 0.57, "logps_train/chosen": -130.05368041992188, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -122.53167724609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.086618661880493, "rewards_train/margins": 1.1272914409637451, "rewards_train/rejected": -4.213910102844238, "step": 2034 }, { "epoch": 0.57, "logps_train/chosen": -40.75750732421875, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -87.02031707763672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0569519996643066, "rewards_train/margins": 1.7413201332092285, "rewards_train/rejected": -2.798272132873535, "step": 2035 }, { "epoch": 0.57, "learning_rate": 7.35518363972483e-08, "loss": 0.4356, "step": 2036 }, { "epoch": 0.57, "logps_train/chosen": -95.84678649902344, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -125.46881103515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9279396533966064, "rewards_train/margins": 2.9374959468841553, "rewards_train/rejected": -4.865435600280762, "step": 2036 }, { "epoch": 0.57, "logps_train/chosen": -117.28984832763672, "logps_train/ref_chosen": -102.0, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -111.59902954101562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.520782232284546, "rewards_train/margins": 1.0416600704193115, "rewards_train/rejected": -2.5624423027038574, "step": 2037 }, { "epoch": 0.57, "learning_rate": 7.311392951177983e-08, "loss": 0.3732, "step": 2038 }, { "epoch": 0.57, "logps_train/chosen": -58.8238525390625, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -49.25, "logps_train/rejected": -69.92394256591797, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9842894673347473, "rewards_train/margins": 1.0846182703971863, "rewards_train/rejected": -2.0689077377319336, "step": 2038 }, { "epoch": 0.57, "logps_train/chosen": -70.83395385742188, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -105.24347686767578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7454560995101929, "rewards_train/margins": 2.4327975511550903, "rewards_train/rejected": -3.178253650665283, "step": 2039 }, { "epoch": 0.57, "learning_rate": 7.267710672773211e-08, "loss": 0.3544, "step": 2040 }, { "epoch": 0.57, "logps_train/chosen": -42.925819396972656, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -65.96470642089844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9110389351844788, "rewards_train/margins": 1.6556467413902283, "rewards_train/rejected": -2.566685676574707, "step": 2040 }, { "epoch": 0.57, "logps_train/chosen": -84.8345947265625, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -113.7282943725586, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2553342580795288, "rewards_train/margins": 2.5889803171157837, "rewards_train/rejected": -3.8443145751953125, "step": 2041 }, { "epoch": 0.57, "learning_rate": 7.224137072230982e-08, "loss": 0.3334, "step": 2042 }, { "epoch": 0.57, "logps_train/chosen": -88.57653045654297, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -150.1007843017578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4748404026031494, "rewards_train/margins": 2.3453941345214844, "rewards_train/rejected": -3.820234537124634, "step": 2042 }, { "epoch": 0.57, "logps_train/chosen": -46.55854034423828, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -67.77569580078125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.4599558115005493, "rewards_train/margins": 1.1422239542007446, "rewards_train/rejected": -1.602179765701294, "step": 2043 }, { "epoch": 0.57, "learning_rate": 7.180672416605687e-08, "loss": 0.4253, "step": 2044 }, { "epoch": 0.57, "logps_train/chosen": -58.505218505859375, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -55.59811019897461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8695771098136902, "rewards_train/margins": 1.038085401058197, "rewards_train/rejected": -1.9076625108718872, "step": 2044 }, { "epoch": 0.57, "logps_train/chosen": -122.59248352050781, "logps_train/ref_chosen": -104.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -146.1300506591797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7967479228973389, "rewards_train/margins": 2.4131319522857666, "rewards_train/rejected": -4.2098798751831055, "step": 2045 }, { "epoch": 0.57, "learning_rate": 7.137316972284027e-08, "loss": 0.3584, "step": 2046 }, { "epoch": 0.57, "logps_train/chosen": -75.13282775878906, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -98.92269897460938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.666212797164917, "rewards_train/margins": 2.1494953632354736, "rewards_train/rejected": -3.8157081604003906, "step": 2046 }, { "epoch": 0.57, "logps_train/chosen": -85.78764343261719, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -113.82390594482422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5402874946594238, "rewards_train/margins": 2.2005014419555664, "rewards_train/rejected": -3.7407889366149902, "step": 2047 }, { "epoch": 0.57, "learning_rate": 7.094071004983343e-08, "loss": 0.2601, "step": 2048 }, { "epoch": 0.57, "logps_train/chosen": -79.29083251953125, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -108.56194305419922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5841622352600098, "rewards_train/margins": 1.883164882659912, "rewards_train/rejected": -3.467327117919922, "step": 2048 }, { "epoch": 0.57, "logps_train/chosen": -68.17729187011719, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -59.22962188720703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9677292704582214, "rewards_train/margins": 0.7999593615531921, "rewards_train/rejected": -1.7676886320114136, "step": 2049 }, { "epoch": 0.57, "learning_rate": 7.050934779750029e-08, "loss": 0.3991, "step": 2050 }, { "epoch": 0.57, "logps_train/chosen": -82.71372985839844, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -100.57864379882812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.6639509201049805, "rewards_train/margins": 1.369694709777832, "rewards_train/rejected": -4.0336456298828125, "step": 2050 }, { "epoch": 0.57, "logps_train/chosen": -67.38702392578125, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -98.01837158203125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6201475262641907, "rewards_train/margins": 2.3334473967552185, "rewards_train/rejected": -2.953594923019409, "step": 2051 }, { "epoch": 0.57, "learning_rate": 7.007908560957895e-08, "loss": 0.5333, "step": 2052 }, { "epoch": 0.57, "logps_train/chosen": -70.43220520019531, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -91.27249145507812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2988355159759521, "rewards_train/margins": 1.0909130573272705, "rewards_train/rejected": -2.3897485733032227, "step": 2052 }, { "epoch": 0.57, "logps_train/chosen": -63.30310821533203, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -67.20059967041016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1099984645843506, "rewards_train/margins": 0.640920877456665, "rewards_train/rejected": -1.7509193420410156, "step": 2053 }, { "epoch": 0.57, "learning_rate": 6.964992612306525e-08, "loss": 0.4779, "step": 2054 }, { "epoch": 0.57, "logps_train/chosen": -40.83663558959961, "logps_train/ref_chosen": -30.375, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -60.61116409301758, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.04176926612854, "rewards_train/margins": 1.2412221431732178, "rewards_train/rejected": -2.282991409301758, "step": 2054 }, { "epoch": 0.57, "logps_train/chosen": -48.40538024902344, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -47.75, "logps_train/rejected": -75.69383239746094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2327252626419067, "rewards_train/margins": 1.5532597303390503, "rewards_train/rejected": -2.785984992980957, "step": 2055 }, { "epoch": 0.57, "learning_rate": 6.92218719681969e-08, "loss": 0.4046, "step": 2056 }, { "epoch": 0.57, "logps_train/chosen": -76.61868286132812, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -89.76553344726562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8899929523468018, "rewards_train/margins": 1.0631229877471924, "rewards_train/rejected": -2.953115940093994, "step": 2056 }, { "epoch": 0.57, "logps_train/chosen": -79.19552612304688, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -82.55109405517578, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8472871780395508, "rewards_train/margins": 0.9828221797943115, "rewards_train/rejected": -2.8301093578338623, "step": 2057 }, { "epoch": 0.58, "learning_rate": 6.879492576843726e-08, "loss": 0.6968, "step": 2058 }, { "epoch": 0.58, "logps_train/chosen": -68.574462890625, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -79.17143249511719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4750241041183472, "rewards_train/margins": 1.1171191930770874, "rewards_train/rejected": -2.5921432971954346, "step": 2058 }, { "epoch": 0.58, "logps_train/chosen": -54.907867431640625, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -31.5, "logps_train/rejected": -54.651058197021484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4823884963989258, "rewards_train/margins": 0.831545352935791, "rewards_train/rejected": -2.313933849334717, "step": 2059 }, { "epoch": 0.58, "learning_rate": 6.836909014045924e-08, "loss": 0.3963, "step": 2060 }, { "epoch": 0.58, "logps_train/chosen": -53.24628448486328, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -48.25, "logps_train/rejected": -76.54415893554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2809760570526123, "rewards_train/margins": 1.5523455142974854, "rewards_train/rejected": -2.8333215713500977, "step": 2060 }, { "epoch": 0.58, "logps_train/chosen": -113.57747650146484, "logps_train/ref_chosen": -92.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -97.16392517089844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1120452880859375, "rewards_train/margins": 0.6478042602539062, "rewards_train/rejected": -2.7598495483398438, "step": 2061 }, { "epoch": 0.58, "learning_rate": 6.794436769412911e-08, "loss": 0.454, "step": 2062 }, { "epoch": 0.58, "logps_train/chosen": -40.35031509399414, "logps_train/ref_chosen": -31.75, "logps_train/ref_rejected": -47.5, "logps_train/rejected": -70.04084777832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8655002117156982, "rewards_train/margins": 1.3850693702697754, "rewards_train/rejected": -2.2505695819854736, "step": 2062 }, { "epoch": 0.58, "logps_train/chosen": -42.43242263793945, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -51.40599060058594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9502733945846558, "rewards_train/margins": 0.7848566770553589, "rewards_train/rejected": -1.7351300716400146, "step": 2063 }, { "epoch": 0.58, "learning_rate": 6.752076103249083e-08, "loss": 0.3283, "step": 2064 }, { "epoch": 0.58, "logps_train/chosen": -54.35895538330078, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -91.61186218261719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.34136441349983215, "rewards_train/margins": 2.185056656599045, "rewards_train/rejected": -2.526421070098877, "step": 2064 }, { "epoch": 0.58, "logps_train/chosen": -105.46289825439453, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -127.77777099609375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.8486335277557373, "rewards_train/margins": 1.6217710971832275, "rewards_train/rejected": -4.470404624938965, "step": 2065 }, { "epoch": 0.58, "learning_rate": 6.709827275174992e-08, "loss": 0.3239, "step": 2066 }, { "epoch": 0.58, "logps_train/chosen": -89.56805419921875, "logps_train/ref_chosen": -76.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -99.33772277832031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3353216648101807, "rewards_train/margins": 1.3476698398590088, "rewards_train/rejected": -2.6829915046691895, "step": 2066 }, { "epoch": 0.58, "logps_train/chosen": -84.03972625732422, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -95.41542053222656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6199886798858643, "rewards_train/margins": 1.028975486755371, "rewards_train/rejected": -2.6489641666412354, "step": 2067 }, { "epoch": 0.58, "learning_rate": 6.66769054412574e-08, "loss": 0.4161, "step": 2068 }, { "epoch": 0.58, "logps_train/chosen": -97.68883514404297, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -105.70333862304688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1757194995880127, "rewards_train/margins": 1.2481303215026855, "rewards_train/rejected": -2.4238498210906982, "step": 2068 }, { "epoch": 0.58, "logps_train/chosen": -69.78389739990234, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -90.17318725585938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.841866135597229, "rewards_train/margins": 1.7820924520492554, "rewards_train/rejected": -2.6239585876464844, "step": 2069 }, { "epoch": 0.58, "learning_rate": 6.625666168349423e-08, "loss": 0.411, "step": 2070 }, { "epoch": 0.58, "logps_train/chosen": -59.67823791503906, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -77.22866821289062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8352067470550537, "rewards_train/margins": 1.596644639968872, "rewards_train/rejected": -2.431851387023926, "step": 2070 }, { "epoch": 0.58, "logps_train/chosen": -91.96846008300781, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -117.0979995727539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.133931040763855, "rewards_train/margins": 2.2127586603164673, "rewards_train/rejected": -3.3466897010803223, "step": 2071 }, { "epoch": 0.58, "learning_rate": 6.583754405405528e-08, "loss": 0.3199, "step": 2072 }, { "epoch": 0.58, "logps_train/chosen": -47.81785202026367, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -65.80642700195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.482371062040329, "rewards_train/margins": 1.701983243227005, "rewards_train/rejected": -2.184354305267334, "step": 2072 }, { "epoch": 0.58, "logps_train/chosen": -54.53237533569336, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -70.27101135253906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.7394677400588989, "rewards_train/margins": 0.4726923704147339, "rewards_train/rejected": -1.2121601104736328, "step": 2073 }, { "epoch": 0.58, "learning_rate": 6.541955512163367e-08, "loss": 0.4126, "step": 2074 }, { "epoch": 0.58, "logps_train/chosen": -104.01774597167969, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -124.22376251220703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2252120971679688, "rewards_train/margins": 1.036226749420166, "rewards_train/rejected": -3.2614388465881348, "step": 2074 }, { "epoch": 0.58, "logps_train/chosen": -87.72959899902344, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -114.3899154663086, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.45655357837677, "rewards_train/margins": 1.1152499914169312, "rewards_train/rejected": -2.571803569793701, "step": 2075 }, { "epoch": 0.58, "learning_rate": 6.500269744800469e-08, "loss": 0.4249, "step": 2076 }, { "epoch": 0.58, "logps_train/chosen": -39.61121368408203, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -25.125, "logps_train/rejected": -44.971614837646484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.40379706025123596, "rewards_train/margins": 1.5770559012889862, "rewards_train/rejected": -1.9808529615402222, "step": 2076 }, { "epoch": 0.58, "logps_train/chosen": -93.03768920898438, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -117.90193176269531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4691985845565796, "rewards_train/margins": 1.1084946393966675, "rewards_train/rejected": -2.577693223953247, "step": 2077 }, { "epoch": 0.58, "learning_rate": 6.45869735880106e-08, "loss": 0.3391, "step": 2078 }, { "epoch": 0.58, "logps_train/chosen": -65.64482879638672, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -104.19233703613281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1625299453735352, "rewards_train/margins": 1.9092426300048828, "rewards_train/rejected": -3.071772575378418, "step": 2078 }, { "epoch": 0.58, "logps_train/chosen": -108.32366943359375, "logps_train/ref_chosen": -92.0, "logps_train/ref_rejected": -88.0, "logps_train/rejected": -134.6239013671875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6147891283035278, "rewards_train/margins": 3.0503371953964233, "rewards_train/rejected": -4.665126323699951, "step": 2079 }, { "epoch": 0.58, "learning_rate": 6.417238608954479e-08, "loss": 0.3345, "step": 2080 }, { "epoch": 0.58, "logps_train/chosen": -41.94489288330078, "logps_train/ref_chosen": -38.75, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -50.916587829589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.32964563369750977, "rewards_train/margins": 1.2667005062103271, "rewards_train/rejected": -1.596346139907837, "step": 2080 }, { "epoch": 0.58, "logps_train/chosen": -113.23661804199219, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -136.02545166015625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8537397384643555, "rewards_train/margins": 1.9238042831420898, "rewards_train/rejected": -3.7775440216064453, "step": 2081 }, { "epoch": 0.58, "learning_rate": 6.375893749353578e-08, "loss": 0.3716, "step": 2082 }, { "epoch": 0.58, "logps_train/chosen": -87.25921630859375, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -111.98489379882812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5462337732315063, "rewards_train/margins": 1.2210057973861694, "rewards_train/rejected": -2.767239570617676, "step": 2082 }, { "epoch": 0.58, "logps_train/chosen": -85.58940887451172, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -100.89420318603516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5526912212371826, "rewards_train/margins": 1.297666311264038, "rewards_train/rejected": -2.8503575325012207, "step": 2083 }, { "epoch": 0.58, "learning_rate": 6.334663033393228e-08, "loss": 0.3492, "step": 2084 }, { "epoch": 0.58, "logps_train/chosen": -49.07570266723633, "logps_train/ref_chosen": -39.75, "logps_train/ref_rejected": -41.25, "logps_train/rejected": -75.57756042480469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9356950521469116, "rewards_train/margins": 2.489639401435852, "rewards_train/rejected": -3.4253344535827637, "step": 2084 }, { "epoch": 0.58, "logps_train/chosen": -43.674495697021484, "logps_train/ref_chosen": -33.75, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -69.68412017822266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9971615672111511, "rewards_train/margins": 1.0604656338691711, "rewards_train/rejected": -2.0576272010803223, "step": 2085 }, { "epoch": 0.58, "learning_rate": 6.293546713768722e-08, "loss": 0.3387, "step": 2086 }, { "epoch": 0.58, "logps_train/chosen": -76.24918365478516, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -70.58163452148438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4436683654785156, "rewards_train/margins": 0.9539487361907959, "rewards_train/rejected": -2.3976171016693115, "step": 2086 }, { "epoch": 0.58, "logps_train/chosen": -92.84099578857422, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -137.44235229492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9978206157684326, "rewards_train/margins": 3.044461488723755, "rewards_train/rejected": -4.0422821044921875, "step": 2087 }, { "epoch": 0.58, "learning_rate": 6.252545042474246e-08, "loss": 0.2887, "step": 2088 }, { "epoch": 0.58, "logps_train/chosen": -162.59339904785156, "logps_train/ref_chosen": -121.5, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -161.24937438964844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -4.100355625152588, "rewards_train/margins": 1.3937220573425293, "rewards_train/rejected": -5.494077682495117, "step": 2088 }, { "epoch": 0.58, "logps_train/chosen": -71.49720764160156, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -86.50640869140625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.618861436843872, "rewards_train/margins": 0.9321701526641846, "rewards_train/rejected": -2.5510315895080566, "step": 2089 }, { "epoch": 0.58, "learning_rate": 6.211658270801315e-08, "loss": 0.4803, "step": 2090 }, { "epoch": 0.58, "logps_train/chosen": -112.76863861083984, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -109.97859191894531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.7280359268188477, "rewards_train/margins": 1.2534170150756836, "rewards_train/rejected": -3.9814529418945312, "step": 2090 }, { "epoch": 0.58, "logps_train/chosen": -67.25273895263672, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -79.99827575683594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8928521275520325, "rewards_train/margins": 1.448088824748993, "rewards_train/rejected": -2.3409409523010254, "step": 2091 }, { "epoch": 0.58, "learning_rate": 6.170886649337257e-08, "loss": 0.3987, "step": 2092 }, { "epoch": 0.58, "logps_train/chosen": -53.44761657714844, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -84.1935043334961, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5974960327148438, "rewards_train/margins": 2.5835728645324707, "rewards_train/rejected": -3.1810688972473145, "step": 2092 }, { "epoch": 0.58, "logps_train/chosen": -93.46104431152344, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -123.36744689941406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6160262823104858, "rewards_train/margins": 1.082436442375183, "rewards_train/rejected": -2.698462724685669, "step": 2093 }, { "epoch": 0.59, "learning_rate": 6.13023042796367e-08, "loss": 0.313, "step": 2094 }, { "epoch": 0.59, "logps_train/chosen": -66.57969665527344, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -89.05522155761719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0910753011703491, "rewards_train/margins": 1.451165795326233, "rewards_train/rejected": -2.542241096496582, "step": 2094 }, { "epoch": 0.59, "logps_train/chosen": -85.72696685791016, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -97.59848022460938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.833828926086426, "rewards_train/margins": 1.2093195915222168, "rewards_train/rejected": -4.043148517608643, "step": 2095 }, { "epoch": 0.59, "learning_rate": 6.089689855854869e-08, "loss": 0.3826, "step": 2096 }, { "epoch": 0.59, "logps_train/chosen": -69.51953125, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -65.06510925292969, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6810541152954102, "rewards_train/margins": 0.38824963569641113, "rewards_train/rejected": -2.0693037509918213, "step": 2096 }, { "epoch": 0.59, "logps_train/chosen": -117.8916015625, "logps_train/ref_chosen": -106.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -126.8585433959961, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.171191930770874, "rewards_train/margins": 1.2963025569915771, "rewards_train/rejected": -2.467494487762451, "step": 2097 }, { "epoch": 0.59, "learning_rate": 6.04926518147639e-08, "loss": 0.5213, "step": 2098 }, { "epoch": 0.59, "logps_train/chosen": -100.62855529785156, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -114.99569702148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.965492606163025, "rewards_train/margins": 1.1067334413528442, "rewards_train/rejected": -3.072226047515869, "step": 2098 }, { "epoch": 0.59, "logps_train/chosen": -98.0699462890625, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -110.64087677001953, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9958622455596924, "rewards_train/margins": 0.7252566814422607, "rewards_train/rejected": -2.721118927001953, "step": 2099 }, { "epoch": 0.59, "learning_rate": 6.00895665258346e-08, "loss": 0.5094, "step": 2100 }, { "epoch": 0.59, "logps_train/chosen": -70.66618347167969, "logps_train/ref_chosen": -45.25, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -88.56083679199219, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.539287567138672, "rewards_train/margins": 0.6989374160766602, "rewards_train/rejected": -3.238224983215332, "step": 2100 }, { "epoch": 0.59, "logps_train/chosen": -65.05520629882812, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -76.51008605957031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4090361595153809, "rewards_train/margins": 1.5167770385742188, "rewards_train/rejected": -2.9258131980895996, "step": 2101 }, { "epoch": 0.59, "learning_rate": 5.968764516219452e-08, "loss": 0.409, "step": 2102 }, { "epoch": 0.59, "logps_train/chosen": -67.32633972167969, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -71.25092315673828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.139665126800537, "rewards_train/margins": 0.6016381978988647, "rewards_train/rejected": -1.7413033246994019, "step": 2102 }, { "epoch": 0.59, "logps_train/chosen": -50.675941467285156, "logps_train/ref_chosen": -37.5, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -64.9217529296875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3148601055145264, "rewards_train/margins": 0.9710648059844971, "rewards_train/rejected": -2.2859249114990234, "step": 2103 }, { "epoch": 0.59, "learning_rate": 5.9286890187144027e-08, "loss": 0.4437, "step": 2104 }, { "epoch": 0.59, "logps_train/chosen": -93.62046813964844, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -118.02957916259766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.357945442199707, "rewards_train/margins": 1.7239186763763428, "rewards_train/rejected": -3.08186411857605, "step": 2104 }, { "epoch": 0.59, "logps_train/chosen": -89.52161407470703, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -136.29417419433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0435675382614136, "rewards_train/margins": 3.3913198709487915, "rewards_train/rejected": -4.434887409210205, "step": 2105 }, { "epoch": 0.59, "learning_rate": 5.888730405683495e-08, "loss": 0.1536, "step": 2106 }, { "epoch": 0.59, "logps_train/chosen": -82.62612915039062, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -56.25, "logps_train/rejected": -86.76184844970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.272378921508789, "rewards_train/margins": 1.7903294563293457, "rewards_train/rejected": -3.0627083778381348, "step": 2106 }, { "epoch": 0.59, "logps_train/chosen": -95.77322387695312, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -119.54524230957031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.112185001373291, "rewards_train/margins": 2.22202730178833, "rewards_train/rejected": -4.334212303161621, "step": 2107 }, { "epoch": 0.59, "learning_rate": 5.848888922025552e-08, "loss": 0.251, "step": 2108 }, { "epoch": 0.59, "logps_train/chosen": -50.7303352355957, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -72.72836303710938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9046742916107178, "rewards_train/margins": 1.6324200630187988, "rewards_train/rejected": -2.5370943546295166, "step": 2108 }, { "epoch": 0.59, "logps_train/chosen": -31.666881561279297, "logps_train/ref_chosen": -27.125, "logps_train/ref_rejected": -30.875, "logps_train/rejected": -39.66892623901367, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.44847527146339417, "rewards_train/margins": 0.4336518943309784, "rewards_train/rejected": -0.8821271657943726, "step": 2109 }, { "epoch": 0.59, "learning_rate": 5.8091648119215105e-08, "loss": 0.4292, "step": 2110 }, { "epoch": 0.59, "logps_train/chosen": -68.48396301269531, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -92.3371810913086, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9716391563415527, "rewards_train/margins": 1.34840726852417, "rewards_train/rejected": -3.3200464248657227, "step": 2110 }, { "epoch": 0.59, "logps_train/chosen": -35.47300338745117, "logps_train/ref_chosen": -27.875, "logps_train/ref_rejected": -26.625, "logps_train/rejected": -39.582855224609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7613626718521118, "rewards_train/margins": 0.528563380241394, "rewards_train/rejected": -1.2899260520935059, "step": 2111 }, { "epoch": 0.59, "learning_rate": 5.769558318832965e-08, "loss": 0.4616, "step": 2112 }, { "epoch": 0.59, "logps_train/chosen": -85.51171875, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -80.0, "logps_train/rejected": -124.2002944946289, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2382328510284424, "rewards_train/margins": 2.1925389766693115, "rewards_train/rejected": -4.430771827697754, "step": 2112 }, { "epoch": 0.59, "logps_train/chosen": -38.618709564208984, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -28.875, "logps_train/rejected": -45.32367706298828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.2907773554325104, "rewards_train/margins": 1.3490119874477386, "rewards_train/rejected": -1.639789342880249, "step": 2113 }, { "epoch": 0.59, "learning_rate": 5.7300696855006684e-08, "loss": 0.4409, "step": 2114 }, { "epoch": 0.59, "logps_train/chosen": -69.73672485351562, "logps_train/ref_chosen": -60.75, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -117.43955993652344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9086329936981201, "rewards_train/margins": 3.11686635017395, "rewards_train/rejected": -4.02549934387207, "step": 2114 }, { "epoch": 0.59, "logps_train/chosen": -49.12819290161133, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -76.30802154541016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6368428468704224, "rewards_train/margins": 1.4916154146194458, "rewards_train/rejected": -3.128458261489868, "step": 2115 }, { "epoch": 0.59, "learning_rate": 5.690699153942999e-08, "loss": 0.3039, "step": 2116 }, { "epoch": 0.59, "logps_train/chosen": -102.94654846191406, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -111.5, "logps_train/rejected": -146.1102294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5367438793182373, "rewards_train/margins": 1.9109971523284912, "rewards_train/rejected": -3.4477410316467285, "step": 2116 }, { "epoch": 0.59, "logps_train/chosen": -79.35420227050781, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -117.54788208007812, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.7651073932647705, "rewards_train/margins": 2.2947590351104736, "rewards_train/rejected": -5.059866428375244, "step": 2117 }, { "epoch": 0.59, "learning_rate": 5.6514469654545424e-08, "loss": 0.3365, "step": 2118 }, { "epoch": 0.59, "logps_train/chosen": -75.69719696044922, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -97.35427856445312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6011650562286377, "rewards_train/margins": 1.3848488330841064, "rewards_train/rejected": -2.986013889312744, "step": 2118 }, { "epoch": 0.59, "logps_train/chosen": -108.6153793334961, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -102.6696548461914, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7549954652786255, "rewards_train/margins": 1.5508369207382202, "rewards_train/rejected": -3.3058323860168457, "step": 2119 }, { "epoch": 0.59, "learning_rate": 5.61231336060457e-08, "loss": 0.3585, "step": 2120 }, { "epoch": 0.59, "logps_train/chosen": -74.99488830566406, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -62.25, "logps_train/rejected": -89.0638427734375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.1393327713012695, "rewards_train/margins": 0.5445904731750488, "rewards_train/rejected": -2.6839232444763184, "step": 2120 }, { "epoch": 0.59, "logps_train/chosen": -92.38201904296875, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -104.81034088134766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.167889356613159, "rewards_train/margins": 1.4100196361541748, "rewards_train/rejected": -3.577908992767334, "step": 2121 }, { "epoch": 0.59, "learning_rate": 5.573298579235586e-08, "loss": 0.442, "step": 2122 }, { "epoch": 0.59, "logps_train/chosen": -34.443363189697266, "logps_train/ref_chosen": -27.25, "logps_train/ref_rejected": -33.25, "logps_train/rejected": -55.34367370605469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.7206120491027832, "rewards_train/margins": 1.4889566898345947, "rewards_train/rejected": -2.209568738937378, "step": 2122 }, { "epoch": 0.59, "logps_train/chosen": -71.74280548095703, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -99.73445129394531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0570931434631348, "rewards_train/margins": 2.6399850845336914, "rewards_train/rejected": -3.697078227996826, "step": 2123 }, { "epoch": 0.59, "learning_rate": 5.534402860461823e-08, "loss": 0.3885, "step": 2124 }, { "epoch": 0.59, "logps_train/chosen": -47.78821563720703, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -54.90350341796875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.218665599822998, "rewards_train/margins": 0.7177789211273193, "rewards_train/rejected": -1.9364445209503174, "step": 2124 }, { "epoch": 0.59, "logps_train/chosen": -74.49174499511719, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -77.91828918457031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7401896715164185, "rewards_train/margins": 0.9453891515731812, "rewards_train/rejected": -2.6855788230895996, "step": 2125 }, { "epoch": 0.59, "learning_rate": 5.495626442667825e-08, "loss": 0.4826, "step": 2126 }, { "epoch": 0.59, "logps_train/chosen": -58.563819885253906, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -58.89781188964844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5790258646011353, "rewards_train/margins": 0.5388680696487427, "rewards_train/rejected": -2.117893934249878, "step": 2126 }, { "epoch": 0.59, "logps_train/chosen": -52.63378143310547, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -97.5623550415039, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1653310060501099, "rewards_train/margins": 2.3487173318862915, "rewards_train/rejected": -3.5140483379364014, "step": 2127 }, { "epoch": 0.59, "learning_rate": 5.456969563506966e-08, "loss": 0.3885, "step": 2128 }, { "epoch": 0.59, "logps_train/chosen": -73.58062744140625, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -98.3315658569336, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.586431860923767, "rewards_train/margins": 1.281685471534729, "rewards_train/rejected": -2.868117332458496, "step": 2128 }, { "epoch": 0.6, "logps_train/chosen": -53.447994232177734, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -81.51532745361328, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.6170652508735657, "rewards_train/margins": 1.7473577857017517, "rewards_train/rejected": -2.3644230365753174, "step": 2129 }, { "epoch": 0.6, "learning_rate": 5.418432459899963e-08, "loss": 0.424, "step": 2130 }, { "epoch": 0.6, "logps_train/chosen": -64.28777313232422, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -88.68441772460938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3242729902267456, "rewards_train/margins": 2.565055727958679, "rewards_train/rejected": -3.889328718185425, "step": 2130 }, { "epoch": 0.6, "logps_train/chosen": -74.45040893554688, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -80.51853942871094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1805874109268188, "rewards_train/margins": 1.4519306421279907, "rewards_train/rejected": -2.6325180530548096, "step": 2131 }, { "epoch": 0.6, "learning_rate": 5.380015368033475e-08, "loss": 0.3344, "step": 2132 }, { "epoch": 0.6, "logps_train/chosen": -52.31156539916992, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -64.6455078125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0885294675827026, "rewards_train/margins": 1.076412320137024, "rewards_train/rejected": -2.1649417877197266, "step": 2132 }, { "epoch": 0.6, "logps_train/chosen": -105.92382049560547, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -130.3473663330078, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1806633472442627, "rewards_train/margins": 2.1187217235565186, "rewards_train/rejected": -3.2993850708007812, "step": 2133 }, { "epoch": 0.6, "learning_rate": 5.341718523358624e-08, "loss": 0.4098, "step": 2134 }, { "epoch": 0.6, "logps_train/chosen": -62.77705383300781, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -65.87120819091797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.801142692565918, "rewards_train/margins": 0.44847822189331055, "rewards_train/rejected": -2.2496209144592285, "step": 2134 }, { "epoch": 0.6, "logps_train/chosen": -75.49212646484375, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -116.18441772460938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.114837646484375, "rewards_train/margins": 2.6766507625579834, "rewards_train/rejected": -3.7914884090423584, "step": 2135 }, { "epoch": 0.6, "learning_rate": 5.3035421605895694e-08, "loss": 0.3943, "step": 2136 }, { "epoch": 0.6, "logps_train/chosen": -79.04789733886719, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -89.93489837646484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0745160579681396, "rewards_train/margins": 1.3844029903411865, "rewards_train/rejected": -3.458919048309326, "step": 2136 }, { "epoch": 0.6, "logps_train/chosen": -71.12640380859375, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -83.72441864013672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9622499942779541, "rewards_train/margins": 1.8742544651031494, "rewards_train/rejected": -2.8365044593811035, "step": 2137 }, { "epoch": 0.6, "learning_rate": 5.265486513702036e-08, "loss": 0.2997, "step": 2138 }, { "epoch": 0.6, "logps_train/chosen": -74.46654510498047, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -93.89525604248047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.1700921058654785, "rewards_train/margins": 0.9196293354034424, "rewards_train/rejected": -3.089721441268921, "step": 2138 }, { "epoch": 0.6, "logps_train/chosen": -67.54258728027344, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -98.41825103759766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2925889492034912, "rewards_train/margins": 2.2851741313934326, "rewards_train/rejected": -3.577763080596924, "step": 2139 }, { "epoch": 0.6, "learning_rate": 5.2275518159319244e-08, "loss": 0.4414, "step": 2140 }, { "epoch": 0.6, "logps_train/chosen": -100.00669860839844, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -120.00212860107422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2702014446258545, "rewards_train/margins": 2.686846971511841, "rewards_train/rejected": -3.9570484161376953, "step": 2140 }, { "epoch": 0.6, "logps_train/chosen": -61.78115463256836, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -54.5, "logps_train/rejected": -75.81358337402344, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4771389961242676, "rewards_train/margins": 0.6557817459106445, "rewards_train/rejected": -2.132920742034912, "step": 2141 }, { "epoch": 0.6, "learning_rate": 5.189738299773863e-08, "loss": 0.3366, "step": 2142 }, { "epoch": 0.6, "logps_train/chosen": -57.79534912109375, "logps_train/ref_chosen": -46.25, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -80.45194244384766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1432068347930908, "rewards_train/margins": 1.410581350326538, "rewards_train/rejected": -2.553788185119629, "step": 2142 }, { "epoch": 0.6, "logps_train/chosen": -52.1115608215332, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -68.91043090820312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6092031002044678, "rewards_train/margins": 0.7982456684112549, "rewards_train/rejected": -2.4074487686157227, "step": 2143 }, { "epoch": 0.6, "learning_rate": 5.152046196979756e-08, "loss": 0.381, "step": 2144 }, { "epoch": 0.6, "logps_train/chosen": -67.82266235351562, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -78.76952362060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1142973899841309, "rewards_train/margins": 0.46704959869384766, "rewards_train/rejected": -1.5813469886779785, "step": 2144 }, { "epoch": 0.6, "logps_train/chosen": -90.55293273925781, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -112.4375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9808785915374756, "rewards_train/margins": 1.3691213130950928, "rewards_train/rejected": -3.3499999046325684, "step": 2145 }, { "epoch": 0.6, "learning_rate": 5.114475738557414e-08, "loss": 0.4408, "step": 2146 }, { "epoch": 0.6, "logps_train/chosen": -80.70329284667969, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -97.93040466308594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1353681087493896, "rewards_train/margins": 1.5688543319702148, "rewards_train/rejected": -3.7042224407196045, "step": 2146 }, { "epoch": 0.6, "logps_train/chosen": -94.27971649169922, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -118.33087158203125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9658623933792114, "rewards_train/margins": 2.504724383354187, "rewards_train/rejected": -4.470586776733398, "step": 2147 }, { "epoch": 0.6, "learning_rate": 5.077027154769106e-08, "loss": 0.2933, "step": 2148 }, { "epoch": 0.6, "logps_train/chosen": -71.42758178710938, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -91.78971099853516, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.768930196762085, "rewards_train/margins": 1.0479319095611572, "rewards_train/rejected": -2.816862106323242, "step": 2148 }, { "epoch": 0.6, "logps_train/chosen": -116.45526123046875, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -124.108642578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.3264341354370117, "rewards_train/margins": 1.762359619140625, "rewards_train/rejected": -4.088793754577637, "step": 2149 }, { "epoch": 0.6, "learning_rate": 5.039700675130143e-08, "loss": 0.4622, "step": 2150 }, { "epoch": 0.6, "logps_train/chosen": -47.57038116455078, "logps_train/ref_chosen": -38.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -63.25647735595703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9398505687713623, "rewards_train/margins": 1.3225162029266357, "rewards_train/rejected": -2.262366771697998, "step": 2150 }, { "epoch": 0.6, "logps_train/chosen": -75.62815856933594, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -103.50210571289062, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8766829967498779, "rewards_train/margins": 2.130559206008911, "rewards_train/rejected": -3.007242202758789, "step": 2151 }, { "epoch": 0.6, "learning_rate": 5.002496528407493e-08, "loss": 0.3644, "step": 2152 }, { "epoch": 0.6, "logps_train/chosen": -70.53668212890625, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -96.66780090332031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0055909156799316, "rewards_train/margins": 1.2736575603485107, "rewards_train/rejected": -2.2792484760284424, "step": 2152 }, { "epoch": 0.6, "logps_train/chosen": -86.23835754394531, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -97.13589477539062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3457109928131104, "rewards_train/margins": 2.0218327045440674, "rewards_train/rejected": -3.3675436973571777, "step": 2153 }, { "epoch": 0.6, "learning_rate": 4.9654149426183675e-08, "loss": 0.3179, "step": 2154 }, { "epoch": 0.6, "logps_train/chosen": -94.10749816894531, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -113.29045104980469, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.027156114578247, "rewards_train/margins": 1.1034526824951172, "rewards_train/rejected": -3.1306087970733643, "step": 2154 }, { "epoch": 0.6, "logps_train/chosen": -41.602203369140625, "logps_train/ref_chosen": -32.25, "logps_train/ref_rejected": -54.0, "logps_train/rejected": -72.07321166992188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9360989928245544, "rewards_train/margins": 0.8774717450141907, "rewards_train/rejected": -1.8135707378387451, "step": 2155 }, { "epoch": 0.6, "learning_rate": 4.9284561450288336e-08, "loss": 0.4909, "step": 2156 }, { "epoch": 0.6, "logps_train/chosen": -50.188743591308594, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -67.2891616821289, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8723899126052856, "rewards_train/margins": 1.5221515893936157, "rewards_train/rejected": -2.3945415019989014, "step": 2156 }, { "epoch": 0.6, "logps_train/chosen": -117.48445129394531, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -145.0334930419922, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.6066479682922363, "rewards_train/margins": 1.5853729248046875, "rewards_train/rejected": -4.192020893096924, "step": 2157 }, { "epoch": 0.6, "learning_rate": 4.8916203621523846e-08, "loss": 0.6095, "step": 2158 }, { "epoch": 0.6, "logps_train/chosen": -112.71588134765625, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -141.0, "logps_train/rejected": -177.91049194335938, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.37471342086792, "rewards_train/margins": 1.3186795711517334, "rewards_train/rejected": -3.6933929920196533, "step": 2158 }, { "epoch": 0.6, "logps_train/chosen": -77.97830200195312, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -108.66264343261719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3766385316848755, "rewards_train/margins": 1.66618812084198, "rewards_train/rejected": -3.0428266525268555, "step": 2159 }, { "epoch": 0.6, "learning_rate": 4.854907819748605e-08, "loss": 0.4645, "step": 2160 }, { "epoch": 0.6, "logps_train/chosen": -76.81294250488281, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -89.4246826171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4867630004882812, "rewards_train/margins": 0.7822675704956055, "rewards_train/rejected": -2.2690305709838867, "step": 2160 }, { "epoch": 0.6, "logps_train/chosen": -51.17985153198242, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -55.96266174316406, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.0998210906982422, "rewards_train/margins": 0.31167924404144287, "rewards_train/rejected": -1.411500334739685, "step": 2161 }, { "epoch": 0.6, "learning_rate": 4.8183187428217644e-08, "loss": 0.5627, "step": 2162 }, { "epoch": 0.6, "logps_train/chosen": -89.8934326171875, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -136.94467163085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2463749647140503, "rewards_train/margins": 2.6863728761672974, "rewards_train/rejected": -3.9327478408813477, "step": 2162 }, { "epoch": 0.6, "logps_train/chosen": -66.49400329589844, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -77.09996032714844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8236675262451172, "rewards_train/margins": 0.727588415145874, "rewards_train/rejected": -2.551255941390991, "step": 2163 }, { "epoch": 0.6, "learning_rate": 4.781853355619414e-08, "loss": 0.3154, "step": 2164 }, { "epoch": 0.6, "logps_train/chosen": -95.30115509033203, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -138.87399291992188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.670496940612793, "rewards_train/margins": 2.0684638023376465, "rewards_train/rejected": -4.7389607429504395, "step": 2164 }, { "epoch": 0.61, "logps_train/chosen": -66.40122985839844, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -92.26812744140625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9758161306381226, "rewards_train/margins": 1.6861530542373657, "rewards_train/rejected": -2.6619691848754883, "step": 2165 }, { "epoch": 0.61, "learning_rate": 4.745511881631048e-08, "loss": 0.3928, "step": 2166 }, { "epoch": 0.61, "logps_train/chosen": -63.793094635009766, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -89.9686279296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8122196197509766, "rewards_train/margins": 2.1561279296875, "rewards_train/rejected": -2.9683475494384766, "step": 2166 }, { "epoch": 0.61, "logps_train/chosen": -73.35771179199219, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -101.32657623291016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8987107276916504, "rewards_train/margins": 1.4831657409667969, "rewards_train/rejected": -3.3818764686584473, "step": 2167 }, { "epoch": 0.61, "learning_rate": 4.7092945435867196e-08, "loss": 0.3073, "step": 2168 }, { "epoch": 0.61, "logps_train/chosen": -75.34603881835938, "logps_train/ref_chosen": -56.75, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -84.72315216064453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8536467552185059, "rewards_train/margins": 0.474332332611084, "rewards_train/rejected": -2.32797908782959, "step": 2168 }, { "epoch": 0.61, "logps_train/chosen": -62.41304016113281, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -89.46566009521484, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.763472080230713, "rewards_train/margins": 1.571765661239624, "rewards_train/rejected": -3.335237741470337, "step": 2169 }, { "epoch": 0.61, "learning_rate": 4.673201563455681e-08, "loss": 0.4618, "step": 2170 }, { "epoch": 0.61, "logps_train/chosen": -71.67170715332031, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -90.49404907226562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.721272349357605, "rewards_train/margins": 1.2793046236038208, "rewards_train/rejected": -3.000576972961426, "step": 2170 }, { "epoch": 0.61, "logps_train/chosen": -80.6717529296875, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -106.77967834472656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6156131029129028, "rewards_train/margins": 2.0881351232528687, "rewards_train/rejected": -3.7037482261657715, "step": 2171 }, { "epoch": 0.61, "learning_rate": 4.637233162445001e-08, "loss": 0.3808, "step": 2172 }, { "epoch": 0.61, "logps_train/chosen": -74.60847473144531, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -90.58171844482422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9211989641189575, "rewards_train/margins": 1.737754225730896, "rewards_train/rejected": -2.6589531898498535, "step": 2172 }, { "epoch": 0.61, "logps_train/chosen": -67.87394714355469, "logps_train/ref_chosen": -55.0, "logps_train/ref_rejected": -59.25, "logps_train/rejected": -81.95368957519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.28134024143219, "rewards_train/margins": 0.9876612424850464, "rewards_train/rejected": -2.2690014839172363, "step": 2173 }, { "epoch": 0.61, "learning_rate": 4.601389560998239e-08, "loss": 0.419, "step": 2174 }, { "epoch": 0.61, "logps_train/chosen": -53.42560577392578, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -76.20135498046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1635569334030151, "rewards_train/margins": 1.0972038507461548, "rewards_train/rejected": -2.26076078414917, "step": 2174 }, { "epoch": 0.61, "logps_train/chosen": -94.52764892578125, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -99.2550048828125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5938782691955566, "rewards_train/margins": 2.3807923793792725, "rewards_train/rejected": -3.974670648574829, "step": 2175 }, { "epoch": 0.61, "learning_rate": 4.5656709787940924e-08, "loss": 0.4072, "step": 2176 }, { "epoch": 0.61, "logps_train/chosen": -57.778236389160156, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -60.36033248901367, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8884190320968628, "rewards_train/margins": 0.6456612348556519, "rewards_train/rejected": -2.5340802669525146, "step": 2176 }, { "epoch": 0.61, "logps_train/chosen": -66.49990844726562, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -69.87161254882812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.4437408447265625, "rewards_train/margins": 1.4879522323608398, "rewards_train/rejected": -1.9316930770874023, "step": 2177 }, { "epoch": 0.61, "learning_rate": 4.5300776347450054e-08, "loss": 0.5362, "step": 2178 }, { "epoch": 0.61, "logps_train/chosen": -88.06047058105469, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -82.39453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.2029218673706055, "rewards_train/margins": 0.9341874122619629, "rewards_train/rejected": -3.1371092796325684, "step": 2178 }, { "epoch": 0.61, "logps_train/chosen": -89.73477935791016, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -110.08747100830078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0811927318573, "rewards_train/margins": 1.9843904972076416, "rewards_train/rejected": -4.065583229064941, "step": 2179 }, { "epoch": 0.61, "learning_rate": 4.494609746995895e-08, "loss": 0.4428, "step": 2180 }, { "epoch": 0.61, "logps_train/chosen": -59.022613525390625, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -80.35166931152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3339993953704834, "rewards_train/margins": 1.9964802265167236, "rewards_train/rejected": -3.330479621887207, "step": 2180 }, { "epoch": 0.61, "logps_train/chosen": -78.413818359375, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -73.18446350097656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.560522437095642, "rewards_train/margins": 0.9602681398391724, "rewards_train/rejected": -2.5207905769348145, "step": 2181 }, { "epoch": 0.61, "learning_rate": 4.4592675329227644e-08, "loss": 0.4098, "step": 2182 }, { "epoch": 0.61, "logps_train/chosen": -97.47107696533203, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -146.03619384765625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.363318681716919, "rewards_train/margins": 3.4576833248138428, "rewards_train/rejected": -4.821002006530762, "step": 2182 }, { "epoch": 0.61, "logps_train/chosen": -84.58187866210938, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -41.0, "logps_train/rejected": -75.60816955566406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.045687675476074, "rewards_train/margins": 1.422257900238037, "rewards_train/rejected": -3.4679455757141113, "step": 2183 }, { "epoch": 0.61, "learning_rate": 4.424051209131399e-08, "loss": 0.2827, "step": 2184 }, { "epoch": 0.61, "logps_train/chosen": -67.81340026855469, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -86.39239501953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1239666938781738, "rewards_train/margins": 1.7319717407226562, "rewards_train/rejected": -2.85593843460083, "step": 2184 }, { "epoch": 0.61, "logps_train/chosen": -102.90721130371094, "logps_train/ref_chosen": -89.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -134.3486328125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3950178623199463, "rewards_train/margins": 1.9812510013580322, "rewards_train/rejected": -3.3762688636779785, "step": 2185 }, { "epoch": 0.61, "learning_rate": 4.388960991455998e-08, "loss": 0.2786, "step": 2186 }, { "epoch": 0.61, "logps_train/chosen": -94.52255249023438, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -104.25389099121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6331144571304321, "rewards_train/margins": 1.6272355318069458, "rewards_train/rejected": -3.260349988937378, "step": 2186 }, { "epoch": 0.61, "logps_train/chosen": -89.76007080078125, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -129.9043731689453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8854801654815674, "rewards_train/margins": 2.8222415447235107, "rewards_train/rejected": -4.707721710205078, "step": 2187 }, { "epoch": 0.61, "learning_rate": 4.353997094957903e-08, "loss": 0.267, "step": 2188 }, { "epoch": 0.61, "logps_train/chosen": -72.3919677734375, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -82.99869537353516, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7066773176193237, "rewards_train/margins": 1.5084270238876343, "rewards_train/rejected": -2.215104341506958, "step": 2188 }, { "epoch": 0.61, "logps_train/chosen": -92.11982727050781, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -119.62309265136719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.272139072418213, "rewards_train/margins": 1.5995445251464844, "rewards_train/rejected": -2.8716835975646973, "step": 2189 }, { "epoch": 0.61, "learning_rate": 4.31915973392426e-08, "loss": 0.344, "step": 2190 }, { "epoch": 0.61, "logps_train/chosen": -61.91041564941406, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -102.25560760498047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.4523700177669525, "rewards_train/margins": 2.0759247839450836, "rewards_train/rejected": -2.528294801712036, "step": 2190 }, { "epoch": 0.61, "logps_train/chosen": -70.85047912597656, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -85.49187469482422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0537985563278198, "rewards_train/margins": 2.0175567865371704, "rewards_train/rejected": -3.0713553428649902, "step": 2191 }, { "epoch": 0.61, "learning_rate": 4.2844491218666986e-08, "loss": 0.2707, "step": 2192 }, { "epoch": 0.61, "logps_train/chosen": -86.39334869384766, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -103.11177062988281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6780071258544922, "rewards_train/margins": 1.2112946510314941, "rewards_train/rejected": -2.8893017768859863, "step": 2192 }, { "epoch": 0.61, "logps_train/chosen": -73.32200622558594, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -93.06844329833984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7441143989562988, "rewards_train/margins": 0.5579445362091064, "rewards_train/rejected": -2.3020589351654053, "step": 2193 }, { "epoch": 0.61, "learning_rate": 4.2498654715200115e-08, "loss": 0.4912, "step": 2194 }, { "epoch": 0.61, "logps_train/chosen": -93.79214477539062, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -104.29537963867188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5503088235855103, "rewards_train/margins": 2.2485655546188354, "rewards_train/rejected": -3.7988743782043457, "step": 2194 }, { "epoch": 0.61, "logps_train/chosen": -78.31034088134766, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -82.18409729003906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7984168529510498, "rewards_train/margins": 0.5113992691040039, "rewards_train/rejected": -2.3098161220550537, "step": 2195 }, { "epoch": 0.61, "learning_rate": 4.2154089948408855e-08, "loss": 0.45, "step": 2196 }, { "epoch": 0.61, "logps_train/chosen": -49.43216323852539, "logps_train/ref_chosen": -46.5, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -57.50679397583008, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.30405592918395996, "rewards_train/margins": 0.6392014026641846, "rewards_train/rejected": -0.9432573318481445, "step": 2196 }, { "epoch": 0.61, "logps_train/chosen": -84.24931335449219, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -113.93408966064453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6593858003616333, "rewards_train/margins": 2.4887903928756714, "rewards_train/rejected": -4.148176193237305, "step": 2197 }, { "epoch": 0.61, "learning_rate": 4.181079903006587e-08, "loss": 0.3739, "step": 2198 }, { "epoch": 0.61, "logps_train/chosen": -104.45442199707031, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -123.62088775634766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6325514316558838, "rewards_train/margins": 1.2787561416625977, "rewards_train/rejected": -2.9113075733184814, "step": 2198 }, { "epoch": 0.61, "logps_train/chosen": -85.09977722167969, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -101.44914245605469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3189617395401, "rewards_train/margins": 1.197046160697937, "rewards_train/rejected": -2.516007900238037, "step": 2199 }, { "epoch": 0.61, "learning_rate": 4.1468784064136424e-08, "loss": 0.3595, "step": 2200 }, { "epoch": 0.61, "logps_train/chosen": -57.185577392578125, "logps_train/ref_chosen": -40.25, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -86.12376403808594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.68418288230896, "rewards_train/margins": 0.493818998336792, "rewards_train/rejected": -2.178001880645752, "step": 2200 }, { "epoch": 0.62, "logps_train/chosen": -63.62837600708008, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -68.22294616699219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9917436838150024, "rewards_train/margins": 1.3664888143539429, "rewards_train/rejected": -2.3582324981689453, "step": 2201 }, { "epoch": 0.62, "learning_rate": 4.112804714676593e-08, "loss": 0.5028, "step": 2202 }, { "epoch": 0.62, "logps_train/chosen": -74.61570739746094, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -86.88809204101562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5113760232925415, "rewards_train/margins": 1.9196211099624634, "rewards_train/rejected": -3.430997133255005, "step": 2202 }, { "epoch": 0.62, "logps_train/chosen": -73.47591400146484, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -123.33708190917969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3405604362487793, "rewards_train/margins": 2.7712721824645996, "rewards_train/rejected": -4.111832618713379, "step": 2203 }, { "epoch": 0.62, "learning_rate": 4.078859036626675e-08, "loss": 0.3244, "step": 2204 }, { "epoch": 0.62, "logps_train/chosen": -78.50559997558594, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -102.9476318359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0773179531097412, "rewards_train/margins": 1.9715468883514404, "rewards_train/rejected": -3.0488648414611816, "step": 2204 }, { "epoch": 0.62, "logps_train/chosen": -99.85668182373047, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -122.10873413085938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9958239793777466, "rewards_train/margins": 1.4857524633407593, "rewards_train/rejected": -3.481576442718506, "step": 2205 }, { "epoch": 0.62, "learning_rate": 4.045041580310568e-08, "loss": 0.3996, "step": 2206 }, { "epoch": 0.62, "logps_train/chosen": -91.04232788085938, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -128.8021240234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.47298282384872437, "rewards_train/margins": 2.8220723271369934, "rewards_train/rejected": -3.2950551509857178, "step": 2206 }, { "epoch": 0.62, "logps_train/chosen": -60.48843765258789, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -85.99115753173828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.032559871673584, "rewards_train/margins": 2.402102470397949, "rewards_train/rejected": -3.434662342071533, "step": 2207 }, { "epoch": 0.62, "learning_rate": 4.011352552989081e-08, "loss": 0.2627, "step": 2208 }, { "epoch": 0.62, "logps_train/chosen": -78.32228088378906, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -98.48217010498047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.222853422164917, "rewards_train/margins": 2.1152076721191406, "rewards_train/rejected": -3.3380610942840576, "step": 2208 }, { "epoch": 0.62, "logps_train/chosen": -121.11969757080078, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -110.13211059570312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.695563793182373, "rewards_train/margins": 1.7629599571228027, "rewards_train/rejected": -4.458523750305176, "step": 2209 }, { "epoch": 0.62, "learning_rate": 3.977792161135926e-08, "loss": 0.2788, "step": 2210 }, { "epoch": 0.62, "logps_train/chosen": -84.21195983886719, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -61.0, "logps_train/rejected": -85.9394302368164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7102582454681396, "rewards_train/margins": 0.7754814624786377, "rewards_train/rejected": -2.4857397079467773, "step": 2210 }, { "epoch": 0.62, "logps_train/chosen": -52.243473052978516, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -56.98064422607422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7221986651420593, "rewards_train/margins": 1.2147327065467834, "rewards_train/rejected": -1.9369313716888428, "step": 2211 }, { "epoch": 0.62, "learning_rate": 3.9443606104364285e-08, "loss": 0.4414, "step": 2212 }, { "epoch": 0.62, "logps_train/chosen": -98.57444763183594, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -121.17247009277344, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.9412338733673096, "rewards_train/margins": 0.9576056003570557, "rewards_train/rejected": -3.8988394737243652, "step": 2212 }, { "epoch": 0.62, "logps_train/chosen": -79.18292999267578, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -112.39274597167969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3831367492675781, "rewards_train/margins": 2.326646327972412, "rewards_train/rejected": -3.7097830772399902, "step": 2213 }, { "epoch": 0.62, "learning_rate": 3.911058105786261e-08, "loss": 0.4105, "step": 2214 }, { "epoch": 0.62, "logps_train/chosen": -78.99240112304688, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -100.45001220703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.66408371925354, "rewards_train/margins": 1.2137305736541748, "rewards_train/rejected": -2.877814292907715, "step": 2214 }, { "epoch": 0.62, "logps_train/chosen": -72.00189971923828, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -80.90254974365234, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5917916297912598, "rewards_train/margins": 1.5211198329925537, "rewards_train/rejected": -3.1129114627838135, "step": 2215 }, { "epoch": 0.62, "learning_rate": 3.877884851290206e-08, "loss": 0.3255, "step": 2216 }, { "epoch": 0.62, "logps_train/chosen": -65.60671997070312, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -86.0708999633789, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1294224262237549, "rewards_train/margins": 1.5378239154815674, "rewards_train/rejected": -2.6672463417053223, "step": 2216 }, { "epoch": 0.62, "logps_train/chosen": -64.95185089111328, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -88.27638244628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1026068925857544, "rewards_train/margins": 1.2574526071548462, "rewards_train/rejected": -2.3600594997406006, "step": 2217 }, { "epoch": 0.62, "learning_rate": 3.844841050260897e-08, "loss": 0.3354, "step": 2218 }, { "epoch": 0.62, "logps_train/chosen": -85.91168212890625, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -114.42323303222656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7695856094360352, "rewards_train/margins": 1.7224445343017578, "rewards_train/rejected": -3.492030143737793, "step": 2218 }, { "epoch": 0.62, "logps_train/chosen": -51.87825012207031, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -67.33705139160156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8430984616279602, "rewards_train/margins": 1.746954619884491, "rewards_train/rejected": -2.590053081512451, "step": 2219 }, { "epoch": 0.62, "learning_rate": 3.811926905217574e-08, "loss": 0.4107, "step": 2220 }, { "epoch": 0.62, "logps_train/chosen": -91.75385284423828, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -119.37815856933594, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.2437448501586914, "rewards_train/margins": 2.267899513244629, "rewards_train/rejected": -4.51164436340332, "step": 2220 }, { "epoch": 0.62, "logps_train/chosen": -127.87559509277344, "logps_train/ref_chosen": -96.5, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -147.115478515625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.1203718185424805, "rewards_train/margins": 2.3958635330200195, "rewards_train/rejected": -5.5162353515625, "step": 2221 }, { "epoch": 0.62, "learning_rate": 3.779142617884823e-08, "loss": 0.3116, "step": 2222 }, { "epoch": 0.62, "logps_train/chosen": -75.1016845703125, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -91.29086303710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.95001220703125, "rewards_train/margins": 1.3796601295471191, "rewards_train/rejected": -3.329672336578369, "step": 2222 }, { "epoch": 0.62, "logps_train/chosen": -82.7794189453125, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -101.03459167480469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0894654989242554, "rewards_train/margins": 1.869852900505066, "rewards_train/rejected": -2.9593183994293213, "step": 2223 }, { "epoch": 0.62, "learning_rate": 3.746488389191371e-08, "loss": 0.2689, "step": 2224 }, { "epoch": 0.62, "logps_train/chosen": -83.6978759765625, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -119.25350189208984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.9529907703399658, "rewards_train/margins": 2.4247024059295654, "rewards_train/rejected": -4.377693176269531, "step": 2224 }, { "epoch": 0.62, "logps_train/chosen": -36.573944091796875, "logps_train/ref_chosen": -28.0, "logps_train/ref_rejected": -40.25, "logps_train/rejected": -59.56032943725586, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8517303466796875, "rewards_train/margins": 1.0855525732040405, "rewards_train/rejected": -1.937282919883728, "step": 2225 }, { "epoch": 0.62, "learning_rate": 3.7139644192688474e-08, "loss": 0.315, "step": 2226 }, { "epoch": 0.62, "logps_train/chosen": -75.35189819335938, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -86.87653350830078, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.269174337387085, "rewards_train/margins": 1.128244161605835, "rewards_train/rejected": -2.39741849899292, "step": 2226 }, { "epoch": 0.62, "logps_train/chosen": -53.44129943847656, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -81.5665283203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9230359792709351, "rewards_train/margins": 1.8857661485671997, "rewards_train/rejected": -2.8088021278381348, "step": 2227 }, { "epoch": 0.62, "learning_rate": 3.681570907450526e-08, "loss": 0.3966, "step": 2228 }, { "epoch": 0.62, "logps_train/chosen": -97.71565246582031, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -121.66749572753906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7164868116378784, "rewards_train/margins": 2.1275824308395386, "rewards_train/rejected": -3.844069242477417, "step": 2228 }, { "epoch": 0.62, "logps_train/chosen": -59.47755432128906, "logps_train/ref_chosen": -40.5, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -65.89055633544922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8981459140777588, "rewards_train/margins": 0.9686441421508789, "rewards_train/rejected": -2.8667900562286377, "step": 2229 }, { "epoch": 0.62, "learning_rate": 3.6493080522701504e-08, "loss": 0.3777, "step": 2230 }, { "epoch": 0.62, "logps_train/chosen": -97.23069763183594, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -103.3641128540039, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.641624927520752, "rewards_train/margins": 1.6627554893493652, "rewards_train/rejected": -4.304380416870117, "step": 2230 }, { "epoch": 0.62, "logps_train/chosen": -90.7872314453125, "logps_train/ref_chosen": -81.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -83.63240051269531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.999622106552124, "rewards_train/margins": 0.7530707120895386, "rewards_train/rejected": -1.7526928186416626, "step": 2231 }, { "epoch": 0.62, "learning_rate": 3.61717605146068e-08, "loss": 0.4237, "step": 2232 }, { "epoch": 0.62, "logps_train/chosen": -89.6351318359375, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -113.39808654785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1514034271240234, "rewards_train/margins": 1.3364527225494385, "rewards_train/rejected": -3.487856149673462, "step": 2232 }, { "epoch": 0.62, "logps_train/chosen": -63.18183898925781, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -69.90095520019531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.025996208190918, "rewards_train/margins": 1.44378662109375, "rewards_train/rejected": -2.469782829284668, "step": 2233 }, { "epoch": 0.62, "learning_rate": 3.585175101953108e-08, "loss": 0.3815, "step": 2234 }, { "epoch": 0.62, "logps_train/chosen": -62.70296859741211, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -79.36196899414062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5584807395935059, "rewards_train/margins": 1.133868932723999, "rewards_train/rejected": -2.692349672317505, "step": 2234 }, { "epoch": 0.62, "logps_train/chosen": -47.91161346435547, "logps_train/ref_chosen": -37.0, "logps_train/ref_rejected": -30.75, "logps_train/rejected": -48.11452102661133, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.0786614418029785, "rewards_train/margins": 0.6613061428070068, "rewards_train/rejected": -1.7399675846099854, "step": 2235 }, { "epoch": 0.62, "learning_rate": 3.553305399875217e-08, "loss": 0.588, "step": 2236 }, { "epoch": 0.62, "logps_train/chosen": -60.3453483581543, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -83.7688217163086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.336194634437561, "rewards_train/margins": 1.9405409097671509, "rewards_train/rejected": -3.276735544204712, "step": 2236 }, { "epoch": 0.63, "logps_train/chosen": -81.75450134277344, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -99.93232727050781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3727158308029175, "rewards_train/margins": 1.4931734800338745, "rewards_train/rejected": -2.865889310836792, "step": 2237 }, { "epoch": 0.63, "learning_rate": 3.521567140550413e-08, "loss": 0.3125, "step": 2238 }, { "epoch": 0.63, "logps_train/chosen": -98.9923095703125, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -110.40878295898438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.20220947265625, "rewards_train/margins": 1.3863375186920166, "rewards_train/rejected": -3.5885469913482666, "step": 2238 }, { "epoch": 0.63, "logps_train/chosen": -91.9637451171875, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -111.10643768310547, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3811404705047607, "rewards_train/margins": 2.0095810890197754, "rewards_train/rejected": -3.390721559524536, "step": 2239 }, { "epoch": 0.63, "learning_rate": 3.4899605184965206e-08, "loss": 0.476, "step": 2240 }, { "epoch": 0.63, "logps_train/chosen": -73.35107421875, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -86.94080352783203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6032711267471313, "rewards_train/margins": 1.0263563394546509, "rewards_train/rejected": -2.6296274662017822, "step": 2240 }, { "epoch": 0.63, "logps_train/chosen": -56.36665344238281, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -69.5025634765625, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.2542437314987183, "rewards_train/margins": 0.8428870439529419, "rewards_train/rejected": -2.09713077545166, "step": 2241 }, { "epoch": 0.63, "learning_rate": 3.45848572742456e-08, "loss": 0.5062, "step": 2242 }, { "epoch": 0.63, "logps_train/chosen": -73.39962768554688, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -100.27967834472656, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.264767646789551, "rewards_train/margins": 0.7805824279785156, "rewards_train/rejected": -3.0453500747680664, "step": 2242 }, { "epoch": 0.63, "logps_train/chosen": -28.2735595703125, "logps_train/ref_chosen": -22.75, "logps_train/ref_rejected": -27.375, "logps_train/rejected": -42.428741455078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5486938953399658, "rewards_train/margins": 0.9595122337341309, "rewards_train/rejected": -1.5082061290740967, "step": 2243 }, { "epoch": 0.63, "learning_rate": 3.427142960237609e-08, "loss": 0.4894, "step": 2244 }, { "epoch": 0.63, "logps_train/chosen": -107.04579162597656, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -86.0, "logps_train/rejected": -122.21106719970703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.6733052730560303, "rewards_train/margins": 0.965355634689331, "rewards_train/rejected": -3.6386609077453613, "step": 2244 }, { "epoch": 0.63, "logps_train/chosen": -63.8718376159668, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -91.53450012207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.367554783821106, "rewards_train/margins": 2.2419499158859253, "rewards_train/rejected": -3.6095046997070312, "step": 2245 }, { "epoch": 0.63, "learning_rate": 3.395932409029589e-08, "loss": 0.3596, "step": 2246 }, { "epoch": 0.63, "logps_train/chosen": -58.545536041259766, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -67.61285400390625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3766239881515503, "rewards_train/margins": 0.8612238168716431, "rewards_train/rejected": -2.2378478050231934, "step": 2246 }, { "epoch": 0.63, "logps_train/chosen": -68.09675598144531, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -82.7144546508789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5113364458084106, "rewards_train/margins": 1.4550312757492065, "rewards_train/rejected": -2.966367721557617, "step": 2247 }, { "epoch": 0.63, "learning_rate": 3.364854265084086e-08, "loss": 0.6393, "step": 2248 }, { "epoch": 0.63, "logps_train/chosen": -85.12940979003906, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -113.14777374267578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0652844905853271, "rewards_train/margins": 1.9237120151519775, "rewards_train/rejected": -2.9889965057373047, "step": 2248 }, { "epoch": 0.63, "logps_train/chosen": -102.07742309570312, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -126.52885437011719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.695193290710449, "rewards_train/margins": 2.2426528930664062, "rewards_train/rejected": -4.9378461837768555, "step": 2249 }, { "epoch": 0.63, "learning_rate": 3.333908718873191e-08, "loss": 0.3309, "step": 2250 }, { "epoch": 0.63, "logps_train/chosen": -93.69108581542969, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -98.12279510498047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4114912748336792, "rewards_train/margins": 2.0351635217666626, "rewards_train/rejected": -3.446654796600342, "step": 2250 }, { "epoch": 0.63, "logps_train/chosen": -108.47863006591797, "logps_train/ref_chosen": -99.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -128.9418182373047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9377063512802124, "rewards_train/margins": 2.1350878477096558, "rewards_train/rejected": -3.072794198989868, "step": 2251 }, { "epoch": 0.63, "learning_rate": 3.303095960056332e-08, "loss": 0.2968, "step": 2252 }, { "epoch": 0.63, "logps_train/chosen": -120.46194458007812, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -152.6195831298828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.637600898742676, "rewards_train/margins": 3.0729904174804688, "rewards_train/rejected": -5.7105913162231445, "step": 2252 }, { "epoch": 0.63, "logps_train/chosen": -95.0181655883789, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -98.46448516845703, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1252541542053223, "rewards_train/margins": 0.46494388580322266, "rewards_train/rejected": -2.590198040008545, "step": 2253 }, { "epoch": 0.63, "learning_rate": 3.2724161774791146e-08, "loss": 0.4819, "step": 2254 }, { "epoch": 0.63, "logps_train/chosen": -41.65080642700195, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -29.875, "logps_train/rejected": -45.3927001953125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1373463869094849, "rewards_train/margins": 0.4144233465194702, "rewards_train/rejected": -1.551769733428955, "step": 2254 }, { "epoch": 0.63, "logps_train/chosen": -95.94683074951172, "logps_train/ref_chosen": -85.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -106.769287109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0665581226348877, "rewards_train/margins": 2.7142765522003174, "rewards_train/rejected": -3.780834674835205, "step": 2255 }, { "epoch": 0.63, "learning_rate": 3.241869559172136e-08, "loss": 0.3635, "step": 2256 }, { "epoch": 0.63, "logps_train/chosen": -70.94738006591797, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -120.03895568847656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1283314228057861, "rewards_train/margins": 2.3808376789093018, "rewards_train/rejected": -3.509169101715088, "step": 2256 }, { "epoch": 0.63, "logps_train/chosen": -66.00940704345703, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -86.06417083740234, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.513245701789856, "rewards_train/margins": 1.0896552801132202, "rewards_train/rejected": -2.602900981903076, "step": 2257 }, { "epoch": 0.63, "learning_rate": 3.211456292349876e-08, "loss": 0.4352, "step": 2258 }, { "epoch": 0.63, "logps_train/chosen": -61.45050811767578, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -93.33035278320312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3827158212661743, "rewards_train/margins": 1.8770774602890015, "rewards_train/rejected": -3.259793281555176, "step": 2258 }, { "epoch": 0.63, "logps_train/chosen": -63.052833557128906, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -91.94204711914062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4310648441314697, "rewards_train/margins": 1.5191943645477295, "rewards_train/rejected": -2.950259208679199, "step": 2259 }, { "epoch": 0.63, "learning_rate": 3.1811765634095265e-08, "loss": 0.3445, "step": 2260 }, { "epoch": 0.63, "logps_train/chosen": -48.07798385620117, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -86.21844482421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.585923433303833, "rewards_train/margins": 2.124983310699463, "rewards_train/rejected": -2.710906744003296, "step": 2260 }, { "epoch": 0.63, "logps_train/chosen": -31.856260299682617, "logps_train/ref_chosen": -28.25, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -49.76077651977539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.362432599067688, "rewards_train/margins": 1.201340675354004, "rewards_train/rejected": -1.563773274421692, "step": 2261 }, { "epoch": 0.63, "learning_rate": 3.151030557929829e-08, "loss": 0.2969, "step": 2262 }, { "epoch": 0.63, "logps_train/chosen": -117.81269073486328, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -151.4298095703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3849802017211914, "rewards_train/margins": 2.5712828636169434, "rewards_train/rejected": -4.956263065338135, "step": 2262 }, { "epoch": 0.63, "logps_train/chosen": -94.64985656738281, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -114.19483947753906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.7395955324172974, "rewards_train/margins": 2.332231879234314, "rewards_train/rejected": -4.071827411651611, "step": 2263 }, { "epoch": 0.63, "learning_rate": 3.121018460669986e-08, "loss": 0.3789, "step": 2264 }, { "epoch": 0.63, "logps_train/chosen": -64.57235717773438, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -70.3760986328125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2156345844268799, "rewards_train/margins": 0.9360380172729492, "rewards_train/rejected": -2.151672601699829, "step": 2264 }, { "epoch": 0.63, "logps_train/chosen": -98.49313354492188, "logps_train/ref_chosen": -78.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -132.10076904296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9899380207061768, "rewards_train/margins": 2.3795154094696045, "rewards_train/rejected": -4.369453430175781, "step": 2265 }, { "epoch": 0.63, "learning_rate": 3.091140455568489e-08, "loss": 0.3682, "step": 2266 }, { "epoch": 0.63, "logps_train/chosen": -131.7357635498047, "logps_train/ref_chosen": -117.0, "logps_train/ref_rejected": -120.0, "logps_train/rejected": -152.8302459716797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4923266172409058, "rewards_train/margins": 1.803197979927063, "rewards_train/rejected": -3.2955245971679688, "step": 2266 }, { "epoch": 0.63, "logps_train/chosen": -53.98360061645508, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -46.5, "logps_train/rejected": -67.46401977539062, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.2785238027572632, "rewards_train/margins": 0.8058539628982544, "rewards_train/rejected": -2.0843777656555176, "step": 2267 }, { "epoch": 0.63, "learning_rate": 3.0613967257420074e-08, "loss": 0.3973, "step": 2268 }, { "epoch": 0.63, "logps_train/chosen": -65.583251953125, "logps_train/ref_chosen": -52.0, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -86.12519073486328, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3612544536590576, "rewards_train/margins": 0.8661086559295654, "rewards_train/rejected": -2.227363109588623, "step": 2268 }, { "epoch": 0.63, "logps_train/chosen": -42.63085174560547, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -76.93778228759766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9796866774559021, "rewards_train/margins": 2.458622992038727, "rewards_train/rejected": -3.438309669494629, "step": 2269 }, { "epoch": 0.63, "learning_rate": 3.031787453484255e-08, "loss": 0.382, "step": 2270 }, { "epoch": 0.63, "logps_train/chosen": -84.20780944824219, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -108.0483169555664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3858203887939453, "rewards_train/margins": 2.2135426998138428, "rewards_train/rejected": -3.599363088607788, "step": 2270 }, { "epoch": 0.63, "logps_train/chosen": -41.86841583251953, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -80.5899658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6313731670379639, "rewards_train/margins": 2.1152215003967285, "rewards_train/rejected": -2.7465946674346924, "step": 2271 }, { "epoch": 0.63, "learning_rate": 3.002312820264893e-08, "loss": 0.2475, "step": 2272 }, { "epoch": 0.63, "logps_train/chosen": -97.06907653808594, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -110.30889892578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.563938617706299, "rewards_train/margins": 0.9935135841369629, "rewards_train/rejected": -3.5574522018432617, "step": 2272 }, { "epoch": 0.64, "logps_train/chosen": -83.07379150390625, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -110.21211242675781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3450748920440674, "rewards_train/margins": 2.406604528427124, "rewards_train/rejected": -3.7516794204711914, "step": 2273 }, { "epoch": 0.64, "learning_rate": 2.972973006728399e-08, "loss": 0.4133, "step": 2274 }, { "epoch": 0.64, "logps_train/chosen": -98.0738754272461, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -142.01951599121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5839498043060303, "rewards_train/margins": 2.246126651763916, "rewards_train/rejected": -3.8300764560699463, "step": 2274 }, { "epoch": 0.64, "logps_train/chosen": -52.19287872314453, "logps_train/ref_chosen": -43.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -72.36558532714844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8724128603935242, "rewards_train/margins": 1.1766451001167297, "rewards_train/rejected": -2.049057960510254, "step": 2275 }, { "epoch": 0.64, "learning_rate": 2.943768192692958e-08, "loss": 0.2993, "step": 2276 }, { "epoch": 0.64, "logps_train/chosen": -38.568092346191406, "logps_train/ref_chosen": -26.875, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -53.184696197509766, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1704810857772827, "rewards_train/margins": 0.6430573463439941, "rewards_train/rejected": -1.8135384321212769, "step": 2276 }, { "epoch": 0.64, "logps_train/chosen": -100.71672058105469, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -111.54247283935547, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.1747965812683105, "rewards_train/margins": 0.7583575248718262, "rewards_train/rejected": -2.9331541061401367, "step": 2277 }, { "epoch": 0.64, "learning_rate": 2.914698557149381e-08, "loss": 0.5478, "step": 2278 }, { "epoch": 0.64, "logps_train/chosen": -59.154361724853516, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -64.31646728515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8205142021179199, "rewards_train/margins": 0.7283201217651367, "rewards_train/rejected": -1.5488343238830566, "step": 2278 }, { "epoch": 0.64, "logps_train/chosen": -69.17066955566406, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -92.26918029785156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7604263424873352, "rewards_train/margins": 1.5942253470420837, "rewards_train/rejected": -2.354651689529419, "step": 2279 }, { "epoch": 0.64, "learning_rate": 2.885764278259989e-08, "loss": 0.4812, "step": 2280 }, { "epoch": 0.64, "logps_train/chosen": -80.02699279785156, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -114.78129577636719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8563127517700195, "rewards_train/margins": 2.429922580718994, "rewards_train/rejected": -4.286235332489014, "step": 2280 }, { "epoch": 0.64, "logps_train/chosen": -82.1644058227539, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -143.25636291503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4097998142242432, "rewards_train/margins": 3.8181798458099365, "rewards_train/rejected": -5.22797966003418, "step": 2281 }, { "epoch": 0.64, "learning_rate": 2.8569655333575388e-08, "loss": 0.2186, "step": 2282 }, { "epoch": 0.64, "logps_train/chosen": -85.30421447753906, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -104.56698608398438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.151515007019043, "rewards_train/margins": 1.4945387840270996, "rewards_train/rejected": -2.6460537910461426, "step": 2282 }, { "epoch": 0.64, "logps_train/chosen": -70.06663513183594, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -89.6580810546875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3867413997650146, "rewards_train/margins": 1.272425889968872, "rewards_train/rejected": -2.6591672897338867, "step": 2283 }, { "epoch": 0.64, "learning_rate": 2.8283024989441017e-08, "loss": 0.4312, "step": 2284 }, { "epoch": 0.64, "logps_train/chosen": -38.46830749511719, "logps_train/ref_chosen": -28.75, "logps_train/ref_rejected": -36.5, "logps_train/rejected": -53.23004150390625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9751753807067871, "rewards_train/margins": 0.7005630731582642, "rewards_train/rejected": -1.6757384538650513, "step": 2284 }, { "epoch": 0.64, "logps_train/chosen": -56.60980987548828, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -37.0, "logps_train/rejected": -60.609840393066406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1805121898651123, "rewards_train/margins": 1.1748082637786865, "rewards_train/rejected": -2.355320453643799, "step": 2285 }, { "epoch": 0.64, "learning_rate": 2.7997753506900284e-08, "loss": 0.4174, "step": 2286 }, { "epoch": 0.64, "logps_train/chosen": -61.47783279418945, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -83.35697174072266, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1370413303375244, "rewards_train/margins": 1.5540273189544678, "rewards_train/rejected": -2.691068649291992, "step": 2286 }, { "epoch": 0.64, "logps_train/chosen": -87.39089965820312, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -80.72539520263672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5031523108482361, "rewards_train/margins": 0.8396994471549988, "rewards_train/rejected": -1.3428517580032349, "step": 2287 }, { "epoch": 0.64, "learning_rate": 2.771384263432838e-08, "loss": 0.4489, "step": 2288 }, { "epoch": 0.64, "logps_train/chosen": -72.82838439941406, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -92.01563262939453, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4658713340759277, "rewards_train/margins": 1.5831775665283203, "rewards_train/rejected": -3.049048900604248, "step": 2288 }, { "epoch": 0.64, "logps_train/chosen": -95.0842514038086, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -117.84736633300781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.3092551231384277, "rewards_train/margins": 1.1809983253479004, "rewards_train/rejected": -3.490253448486328, "step": 2289 }, { "epoch": 0.64, "learning_rate": 2.7431294111761538e-08, "loss": 0.4206, "step": 2290 }, { "epoch": 0.64, "logps_train/chosen": -35.47691345214844, "logps_train/ref_chosen": -29.375, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -67.27271270751953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6105333566665649, "rewards_train/margins": 2.0660542249679565, "rewards_train/rejected": -2.6765875816345215, "step": 2290 }, { "epoch": 0.64, "logps_train/chosen": -85.71358489990234, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -104.96987915039062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.313155174255371, "rewards_train/margins": 0.964301586151123, "rewards_train/rejected": -3.277456760406494, "step": 2291 }, { "epoch": 0.64, "learning_rate": 2.715010967088646e-08, "loss": 0.3828, "step": 2292 }, { "epoch": 0.64, "logps_train/chosen": -105.21588134765625, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -143.63766479492188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5610418319702148, "rewards_train/margins": 2.458975315093994, "rewards_train/rejected": -4.020017147064209, "step": 2292 }, { "epoch": 0.64, "logps_train/chosen": -55.39488983154297, "logps_train/ref_chosen": -49.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -65.77626037597656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5595819354057312, "rewards_train/margins": 1.0641135573387146, "rewards_train/rejected": -1.6236954927444458, "step": 2293 }, { "epoch": 0.64, "learning_rate": 2.6870291035029718e-08, "loss": 0.3276, "step": 2294 }, { "epoch": 0.64, "logps_train/chosen": -81.55626678466797, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -84.76385498046875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8581660985946655, "rewards_train/margins": 0.8850165605545044, "rewards_train/rejected": -2.74318265914917, "step": 2294 }, { "epoch": 0.64, "logps_train/chosen": -64.18229675292969, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -48.0, "logps_train/rejected": -75.52268981933594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6174484491348267, "rewards_train/margins": 1.1398991346359253, "rewards_train/rejected": -2.757347583770752, "step": 2295 }, { "epoch": 0.64, "learning_rate": 2.659183991914696e-08, "loss": 0.4717, "step": 2296 }, { "epoch": 0.64, "logps_train/chosen": -68.45669555664062, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -58.75, "logps_train/rejected": -81.3840103149414, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.764907956123352, "rewards_train/margins": 0.494195818901062, "rewards_train/rejected": -2.259103775024414, "step": 2296 }, { "epoch": 0.64, "logps_train/chosen": -85.42061614990234, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -102.02793884277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0028035640716553, "rewards_train/margins": 2.16424822807312, "rewards_train/rejected": -3.1670517921447754, "step": 2297 }, { "epoch": 0.64, "learning_rate": 2.631475802981267e-08, "loss": 0.3543, "step": 2298 }, { "epoch": 0.64, "logps_train/chosen": -123.10936737060547, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -124.00788116455078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.76747989654541, "rewards_train/margins": 0.8004956245422363, "rewards_train/rejected": -3.5679755210876465, "step": 2298 }, { "epoch": 0.64, "logps_train/chosen": -112.47196197509766, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -92.0, "logps_train/rejected": -130.05885314941406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.2675087451934814, "rewards_train/margins": 1.5563452243804932, "rewards_train/rejected": -3.8238539695739746, "step": 2299 }, { "epoch": 0.64, "learning_rate": 2.6039047065209567e-08, "loss": 0.5269, "step": 2300 }, { "epoch": 0.64, "logps_train/chosen": -97.63093566894531, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -108.96878814697266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0855538845062256, "rewards_train/margins": 1.4871065616607666, "rewards_train/rejected": -3.572660446166992, "step": 2300 }, { "epoch": 0.64, "logps_train/chosen": -69.18180084228516, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -91.53018188476562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.5590007305145264, "rewards_train/margins": 1.916283130645752, "rewards_train/rejected": -3.4752838611602783, "step": 2301 }, { "epoch": 0.64, "learning_rate": 2.576470871511832e-08, "loss": 0.4149, "step": 2302 }, { "epoch": 0.64, "logps_train/chosen": -68.69903564453125, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -55.75, "logps_train/rejected": -76.16432189941406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1012508869171143, "rewards_train/margins": 0.9362752437591553, "rewards_train/rejected": -2.0375261306762695, "step": 2302 }, { "epoch": 0.64, "logps_train/chosen": -48.255455017089844, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -77.14627075195312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.226326823234558, "rewards_train/margins": 1.9146674871444702, "rewards_train/rejected": -3.1409943103790283, "step": 2303 }, { "epoch": 0.64, "learning_rate": 2.5491744660906922e-08, "loss": 0.4538, "step": 2304 }, { "epoch": 0.64, "logps_train/chosen": -41.622154235839844, "logps_train/ref_chosen": -30.0, "logps_train/ref_rejected": -42.75, "logps_train/rejected": -65.2881851196289, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1606160402297974, "rewards_train/margins": 1.097267985343933, "rewards_train/rejected": -2.2578840255737305, "step": 2304 }, { "epoch": 0.64, "logps_train/chosen": -86.71805572509766, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -108.43025207519531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6573522090911865, "rewards_train/margins": 1.0544235706329346, "rewards_train/rejected": -2.711775779724121, "step": 2305 }, { "epoch": 0.64, "learning_rate": 2.522015657552068e-08, "loss": 0.5079, "step": 2306 }, { "epoch": 0.64, "logps_train/chosen": -79.84477233886719, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -133.5365447998047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.478226900100708, "rewards_train/margins": 2.4129269123077393, "rewards_train/rejected": -3.8911538124084473, "step": 2306 }, { "epoch": 0.64, "logps_train/chosen": -52.64592742919922, "logps_train/ref_chosen": -43.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -72.86075592041016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8976004719734192, "rewards_train/margins": 0.7845690846443176, "rewards_train/rejected": -1.6821695566177368, "step": 2307 }, { "epoch": 0.65, "learning_rate": 2.4949946123471928e-08, "loss": 0.4362, "step": 2308 }, { "epoch": 0.65, "logps_train/chosen": -112.13893127441406, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -119.48912811279297, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8115500211715698, "rewards_train/margins": 1.4623628854751587, "rewards_train/rejected": -3.2739129066467285, "step": 2308 }, { "epoch": 0.65, "logps_train/chosen": -77.41275024414062, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -108.7606201171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5029933452606201, "rewards_train/margins": 1.9535377025604248, "rewards_train/rejected": -3.456531047821045, "step": 2309 }, { "epoch": 0.65, "learning_rate": 2.468111496082953e-08, "loss": 0.393, "step": 2310 }, { "epoch": 0.65, "logps_train/chosen": -76.18523406982422, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -59.75, "logps_train/rejected": -86.7576904296875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3306331634521484, "rewards_train/margins": 1.3666200637817383, "rewards_train/rejected": -2.6972532272338867, "step": 2310 }, { "epoch": 0.65, "logps_train/chosen": -74.35615539550781, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -85.40081787109375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.3793771266937256, "rewards_train/margins": 0.906029462814331, "rewards_train/rejected": -3.2854065895080566, "step": 2311 }, { "epoch": 0.65, "learning_rate": 2.441366473520909e-08, "loss": 0.438, "step": 2312 }, { "epoch": 0.65, "logps_train/chosen": -84.054931640625, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -111.26505279541016, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.768969178199768, "rewards_train/margins": 2.6528486013412476, "rewards_train/rejected": -4.421817779541016, "step": 2312 }, { "epoch": 0.65, "logps_train/chosen": -97.89682006835938, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -88.00159454345703, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3671231269836426, "rewards_train/margins": 1.2402629852294922, "rewards_train/rejected": -2.6073861122131348, "step": 2313 }, { "epoch": 0.65, "learning_rate": 2.414759708576272e-08, "loss": 0.4338, "step": 2314 }, { "epoch": 0.65, "logps_train/chosen": -85.98214721679688, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -108.347900390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1966524124145508, "rewards_train/margins": 2.472512722015381, "rewards_train/rejected": -3.6691651344299316, "step": 2314 }, { "epoch": 0.65, "logps_train/chosen": -101.44063568115234, "logps_train/ref_chosen": -91.5, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -105.71339416503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9948445558547974, "rewards_train/margins": 1.640947937965393, "rewards_train/rejected": -2.6357924938201904, "step": 2315 }, { "epoch": 0.65, "learning_rate": 2.3882913643168996e-08, "loss": 0.2656, "step": 2316 }, { "epoch": 0.65, "logps_train/chosen": -54.69548034667969, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -73.16571044921875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.041813850402832, "rewards_train/margins": 1.148585557937622, "rewards_train/rejected": -2.190399408340454, "step": 2316 }, { "epoch": 0.65, "logps_train/chosen": -82.53874969482422, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -101.69741821289062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4109060764312744, "rewards_train/margins": 1.3307111263275146, "rewards_train/rejected": -2.741617202758789, "step": 2317 }, { "epoch": 0.65, "learning_rate": 2.3619616029622786e-08, "loss": 0.378, "step": 2318 }, { "epoch": 0.65, "logps_train/chosen": -59.0460205078125, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -53.0, "logps_train/rejected": -77.77056121826172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1014772653579712, "rewards_train/margins": 1.3862236738204956, "rewards_train/rejected": -2.487700939178467, "step": 2318 }, { "epoch": 0.65, "logps_train/chosen": -106.45872497558594, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -128.98675537109375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2654037475585938, "rewards_train/margins": 1.5731151103973389, "rewards_train/rejected": -3.8385188579559326, "step": 2319 }, { "epoch": 0.65, "learning_rate": 2.3357705858825626e-08, "loss": 0.3599, "step": 2320 }, { "epoch": 0.65, "logps_train/chosen": -54.24959182739258, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -36.75, "logps_train/rejected": -51.97197341918945, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6323809027671814, "rewards_train/margins": 0.8976289629936218, "rewards_train/rejected": -1.5300098657608032, "step": 2320 }, { "epoch": 0.65, "logps_train/chosen": -85.1417236328125, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -86.06590270996094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1801878213882446, "rewards_train/margins": 1.1674178838729858, "rewards_train/rejected": -2.3476057052612305, "step": 2321 }, { "epoch": 0.65, "learning_rate": 2.3097184735975715e-08, "loss": 0.4628, "step": 2322 }, { "epoch": 0.65, "logps_train/chosen": -50.962223052978516, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -63.62197494506836, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9616520404815674, "rewards_train/margins": 0.4386312961578369, "rewards_train/rejected": -1.4002833366394043, "step": 2322 }, { "epoch": 0.65, "logps_train/chosen": -45.97882843017578, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -46.01702880859375, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.8758121132850647, "rewards_train/margins": 0.31807857751846313, "rewards_train/rejected": -1.1938906908035278, "step": 2323 }, { "epoch": 0.65, "learning_rate": 2.283805425775784e-08, "loss": 0.6033, "step": 2324 }, { "epoch": 0.65, "logps_train/chosen": -80.25047302246094, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -53.75, "logps_train/rejected": -84.49087524414062, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0461416244506836, "rewards_train/margins": 1.0320472717285156, "rewards_train/rejected": -3.078188896179199, "step": 2324 }, { "epoch": 0.65, "logps_train/chosen": -33.4067497253418, "logps_train/ref_chosen": -26.125, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -52.737266540527344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7299327254295349, "rewards_train/margins": 1.1340284943580627, "rewards_train/rejected": -1.8639612197875977, "step": 2325 }, { "epoch": 0.65, "learning_rate": 2.2580316012333983e-08, "loss": 0.3952, "step": 2326 }, { "epoch": 0.65, "logps_train/chosen": -63.234134674072266, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -86.70317077636719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8007569909095764, "rewards_train/margins": 2.040653169155121, "rewards_train/rejected": -2.8414101600646973, "step": 2326 }, { "epoch": 0.65, "logps_train/chosen": -111.7594985961914, "logps_train/ref_chosen": -98.5, "logps_train/ref_rejected": -113.0, "logps_train/rejected": -152.4572296142578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3231786489486694, "rewards_train/margins": 2.63852322101593, "rewards_train/rejected": -3.9617018699645996, "step": 2327 }, { "epoch": 0.65, "learning_rate": 2.232397157933333e-08, "loss": 0.2995, "step": 2328 }, { "epoch": 0.65, "logps_train/chosen": -112.3330307006836, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -154.49700927734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -3.4161157608032227, "rewards_train/margins": 1.9484291076660156, "rewards_train/rejected": -5.364544868469238, "step": 2328 }, { "epoch": 0.65, "logps_train/chosen": -59.38545227050781, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -55.0, "logps_train/rejected": -82.02438354492188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3947954177856445, "rewards_train/margins": 1.3133068084716797, "rewards_train/rejected": -2.708102226257324, "step": 2329 }, { "epoch": 0.65, "learning_rate": 2.206902252984266e-08, "loss": 0.3211, "step": 2330 }, { "epoch": 0.65, "logps_train/chosen": -99.88095092773438, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -94.0, "logps_train/rejected": -132.04046630859375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6930516958236694, "rewards_train/margins": 2.106575608253479, "rewards_train/rejected": -3.7996273040771484, "step": 2330 }, { "epoch": 0.65, "logps_train/chosen": -91.5267333984375, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -124.65767669677734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4155640602111816, "rewards_train/margins": 2.719344139099121, "rewards_train/rejected": -4.134908199310303, "step": 2331 }, { "epoch": 0.65, "learning_rate": 2.1815470426396615e-08, "loss": 0.3932, "step": 2332 }, { "epoch": 0.65, "logps_train/chosen": -46.769317626953125, "logps_train/ref_chosen": -41.5, "logps_train/ref_rejected": -51.75, "logps_train/rejected": -72.10181427001953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5208772420883179, "rewards_train/margins": 1.517428994178772, "rewards_train/rejected": -2.03830623626709, "step": 2332 }, { "epoch": 0.65, "logps_train/chosen": -58.29115295410156, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -62.80207061767578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9801895618438721, "rewards_train/margins": 1.1518728733062744, "rewards_train/rejected": -2.1320624351501465, "step": 2333 }, { "epoch": 0.65, "learning_rate": 2.1563316822968332e-08, "loss": 0.3587, "step": 2334 }, { "epoch": 0.65, "logps_train/chosen": -116.30047607421875, "logps_train/ref_chosen": -93.5, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -161.61383056640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.290984869003296, "rewards_train/margins": 2.540710210800171, "rewards_train/rejected": -4.831695079803467, "step": 2334 }, { "epoch": 0.65, "logps_train/chosen": -89.87202453613281, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -99.28591918945312, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.212398052215576, "rewards_train/margins": 1.0849440097808838, "rewards_train/rejected": -3.29734206199646, "step": 2335 }, { "epoch": 0.65, "learning_rate": 2.1312563264959837e-08, "loss": 0.6399, "step": 2336 }, { "epoch": 0.65, "logps_train/chosen": -69.43641662597656, "logps_train/ref_chosen": -50.5, "logps_train/ref_rejected": -57.25, "logps_train/rejected": -90.65304565429688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.901381015777588, "rewards_train/margins": 1.4276933670043945, "rewards_train/rejected": -3.3290743827819824, "step": 2336 }, { "epoch": 0.65, "logps_train/chosen": -66.98248291015625, "logps_train/ref_chosen": -56.25, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -120.43529510498047, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0648493766784668, "rewards_train/margins": 2.090008020401001, "rewards_train/rejected": -3.1548573970794678, "step": 2337 }, { "epoch": 0.65, "learning_rate": 2.1063211289192363e-08, "loss": 0.3742, "step": 2338 }, { "epoch": 0.65, "logps_train/chosen": -37.199623107910156, "logps_train/ref_chosen": -32.75, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -48.03995132446289, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.44222787022590637, "rewards_train/margins": 0.9666502177715302, "rewards_train/rejected": -1.4088780879974365, "step": 2338 }, { "epoch": 0.65, "logps_train/chosen": -120.96432495117188, "logps_train/ref_chosen": -106.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -147.673828125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.485103964805603, "rewards_train/margins": 2.4358195066452026, "rewards_train/rejected": -3.9209234714508057, "step": 2339 }, { "epoch": 0.65, "learning_rate": 2.081526242389728e-08, "loss": 0.3163, "step": 2340 }, { "epoch": 0.65, "logps_train/chosen": -61.40909957885742, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -67.73588562011719, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2549724578857422, "rewards_train/margins": 0.6457642316818237, "rewards_train/rejected": -1.900736689567566, "step": 2340 }, { "epoch": 0.65, "logps_train/chosen": -65.85340881347656, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -59.5, "logps_train/rejected": -85.075439453125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.009070873260498, "rewards_train/margins": 1.5371451377868652, "rewards_train/rejected": -2.5462160110473633, "step": 2341 }, { "epoch": 0.65, "learning_rate": 2.056871818870648e-08, "loss": 0.4376, "step": 2342 }, { "epoch": 0.65, "logps_train/chosen": -70.17825317382812, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -70.79692077636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4225132465362549, "rewards_train/margins": 1.450049877166748, "rewards_train/rejected": -1.872563123703003, "step": 2342 }, { "epoch": 0.65, "logps_train/chosen": -89.3247299194336, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -121.79609680175781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.335378646850586, "rewards_train/margins": 1.4989194869995117, "rewards_train/rejected": -3.8342981338500977, "step": 2343 }, { "epoch": 0.66, "learning_rate": 2.0323580094643215e-08, "loss": 0.2979, "step": 2344 }, { "epoch": 0.66, "logps_train/chosen": -104.042236328125, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -74.5, "logps_train/rejected": -115.4459228515625, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -2.83164119720459, "rewards_train/margins": 1.2770862579345703, "rewards_train/rejected": -4.10872745513916, "step": 2344 }, { "epoch": 0.66, "logps_train/chosen": -87.92204284667969, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -105.17977142333984, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.6062674522399902, "rewards_train/margins": 1.1886749267578125, "rewards_train/rejected": -3.7949423789978027, "step": 2345 }, { "epoch": 0.66, "learning_rate": 2.0079849644112636e-08, "loss": 0.4891, "step": 2346 }, { "epoch": 0.66, "logps_train/chosen": -62.77349090576172, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -90.38088989257812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8043019771575928, "rewards_train/margins": 1.4115214347839355, "rewards_train/rejected": -3.2158234119415283, "step": 2346 }, { "epoch": 0.66, "logps_train/chosen": -81.14441680908203, "logps_train/ref_chosen": -59.25, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -125.37225341796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.182410717010498, "rewards_train/margins": 1.6298151016235352, "rewards_train/rejected": -3.812225818634033, "step": 2347 }, { "epoch": 0.66, "learning_rate": 1.9837528330892776e-08, "loss": 0.4502, "step": 2348 }, { "epoch": 0.66, "logps_train/chosen": -62.4871711730957, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -54.25, "logps_train/rejected": -81.77409362792969, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9696152806282043, "rewards_train/margins": 1.782012164592743, "rewards_train/rejected": -2.7516274452209473, "step": 2348 }, { "epoch": 0.66, "logps_train/chosen": -102.28630065917969, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -116.01897430419922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.920329213142395, "rewards_train/margins": 1.644849181175232, "rewards_train/rejected": -3.565178394317627, "step": 2349 }, { "epoch": 0.66, "learning_rate": 1.9596617640125463e-08, "loss": 0.4125, "step": 2350 }, { "epoch": 0.66, "logps_train/chosen": -144.18722534179688, "logps_train/ref_chosen": -105.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -146.3778076171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.8687233924865723, "rewards_train/margins": 1.2151508331298828, "rewards_train/rejected": -5.083874225616455, "step": 2350 }, { "epoch": 0.66, "logps_train/chosen": -72.83572387695312, "logps_train/ref_chosen": -48.0, "logps_train/ref_rejected": -44.75, "logps_train/rejected": -71.47396850585938, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.483181953430176, "rewards_train/margins": 0.19907808303833008, "rewards_train/rejected": -2.682260036468506, "step": 2351 }, { "epoch": 0.66, "learning_rate": 1.935711904830681e-08, "loss": 0.6555, "step": 2352 }, { "epoch": 0.66, "logps_train/chosen": -60.31793212890625, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -80.93270874023438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.746246337890625, "rewards_train/margins": 1.3808135986328125, "rewards_train/rejected": -3.1270599365234375, "step": 2352 }, { "epoch": 0.66, "logps_train/chosen": -79.99787902832031, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -116.02961730957031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.081282377243042, "rewards_train/margins": 2.1707513332366943, "rewards_train/rejected": -3.2520337104797363, "step": 2353 }, { "epoch": 0.66, "learning_rate": 1.9119034023278634e-08, "loss": 0.3205, "step": 2354 }, { "epoch": 0.66, "logps_train/chosen": -70.44905853271484, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -94.92647552490234, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1918785572052002, "rewards_train/margins": 1.392761468887329, "rewards_train/rejected": -2.5846400260925293, "step": 2354 }, { "epoch": 0.66, "logps_train/chosen": -52.95862579345703, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -82.2431640625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6912485361099243, "rewards_train/margins": 2.4679068326950073, "rewards_train/rejected": -3.1591553688049316, "step": 2355 }, { "epoch": 0.66, "learning_rate": 1.8882364024219306e-08, "loss": 0.367, "step": 2356 }, { "epoch": 0.66, "logps_train/chosen": -85.4339599609375, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -104.77973175048828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.910583257675171, "rewards_train/margins": 1.3658277988433838, "rewards_train/rejected": -3.2764110565185547, "step": 2356 }, { "epoch": 0.66, "logps_train/chosen": -79.79117584228516, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -101.0180435180664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8884929418563843, "rewards_train/margins": 0.6673704385757446, "rewards_train/rejected": -2.555863380432129, "step": 2357 }, { "epoch": 0.66, "learning_rate": 1.864711050163456e-08, "loss": 0.5869, "step": 2358 }, { "epoch": 0.66, "logps_train/chosen": -47.60871124267578, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -82.54953002929688, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.899396538734436, "rewards_train/margins": 1.6905664205551147, "rewards_train/rejected": -2.589962959289551, "step": 2358 }, { "epoch": 0.66, "logps_train/chosen": -54.427406311035156, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -90.47377014160156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6716465950012207, "rewards_train/margins": 2.7245097160339355, "rewards_train/rejected": -3.3961563110351562, "step": 2359 }, { "epoch": 0.66, "learning_rate": 1.8413274897349036e-08, "loss": 0.3164, "step": 2360 }, { "epoch": 0.66, "logps_train/chosen": -74.3843994140625, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -42.25, "logps_train/rejected": -60.552337646484375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1689088344573975, "rewards_train/margins": 0.6492154598236084, "rewards_train/rejected": -1.8181242942810059, "step": 2360 }, { "epoch": 0.66, "logps_train/chosen": -75.90766906738281, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -88.56928253173828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0731887817382812, "rewards_train/margins": 1.0165519714355469, "rewards_train/rejected": -3.089740753173828, "step": 2361 }, { "epoch": 0.66, "learning_rate": 1.818085864449709e-08, "loss": 0.4651, "step": 2362 }, { "epoch": 0.66, "logps_train/chosen": -50.30708312988281, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -56.24629211425781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.228315830230713, "rewards_train/margins": 0.4563720226287842, "rewards_train/rejected": -1.684687852859497, "step": 2362 }, { "epoch": 0.66, "logps_train/chosen": -54.42514419555664, "logps_train/ref_chosen": -43.0, "logps_train/ref_rejected": -39.25, "logps_train/rejected": -63.159305572509766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1516941785812378, "rewards_train/margins": 1.2406038045883179, "rewards_train/rejected": -2.3922979831695557, "step": 2363 }, { "epoch": 0.66, "learning_rate": 1.7949863167514207e-08, "loss": 0.4729, "step": 2364 }, { "epoch": 0.66, "logps_train/chosen": -54.35980224609375, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -62.448402404785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5064880847930908, "rewards_train/margins": 0.3297581672668457, "rewards_train/rejected": -1.8362462520599365, "step": 2364 }, { "epoch": 0.66, "logps_train/chosen": -88.7553939819336, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -130.08148193359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.585695743560791, "rewards_train/margins": 3.8771395683288574, "rewards_train/rejected": -5.462835311889648, "step": 2365 }, { "epoch": 0.66, "learning_rate": 1.7720289882128092e-08, "loss": 0.3635, "step": 2366 }, { "epoch": 0.66, "logps_train/chosen": -115.94821166992188, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -155.0402069091797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.00575852394104, "rewards_train/margins": 3.8826377391815186, "rewards_train/rejected": -5.888396263122559, "step": 2366 }, { "epoch": 0.66, "logps_train/chosen": -30.946998596191406, "logps_train/ref_chosen": -22.125, "logps_train/ref_rejected": -32.25, "logps_train/rejected": -49.723960876464844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8831765055656433, "rewards_train/margins": 0.8765245079994202, "rewards_train/rejected": -1.7597010135650635, "step": 2367 }, { "epoch": 0.66, "learning_rate": 1.749214019535028e-08, "loss": 0.2652, "step": 2368 }, { "epoch": 0.66, "logps_train/chosen": -103.5555419921875, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -128.9398956298828, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.375476837158203, "rewards_train/margins": 2.0962467193603516, "rewards_train/rejected": -4.471723556518555, "step": 2368 }, { "epoch": 0.66, "logps_train/chosen": -69.94168853759766, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -63.5, "logps_train/rejected": -82.79621887207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5803020000457764, "rewards_train/margins": 1.3461954593658447, "rewards_train/rejected": -1.926497459411621, "step": 2369 }, { "epoch": 0.66, "learning_rate": 1.7265415505467202e-08, "loss": 0.4227, "step": 2370 }, { "epoch": 0.66, "logps_train/chosen": -59.2435302734375, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -57.10908508300781, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -1.37298583984375, "rewards_train/margins": 0.28577423095703125, "rewards_train/rejected": -1.6587600708007812, "step": 2370 }, { "epoch": 0.66, "logps_train/chosen": -48.712791442871094, "logps_train/ref_chosen": -40.0, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -60.24412536621094, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8595608472824097, "rewards_train/margins": 0.6164143085479736, "rewards_train/rejected": -1.4759751558303833, "step": 2371 }, { "epoch": 0.66, "learning_rate": 1.7040117202031774e-08, "loss": 0.5745, "step": 2372 }, { "epoch": 0.66, "logps_train/chosen": -54.472015380859375, "logps_train/ref_chosen": -43.25, "logps_train/ref_rejected": -40.75, "logps_train/rejected": -61.366668701171875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1296234130859375, "rewards_train/margins": 0.9418091773986816, "rewards_train/rejected": -2.071432590484619, "step": 2372 }, { "epoch": 0.66, "logps_train/chosen": -51.196144104003906, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -67.7552719116211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5069681406021118, "rewards_train/margins": 1.1002975702285767, "rewards_train/rejected": -2.6072657108306885, "step": 2373 }, { "epoch": 0.66, "learning_rate": 1.6816246665854905e-08, "loss": 0.4071, "step": 2374 }, { "epoch": 0.66, "logps_train/chosen": -67.62458801269531, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -59.0, "logps_train/rejected": -78.61567687988281, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8679275512695312, "rewards_train/margins": 1.0901976823806763, "rewards_train/rejected": -1.9581252336502075, "step": 2374 }, { "epoch": 0.66, "logps_train/chosen": -121.6823501586914, "logps_train/ref_chosen": -101.5, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -146.6358184814453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.001828670501709, "rewards_train/margins": 2.202378749847412, "rewards_train/rejected": -4.204207420349121, "step": 2375 }, { "epoch": 0.66, "learning_rate": 1.6593805268996952e-08, "loss": 0.3113, "step": 2376 }, { "epoch": 0.66, "logps_train/chosen": -39.511253356933594, "logps_train/ref_chosen": -29.875, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -58.38397979736328, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9601094722747803, "rewards_train/margins": 0.8243824243545532, "rewards_train/rejected": -1.7844918966293335, "step": 2376 }, { "epoch": 0.66, "logps_train/chosen": -38.205841064453125, "logps_train/ref_chosen": -30.25, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -58.22001266479492, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8016389608383179, "rewards_train/margins": 1.2359873056411743, "rewards_train/rejected": -2.037626266479492, "step": 2377 }, { "epoch": 0.66, "learning_rate": 1.6372794374759385e-08, "loss": 0.5008, "step": 2378 }, { "epoch": 0.66, "logps_train/chosen": -57.011898040771484, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -72.63736724853516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9588067531585693, "rewards_train/margins": 1.4664533138275146, "rewards_train/rejected": -2.425260066986084, "step": 2378 }, { "epoch": 0.66, "logps_train/chosen": -70.20790100097656, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -102.10855102539062, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9248183965682983, "rewards_train/margins": 2.0233408212661743, "rewards_train/rejected": -2.9481592178344727, "step": 2379 }, { "epoch": 0.67, "learning_rate": 1.615321533767633e-08, "loss": 0.331, "step": 2380 }, { "epoch": 0.67, "logps_train/chosen": -83.17626953125, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -109.589111328125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.453174352645874, "rewards_train/margins": 1.6352295875549316, "rewards_train/rejected": -3.0884039402008057, "step": 2380 }, { "epoch": 0.67, "logps_train/chosen": -83.79940795898438, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -115.67716979980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8533779382705688, "rewards_train/margins": 3.0080899000167847, "rewards_train/rejected": -4.8614678382873535, "step": 2381 }, { "epoch": 0.67, "learning_rate": 1.5935069503506317e-08, "loss": 0.3274, "step": 2382 }, { "epoch": 0.67, "logps_train/chosen": -64.24397277832031, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -39.5, "logps_train/rejected": -56.80262756347656, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1977365016937256, "rewards_train/margins": 0.5394594669342041, "rewards_train/rejected": -1.7371959686279297, "step": 2382 }, { "epoch": 0.67, "logps_train/chosen": -86.50545501708984, "logps_train/ref_chosen": -59.75, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -95.79205322265625, "rewards_train/accuracies": 0.25, "rewards_train/chosen": -2.671053647994995, "rewards_train/margins": 0.4847145080566406, "rewards_train/rejected": -3.1557681560516357, "step": 2383 }, { "epoch": 0.67, "learning_rate": 1.5718358209224153e-08, "loss": 0.6025, "step": 2384 }, { "epoch": 0.67, "logps_train/chosen": -78.83937072753906, "logps_train/ref_chosen": -60.25, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -109.42338562011719, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8662126064300537, "rewards_train/margins": 1.6802761554718018, "rewards_train/rejected": -3.5464887619018555, "step": 2384 }, { "epoch": 0.67, "logps_train/chosen": -74.46876525878906, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -86.62158203125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.0051772594451904, "rewards_train/margins": 1.1073722839355469, "rewards_train/rejected": -3.1125495433807373, "step": 2385 }, { "epoch": 0.67, "learning_rate": 1.5503082783012546e-08, "loss": 0.3948, "step": 2386 }, { "epoch": 0.67, "logps_train/chosen": -72.53384399414062, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -88.03909301757812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4690089225769043, "rewards_train/margins": 0.7313849925994873, "rewards_train/rejected": -3.2003939151763916, "step": 2386 }, { "epoch": 0.67, "logps_train/chosen": -74.85980987548828, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -83.0, "logps_train/rejected": -111.21375274658203, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2601996660232544, "rewards_train/margins": 1.556097388267517, "rewards_train/rejected": -2.8162970542907715, "step": 2387 }, { "epoch": 0.67, "learning_rate": 1.5289244544254036e-08, "loss": 0.4011, "step": 2388 }, { "epoch": 0.67, "logps_train/chosen": -71.07477569580078, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -92.66500854492188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3321850299835205, "rewards_train/margins": 1.4595105648040771, "rewards_train/rejected": -2.7916955947875977, "step": 2388 }, { "epoch": 0.67, "logps_train/chosen": -53.74809265136719, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -58.0, "logps_train/rejected": -77.03923797607422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9548389911651611, "rewards_train/margins": 0.9410768747329712, "rewards_train/rejected": -1.8959158658981323, "step": 2389 }, { "epoch": 0.67, "learning_rate": 1.507684480352292e-08, "loss": 0.3496, "step": 2390 }, { "epoch": 0.67, "logps_train/chosen": -39.05058288574219, "logps_train/ref_chosen": -21.0, "logps_train/ref_rejected": -23.25, "logps_train/rejected": -39.60066223144531, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.7998826503753662, "rewards_train/margins": -0.16611015796661377, "rewards_train/rejected": -1.6337724924087524, "step": 2390 }, { "epoch": 0.67, "logps_train/chosen": -91.94862365722656, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -63.25, "logps_train/rejected": -82.77864074707031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6413460969924927, "rewards_train/margins": 0.300189733505249, "rewards_train/rejected": -1.9415358304977417, "step": 2391 }, { "epoch": 0.67, "learning_rate": 1.4865884862577254e-08, "loss": 0.7812, "step": 2392 }, { "epoch": 0.67, "logps_train/chosen": -36.97472381591797, "logps_train/ref_chosen": -27.875, "logps_train/ref_rejected": -35.0, "logps_train/rejected": -52.20393371582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9113396406173706, "rewards_train/margins": 0.8012415170669556, "rewards_train/rejected": -1.7125811576843262, "step": 2392 }, { "epoch": 0.67, "logps_train/chosen": -51.82833480834961, "logps_train/ref_chosen": -40.75, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -61.646697998046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1044155359268188, "rewards_train/margins": 0.8557621240615845, "rewards_train/rejected": -1.9601776599884033, "step": 2393 }, { "epoch": 0.67, "learning_rate": 1.4656366014350746e-08, "loss": 0.4348, "step": 2394 }, { "epoch": 0.67, "logps_train/chosen": -95.79158020019531, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -121.02859497070312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.239314556121826, "rewards_train/margins": 2.274287223815918, "rewards_train/rejected": -4.513601779937744, "step": 2394 }, { "epoch": 0.67, "logps_train/chosen": -84.07028198242188, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -124.91248321533203, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7320280075073242, "rewards_train/margins": 2.346329689025879, "rewards_train/rejected": -4.078357696533203, "step": 2395 }, { "epoch": 0.67, "learning_rate": 1.4448289542944997e-08, "loss": 0.3491, "step": 2396 }, { "epoch": 0.67, "logps_train/chosen": -97.93675231933594, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -106.16511535644531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.622971773147583, "rewards_train/margins": 1.857015609741211, "rewards_train/rejected": -3.479987382888794, "step": 2396 }, { "epoch": 0.67, "logps_train/chosen": -45.19966125488281, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -25.125, "logps_train/rejected": -45.684791564941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.9754351377487183, "rewards_train/margins": 1.0760518312454224, "rewards_train/rejected": -2.0514869689941406, "step": 2397 }, { "epoch": 0.67, "learning_rate": 1.4241656723621515e-08, "loss": 0.4637, "step": 2398 }, { "epoch": 0.67, "logps_train/chosen": -87.3427734375, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -126.21279907226562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.769433617591858, "rewards_train/margins": 2.011221766471863, "rewards_train/rejected": -3.7806553840637207, "step": 2398 }, { "epoch": 0.67, "logps_train/chosen": -100.74357604980469, "logps_train/ref_chosen": -87.5, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -136.00559997558594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3298254013061523, "rewards_train/margins": 1.7863597869873047, "rewards_train/rejected": -3.116185188293457, "step": 2399 }, { "epoch": 0.67, "learning_rate": 1.4036468822793967e-08, "loss": 0.3085, "step": 2400 }, { "epoch": 0.67, "logps_train/chosen": -66.79975891113281, "logps_train/ref_chosen": -61.25, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -90.70407104492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5506794452667236, "rewards_train/margins": 1.7453136444091797, "rewards_train/rejected": -2.2959930896759033, "step": 2400 }, { "epoch": 0.67, "logps_train/chosen": -83.78105163574219, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -114.36720275878906, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.472440719604492, "rewards_train/margins": 1.049436092376709, "rewards_train/rejected": -3.521876811981201, "step": 2401 }, { "epoch": 0.67, "learning_rate": 1.383272709802033e-08, "loss": 0.3326, "step": 2402 }, { "epoch": 0.67, "logps_train/chosen": -73.37391662597656, "logps_train/ref_chosen": -63.0, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -89.45342254638672, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0397354364395142, "rewards_train/margins": 2.0501378774642944, "rewards_train/rejected": -3.0898733139038086, "step": 2402 }, { "epoch": 0.67, "logps_train/chosen": -69.87894439697266, "logps_train/ref_chosen": -55.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -107.5324935913086, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4334388971328735, "rewards_train/margins": 1.7757552862167358, "rewards_train/rejected": -3.2091941833496094, "step": 2403 }, { "epoch": 0.67, "learning_rate": 1.3630432797995251e-08, "loss": 0.3009, "step": 2404 }, { "epoch": 0.67, "logps_train/chosen": -63.660797119140625, "logps_train/ref_chosen": -53.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -98.97848510742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.012954831123352, "rewards_train/margins": 1.8061822652816772, "rewards_train/rejected": -2.8191370964050293, "step": 2404 }, { "epoch": 0.67, "logps_train/chosen": -51.920997619628906, "logps_train/ref_chosen": -42.75, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -65.692138671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9235451817512512, "rewards_train/margins": 1.3056291937828064, "rewards_train/rejected": -2.2291743755340576, "step": 2405 }, { "epoch": 0.67, "learning_rate": 1.34295871625425e-08, "loss": 0.3286, "step": 2406 }, { "epoch": 0.67, "logps_train/chosen": -67.31393432617188, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -55.25, "logps_train/rejected": -83.87126922607422, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8841273784637451, "rewards_train/margins": 0.9834682941436768, "rewards_train/rejected": -2.867595672607422, "step": 2406 }, { "epoch": 0.67, "logps_train/chosen": -101.5591049194336, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -166.26010131835938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.001710891723633, "rewards_train/margins": 3.502619743347168, "rewards_train/rejected": -5.504330635070801, "step": 2407 }, { "epoch": 0.67, "learning_rate": 1.3230191422607063e-08, "loss": 0.2789, "step": 2408 }, { "epoch": 0.67, "logps_train/chosen": -67.17030334472656, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -75.45077514648438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6024301052093506, "rewards_train/margins": 1.0479693412780762, "rewards_train/rejected": -2.6503994464874268, "step": 2408 }, { "epoch": 0.67, "logps_train/chosen": -59.81527328491211, "logps_train/ref_chosen": -47.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -101.85376739501953, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2873866558074951, "rewards_train/margins": 1.9499430656433105, "rewards_train/rejected": -3.2373297214508057, "step": 2409 }, { "epoch": 0.67, "learning_rate": 1.303224680024792e-08, "loss": 0.4324, "step": 2410 }, { "epoch": 0.67, "logps_train/chosen": -64.89202880859375, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -75.514892578125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9850528240203857, "rewards_train/margins": 1.0158016681671143, "rewards_train/rejected": -3.0008544921875, "step": 2410 }, { "epoch": 0.67, "logps_train/chosen": -103.09857940673828, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -116.10062408447266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7422800064086914, "rewards_train/margins": 2.121102809906006, "rewards_train/rejected": -2.8633828163146973, "step": 2411 }, { "epoch": 0.67, "learning_rate": 1.2835754508630392e-08, "loss": 0.3235, "step": 2412 }, { "epoch": 0.67, "logps_train/chosen": -74.21398162841797, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -96.1976318359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8448357582092285, "rewards_train/margins": 1.417896032333374, "rewards_train/rejected": -2.2627317905426025, "step": 2412 }, { "epoch": 0.67, "logps_train/chosen": -97.18973541259766, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -138.10935974121094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8095985651016235, "rewards_train/margins": 2.70250928401947, "rewards_train/rejected": -4.512107849121094, "step": 2413 }, { "epoch": 0.67, "learning_rate": 1.2640715752018777e-08, "loss": 0.2717, "step": 2414 }, { "epoch": 0.67, "logps_train/chosen": -58.3160400390625, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -62.888755798339844, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.1085573434829712, "rewards_train/margins": 0.2411586046218872, "rewards_train/rejected": -1.3497159481048584, "step": 2414 }, { "epoch": 0.67, "logps_train/chosen": -86.14068603515625, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -99.3829345703125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3257873058319092, "rewards_train/margins": 1.9703190326690674, "rewards_train/rejected": -3.2961063385009766, "step": 2415 }, { "epoch": 0.68, "learning_rate": 1.2447131725768806e-08, "loss": 0.5444, "step": 2416 }, { "epoch": 0.68, "logps_train/chosen": -90.866455078125, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -91.5, "logps_train/rejected": -126.46876525878906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9405517578125, "rewards_train/margins": 2.5719494819641113, "rewards_train/rejected": -3.5125012397766113, "step": 2416 }, { "epoch": 0.68, "logps_train/chosen": -37.76715087890625, "logps_train/ref_chosen": -22.875, "logps_train/ref_rejected": -35.5, "logps_train/rejected": -61.101043701171875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4909729957580566, "rewards_train/margins": 1.0597565174102783, "rewards_train/rejected": -2.550729513168335, "step": 2417 }, { "epoch": 0.68, "learning_rate": 1.2255003616320592e-08, "loss": 0.3856, "step": 2418 }, { "epoch": 0.68, "logps_train/chosen": -55.25697326660156, "logps_train/ref_chosen": -44.75, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -91.46592712402344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0396625995635986, "rewards_train/margins": 1.8676722049713135, "rewards_train/rejected": -2.907334804534912, "step": 2418 }, { "epoch": 0.68, "logps_train/chosen": -82.95384216308594, "logps_train/ref_chosen": -60.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -103.22655487060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2568106651306152, "rewards_train/margins": 1.695141315460205, "rewards_train/rejected": -3.9519519805908203, "step": 2419 }, { "epoch": 0.68, "learning_rate": 1.2064332601191163e-08, "loss": 0.3177, "step": 2420 }, { "epoch": 0.68, "logps_train/chosen": -78.79591369628906, "logps_train/ref_chosen": -57.25, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -100.02770233154297, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.1458516120910645, "rewards_train/margins": 1.7836523056030273, "rewards_train/rejected": -3.929503917694092, "step": 2420 }, { "epoch": 0.68, "logps_train/chosen": -65.70633697509766, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -56.5, "logps_train/rejected": -70.92654418945312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6659465432167053, "rewards_train/margins": 0.7820789217948914, "rewards_train/rejected": -1.4480254650115967, "step": 2421 }, { "epoch": 0.68, "learning_rate": 1.187511984896719e-08, "loss": 0.4298, "step": 2422 }, { "epoch": 0.68, "logps_train/chosen": -67.92259216308594, "logps_train/ref_chosen": -55.25, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -99.214599609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2611314058303833, "rewards_train/margins": 2.0294686555862427, "rewards_train/rejected": -3.290600061416626, "step": 2422 }, { "epoch": 0.68, "logps_train/chosen": -51.32395935058594, "logps_train/ref_chosen": -38.5, "logps_train/ref_rejected": -34.25, "logps_train/rejected": -56.72447967529297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2930402755737305, "rewards_train/margins": 0.9630014896392822, "rewards_train/rejected": -2.2560417652130127, "step": 2423 }, { "epoch": 0.68, "learning_rate": 1.1687366519298053e-08, "loss": 0.3201, "step": 2424 }, { "epoch": 0.68, "logps_train/chosen": -87.52438354492188, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -81.0, "logps_train/rejected": -100.53943634033203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0840210914611816, "rewards_train/margins": 0.8536046743392944, "rewards_train/rejected": -1.937625765800476, "step": 2424 }, { "epoch": 0.68, "logps_train/chosen": -78.11419677734375, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -99.64435577392578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3555598258972168, "rewards_train/margins": 1.7846570014953613, "rewards_train/rejected": -3.140216827392578, "step": 2425 }, { "epoch": 0.68, "learning_rate": 1.1501073762888559e-08, "loss": 0.3919, "step": 2426 }, { "epoch": 0.68, "logps_train/chosen": -35.351890563964844, "logps_train/ref_chosen": -31.625, "logps_train/ref_rejected": -43.25, "logps_train/rejected": -63.67839813232422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.36946624517440796, "rewards_train/margins": 1.6733737587928772, "rewards_train/rejected": -2.042840003967285, "step": 2426 }, { "epoch": 0.68, "logps_train/chosen": -62.18972396850586, "logps_train/ref_chosen": -51.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -71.81778717041016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.101003646850586, "rewards_train/margins": 1.4165170192718506, "rewards_train/rejected": -2.5175206661224365, "step": 2427 }, { "epoch": 0.68, "learning_rate": 1.1316242721491954e-08, "loss": 0.351, "step": 2428 }, { "epoch": 0.68, "logps_train/chosen": -75.05734252929688, "logps_train/ref_chosen": -59.0, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -101.861083984375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6071021556854248, "rewards_train/margins": 2.1075222492218018, "rewards_train/rejected": -3.7146244049072266, "step": 2428 }, { "epoch": 0.68, "logps_train/chosen": -88.50202941894531, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -110.95804595947266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9464918375015259, "rewards_train/margins": 1.4665004014968872, "rewards_train/rejected": -3.412992238998413, "step": 2429 }, { "epoch": 0.68, "learning_rate": 1.113287452790282e-08, "loss": 0.3863, "step": 2430 }, { "epoch": 0.68, "logps_train/chosen": -90.89009094238281, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -87.5, "logps_train/rejected": -123.1275863647461, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.7041456699371338, "rewards_train/margins": 1.8570501804351807, "rewards_train/rejected": -3.5611958503723145, "step": 2430 }, { "epoch": 0.68, "logps_train/chosen": -96.65910339355469, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -72.0, "logps_train/rejected": -109.24962615966797, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3877854347229004, "rewards_train/margins": 1.326240062713623, "rewards_train/rejected": -3.7140254974365234, "step": 2431 }, { "epoch": 0.68, "learning_rate": 1.0950970305950352e-08, "loss": 0.351, "step": 2432 }, { "epoch": 0.68, "logps_train/chosen": -60.00078201293945, "logps_train/ref_chosen": -49.25, "logps_train/ref_rejected": -34.75, "logps_train/rejected": -64.7528305053711, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0823049545288086, "rewards_train/margins": 1.9136815071105957, "rewards_train/rejected": -2.9959864616394043, "step": 2432 }, { "epoch": 0.68, "logps_train/chosen": -63.58074951171875, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -80.39212036132812, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.7463560104370117, "rewards_train/margins": 1.0858254432678223, "rewards_train/rejected": -2.832181453704834, "step": 2433 }, { "epoch": 0.68, "learning_rate": 1.0770531170491287e-08, "loss": 0.3667, "step": 2434 }, { "epoch": 0.68, "logps_train/chosen": -91.77403259277344, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -108.3724365234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6567487716674805, "rewards_train/margins": 2.376051902770996, "rewards_train/rejected": -4.032800674438477, "step": 2434 }, { "epoch": 0.68, "logps_train/chosen": -78.766845703125, "logps_train/ref_chosen": -62.5, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -93.99950408935547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.624145746231079, "rewards_train/margins": 0.8383047580718994, "rewards_train/rejected": -2.4624505043029785, "step": 2435 }, { "epoch": 0.68, "learning_rate": 1.059155822740304e-08, "loss": 0.31, "step": 2436 }, { "epoch": 0.68, "logps_train/chosen": -95.65286254882812, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -92.5, "logps_train/rejected": -130.82110595703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.56919264793396, "rewards_train/margins": 2.257840633392334, "rewards_train/rejected": -3.827033281326294, "step": 2436 }, { "epoch": 0.68, "logps_train/chosen": -99.4281005859375, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -127.45632934570312, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1505249738693237, "rewards_train/margins": 1.4359287023544312, "rewards_train/rejected": -2.586453676223755, "step": 2437 }, { "epoch": 0.68, "learning_rate": 1.0414052573577136e-08, "loss": 0.3998, "step": 2438 }, { "epoch": 0.68, "logps_train/chosen": -40.05958557128906, "logps_train/ref_chosen": -31.5, "logps_train/ref_rejected": -30.625, "logps_train/rejected": -46.46166229248047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8520525693893433, "rewards_train/margins": 0.7300509214401245, "rewards_train/rejected": -1.5821034908294678, "step": 2438 }, { "epoch": 0.68, "logps_train/chosen": -96.32131958007812, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -90.0, "logps_train/rejected": -115.99043273925781, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.13877272605896, "rewards_train/margins": 1.467301368713379, "rewards_train/rejected": -2.606074094772339, "step": 2439 }, { "epoch": 0.68, "learning_rate": 1.0238015296912345e-08, "loss": 0.4685, "step": 2440 }, { "epoch": 0.68, "logps_train/chosen": -75.55229187011719, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -109.10525512695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9167520999908447, "rewards_train/margins": 2.6869375705718994, "rewards_train/rejected": -3.603689670562744, "step": 2440 }, { "epoch": 0.68, "logps_train/chosen": -57.32146072387695, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -102.18501281738281, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -0.8899586200714111, "rewards_train/margins": 1.5357694625854492, "rewards_train/rejected": -2.4257280826568604, "step": 2441 }, { "epoch": 0.68, "learning_rate": 1.0063447476307886e-08, "loss": 0.3098, "step": 2442 }, { "epoch": 0.68, "logps_train/chosen": -132.6461181640625, "logps_train/ref_chosen": -93.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -125.29907989501953, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -3.954845666885376, "rewards_train/margins": 0.7719371318817139, "rewards_train/rejected": -4.72678279876709, "step": 2442 }, { "epoch": 0.68, "logps_train/chosen": -67.41500854492188, "logps_train/ref_chosen": -52.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -85.75605773925781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4844694137573242, "rewards_train/margins": 1.384300708770752, "rewards_train/rejected": -2.868770122528076, "step": 2443 }, { "epoch": 0.68, "learning_rate": 9.890350181657126e-09, "loss": 0.4678, "step": 2444 }, { "epoch": 0.68, "logps_train/chosen": -85.39179992675781, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -74.0, "logps_train/rejected": -110.13862609863281, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.985273838043213, "rewards_train/margins": 1.6444087028503418, "rewards_train/rejected": -3.6296825408935547, "step": 2444 }, { "epoch": 0.68, "logps_train/chosen": -107.15191650390625, "logps_train/ref_chosen": -82.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -131.0458221435547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.5230047702789307, "rewards_train/margins": 1.701108694076538, "rewards_train/rejected": -4.224113464355469, "step": 2445 }, { "epoch": 0.68, "learning_rate": 9.718724473840784e-09, "loss": 0.3512, "step": 2446 }, { "epoch": 0.68, "logps_train/chosen": -79.40707397460938, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -104.53474426269531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0442235469818115, "rewards_train/margins": 2.2678444385528564, "rewards_train/rejected": -3.312067985534668, "step": 2446 }, { "epoch": 0.68, "logps_train/chosen": -70.65157318115234, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -57.5, "logps_train/rejected": -95.6650161743164, "rewards_train/accuracies": 0.375, "rewards_train/chosen": -2.3841028213500977, "rewards_train/margins": 1.440992832183838, "rewards_train/rejected": -3.8250956535339355, "step": 2447 }, { "epoch": 0.68, "learning_rate": 9.54857140472054e-09, "loss": 0.5246, "step": 2448 }, { "epoch": 0.68, "logps_train/chosen": -113.29756164550781, "logps_train/ref_chosen": -101.5, "logps_train/ref_rejected": -114.0, "logps_train/rejected": -149.53109741210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.168135166168213, "rewards_train/margins": 2.3838024139404297, "rewards_train/rejected": -3.5519375801086426, "step": 2448 }, { "epoch": 0.68, "logps_train/chosen": -119.50375366210938, "logps_train/ref_chosen": -96.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -125.91334533691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.349594831466675, "rewards_train/margins": 1.2878334522247314, "rewards_train/rejected": -3.6374282836914062, "step": 2449 }, { "epoch": 0.68, "learning_rate": 9.379892017132463e-09, "loss": 0.321, "step": 2450 }, { "epoch": 0.68, "logps_train/chosen": -59.321170806884766, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -89.38465881347656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1541874408721924, "rewards_train/margins": 1.7940442562103271, "rewards_train/rejected": -2.9482316970825195, "step": 2450 }, { "epoch": 0.69, "logps_train/chosen": -82.37020111083984, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -63.0, "logps_train/rejected": -98.11495971679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5903408527374268, "rewards_train/margins": 1.909045934677124, "rewards_train/rejected": -3.499386787414551, "step": 2451 }, { "epoch": 0.69, "learning_rate": 9.21268734488076e-09, "loss": 0.3678, "step": 2452 }, { "epoch": 0.69, "logps_train/chosen": -66.78101348876953, "logps_train/ref_chosen": -53.25, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -94.06474304199219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3488047122955322, "rewards_train/margins": 1.8186075687408447, "rewards_train/rejected": -3.167412281036377, "step": 2452 }, { "epoch": 0.69, "logps_train/chosen": -47.781639099121094, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -75.32060241699219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5617574453353882, "rewards_train/margins": 1.2742092609405518, "rewards_train/rejected": -1.83596670627594, "step": 2453 }, { "epoch": 0.69, "learning_rate": 9.046958412731454e-09, "loss": 0.3313, "step": 2454 }, { "epoch": 0.69, "logps_train/chosen": -113.29007720947266, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -80.5, "logps_train/rejected": -108.9620590209961, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -2.6899452209472656, "rewards_train/margins": 0.1687610149383545, "rewards_train/rejected": -2.85870623588562, "step": 2454 }, { "epoch": 0.69, "logps_train/chosen": -74.42170715332031, "logps_train/ref_chosen": -56.0, "logps_train/ref_rejected": -62.0, "logps_train/rejected": -102.59025573730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.8433425426483154, "rewards_train/margins": 2.214120626449585, "rewards_train/rejected": -4.0574631690979, "step": 2455 }, { "epoch": 0.69, "learning_rate": 8.882706236405885e-09, "loss": 0.4355, "step": 2456 }, { "epoch": 0.69, "logps_train/chosen": -76.48384857177734, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -108.02183532714844, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.20658802986145, "rewards_train/margins": 2.171766996383667, "rewards_train/rejected": -4.378355026245117, "step": 2456 }, { "epoch": 0.69, "logps_train/chosen": -95.93270874023438, "logps_train/ref_chosen": -79.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -113.05723571777344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6924901008605957, "rewards_train/margins": 0.9589362144470215, "rewards_train/rejected": -2.651426315307617, "step": 2457 }, { "epoch": 0.69, "learning_rate": 8.719931822574716e-09, "loss": 0.4546, "step": 2458 }, { "epoch": 0.69, "logps_train/chosen": -68.79032897949219, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -36.25, "logps_train/rejected": -66.95207977294922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4576951265335083, "rewards_train/margins": 1.623987078666687, "rewards_train/rejected": -3.0816822052001953, "step": 2458 }, { "epoch": 0.69, "logps_train/chosen": -54.415740966796875, "logps_train/ref_chosen": -41.75, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -59.605167388916016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2603728771209717, "rewards_train/margins": 0.5190891027450562, "rewards_train/rejected": -1.7794619798660278, "step": 2459 }, { "epoch": 0.69, "learning_rate": 8.558636168851745e-09, "loss": 0.3846, "step": 2460 }, { "epoch": 0.69, "logps_train/chosen": -109.34850311279297, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -116.5679931640625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -3.1129746437072754, "rewards_train/margins": 0.7996840476989746, "rewards_train/rejected": -3.91265869140625, "step": 2460 }, { "epoch": 0.69, "logps_train/chosen": -75.21438598632812, "logps_train/ref_chosen": -61.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -94.56011962890625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4101901054382324, "rewards_train/margins": 1.4834990501403809, "rewards_train/rejected": -2.8936891555786133, "step": 2461 }, { "epoch": 0.69, "learning_rate": 8.39882026378766e-09, "loss": 0.4584, "step": 2462 }, { "epoch": 0.69, "logps_train/chosen": -61.35057830810547, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -44.0, "logps_train/rejected": -69.88313293457031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1280266046524048, "rewards_train/margins": 1.4540365934371948, "rewards_train/rejected": -2.5820631980895996, "step": 2462 }, { "epoch": 0.69, "logps_train/chosen": -76.86917114257812, "logps_train/ref_chosen": -63.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -107.19986724853516, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3388700485229492, "rewards_train/margins": 1.6354131698608398, "rewards_train/rejected": -2.974283218383789, "step": 2463 }, { "epoch": 0.69, "learning_rate": 8.240485086864007e-09, "loss": 0.3694, "step": 2464 }, { "epoch": 0.69, "logps_train/chosen": -63.74079895019531, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -80.04426574707031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5031814575195312, "rewards_train/margins": 0.8016355037689209, "rewards_train/rejected": -2.304816961288452, "step": 2464 }, { "epoch": 0.69, "logps_train/chosen": -76.92068481445312, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -90.85658264160156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2391629219055176, "rewards_train/margins": 1.473839282989502, "rewards_train/rejected": -2.7130022048950195, "step": 2465 }, { "epoch": 0.69, "learning_rate": 8.083631608487268e-09, "loss": 0.482, "step": 2466 }, { "epoch": 0.69, "logps_train/chosen": -52.597015380859375, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -48.5, "logps_train/rejected": -70.30706787109375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5288236141204834, "rewards_train/margins": 1.6440527439117432, "rewards_train/rejected": -2.1728763580322266, "step": 2466 }, { "epoch": 0.69, "logps_train/chosen": -41.66633605957031, "logps_train/ref_chosen": -29.75, "logps_train/ref_rejected": -29.5, "logps_train/rejected": -50.57468032836914, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1877273321151733, "rewards_train/margins": 0.9138811826705933, "rewards_train/rejected": -2.1016085147857666, "step": 2467 }, { "epoch": 0.69, "learning_rate": 7.928260789982932e-09, "loss": 0.4205, "step": 2468 }, { "epoch": 0.69, "logps_train/chosen": -64.90805053710938, "logps_train/ref_chosen": -52.25, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -81.55690002441406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.2579922676086426, "rewards_train/margins": 0.8851981163024902, "rewards_train/rejected": -2.143190383911133, "step": 2468 }, { "epoch": 0.69, "logps_train/chosen": -62.74514389038086, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -104.23786163330078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.254934549331665, "rewards_train/margins": 2.454007863998413, "rewards_train/rejected": -3.708942413330078, "step": 2469 }, { "epoch": 0.69, "learning_rate": 7.774373583589455e-09, "loss": 0.3021, "step": 2470 }, { "epoch": 0.69, "logps_train/chosen": -90.30842590332031, "logps_train/ref_chosen": -74.5, "logps_train/ref_rejected": -52.0, "logps_train/rejected": -69.93529510498047, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.580256700515747, "rewards_train/margins": 0.21385860443115234, "rewards_train/rejected": -1.7941153049468994, "step": 2470 }, { "epoch": 0.69, "logps_train/chosen": -91.61563110351562, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -115.80494689941406, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8113679885864258, "rewards_train/margins": 1.9533071517944336, "rewards_train/rejected": -3.7646751403808594, "step": 2471 }, { "epoch": 0.69, "learning_rate": 7.621970932452536e-09, "loss": 0.6114, "step": 2472 }, { "epoch": 0.69, "logps_train/chosen": -83.10690307617188, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -75.5, "logps_train/rejected": -117.90853881835938, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5819799900054932, "rewards_train/margins": 2.6479365825653076, "rewards_train/rejected": -4.229916572570801, "step": 2472 }, { "epoch": 0.69, "logps_train/chosen": -51.63884353637695, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -74.3192138671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.9325367212295532, "rewards_train/margins": 1.8177446126937866, "rewards_train/rejected": -2.75028133392334, "step": 2473 }, { "epoch": 0.69, "learning_rate": 7.47105377061935e-09, "loss": 0.2614, "step": 2474 }, { "epoch": 0.69, "logps_train/chosen": -80.03575897216797, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -107.07798767089844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.789513111114502, "rewards_train/margins": 2.2671141624450684, "rewards_train/rejected": -4.05662727355957, "step": 2474 }, { "epoch": 0.69, "logps_train/chosen": -33.94361877441406, "logps_train/ref_chosen": -25.5, "logps_train/ref_rejected": -26.625, "logps_train/rejected": -42.953208923339844, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8385021686553955, "rewards_train/margins": 0.7933425903320312, "rewards_train/rejected": -1.6318447589874268, "step": 2475 }, { "epoch": 0.69, "learning_rate": 7.321623023032797e-09, "loss": 0.3948, "step": 2476 }, { "epoch": 0.69, "logps_train/chosen": -101.93995666503906, "logps_train/ref_chosen": -77.5, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -113.46424102783203, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.4521994590759277, "rewards_train/margins": 0.7137560844421387, "rewards_train/rejected": -3.1659555435180664, "step": 2476 }, { "epoch": 0.69, "logps_train/chosen": -117.43596649169922, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -140.4024658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.3443779945373535, "rewards_train/margins": 2.070868492126465, "rewards_train/rejected": -4.415246486663818, "step": 2477 }, { "epoch": 0.69, "learning_rate": 7.1736796055257285e-09, "loss": 0.3932, "step": 2478 }, { "epoch": 0.69, "logps_train/chosen": -68.93977355957031, "logps_train/ref_chosen": -50.0, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -88.85133361816406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.894759178161621, "rewards_train/margins": 0.8424253463745117, "rewards_train/rejected": -2.737184524536133, "step": 2478 }, { "epoch": 0.69, "logps_train/chosen": -62.757423400878906, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -74.76496887207031, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0546488761901855, "rewards_train/margins": 0.9101297855377197, "rewards_train/rejected": -2.9647786617279053, "step": 2479 }, { "epoch": 0.69, "learning_rate": 7.027224424815542e-09, "loss": 0.4252, "step": 2480 }, { "epoch": 0.69, "logps_train/chosen": -108.4504165649414, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -133.88917541503906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.838791847229004, "rewards_train/margins": 1.2126259803771973, "rewards_train/rejected": -3.051417827606201, "step": 2480 }, { "epoch": 0.69, "logps_train/chosen": -44.91813278198242, "logps_train/ref_chosen": -32.5, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -69.77384948730469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2348432540893555, "rewards_train/margins": 0.6959223747253418, "rewards_train/rejected": -1.9307656288146973, "step": 2481 }, { "epoch": 0.69, "learning_rate": 6.882258378498457e-09, "loss": 0.4873, "step": 2482 }, { "epoch": 0.69, "logps_train/chosen": -91.01998138427734, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -119.45240783691406, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.745943307876587, "rewards_train/margins": 1.5054504871368408, "rewards_train/rejected": -3.2513937950134277, "step": 2482 }, { "epoch": 0.69, "logps_train/chosen": -61.96266174316406, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -77.428955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4462659358978271, "rewards_train/margins": 0.9817855358123779, "rewards_train/rejected": -2.428051471710205, "step": 2483 }, { "epoch": 0.69, "learning_rate": 6.738782355044048e-09, "loss": 0.3518, "step": 2484 }, { "epoch": 0.69, "logps_train/chosen": -69.67442321777344, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -45.0, "logps_train/rejected": -63.17634963989258, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.5654404163360596, "rewards_train/margins": 0.2586398124694824, "rewards_train/rejected": -1.824080228805542, "step": 2484 }, { "epoch": 0.69, "logps_train/chosen": -109.82067108154297, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -137.20709228515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9070677757263184, "rewards_train/margins": 2.1558289527893066, "rewards_train/rejected": -4.062896728515625, "step": 2485 }, { "epoch": 0.69, "learning_rate": 6.596797233789863e-09, "loss": 0.5463, "step": 2486 }, { "epoch": 0.69, "logps_train/chosen": -39.382389068603516, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -40.5, "logps_train/rejected": -63.26960754394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6440982818603516, "rewards_train/margins": 1.630518913269043, "rewards_train/rejected": -2.2746171951293945, "step": 2486 }, { "epoch": 0.7, "logps_train/chosen": -73.81800842285156, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -113.29178619384766, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1682264804840088, "rewards_train/margins": 2.5187647342681885, "rewards_train/rejected": -3.6869912147521973, "step": 2487 }, { "epoch": 0.7, "learning_rate": 6.45630388493601e-09, "loss": 0.2414, "step": 2488 }, { "epoch": 0.7, "logps_train/chosen": -68.95889282226562, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -68.53378295898438, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.294912815093994, "rewards_train/margins": 0.24713706970214844, "rewards_train/rejected": -2.5420498847961426, "step": 2488 }, { "epoch": 0.7, "logps_train/chosen": -60.429569244384766, "logps_train/ref_chosen": -47.5, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -65.44770050048828, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.300281286239624, "rewards_train/margins": 0.7160708904266357, "rewards_train/rejected": -2.0163521766662598, "step": 2489 }, { "epoch": 0.7, "learning_rate": 6.317303169539739e-09, "loss": 0.5899, "step": 2490 }, { "epoch": 0.7, "logps_train/chosen": -48.39255142211914, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -45.5, "logps_train/rejected": -63.63333511352539, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6046849489212036, "rewards_train/margins": 1.203863501548767, "rewards_train/rejected": -1.8085484504699707, "step": 2490 }, { "epoch": 0.7, "logps_train/chosen": -30.971542358398438, "logps_train/ref_chosen": -25.75, "logps_train/ref_rejected": -20.125, "logps_train/rejected": -32.14308166503906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5250838994979858, "rewards_train/margins": 0.6758455038070679, "rewards_train/rejected": -1.2009294033050537, "step": 2491 }, { "epoch": 0.7, "learning_rate": 6.179795939510263e-09, "loss": 0.4329, "step": 2492 }, { "epoch": 0.7, "logps_train/chosen": -48.52174377441406, "logps_train/ref_chosen": -35.0, "logps_train/ref_rejected": -39.75, "logps_train/rejected": -62.720191955566406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.355494499206543, "rewards_train/margins": 0.9501185417175293, "rewards_train/rejected": -2.3056130409240723, "step": 2492 }, { "epoch": 0.7, "logps_train/chosen": -23.89380645751953, "logps_train/ref_chosen": -19.0, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -57.6026496887207, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.4851815700531006, "rewards_train/margins": 0.8491067886352539, "rewards_train/rejected": -1.3342883586883545, "step": 2493 }, { "epoch": 0.7, "learning_rate": 6.043783037603472e-09, "loss": 0.4688, "step": 2494 }, { "epoch": 0.7, "logps_train/chosen": -34.178165435791016, "logps_train/ref_chosen": -25.0, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -55.25493621826172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9166446924209595, "rewards_train/margins": 0.8803333044052124, "rewards_train/rejected": -1.7969779968261719, "step": 2494 }, { "epoch": 0.7, "logps_train/chosen": -62.27611541748047, "logps_train/ref_chosen": -50.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -92.29428100585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.1438229084014893, "rewards_train/margins": 1.6285741329193115, "rewards_train/rejected": -2.772397041320801, "step": 2495 }, { "epoch": 0.7, "learning_rate": 5.909265297416921e-09, "loss": 0.357, "step": 2496 }, { "epoch": 0.7, "logps_train/chosen": -60.37770080566406, "logps_train/ref_chosen": -49.0, "logps_train/ref_rejected": -51.5, "logps_train/rejected": -80.50286865234375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1416759490966797, "rewards_train/margins": 1.761735200881958, "rewards_train/rejected": -2.9034111499786377, "step": 2496 }, { "epoch": 0.7, "logps_train/chosen": -78.12179565429688, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -136.73385620117188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.4813199043273926, "rewards_train/margins": 3.231128215789795, "rewards_train/rejected": -4.7124481201171875, "step": 2497 }, { "epoch": 0.7, "learning_rate": 5.776243543384435e-09, "loss": 0.3082, "step": 2498 }, { "epoch": 0.7, "logps_train/chosen": -73.89704895019531, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -56.75, "logps_train/rejected": -91.88042449951172, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6803300380706787, "rewards_train/margins": 1.821774959564209, "rewards_train/rejected": -3.5021049976348877, "step": 2498 }, { "epoch": 0.7, "logps_train/chosen": -80.3949203491211, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -116.55427551269531, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.4425199031829834, "rewards_train/margins": 2.521501302719116, "rewards_train/rejected": -3.9640212059020996, "step": 2499 }, { "epoch": 0.7, "learning_rate": 5.644718590771341e-09, "loss": 0.34, "step": 2500 }, { "epoch": 0.7, "logps_train/chosen": -55.54356384277344, "logps_train/ref_chosen": -46.0, "logps_train/ref_rejected": -54.75, "logps_train/rejected": -69.06123352050781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9430282711982727, "rewards_train/margins": 0.4974704384803772, "rewards_train/rejected": -1.44049870967865, "step": 2500 }, { "epoch": 0.7, "logps_train/chosen": -109.88554382324219, "logps_train/ref_chosen": -97.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -114.14254760742188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2955855131149292, "rewards_train/margins": 1.2350753545761108, "rewards_train/rejected": -2.53066086769104, "step": 2501 }, { "epoch": 0.7, "learning_rate": 5.514691245669279e-09, "loss": 0.4098, "step": 2502 }, { "epoch": 0.7, "logps_train/chosen": -117.96464538574219, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -127.90826416015625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4212193489074707, "rewards_train/margins": 1.5367944240570068, "rewards_train/rejected": -3.9580137729644775, "step": 2502 }, { "epoch": 0.7, "logps_train/chosen": -80.95791625976562, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -60.25, "logps_train/rejected": -90.92584228515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.09735369682312, "rewards_train/margins": 0.9616367816925049, "rewards_train/rejected": -3.058990478515625, "step": 2503 }, { "epoch": 0.7, "learning_rate": 5.386162304991393e-09, "loss": 0.4264, "step": 2504 }, { "epoch": 0.7, "logps_train/chosen": -64.06707763671875, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -90.91767883300781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6518247127532959, "rewards_train/margins": 2.4473655223846436, "rewards_train/rejected": -3.0991902351379395, "step": 2504 }, { "epoch": 0.7, "logps_train/chosen": -114.7740249633789, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -135.31288146972656, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3707621097564697, "rewards_train/margins": 1.3536896705627441, "rewards_train/rejected": -3.724451780319214, "step": 2505 }, { "epoch": 0.7, "learning_rate": 5.259132556467316e-09, "loss": 0.2807, "step": 2506 }, { "epoch": 0.7, "logps_train/chosen": -73.70887756347656, "logps_train/ref_chosen": -59.5, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -91.44155883789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4282361268997192, "rewards_train/margins": 1.0232445001602173, "rewards_train/rejected": -2.4514806270599365, "step": 2506 }, { "epoch": 0.7, "logps_train/chosen": -76.83233642578125, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -62.75, "logps_train/rejected": -91.50949096679688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.363507866859436, "rewards_train/margins": 1.5085357427597046, "rewards_train/rejected": -2.8720436096191406, "step": 2507 }, { "epoch": 0.7, "learning_rate": 5.133602778638474e-09, "loss": 0.3832, "step": 2508 }, { "epoch": 0.7, "logps_train/chosen": -67.82963562011719, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -75.24113464355469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1230028867721558, "rewards_train/margins": 1.0438843965530396, "rewards_train/rejected": -2.1668872833251953, "step": 2508 }, { "epoch": 0.7, "logps_train/chosen": -45.889976501464844, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -66.9637222290039, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1408531665802002, "rewards_train/margins": 0.8887221813201904, "rewards_train/rejected": -2.0295753479003906, "step": 2509 }, { "epoch": 0.7, "learning_rate": 5.009573740853313e-09, "loss": 0.4551, "step": 2510 }, { "epoch": 0.7, "logps_train/chosen": -49.59282302856445, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -37.25, "logps_train/rejected": -58.731849670410156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3594775199890137, "rewards_train/margins": 0.7906608581542969, "rewards_train/rejected": -2.1501383781433105, "step": 2510 }, { "epoch": 0.7, "logps_train/chosen": -118.69239807128906, "logps_train/ref_chosen": -95.5, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -143.94659423828125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3348653316497803, "rewards_train/margins": 1.9566686153411865, "rewards_train/rejected": -4.291533946990967, "step": 2511 }, { "epoch": 0.7, "learning_rate": 4.887046203262357e-09, "loss": 0.4218, "step": 2512 }, { "epoch": 0.7, "logps_train/chosen": -72.07984924316406, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -93.16964721679688, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.031886100769043, "rewards_train/margins": 0.603804349899292, "rewards_train/rejected": -2.635690450668335, "step": 2512 }, { "epoch": 0.7, "logps_train/chosen": -86.69515228271484, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -126.12506103515625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.1351399421691895, "rewards_train/margins": 2.3808813095092773, "rewards_train/rejected": -3.516021251678467, "step": 2513 }, { "epoch": 0.7, "learning_rate": 4.7660209168138285e-09, "loss": 0.4646, "step": 2514 }, { "epoch": 0.7, "logps_train/chosen": -64.74380493164062, "logps_train/ref_chosen": -53.75, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -70.17572021484375, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.0950838327407837, "rewards_train/margins": 1.245388388633728, "rewards_train/rejected": -2.3404722213745117, "step": 2514 }, { "epoch": 0.7, "logps_train/chosen": -87.3644790649414, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -56.0, "logps_train/rejected": -91.03467559814453, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.362717628479004, "rewards_train/margins": 1.1411406993865967, "rewards_train/rejected": -3.5038583278656006, "step": 2515 }, { "epoch": 0.7, "learning_rate": 4.646498623248979e-09, "loss": 0.4612, "step": 2516 }, { "epoch": 0.7, "logps_train/chosen": -99.24951171875, "logps_train/ref_chosen": -83.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -146.4476318359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6050289869308472, "rewards_train/margins": 3.420204281806946, "rewards_train/rejected": -5.025233268737793, "step": 2516 }, { "epoch": 0.7, "logps_train/chosen": -84.79946899414062, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -94.22405242919922, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9858070611953735, "rewards_train/margins": 0.9464620351791382, "rewards_train/rejected": -2.9322690963745117, "step": 2517 }, { "epoch": 0.7, "learning_rate": 4.528480055097372e-09, "loss": 0.257, "step": 2518 }, { "epoch": 0.7, "logps_train/chosen": -89.5019302368164, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -115.20271301269531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.750583291053772, "rewards_train/margins": 2.248594641685486, "rewards_train/rejected": -3.999177932739258, "step": 2518 }, { "epoch": 0.7, "logps_train/chosen": -73.71375274658203, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -100.21600341796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0963748693466187, "rewards_train/margins": 1.8213196992874146, "rewards_train/rejected": -2.917694568634033, "step": 2519 }, { "epoch": 0.7, "learning_rate": 4.41196593567264e-09, "loss": 0.2622, "step": 2520 }, { "epoch": 0.7, "logps_train/chosen": -60.51446533203125, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -50.25, "logps_train/rejected": -73.4498519897461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9528138637542725, "rewards_train/margins": 1.3773272037506104, "rewards_train/rejected": -2.330141067504883, "step": 2520 }, { "epoch": 0.7, "logps_train/chosen": -65.96441650390625, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -86.78656768798828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.142828345298767, "rewards_train/margins": 1.0246959924697876, "rewards_train/rejected": -2.1675243377685547, "step": 2521 }, { "epoch": 0.7, "learning_rate": 4.2969569790678454e-09, "loss": 0.3736, "step": 2522 }, { "epoch": 0.7, "logps_train/chosen": -31.64518928527832, "logps_train/ref_chosen": -23.75, "logps_train/ref_rejected": -30.875, "logps_train/rejected": -49.446434020996094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7922532558441162, "rewards_train/margins": 1.0594215393066406, "rewards_train/rejected": -1.8516747951507568, "step": 2522 }, { "epoch": 0.71, "logps_train/chosen": -77.87290954589844, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -58.5, "logps_train/rejected": -92.91309356689453, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9161972999572754, "rewards_train/margins": 1.5172991752624512, "rewards_train/rejected": -3.4334964752197266, "step": 2523 }, { "epoch": 0.71, "learning_rate": 4.183453890151289e-09, "loss": 0.3236, "step": 2524 }, { "epoch": 0.71, "logps_train/chosen": -58.016319274902344, "logps_train/ref_chosen": -47.75, "logps_train/ref_rejected": -52.75, "logps_train/rejected": -74.41314697265625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0327842235565186, "rewards_train/margins": 1.125718355178833, "rewards_train/rejected": -2.1585025787353516, "step": 2524 }, { "epoch": 0.71, "logps_train/chosen": -81.72956085205078, "logps_train/ref_chosen": -62.25, "logps_train/ref_rejected": -53.25, "logps_train/rejected": -89.55940246582031, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.950885534286499, "rewards_train/margins": 1.691382884979248, "rewards_train/rejected": -3.642268419265747, "step": 2525 }, { "epoch": 0.71, "learning_rate": 4.0714573645619645e-09, "loss": 0.4365, "step": 2526 }, { "epoch": 0.71, "logps_train/chosen": -125.89964294433594, "logps_train/ref_chosen": -103.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -151.95408630371094, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2696521282196045, "rewards_train/margins": 2.822632074356079, "rewards_train/rejected": -5.092284202575684, "step": 2526 }, { "epoch": 0.71, "logps_train/chosen": -107.92918395996094, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -84.5, "logps_train/rejected": -119.59950256347656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8476057052612305, "rewards_train/margins": 1.6846108436584473, "rewards_train/rejected": -3.5322165489196777, "step": 2527 }, { "epoch": 0.71, "learning_rate": 3.960968088705524e-09, "loss": 0.3241, "step": 2528 }, { "epoch": 0.71, "logps_train/chosen": -57.016517639160156, "logps_train/ref_chosen": -44.25, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -82.94845581054688, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.274308204650879, "rewards_train/margins": 1.2373342514038086, "rewards_train/rejected": -2.5116424560546875, "step": 2528 }, { "epoch": 0.71, "logps_train/chosen": -105.4737777709961, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -68.0, "logps_train/rejected": -106.50234985351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.0741353034973145, "rewards_train/margins": 1.7837176322937012, "rewards_train/rejected": -3.8578529357910156, "step": 2529 }, { "epoch": 0.71, "learning_rate": 3.85198673974993e-09, "loss": 0.3638, "step": 2530 }, { "epoch": 0.71, "logps_train/chosen": -82.160888671875, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -50.75, "logps_train/rejected": -73.44884490966797, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.467456340789795, "rewards_train/margins": 0.8141472339630127, "rewards_train/rejected": -2.2816035747528076, "step": 2530 }, { "epoch": 0.71, "logps_train/chosen": -76.90202331542969, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -70.87318420410156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9978537559509277, "rewards_train/margins": 0.7424285411834717, "rewards_train/rejected": -1.7402822971343994, "step": 2531 }, { "epoch": 0.71, "learning_rate": 3.744513985621339e-09, "loss": 0.5711, "step": 2532 }, { "epoch": 0.71, "logps_train/chosen": -91.89335632324219, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -96.53278350830078, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.4848430156707764, "rewards_train/margins": 0.6058864593505859, "rewards_train/rejected": -3.0907294750213623, "step": 2532 }, { "epoch": 0.71, "logps_train/chosen": -48.245994567871094, "logps_train/ref_chosen": -34.5, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -58.76687240600586, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3714743852615356, "rewards_train/margins": 0.6583379507064819, "rewards_train/rejected": -2.0298123359680176, "step": 2533 }, { "epoch": 0.71, "learning_rate": 3.63855048500003e-09, "loss": 0.5459, "step": 2534 }, { "epoch": 0.71, "logps_train/chosen": -111.29991912841797, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -83.5, "logps_train/rejected": -122.70579528808594, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4971795082092285, "rewards_train/margins": 1.4253535270690918, "rewards_train/rejected": -3.9225330352783203, "step": 2534 }, { "epoch": 0.71, "logps_train/chosen": -52.40454864501953, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -42.5, "logps_train/rejected": -53.28978729248047, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6623297929763794, "rewards_train/margins": 0.42866051197052, "rewards_train/rejected": -1.0909903049468994, "step": 2535 }, { "epoch": 0.71, "learning_rate": 3.5340968873163457e-09, "loss": 0.4747, "step": 2536 }, { "epoch": 0.71, "logps_train/chosen": -63.09726333618164, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -99.51697540283203, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1542572975158691, "rewards_train/margins": 2.721268653869629, "rewards_train/rejected": -3.875525951385498, "step": 2536 }, { "epoch": 0.71, "logps_train/chosen": -50.594722747802734, "logps_train/ref_chosen": -34.75, "logps_train/ref_rejected": -45.75, "logps_train/rejected": -69.31079864501953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5809564590454102, "rewards_train/margins": 0.785670280456543, "rewards_train/rejected": -2.366626739501953, "step": 2537 }, { "epoch": 0.71, "learning_rate": 3.4311538327467525e-09, "loss": 0.331, "step": 2538 }, { "epoch": 0.71, "logps_train/chosen": -43.12479782104492, "logps_train/ref_chosen": -39.5, "logps_train/ref_rejected": -41.5, "logps_train/rejected": -55.727699279785156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.3640422523021698, "rewards_train/margins": 1.0585326254367828, "rewards_train/rejected": -1.4225748777389526, "step": 2538 }, { "epoch": 0.71, "logps_train/chosen": -42.5294189453125, "logps_train/ref_chosen": -37.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -61.11363220214844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.5201295614242554, "rewards_train/margins": 0.8588117361068726, "rewards_train/rejected": -1.378941297531128, "step": 2539 }, { "epoch": 0.71, "learning_rate": 3.3297219522098185e-09, "loss": 0.4105, "step": 2540 }, { "epoch": 0.71, "logps_train/chosen": -73.4512939453125, "logps_train/ref_chosen": -54.5, "logps_train/ref_rejected": -61.25, "logps_train/rejected": -97.45016479492188, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8943475484848022, "rewards_train/margins": 1.713950753211975, "rewards_train/rejected": -3.6082983016967773, "step": 2540 }, { "epoch": 0.71, "logps_train/chosen": -83.4524154663086, "logps_train/ref_chosen": -65.5, "logps_train/ref_rejected": -85.0, "logps_train/rejected": -112.43692016601562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8120381832122803, "rewards_train/margins": 0.9123179912567139, "rewards_train/rejected": -2.724356174468994, "step": 2541 }, { "epoch": 0.71, "learning_rate": 3.229801867362436e-09, "loss": 0.3953, "step": 2542 }, { "epoch": 0.71, "logps_train/chosen": -100.3590316772461, "logps_train/ref_chosen": -74.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -114.94544982910156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.642935037612915, "rewards_train/margins": 1.7891104221343994, "rewards_train/rejected": -4.4320454597473145, "step": 2542 }, { "epoch": 0.71, "logps_train/chosen": -59.420745849609375, "logps_train/ref_chosen": -46.75, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -76.69956970214844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2723479270935059, "rewards_train/margins": 1.3438987731933594, "rewards_train/rejected": -2.6162467002868652, "step": 2543 }, { "epoch": 0.71, "learning_rate": 3.131394190595993e-09, "loss": 0.3992, "step": 2544 }, { "epoch": 0.71, "logps_train/chosen": -85.40623474121094, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -62.5, "logps_train/rejected": -82.9942398071289, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.5839824676513672, "rewards_train/margins": 0.4677853584289551, "rewards_train/rejected": -2.0517678260803223, "step": 2544 }, { "epoch": 0.71, "logps_train/chosen": -74.05406188964844, "logps_train/ref_chosen": -63.75, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -112.29505920410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.039976716041565, "rewards_train/margins": 3.261355757713318, "rewards_train/rejected": -4.301332473754883, "step": 2545 }, { "epoch": 0.71, "learning_rate": 3.0344995250326245e-09, "loss": 0.4002, "step": 2546 }, { "epoch": 0.71, "logps_train/chosen": -108.33120727539062, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -115.80293273925781, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.568178653717041, "rewards_train/margins": 0.8703188896179199, "rewards_train/rejected": -4.438497543334961, "step": 2546 }, { "epoch": 0.71, "logps_train/chosen": -60.30493927001953, "logps_train/ref_chosen": -48.25, "logps_train/ref_rejected": -47.0, "logps_train/rejected": -74.990966796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2004163265228271, "rewards_train/margins": 1.5908684730529785, "rewards_train/rejected": -2.7912847995758057, "step": 2547 }, { "epoch": 0.71, "learning_rate": 2.9391184645214952e-09, "loss": 0.4832, "step": 2548 }, { "epoch": 0.71, "logps_train/chosen": -38.85019302368164, "logps_train/ref_chosen": -25.75, "logps_train/ref_rejected": -30.0, "logps_train/rejected": -53.136985778808594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3068944215774536, "rewards_train/margins": 1.00485098361969, "rewards_train/rejected": -2.3117454051971436, "step": 2548 }, { "epoch": 0.71, "logps_train/chosen": -75.02616882324219, "logps_train/ref_chosen": -60.0, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -110.71455383300781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5113331079483032, "rewards_train/margins": 2.076332926750183, "rewards_train/rejected": -3.5876660346984863, "step": 2549 }, { "epoch": 0.71, "learning_rate": 2.845251593635134e-09, "loss": 0.303, "step": 2550 }, { "epoch": 0.71, "logps_train/chosen": -73.92768096923828, "logps_train/ref_chosen": -58.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -99.3314208984375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.50937020778656, "rewards_train/margins": 1.98568594455719, "rewards_train/rejected": -3.49505615234375, "step": 2550 }, { "epoch": 0.71, "logps_train/chosen": -80.48007202148438, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -77.5, "logps_train/rejected": -101.70580291748047, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3081636428833008, "rewards_train/margins": 1.1171047687530518, "rewards_train/rejected": -2.4252684116363525, "step": 2551 }, { "epoch": 0.71, "learning_rate": 2.7528994876659663e-09, "loss": 0.4651, "step": 2552 }, { "epoch": 0.71, "logps_train/chosen": -86.15945434570312, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -61.75, "logps_train/rejected": -86.7913818359375, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.3490512371063232, "rewards_train/margins": 1.1609461307525635, "rewards_train/rejected": -2.5099973678588867, "step": 2552 }, { "epoch": 0.71, "logps_train/chosen": -66.86172485351562, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -108.44461822509766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8227444887161255, "rewards_train/margins": 2.2795294523239136, "rewards_train/rejected": -4.102273941040039, "step": 2553 }, { "epoch": 0.71, "learning_rate": 2.6620627126226205e-09, "loss": 0.4658, "step": 2554 }, { "epoch": 0.71, "logps_train/chosen": -114.74082946777344, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -111.30467224121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6873644590377808, "rewards_train/margins": 1.7747431993484497, "rewards_train/rejected": -3.4621076583862305, "step": 2554 }, { "epoch": 0.71, "logps_train/chosen": -48.35145950317383, "logps_train/ref_chosen": -35.5, "logps_train/ref_rejected": -36.0, "logps_train/rejected": -52.562843322753906, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.280165433883667, "rewards_train/margins": 0.377974271774292, "rewards_train/rejected": -1.658139705657959, "step": 2555 }, { "epoch": 0.71, "learning_rate": 2.5727418252266265e-09, "loss": 0.374, "step": 2556 }, { "epoch": 0.71, "logps_train/chosen": -96.28713989257812, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -84.0, "logps_train/rejected": -128.15771484375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4005889892578125, "rewards_train/margins": 3.024655342102051, "rewards_train/rejected": -4.425244331359863, "step": 2556 }, { "epoch": 0.71, "logps_train/chosen": -94.68815612792969, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -147.94271850585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4899098873138428, "rewards_train/margins": 3.2996747493743896, "rewards_train/rejected": -4.789584636688232, "step": 2557 }, { "epoch": 0.71, "learning_rate": 2.484937372908835e-09, "loss": 0.1877, "step": 2558 }, { "epoch": 0.71, "logps_train/chosen": -99.77741241455078, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -87.0, "logps_train/rejected": -127.62248229980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7371158599853516, "rewards_train/margins": 2.3470077514648438, "rewards_train/rejected": -4.084123611450195, "step": 2558 }, { "epoch": 0.72, "logps_train/chosen": -50.96588897705078, "logps_train/ref_chosen": -44.5, "logps_train/ref_rejected": -52.25, "logps_train/rejected": -74.98343658447266, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6544015407562256, "rewards_train/margins": 1.6210906505584717, "rewards_train/rejected": -2.2754921913146973, "step": 2559 }, { "epoch": 0.72, "learning_rate": 2.3986498938062537e-09, "loss": 0.2611, "step": 2560 }, { "epoch": 0.72, "logps_train/chosen": -53.0804557800293, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -37.5, "logps_train/rejected": -53.82887268066406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7264049053192139, "rewards_train/margins": 0.9108771085739136, "rewards_train/rejected": -1.6372820138931274, "step": 2560 }, { "epoch": 0.72, "logps_train/chosen": -62.467803955078125, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -82.0, "logps_train/rejected": -114.39625549316406, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.7627962827682495, "rewards_train/margins": 2.4947978258132935, "rewards_train/rejected": -3.257594108581543, "step": 2561 }, { "epoch": 0.72, "learning_rate": 2.3138799167585488e-09, "loss": 0.3925, "step": 2562 }, { "epoch": 0.72, "logps_train/chosen": -84.17210388183594, "logps_train/ref_chosen": -64.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -107.87734985351562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.0143778324127197, "rewards_train/margins": 1.4485528469085693, "rewards_train/rejected": -3.462930679321289, "step": 2562 }, { "epoch": 0.72, "logps_train/chosen": -75.2925033569336, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -93.16082763671875, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.767727255821228, "rewards_train/margins": 0.875503659248352, "rewards_train/rejected": -2.64323091506958, "step": 2563 }, { "epoch": 0.72, "learning_rate": 2.2306279613049926e-09, "loss": 0.4548, "step": 2564 }, { "epoch": 0.72, "logps_train/chosen": -77.18624877929688, "logps_train/ref_chosen": -58.25, "logps_train/ref_rejected": -42.0, "logps_train/rejected": -69.58381652832031, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.8937714099884033, "rewards_train/margins": 0.8580672740936279, "rewards_train/rejected": -2.7518386840820312, "step": 2564 }, { "epoch": 0.72, "logps_train/chosen": -85.76945495605469, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -85.5, "logps_train/rejected": -120.61970520019531, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4031178951263428, "rewards_train/margins": 2.1205708980560303, "rewards_train/rejected": -3.523688793182373, "step": 2565 }, { "epoch": 0.72, "learning_rate": 2.1488945376810785e-09, "loss": 0.5526, "step": 2566 }, { "epoch": 0.72, "logps_train/chosen": -117.46063232421875, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -142.2309112548828, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2866880893707275, "rewards_train/margins": 2.3489034175872803, "rewards_train/rejected": -4.635591506958008, "step": 2566 }, { "epoch": 0.72, "logps_train/chosen": -102.19227600097656, "logps_train/ref_chosen": -86.0, "logps_train/ref_rejected": -73.0, "logps_train/rejected": -99.00234985351562, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.6325082778930664, "rewards_train/margins": 0.9575705528259277, "rewards_train/rejected": -2.590078830718994, "step": 2567 }, { "epoch": 0.72, "learning_rate": 2.06868014681566e-09, "loss": 0.4992, "step": 2568 }, { "epoch": 0.72, "logps_train/chosen": -47.828514099121094, "logps_train/ref_chosen": -36.25, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -68.7930679321289, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1507221460342407, "rewards_train/margins": 0.863301157951355, "rewards_train/rejected": -2.0140233039855957, "step": 2568 }, { "epoch": 0.72, "logps_train/chosen": -83.5516357421875, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -135.83253479003906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2588741779327393, "rewards_train/margins": 3.0040667057037354, "rewards_train/rejected": -4.262940883636475, "step": 2569 }, { "epoch": 0.72, "learning_rate": 1.989985280327566e-09, "loss": 0.3847, "step": 2570 }, { "epoch": 0.72, "logps_train/chosen": -80.92173767089844, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -67.0, "logps_train/rejected": -108.75089263916016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7683460712432861, "rewards_train/margins": 2.3823297023773193, "rewards_train/rejected": -4.1506757736206055, "step": 2570 }, { "epoch": 0.72, "logps_train/chosen": -97.18829345703125, "logps_train/ref_chosen": -84.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -94.86675262451172, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2614080905914307, "rewards_train/margins": 1.3947980403900146, "rewards_train/rejected": -2.6562061309814453, "step": 2571 }, { "epoch": 0.72, "learning_rate": 1.9128104205228534e-09, "loss": 0.275, "step": 2572 }, { "epoch": 0.72, "logps_train/chosen": -36.382652282714844, "logps_train/ref_chosen": -33.0, "logps_train/ref_rejected": -43.5, "logps_train/rejected": -65.79957580566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.33304083347320557, "rewards_train/margins": 1.8988693952560425, "rewards_train/rejected": -2.231910228729248, "step": 2572 }, { "epoch": 0.72, "logps_train/chosen": -65.64483642578125, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -99.76315307617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.298858642578125, "rewards_train/margins": 1.9219884872436523, "rewards_train/rejected": -3.2208471298217773, "step": 2573 }, { "epoch": 0.72, "learning_rate": 1.8371560403916963e-09, "loss": 0.2957, "step": 2574 }, { "epoch": 0.72, "logps_train/chosen": -61.229835510253906, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -104.45869445800781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8568215370178223, "rewards_train/margins": 2.9730324745178223, "rewards_train/rejected": -3.8298540115356445, "step": 2574 }, { "epoch": 0.72, "logps_train/chosen": -77.876953125, "logps_train/ref_chosen": -54.25, "logps_train/ref_rejected": -50.5, "logps_train/rejected": -81.69902038574219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.372558832168579, "rewards_train/margins": 0.7455852031707764, "rewards_train/rejected": -3.1181440353393555, "step": 2575 }, { "epoch": 0.72, "learning_rate": 1.7630226036055295e-09, "loss": 0.482, "step": 2576 }, { "epoch": 0.72, "logps_train/chosen": -107.43798828125, "logps_train/ref_chosen": -76.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -166.614501953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -3.076221466064453, "rewards_train/margins": 3.1914777755737305, "rewards_train/rejected": -6.267699241638184, "step": 2576 }, { "epoch": 0.72, "logps_train/chosen": -55.397789001464844, "logps_train/ref_chosen": -45.75, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -82.51815032958984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9651696681976318, "rewards_train/margins": 1.2850830554962158, "rewards_train/rejected": -2.2502527236938477, "step": 2577 }, { "epoch": 0.72, "learning_rate": 1.690410564514244e-09, "loss": 0.2142, "step": 2578 }, { "epoch": 0.72, "logps_train/chosen": -84.96260070800781, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -61.5, "logps_train/rejected": -91.90412902832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.000166893005371, "rewards_train/margins": 2.0347776412963867, "rewards_train/rejected": -3.034944534301758, "step": 2578 }, { "epoch": 0.72, "logps_train/chosen": -43.4246826171875, "logps_train/ref_chosen": -31.5, "logps_train/ref_rejected": -51.25, "logps_train/rejected": -82.26408386230469, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.188562273979187, "rewards_train/margins": 1.9011269807815552, "rewards_train/rejected": -3.089689254760742, "step": 2579 }, { "epoch": 0.72, "learning_rate": 1.6193203681433842e-09, "loss": 0.2988, "step": 2580 }, { "epoch": 0.72, "logps_train/chosen": -62.937217712402344, "logps_train/ref_chosen": -54.0, "logps_train/ref_rejected": -53.5, "logps_train/rejected": -72.88941192626953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.8898157477378845, "rewards_train/margins": 1.0434615015983582, "rewards_train/rejected": -1.9332772493362427, "step": 2580 }, { "epoch": 0.72, "logps_train/chosen": -108.91561126708984, "logps_train/ref_chosen": -86.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -117.49498748779297, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.241560935974121, "rewards_train/margins": 1.604032039642334, "rewards_train/rejected": -3.845592975616455, "step": 2581 }, { "epoch": 0.72, "learning_rate": 1.5497524501913162e-09, "loss": 0.3374, "step": 2582 }, { "epoch": 0.72, "logps_train/chosen": -23.017427444458008, "logps_train/ref_chosen": -18.5, "logps_train/ref_rejected": -20.0, "logps_train/rejected": -32.56489181518555, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.44817832112312317, "rewards_train/margins": 0.8055765330791473, "rewards_train/rejected": -1.2537548542022705, "step": 2582 }, { "epoch": 0.72, "logps_train/chosen": -83.52470397949219, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -65.5, "logps_train/rejected": -116.595458984375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.1735641956329346, "rewards_train/margins": 2.918989896774292, "rewards_train/rejected": -5.092554092407227, "step": 2583 }, { "epoch": 0.72, "learning_rate": 1.481707237026758e-09, "loss": 0.3747, "step": 2584 }, { "epoch": 0.72, "logps_train/chosen": -42.2349853515625, "logps_train/ref_chosen": -36.0, "logps_train/ref_rejected": -28.375, "logps_train/rejected": -49.74072265625, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6160765886306763, "rewards_train/margins": 1.5263549089431763, "rewards_train/rejected": -2.1424314975738525, "step": 2584 }, { "epoch": 0.72, "logps_train/chosen": -92.0221939086914, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -98.5, "logps_train/rejected": -142.62228393554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.937375783920288, "rewards_train/margins": 2.4701645374298096, "rewards_train/rejected": -4.407540321350098, "step": 2585 }, { "epoch": 0.72, "learning_rate": 1.4151851456859765e-09, "loss": 0.2493, "step": 2586 }, { "epoch": 0.72, "logps_train/chosen": -74.80470275878906, "logps_train/ref_chosen": -64.5, "logps_train/ref_rejected": -77.0, "logps_train/rejected": -100.12959289550781, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.05058753490448, "rewards_train/margins": 1.2408875226974487, "rewards_train/rejected": -2.2914750576019287, "step": 2586 }, { "epoch": 0.72, "logps_train/chosen": -113.98797607421875, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -144.82891845703125, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.9788753986358643, "rewards_train/margins": 1.0831177234649658, "rewards_train/rejected": -4.06199312210083, "step": 2587 }, { "epoch": 0.72, "learning_rate": 1.3501865838703718e-09, "loss": 0.4007, "step": 2588 }, { "epoch": 0.72, "logps_train/chosen": -113.89677429199219, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -89.0, "logps_train/rejected": -132.39170837402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2963175773620605, "rewards_train/margins": 2.0303540229797363, "rewards_train/rejected": -4.326671600341797, "step": 2588 }, { "epoch": 0.72, "logps_train/chosen": -152.4592742919922, "logps_train/ref_chosen": -124.0, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -149.7528839111328, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -2.8599905967712402, "rewards_train/margins": 1.2711572647094727, "rewards_train/rejected": -4.131147861480713, "step": 2589 }, { "epoch": 0.72, "learning_rate": 1.2867119499438973e-09, "loss": 0.4554, "step": 2590 }, { "epoch": 0.72, "logps_train/chosen": -81.25377655029297, "logps_train/ref_chosen": -62.0, "logps_train/ref_rejected": -63.75, "logps_train/rejected": -94.271728515625, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9240100383758545, "rewards_train/margins": 1.115809440612793, "rewards_train/rejected": -3.0398194789886475, "step": 2590 }, { "epoch": 0.72, "logps_train/chosen": -71.81185150146484, "logps_train/ref_chosen": -57.0, "logps_train/ref_rejected": -66.0, "logps_train/rejected": -103.9495849609375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.4714198112487793, "rewards_train/margins": 2.3200230598449707, "rewards_train/rejected": -3.79144287109375, "step": 2591 }, { "epoch": 0.72, "learning_rate": 1.2247616329305876e-09, "loss": 0.4398, "step": 2592 }, { "epoch": 0.72, "logps_train/chosen": -97.08218383789062, "logps_train/ref_chosen": -79.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -116.63507080078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.7507970333099365, "rewards_train/margins": 2.0326321125030518, "rewards_train/rejected": -3.7834291458129883, "step": 2592 }, { "epoch": 0.72, "logps_train/chosen": -53.58721923828125, "logps_train/ref_chosen": -48.75, "logps_train/ref_rejected": -49.5, "logps_train/rejected": -64.55078125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.49036306142807007, "rewards_train/margins": 1.0194998383522034, "rewards_train/rejected": -1.5098628997802734, "step": 2593 }, { "epoch": 0.72, "learning_rate": 1.1643360125123125e-09, "loss": 0.3161, "step": 2594 }, { "epoch": 0.72, "logps_train/chosen": -42.81266403198242, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -49.0, "logps_train/rejected": -78.10739135742188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.4896649122238159, "rewards_train/margins": 2.4249807596206665, "rewards_train/rejected": -2.9146456718444824, "step": 2594 }, { "epoch": 0.73, "logps_train/chosen": -124.36282348632812, "logps_train/ref_chosen": -94.5, "logps_train/ref_rejected": -131.0, "logps_train/rejected": -187.78466796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.990774631500244, "rewards_train/margins": 2.725193500518799, "rewards_train/rejected": -5.715968132019043, "step": 2595 }, { "epoch": 0.73, "learning_rate": 1.105435459026305e-09, "loss": 0.2363, "step": 2596 }, { "epoch": 0.73, "logps_train/chosen": -79.27178955078125, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -70.0, "logps_train/rejected": -101.41029357910156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2160460948944092, "rewards_train/margins": 1.918341875076294, "rewards_train/rejected": -3.134387969970703, "step": 2596 }, { "epoch": 0.73, "logps_train/chosen": -83.04338073730469, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -75.0, "logps_train/rejected": -110.1080322265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.6672286987304688, "rewards_train/margins": 1.820918083190918, "rewards_train/rejected": -3.4881467819213867, "step": 2597 }, { "epoch": 0.73, "learning_rate": 1.048060333462969e-09, "loss": 0.2407, "step": 2598 }, { "epoch": 0.73, "logps_train/chosen": -65.78709411621094, "logps_train/ref_chosen": -47.25, "logps_train/ref_rejected": -26.375, "logps_train/rejected": -50.349822998046875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.8619128465652466, "rewards_train/margins": 0.5308820009231567, "rewards_train/rejected": -2.3927948474884033, "step": 2598 }, { "epoch": 0.73, "logps_train/chosen": -85.44850158691406, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -90.5, "logps_train/rejected": -138.4266357421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.5466082096099854, "rewards_train/margins": 3.2335546016693115, "rewards_train/rejected": -4.780162811279297, "step": 2599 }, { "epoch": 0.73, "learning_rate": 9.922109874636875e-10, "loss": 0.3591, "step": 2600 }, { "epoch": 0.73, "logps_train/chosen": -64.54884338378906, "logps_train/ref_chosen": -51.0, "logps_train/ref_rejected": -52.5, "logps_train/rejected": -72.33523559570312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.3623065948486328, "rewards_train/margins": 0.6255141496658325, "rewards_train/rejected": -1.9878207445144653, "step": 2600 }, { "epoch": 0.73, "logps_train/chosen": -90.21068572998047, "logps_train/ref_chosen": -78.0, "logps_train/ref_rejected": -69.5, "logps_train/rejected": -96.4468994140625, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.2124139070510864, "rewards_train/margins": 1.480554461479187, "rewards_train/rejected": -2.6929683685302734, "step": 2601 }, { "epoch": 0.73, "learning_rate": 9.378877633185178e-10, "loss": 0.4334, "step": 2602 }, { "epoch": 0.73, "logps_train/chosen": -106.14705657958984, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -79.5, "logps_train/rejected": -113.75762176513672, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.2240805625915527, "rewards_train/margins": 1.2075417041778564, "rewards_train/rejected": -3.431622266769409, "step": 2602 }, { "epoch": 0.73, "logps_train/chosen": -91.43550872802734, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -110.93279266357422, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.2091755867004395, "rewards_train/margins": 2.1323466300964355, "rewards_train/rejected": -4.341522216796875, "step": 2603 }, { "epoch": 0.73, "learning_rate": 8.85090993964277e-10, "loss": 0.3414, "step": 2604 }, { "epoch": 0.73, "logps_train/chosen": -54.87451934814453, "logps_train/ref_chosen": -42.25, "logps_train/ref_rejected": -38.5, "logps_train/rejected": -53.47871780395508, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.2569833993911743, "rewards_train/margins": 0.2319040298461914, "rewards_train/rejected": -1.4888874292373657, "step": 2604 }, { "epoch": 0.73, "logps_train/chosen": -89.64334106445312, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -99.81838989257812, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.39353346824646, "rewards_train/margins": 0.425805926322937, "rewards_train/rejected": -1.819339394569397, "step": 2605 }, { "epoch": 0.73, "learning_rate": 8.338210029824877e-10, "loss": 0.6774, "step": 2606 }, { "epoch": 0.73, "logps_train/chosen": -103.01151275634766, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -97.0, "logps_train/rejected": -158.04318237304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2964632511138916, "rewards_train/margins": 3.812737226486206, "rewards_train/rejected": -6.109200477600098, "step": 2606 }, { "epoch": 0.73, "logps_train/chosen": -65.50096893310547, "logps_train/ref_chosen": -50.25, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -94.0868148803711, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5175776481628418, "rewards_train/margins": 1.2215728759765625, "rewards_train/rejected": -2.7391505241394043, "step": 2607 }, { "epoch": 0.73, "learning_rate": 7.840781045972134e-10, "loss": 0.2404, "step": 2608 }, { "epoch": 0.73, "logps_train/chosen": -44.519287109375, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -57.75, "logps_train/rejected": -79.1734848022461, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6569089293479919, "rewards_train/margins": 1.4751859307289124, "rewards_train/rejected": -2.1320948600769043, "step": 2608 }, { "epoch": 0.73, "logps_train/chosen": -98.72407531738281, "logps_train/ref_chosen": -75.5, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -102.75523376464844, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.331782817840576, "rewards_train/margins": 1.4964756965637207, "rewards_train/rejected": -3.828258514404297, "step": 2609 }, { "epoch": 0.73, "learning_rate": 7.358626036733373e-10, "loss": 0.3815, "step": 2610 }, { "epoch": 0.73, "logps_train/chosen": -46.78388977050781, "logps_train/ref_chosen": -41.25, "logps_train/ref_rejected": -34.0, "logps_train/rejected": -59.62596893310547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.5613968372344971, "rewards_train/margins": 2.0004189014434814, "rewards_train/rejected": -2.5618157386779785, "step": 2610 }, { "epoch": 0.73, "logps_train/chosen": -64.04884338378906, "logps_train/ref_chosen": -52.75, "logps_train/ref_rejected": -58.25, "logps_train/rejected": -72.33780670166016, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.1375988721847534, "rewards_train/margins": 0.2727445363998413, "rewards_train/rejected": -1.4103434085845947, "step": 2611 }, { "epoch": 0.73, "learning_rate": 6.891747957145921e-10, "loss": 0.4822, "step": 2612 }, { "epoch": 0.73, "logps_train/chosen": -90.50855255126953, "logps_train/ref_chosen": -62.75, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -104.34603881835938, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.787964344024658, "rewards_train/margins": 1.141561508178711, "rewards_train/rejected": -3.929525852203369, "step": 2612 }, { "epoch": 0.73, "logps_train/chosen": -45.346405029296875, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -47.25, "logps_train/rejected": -70.21902465820312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.602023184299469, "rewards_train/margins": 1.6985411047935486, "rewards_train/rejected": -2.3005642890930176, "step": 2613 }, { "epoch": 0.73, "learning_rate": 6.440149668617556e-10, "loss": 0.379, "step": 2614 }, { "epoch": 0.73, "logps_train/chosen": -32.510215759277344, "logps_train/ref_chosen": -26.5, "logps_train/ref_rejected": -24.0, "logps_train/rejected": -35.33387756347656, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.6028770208358765, "rewards_train/margins": 0.5250420570373535, "rewards_train/rejected": -1.12791907787323, "step": 2614 }, { "epoch": 0.73, "logps_train/chosen": -62.134742736816406, "logps_train/ref_chosen": -51.5, "logps_train/ref_rejected": -49.75, "logps_train/rejected": -70.88180541992188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.052536964416504, "rewards_train/margins": 1.064549446105957, "rewards_train/rejected": -2.117086410522461, "step": 2615 }, { "epoch": 0.73, "learning_rate": 6.003833938908742e-10, "loss": 0.4904, "step": 2616 }, { "epoch": 0.73, "logps_train/chosen": -72.50592041015625, "logps_train/ref_chosen": -57.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -96.1620864868164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4924870729446411, "rewards_train/margins": 2.4229406118392944, "rewards_train/rejected": -3.9154276847839355, "step": 2616 }, { "epoch": 0.73, "logps_train/chosen": -80.89410400390625, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -94.4085922241211, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.099176287651062, "rewards_train/margins": 1.5959798097610474, "rewards_train/rejected": -2.6951560974121094, "step": 2617 }, { "epoch": 0.73, "learning_rate": 5.58280344211709e-10, "loss": 0.2502, "step": 2618 }, { "epoch": 0.73, "logps_train/chosen": -41.728641510009766, "logps_train/ref_chosen": -31.75, "logps_train/ref_rejected": -28.625, "logps_train/rejected": -42.172752380371094, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.000537633895874, "rewards_train/margins": 0.3592545986175537, "rewards_train/rejected": -1.3597922325134277, "step": 2618 }, { "epoch": 0.73, "logps_train/chosen": -122.71369171142578, "logps_train/ref_chosen": -98.0, "logps_train/ref_rejected": -91.0, "logps_train/rejected": -139.3235321044922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.4494946002960205, "rewards_train/margins": 2.359421491622925, "rewards_train/rejected": -4.808916091918945, "step": 2619 }, { "epoch": 0.73, "learning_rate": 5.177060758659036e-10, "loss": 0.3948, "step": 2620 }, { "epoch": 0.73, "logps_train/chosen": -64.05299377441406, "logps_train/ref_chosen": -51.75, "logps_train/ref_rejected": -44.25, "logps_train/rejected": -58.91072082519531, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2408467531204224, "rewards_train/margins": 0.2281550168991089, "rewards_train/rejected": -1.4690017700195312, "step": 2620 }, { "epoch": 0.73, "logps_train/chosen": -133.38526916503906, "logps_train/ref_chosen": -94.0, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -154.12063598632812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -3.9533708095550537, "rewards_train/margins": 1.56553053855896, "rewards_train/rejected": -5.518901348114014, "step": 2621 }, { "epoch": 0.73, "learning_rate": 4.786608375254853e-10, "loss": 0.5219, "step": 2622 }, { "epoch": 0.73, "logps_train/chosen": -73.6378173828125, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -60.5, "logps_train/rejected": -84.26182556152344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.5983517169952393, "rewards_train/margins": 0.7883777618408203, "rewards_train/rejected": -2.3867294788360596, "step": 2622 }, { "epoch": 0.73, "logps_train/chosen": -68.96737670898438, "logps_train/ref_chosen": -57.75, "logps_train/ref_rejected": -50.0, "logps_train/rejected": -71.81686401367188, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.123691439628601, "rewards_train/margins": 1.0577021837234497, "rewards_train/rejected": -2.181393623352051, "step": 2623 }, { "epoch": 0.73, "learning_rate": 4.4114486849136656e-10, "loss": 0.4835, "step": 2624 }, { "epoch": 0.73, "logps_train/chosen": -106.56568908691406, "logps_train/ref_chosen": -77.0, "logps_train/ref_rejected": -86.5, "logps_train/rejected": -135.99807739257812, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.9518814086914062, "rewards_train/margins": 2.0166759490966797, "rewards_train/rejected": -4.968557357788086, "step": 2624 }, { "epoch": 0.73, "logps_train/chosen": -73.24900817871094, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -57.0, "logps_train/rejected": -87.77178955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.4643534421920776, "rewards_train/margins": 1.6003261804580688, "rewards_train/rejected": -3.0646796226501465, "step": 2625 }, { "epoch": 0.73, "learning_rate": 4.0515839869190163e-10, "loss": 0.3318, "step": 2626 }, { "epoch": 0.73, "logps_train/chosen": -89.29573822021484, "logps_train/ref_chosen": -61.75, "logps_train/ref_rejected": -64.5, "logps_train/rejected": -104.28428649902344, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.748079538345337, "rewards_train/margins": 1.2377712726593018, "rewards_train/rejected": -3.9858508110046387, "step": 2626 }, { "epoch": 0.73, "logps_train/chosen": -45.72940444946289, "logps_train/ref_chosen": -39.0, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -58.500343322753906, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.6848545074462891, "rewards_train/margins": 1.64369535446167, "rewards_train/rejected": -2.328549861907959, "step": 2627 }, { "epoch": 0.73, "learning_rate": 3.707016486813597e-10, "loss": 0.3812, "step": 2628 }, { "epoch": 0.73, "logps_train/chosen": -37.667354583740234, "logps_train/ref_chosen": -35.25, "logps_train/ref_rejected": -23.375, "logps_train/rejected": -31.715017318725586, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.252868115901947, "rewards_train/margins": 0.5866023898124695, "rewards_train/rejected": -0.8394705057144165, "step": 2628 }, { "epoch": 0.73, "logps_train/chosen": -97.2012939453125, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -68.5, "logps_train/rejected": -104.85595703125, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.655089855194092, "rewards_train/margins": 0.9578502178192139, "rewards_train/rejected": -3.6129400730133057, "step": 2629 }, { "epoch": 0.74, "learning_rate": 3.377748296386762e-10, "loss": 0.5732, "step": 2630 }, { "epoch": 0.74, "logps_train/chosen": -97.13841247558594, "logps_train/ref_chosen": -80.0, "logps_train/ref_rejected": -97.5, "logps_train/rejected": -149.4308624267578, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.711400032043457, "rewards_train/margins": 3.483640193939209, "rewards_train/rejected": -5.195040225982666, "step": 2630 }, { "epoch": 0.74, "logps_train/chosen": -100.67832946777344, "logps_train/ref_chosen": -84.0, "logps_train/ref_rejected": -67.5, "logps_train/rejected": -97.8401107788086, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6525983810424805, "rewards_train/margins": 1.3847332000732422, "rewards_train/rejected": -3.0373315811157227, "step": 2631 }, { "epoch": 0.74, "learning_rate": 3.0637814336612053e-10, "loss": 0.2806, "step": 2632 }, { "epoch": 0.74, "logps_train/chosen": -78.55406951904297, "logps_train/ref_chosen": -58.5, "logps_train/ref_rejected": -60.75, "logps_train/rejected": -100.8720932006836, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.9934930801391602, "rewards_train/margins": 2.019594669342041, "rewards_train/rejected": -4.013087749481201, "step": 2632 }, { "epoch": 0.74, "logps_train/chosen": -49.850643157958984, "logps_train/ref_chosen": -35.75, "logps_train/ref_rejected": -26.5, "logps_train/rejected": -49.26601028442383, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.39939546585083, "rewards_train/margins": 0.8737633228302002, "rewards_train/rejected": -2.2731587886810303, "step": 2633 }, { "epoch": 0.74, "learning_rate": 2.765117822880747e-10, "loss": 0.4619, "step": 2634 }, { "epoch": 0.74, "logps_train/chosen": -111.60806274414062, "logps_train/ref_chosen": -89.5, "logps_train/ref_rejected": -76.0, "logps_train/rejected": -110.72080993652344, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.213834285736084, "rewards_train/margins": 1.2634224891662598, "rewards_train/rejected": -3.4772567749023438, "step": 2634 }, { "epoch": 0.74, "logps_train/chosen": -85.77435302734375, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -79.0, "logps_train/rejected": -104.79190063476562, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.9071223735809326, "rewards_train/margins": 0.6876928806304932, "rewards_train/rejected": -2.594815254211426, "step": 2635 }, { "epoch": 0.74, "learning_rate": 2.481759294498398e-10, "loss": 0.6999, "step": 2636 }, { "epoch": 0.74, "logps_train/chosen": -119.15330505371094, "logps_train/ref_chosen": -90.0, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -146.40960693359375, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.920018196105957, "rewards_train/margins": 1.6740684509277344, "rewards_train/rejected": -4.594086647033691, "step": 2636 }, { "epoch": 0.74, "logps_train/chosen": -70.18196105957031, "logps_train/ref_chosen": -53.0, "logps_train/ref_rejected": -69.0, "logps_train/rejected": -101.83208465576172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.7195870876312256, "rewards_train/margins": 1.5519025325775146, "rewards_train/rejected": -3.2714896202087402, "step": 2637 }, { "epoch": 0.74, "learning_rate": 2.2137075851652587e-10, "loss": 0.3861, "step": 2638 }, { "epoch": 0.74, "logps_train/chosen": -72.2225112915039, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -73.5, "logps_train/rejected": -96.84630584716797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.47615736722946167, "rewards_train/margins": 1.8420670628547668, "rewards_train/rejected": -2.3182244300842285, "step": 2638 }, { "epoch": 0.74, "logps_train/chosen": -47.962032318115234, "logps_train/ref_chosen": -38.0, "logps_train/ref_rejected": -32.75, "logps_train/rejected": -46.229488372802734, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.9924926161766052, "rewards_train/margins": 0.35311228036880493, "rewards_train/rejected": -1.3456048965454102, "step": 2639 }, { "epoch": 0.74, "learning_rate": 1.960964337719695e-10, "loss": 0.4215, "step": 2640 }, { "epoch": 0.74, "logps_train/chosen": -87.33999633789062, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -70.5, "logps_train/rejected": -103.14662170410156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.2516021728515625, "rewards_train/margins": 2.0252671241760254, "rewards_train/rejected": -3.276869297027588, "step": 2640 }, { "epoch": 0.74, "logps_train/chosen": -113.96342468261719, "logps_train/ref_chosen": -88.0, "logps_train/ref_rejected": -89.5, "logps_train/rejected": -131.891357421875, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.593217372894287, "rewards_train/margins": 1.622480869293213, "rewards_train/rejected": -4.2156982421875, "step": 2641 }, { "epoch": 0.74, "learning_rate": 1.7235311011778998e-10, "loss": 0.3257, "step": 2642 }, { "epoch": 0.74, "logps_train/chosen": -56.97750473022461, "logps_train/ref_chosen": -45.5, "logps_train/ref_rejected": -64.0, "logps_train/rejected": -86.05448913574219, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1559534072875977, "rewards_train/margins": 1.0323081016540527, "rewards_train/rejected": -2.1882615089416504, "step": 2642 }, { "epoch": 0.74, "logps_train/chosen": -57.61783981323242, "logps_train/ref_chosen": -48.5, "logps_train/ref_rejected": -71.5, "logps_train/rejected": -105.30703735351562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.9145675301551819, "rewards_train/margins": 2.4689196944236755, "rewards_train/rejected": -3.3834872245788574, "step": 2643 }, { "epoch": 0.74, "learning_rate": 1.5014093307227915e-10, "loss": 0.3497, "step": 2644 }, { "epoch": 0.74, "logps_train/chosen": -80.65318298339844, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -98.0, "logps_train/rejected": -124.08656311035156, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.1981306076049805, "rewards_train/margins": 1.4226343631744385, "rewards_train/rejected": -2.620764970779419, "step": 2644 }, { "epoch": 0.74, "logps_train/chosen": -63.08063888549805, "logps_train/ref_chosen": -55.75, "logps_train/ref_rejected": -46.75, "logps_train/rejected": -65.68746948242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.740876317024231, "rewards_train/margins": 1.1562882661819458, "rewards_train/rejected": -1.8971645832061768, "step": 2645 }, { "epoch": 0.74, "learning_rate": 1.2946003876976308e-10, "loss": 0.3455, "step": 2646 }, { "epoch": 0.74, "logps_train/chosen": -16.557817459106445, "logps_train/ref_chosen": -11.9375, "logps_train/ref_rejected": -13.4375, "logps_train/rejected": -26.84172821044922, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.46125054359436035, "rewards_train/margins": 0.8781956434249878, "rewards_train/rejected": -1.3394461870193481, "step": 2646 }, { "epoch": 0.74, "logps_train/chosen": -52.47710418701172, "logps_train/ref_chosen": -45.0, "logps_train/ref_rejected": -55.5, "logps_train/rejected": -84.27853393554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.7590386867523193, "rewards_train/margins": 2.109830617904663, "rewards_train/rejected": -2.8688693046569824, "step": 2647 }, { "epoch": 0.74, "learning_rate": 1.1031055395946398e-10, "loss": 0.2915, "step": 2648 }, { "epoch": 0.74, "logps_train/chosen": -104.32331085205078, "logps_train/ref_chosen": -81.5, "logps_train/ref_rejected": -95.5, "logps_train/rejected": -138.26004028320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -2.2963931560516357, "rewards_train/margins": 1.9561731815338135, "rewards_train/rejected": -4.252566337585449, "step": 2648 }, { "epoch": 0.74, "logps_train/chosen": -50.243953704833984, "logps_train/ref_chosen": -42.0, "logps_train/ref_rejected": -41.75, "logps_train/rejected": -73.75155639648438, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -0.8341608047485352, "rewards_train/margins": 2.3757612705230713, "rewards_train/rejected": -3.2099220752716064, "step": 2649 }, { "epoch": 0.74, "learning_rate": 9.269259600505618e-11, "loss": 0.2996, "step": 2650 }, { "epoch": 0.74, "logps_train/chosen": -105.0044937133789, "logps_train/ref_chosen": -85.0, "logps_train/ref_rejected": -76.5, "logps_train/rejected": -110.92948150634766, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.9816014766693115, "rewards_train/margins": 1.481659173965454, "rewards_train/rejected": -3.4632606506347656, "step": 2650 }, { "epoch": 0.74, "logps_train/chosen": -52.51024627685547, "logps_train/ref_chosen": -44.0, "logps_train/ref_rejected": -43.0, "logps_train/rejected": -61.42645263671875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.8603506088256836, "rewards_train/margins": 0.9895212650299072, "rewards_train/rejected": -1.8498718738555908, "step": 2651 }, { "epoch": 0.74, "learning_rate": 7.660627288361143e-11, "loss": 0.3913, "step": 2652 }, { "epoch": 0.74, "logps_train/chosen": -79.01078796386719, "logps_train/ref_chosen": -65.0, "logps_train/ref_rejected": -72.5, "logps_train/rejected": -89.0014419555664, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.3760783672332764, "rewards_train/margins": 0.261566162109375, "rewards_train/rejected": -1.6376445293426514, "step": 2652 }, { "epoch": 0.74, "logps_train/chosen": -131.6029052734375, "logps_train/ref_chosen": -107.5, "logps_train/ref_rejected": -113.5, "logps_train/rejected": -157.19580078125, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -2.3950564861297607, "rewards_train/margins": 1.9878051280975342, "rewards_train/rejected": -4.382861614227295, "step": 2653 }, { "epoch": 0.74, "learning_rate": 6.205168318523802e-11, "loss": 0.4755, "step": 2654 }, { "epoch": 0.74, "logps_train/chosen": -62.2870979309082, "logps_train/ref_chosen": -49.5, "logps_train/ref_rejected": -48.75, "logps_train/rejected": -74.03931427001953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.290428638458252, "rewards_train/margins": 1.2308855056762695, "rewards_train/rejected": -2.5213141441345215, "step": 2654 }, { "epoch": 0.74, "logps_train/chosen": -39.191017150878906, "logps_train/ref_chosen": -34.0, "logps_train/ref_rejected": -45.25, "logps_train/rejected": -61.628631591796875, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -0.5071878433227539, "rewards_train/margins": 1.1421985626220703, "rewards_train/rejected": -1.6493864059448242, "step": 2655 }, { "epoch": 0.74, "learning_rate": 4.9028916112220374e-11, "loss": 0.3524, "step": 2656 }, { "epoch": 0.74, "logps_train/chosen": -93.15544128417969, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -88.5, "logps_train/rejected": -114.2016372680664, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.0510915517807007, "rewards_train/margins": 1.5128222703933716, "rewards_train/rejected": -2.5639138221740723, "step": 2656 }, { "epoch": 0.74, "logps_train/chosen": -55.345123291015625, "logps_train/ref_chosen": -39.25, "logps_train/ref_rejected": -35.25, "logps_train/rejected": -59.86796188354492, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.6190825700759888, "rewards_train/margins": 0.8321670293807983, "rewards_train/rejected": -2.451249599456787, "step": 2657 }, { "epoch": 0.74, "learning_rate": 3.7538051478686053e-11, "loss": 0.4123, "step": 2658 }, { "epoch": 0.74, "logps_train/chosen": -97.30484008789062, "logps_train/ref_chosen": -82.5, "logps_train/ref_rejected": -81.5, "logps_train/rejected": -112.37997436523438, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.487417221069336, "rewards_train/margins": 1.5802674293518066, "rewards_train/rejected": -3.0676846504211426, "step": 2658 }, { "epoch": 0.74, "logps_train/chosen": -47.846046447753906, "logps_train/ref_chosen": -41.0, "logps_train/ref_rejected": -37.75, "logps_train/rejected": -54.81316375732422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.6760110259056091, "rewards_train/margins": 1.0334305167198181, "rewards_train/rejected": -1.7094415426254272, "step": 2659 }, { "epoch": 0.74, "learning_rate": 2.757915970996727e-11, "loss": 0.4072, "step": 2660 }, { "epoch": 0.74, "logps_train/chosen": -82.4228744506836, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -94.5, "logps_train/rejected": -130.42816162109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.0680689811706543, "rewards_train/margins": 2.5282626152038574, "rewards_train/rejected": -3.5963315963745117, "step": 2660 }, { "epoch": 0.74, "logps_train/chosen": -65.0551986694336, "logps_train/ref_chosen": -54.75, "logps_train/ref_rejected": -65.0, "logps_train/rejected": -106.5331802368164, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.022121548652649, "rewards_train/margins": 3.1245564222335815, "rewards_train/rejected": -4.1466779708862305, "step": 2661 }, { "epoch": 0.74, "learning_rate": 1.915230184224015e-11, "loss": 0.2196, "step": 2662 }, { "epoch": 0.74, "logps_train/chosen": -31.293441772460938, "logps_train/ref_chosen": -24.75, "logps_train/ref_rejected": -33.0, "logps_train/rejected": -46.453125, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.6559067964553833, "rewards_train/margins": 0.70073401927948, "rewards_train/rejected": -1.3566408157348633, "step": 2662 }, { "epoch": 0.74, "logps_train/chosen": -69.38417053222656, "logps_train/ref_chosen": -56.5, "logps_train/ref_rejected": -51.0, "logps_train/rejected": -75.73250579833984, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.2813860177993774, "rewards_train/margins": 1.1850284337997437, "rewards_train/rejected": -2.466414451599121, "step": 2663 }, { "epoch": 0.74, "learning_rate": 1.2257529522108346e-11, "loss": 0.4508, "step": 2664 }, { "epoch": 0.74, "logps_train/chosen": -93.66033935546875, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -71.0, "logps_train/rejected": -104.41761779785156, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -2.036736488342285, "rewards_train/margins": 1.322603464126587, "rewards_train/rejected": -3.359339952468872, "step": 2664 }, { "epoch": 0.74, "logps_train/chosen": -76.0361328125, "logps_train/ref_chosen": -63.25, "logps_train/ref_rejected": -78.5, "logps_train/rejected": -115.82254028320312, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.26904296875, "rewards_train/margins": 2.4534459114074707, "rewards_train/rejected": -3.7224888801574707, "step": 2665 }, { "epoch": 0.75, "learning_rate": 6.894885006381024e-12, "loss": 0.3754, "step": 2666 }, { "epoch": 0.75, "logps_train/chosen": -89.47756958007812, "logps_train/ref_chosen": -80.5, "logps_train/ref_rejected": -66.5, "logps_train/rejected": -80.26798248291016, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.9157260060310364, "rewards_train/margins": 0.460388720035553, "rewards_train/rejected": -1.3761147260665894, "step": 2666 }, { "epoch": 0.75, "logps_train/chosen": -80.7105712890625, "logps_train/ref_chosen": -61.5, "logps_train/ref_rejected": -60.0, "logps_train/rejected": -85.43064880371094, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -1.9282830953598022, "rewards_train/margins": 0.6216179132461548, "rewards_train/rejected": -2.549901008605957, "step": 2667 }, { "epoch": 0.75, "learning_rate": 3.0644011616287603e-12, "loss": 0.6348, "step": 2668 }, { "epoch": 0.75, "logps_train/chosen": -84.00387573242188, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -96.5, "logps_train/rejected": -124.59268188476562, "rewards_train/accuracies": 0.875, "rewards_train/chosen": -1.0566372871398926, "rewards_train/margins": 1.7485294342041016, "rewards_train/rejected": -2.805166721343994, "step": 2668 }, { "epoch": 0.75, "logps_train/chosen": -89.93424224853516, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -93.0, "logps_train/rejected": -117.02482604980469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": -1.6844398975372314, "rewards_train/margins": 0.7242922782897949, "rewards_train/rejected": -2.4087321758270264, "step": 2669 }, { "epoch": 0.75, "learning_rate": 7.661014642390551e-13, "loss": 0.6116, "step": 2670 }, { "epoch": 0.75, "logps_train/chosen": -104.87940979003906, "logps_train/ref_chosen": -91.0, "logps_train/ref_rejected": -96.0, "logps_train/rejected": -134.00120544433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -1.3824727535247803, "rewards_train/margins": 2.4367880821228027, "rewards_train/rejected": -3.819260835647583, "step": 2670 }, { "epoch": 0.75, "logps_train/chosen": -93.1978988647461, "logps_train/ref_chosen": -75.0, "logps_train/ref_rejected": -78.0, "logps_train/rejected": -113.64189147949219, "rewards_train/accuracies": 0.75, "rewards_train/chosen": -1.8246729373931885, "rewards_train/margins": 1.763441801071167, "rewards_train/rejected": -3.5881147384643555, "step": 2671 }, { "epoch": 0.75, "learning_rate": 0.0, "loss": 0.3292, "step": 2672 }, { "epoch": 0.75, "step": 2672, "total_flos": 0.0, "train_loss": 0.4759224293876194, "train_runtime": 5798.6467, "train_samples_per_second": 3.686, "train_steps_per_second": 0.461 } ], "logging_steps": 2, "max_steps": 2672, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }