diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40108 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7467859139183901, + "eval_steps": 500, + "global_step": 2672, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "logps_train/chosen": -28.009265899658203, + "logps_train/ref_chosen": -27.875, + "logps_train/ref_rejected": -34.25, + "logps_train/rejected": -34.33583068847656, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.0073716905899345875, + "rewards_train/margins": -0.0024997838772833347, + "rewards_train/rejected": -0.004871906712651253, + "step": 0 + }, + { + "epoch": 0.0, + "logps_train/chosen": -73.56502532958984, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -81.08031463623047, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.03540842980146408, + "rewards_train/margins": -0.03733780421316624, + "rewards_train/rejected": 0.001929374411702156, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 7.462686567164179e-09, + "loss": 0.7033, + "step": 2 + }, + { + "epoch": 0.0, + "logps_train/chosen": -67.06131744384766, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -58.305389404296875, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.004414868541061878, + "rewards_train/margins": -0.012702328152954578, + "rewards_train/rejected": 0.017117196694016457, + "step": 2 + }, + { + "epoch": 0.0, + "logps_train/chosen": -74.58602905273438, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -72.01849365234375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.004287677817046642, + "rewards_train/margins": -0.014956123195588589, + "rewards_train/rejected": 0.01924380101263523, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 1.4925373134328357e-08, + "loss": 0.7004, + "step": 4 + }, + { + "epoch": 0.0, + "logps_train/chosen": -96.714111328125, + "logps_train/ref_chosen": -96.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -94.39057159423828, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.017505139112472534, + "rewards_train/margins": -0.02043974446132779, + "rewards_train/rejected": 0.002934605348855257, + "step": 4 + }, + { + "epoch": 0.0, + "logps_train/chosen": -30.27285385131836, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -31.125, + "logps_train/rejected": -31.02100944519043, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0027926864568144083, + "rewards_train/margins": -0.00448125577531755, + "rewards_train/rejected": 0.007273942232131958, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 2.2388059701492534e-08, + "loss": 0.7019, + "step": 6 + }, + { + "epoch": 0.0, + "logps_train/chosen": -52.10371017456055, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -55.562103271484375, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.01896473951637745, + "rewards_train/margins": -0.026817021891474724, + "rewards_train/rejected": 0.007852282375097275, + "step": 6 + }, + { + "epoch": 0.0, + "logps_train/chosen": -71.28314208984375, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -76.08761596679688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01777965947985649, + "rewards_train/margins": 0.03591597452759743, + "rewards_train/rejected": -0.018136315047740936, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.6922, + "step": 8 + }, + { + "epoch": 0.0, + "logps_train/chosen": -76.19033813476562, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -79.52376556396484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0020599185954779387, + "rewards_train/margins": 0.017619952326640487, + "rewards_train/rejected": -0.015560033731162548, + "step": 8 + }, + { + "epoch": 0.0, + "logps_train/chosen": -54.57809829711914, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -62.02259063720703, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.007614586036652327, + "rewards_train/margins": 0.005191568750888109, + "rewards_train/rejected": -0.012806154787540436, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 3.731343283582089e-08, + "loss": 0.687, + "step": 10 + }, + { + "epoch": 0.0, + "logps_train/chosen": -70.93370819091797, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -66.47660064697266, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.001964431954547763, + "rewards_train/margins": -0.003913437249138951, + "rewards_train/rejected": 0.0019490052945911884, + "step": 10 + }, + { + "epoch": 0.0, + "logps_train/chosen": -51.025726318359375, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -62.484683990478516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.013052631169557571, + "rewards_train/margins": 0.0199194997549057, + "rewards_train/rejected": -0.006866868585348129, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 4.477611940298507e-08, + "loss": 0.6889, + "step": 12 + }, + { + "epoch": 0.0, + "logps_train/chosen": -52.511112213134766, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -51.727561950683594, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.004747912287712097, + "rewards_train/margins": 0.019887147471308708, + "rewards_train/rejected": -0.015139235183596611, + "step": 12 + }, + { + "epoch": 0.0, + "logps_train/chosen": -63.97336959838867, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -80.89689636230469, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.007350319065153599, + "rewards_train/margins": 0.003485799767076969, + "rewards_train/rejected": 0.0038645192980766296, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 5.223880597014925e-08, + "loss": 0.6875, + "step": 14 + }, + { + "epoch": 0.0, + "logps_train/chosen": -31.11260223388672, + "logps_train/ref_chosen": -31.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -47.75293731689453, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.0066704899072647095, + "rewards_train/margins": -0.021025076508522034, + "rewards_train/rejected": 0.014354586601257324, + "step": 14 + }, + { + "epoch": 0.0, + "logps_train/chosen": -62.03797912597656, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -76.19300842285156, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.00223541259765625, + "rewards_train/margins": -0.006372492760419846, + "rewards_train/rejected": 0.004137080162763596, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 5.970149253731343e-08, + "loss": 0.6999, + "step": 16 + }, + { + "epoch": 0.0, + "logps_train/chosen": -45.3233528137207, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -52.9857177734375, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.008873285725712776, + "rewards_train/margins": -0.014622984454035759, + "rewards_train/rejected": 0.005749698728322983, + "step": 16 + }, + { + "epoch": 0.0, + "logps_train/chosen": -43.823646545410156, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -45.151485443115234, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.015372293069958687, + "rewards_train/margins": -0.006864186376333237, + "rewards_train/rejected": -0.00850810669362545, + "step": 17 + }, + { + "epoch": 0.01, + "learning_rate": 6.71641791044776e-08, + "loss": 0.6986, + "step": 18 + }, + { + "epoch": 0.01, + "logps_train/chosen": -46.67842102050781, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -45.35725021362305, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.017704779282212257, + "rewards_train/margins": 0.009679672308266163, + "rewards_train/rejected": 0.008025106973946095, + "step": 18 + }, + { + "epoch": 0.01, + "logps_train/chosen": -55.24577331542969, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -60.35206604003906, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0006179455667734146, + "rewards_train/margins": 0.008089966606348753, + "rewards_train/rejected": -0.007472021039575338, + "step": 19 + }, + { + "epoch": 0.01, + "learning_rate": 7.462686567164178e-08, + "loss": 0.6891, + "step": 20 + }, + { + "epoch": 0.01, + "logps_train/chosen": -70.94389343261719, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -83.84043884277344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.008793413639068604, + "rewards_train/margins": -0.014788252301514149, + "rewards_train/rejected": 0.005994838662445545, + "step": 20 + }, + { + "epoch": 0.01, + "logps_train/chosen": -57.3712272644043, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -69.30262756347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.010169684886932373, + "rewards_train/margins": 0.016967594623565674, + "rewards_train/rejected": -0.027137279510498047, + "step": 21 + }, + { + "epoch": 0.01, + "learning_rate": 8.208955223880596e-08, + "loss": 0.6928, + "step": 22 + }, + { + "epoch": 0.01, + "logps_train/chosen": -45.63951873779297, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -71.93592071533203, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.013489503413438797, + "rewards_train/margins": 0.002784440293908119, + "rewards_train/rejected": 0.010705063119530678, + "step": 22 + }, + { + "epoch": 0.01, + "logps_train/chosen": -43.61445617675781, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -50.658485412597656, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.0010543353855609894, + "rewards_train/margins": -0.006143870297819376, + "rewards_train/rejected": 0.007198205683380365, + "step": 23 + }, + { + "epoch": 0.01, + "learning_rate": 8.955223880597014e-08, + "loss": 0.694, + "step": 24 + }, + { + "epoch": 0.01, + "logps_train/chosen": -54.579505920410156, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -60.249229431152344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.009708481840789318, + "rewards_train/margins": -0.012520214542746544, + "rewards_train/rejected": 0.002811732701957226, + "step": 24 + }, + { + "epoch": 0.01, + "logps_train/chosen": -44.96370315551758, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -50.72367858886719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.002693634247407317, + "rewards_train/margins": 0.0003871179651468992, + "rewards_train/rejected": -0.0030807522125542164, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 9.701492537313432e-08, + "loss": 0.6957, + "step": 26 + }, + { + "epoch": 0.01, + "logps_train/chosen": -52.113609313964844, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -55.80499267578125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.0025722477585077286, + "rewards_train/margins": 0.009958336129784584, + "rewards_train/rejected": -0.012530583888292313, + "step": 26 + }, + { + "epoch": 0.01, + "logps_train/chosen": -78.57445526123047, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -71.58308410644531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.011304108425974846, + "rewards_train/margins": 0.008577450178563595, + "rewards_train/rejected": 0.002726658247411251, + "step": 27 + }, + { + "epoch": 0.01, + "learning_rate": 1.044776119402985e-07, + "loss": 0.6886, + "step": 28 + }, + { + "epoch": 0.01, + "logps_train/chosen": -48.44830322265625, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -42.34334182739258, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.015338033437728882, + "rewards_train/margins": -0.016941360663622618, + "rewards_train/rejected": 0.0016033272258937359, + "step": 28 + }, + { + "epoch": 0.01, + "logps_train/chosen": -72.24885559082031, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -65.16725158691406, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.024398073554039, + "rewards_train/margins": -0.003961820155382156, + "rewards_train/rejected": -0.020436253398656845, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 1.1194029850746268e-07, + "loss": 0.6985, + "step": 30 + }, + { + "epoch": 0.01, + "logps_train/chosen": -48.282257080078125, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -51.52762985229492, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.025192007422447205, + "rewards_train/margins": 0.00940013863146305, + "rewards_train/rejected": 0.015791868790984154, + "step": 30 + }, + { + "epoch": 0.01, + "logps_train/chosen": -69.8888931274414, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -71.6630859375, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0028590518049895763, + "rewards_train/margins": -0.012228913139551878, + "rewards_train/rejected": 0.015087964944541454, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 1.1940298507462686e-07, + "loss": 0.6941, + "step": 32 + }, + { + "epoch": 0.01, + "logps_train/chosen": -61.05165481567383, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -48.58993911743164, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.017588702961802483, + "rewards_train/margins": 0.022090469021350145, + "rewards_train/rejected": -0.004501766059547663, + "step": 32 + }, + { + "epoch": 0.01, + "logps_train/chosen": -65.1072006225586, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -65.76799011230469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0034931949339807034, + "rewards_train/margins": -0.010873754974454641, + "rewards_train/rejected": 0.007380560040473938, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 1.2686567164179106e-07, + "loss": 0.69, + "step": 34 + }, + { + "epoch": 0.01, + "logps_train/chosen": -79.93659973144531, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -91.6839599609375, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.030379492789506912, + "rewards_train/margins": -0.06432736292481422, + "rewards_train/rejected": 0.03394787013530731, + "step": 34 + }, + { + "epoch": 0.01, + "logps_train/chosen": -70.9692611694336, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -69.50221252441406, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.005519765429198742, + "rewards_train/margins": 0.0036860769614577293, + "rewards_train/rejected": -0.009205842390656471, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 1.343283582089552e-07, + "loss": 0.7098, + "step": 36 + }, + { + "epoch": 0.01, + "logps_train/chosen": -20.790733337402344, + "logps_train/ref_chosen": -20.875, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -34.51048278808594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.005765479989349842, + "rewards_train/margins": 0.005568873370066285, + "rewards_train/rejected": 0.00019660661928355694, + "step": 36 + }, + { + "epoch": 0.01, + "logps_train/chosen": -60.28750991821289, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -69.29464721679688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.009041781537234783, + "rewards_train/margins": 0.010185684077441692, + "rewards_train/rejected": -0.0011439025402069092, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 1.4179104477611938e-07, + "loss": 0.6896, + "step": 38 + }, + { + "epoch": 0.01, + "logps_train/chosen": -53.85614013671875, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -76.01130676269531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.005633586551994085, + "rewards_train/margins": -0.013291806448251009, + "rewards_train/rejected": 0.007658219896256924, + "step": 38 + }, + { + "epoch": 0.01, + "logps_train/chosen": -89.0808334350586, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -98.09664154052734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.011349696666002274, + "rewards_train/margins": 0.018670076970010996, + "rewards_train/rejected": -0.007320380304008722, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 1.4925373134328355e-07, + "loss": 0.692, + "step": 40 + }, + { + "epoch": 0.01, + "logps_train/chosen": -23.839229583740234, + "logps_train/ref_chosen": -24.0, + "logps_train/ref_rejected": -25.875, + "logps_train/rejected": -25.802457809448242, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.009827196598052979, + "rewards_train/margins": 0.0015963371843099594, + "rewards_train/rejected": 0.008230859413743019, + "step": 40 + }, + { + "epoch": 0.01, + "logps_train/chosen": -69.20179748535156, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -86.10008239746094, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.01432049460709095, + "rewards_train/margins": -0.0254062432795763, + "rewards_train/rejected": 0.011085748672485352, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 1.5671641791044775e-07, + "loss": 0.6993, + "step": 42 + }, + { + "epoch": 0.01, + "logps_train/chosen": -61.325164794921875, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -78.37467956542969, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.002786031924188137, + "rewards_train/margins": 0.01662126276642084, + "rewards_train/rejected": -0.013835230842232704, + "step": 42 + }, + { + "epoch": 0.01, + "logps_train/chosen": -61.91954040527344, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -53.99800491333008, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.011192599311470985, + "rewards_train/margins": -0.01178260799497366, + "rewards_train/rejected": 0.0005900086835026741, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.6417910447761193e-07, + "loss": 0.6922, + "step": 44 + }, + { + "epoch": 0.01, + "logps_train/chosen": -61.60581588745117, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -85.86632537841797, + "rewards_train/accuracies": 0.0, + "rewards_train/chosen": -0.011753350496292114, + "rewards_train/margins": -0.02785518765449524, + "rewards_train/rejected": 0.016101837158203125, + "step": 44 + }, + { + "epoch": 0.01, + "logps_train/chosen": -48.868995666503906, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -50.02741241455078, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.005552059970796108, + "rewards_train/margins": -0.00515446113422513, + "rewards_train/rejected": -0.00039759883657097816, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.716417910447761e-07, + "loss": 0.7018, + "step": 46 + }, + { + "epoch": 0.01, + "logps_train/chosen": -59.111244201660156, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.70795440673828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.010750353336334229, + "rewards_train/margins": -0.0006807064637541771, + "rewards_train/rejected": 0.011431059800088406, + "step": 46 + }, + { + "epoch": 0.01, + "logps_train/chosen": -53.983726501464844, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -75.58424377441406, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.009602963924407959, + "rewards_train/margins": -0.005280325189232826, + "rewards_train/rejected": -0.004322638735175133, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.7910447761194027e-07, + "loss": 0.6947, + "step": 48 + }, + { + "epoch": 0.01, + "logps_train/chosen": -56.18229675292969, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -61.73432922363281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.00553455064073205, + "rewards_train/margins": -0.013693366665393114, + "rewards_train/rejected": 0.008158816024661064, + "step": 48 + }, + { + "epoch": 0.01, + "logps_train/chosen": -58.342559814453125, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -61.901893615722656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.009103439748287201, + "rewards_train/margins": 0.017651867121458054, + "rewards_train/rejected": -0.008548427373170853, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.8656716417910447e-07, + "loss": 0.6924, + "step": 50 + }, + { + "epoch": 0.01, + "logps_train/chosen": -75.02145385742188, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -74.90623474121094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.023635398596525192, + "rewards_train/margins": 0.01660336134955287, + "rewards_train/rejected": 0.0070320372469723225, + "step": 50 + }, + { + "epoch": 0.01, + "logps_train/chosen": -75.75387573242188, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -73.04302978515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.007620600052177906, + "rewards_train/margins": 0.013486066833138466, + "rewards_train/rejected": -0.00586546678096056, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.9402985074626865e-07, + "loss": 0.6858, + "step": 52 + }, + { + "epoch": 0.01, + "logps_train/chosen": -57.362266540527344, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -62.26457595825195, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0058142924681305885, + "rewards_train/margins": -0.0009311474859714508, + "rewards_train/rejected": 0.006745439954102039, + "step": 52 + }, + { + "epoch": 0.01, + "logps_train/chosen": -76.48269653320312, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.6187973022461, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.016237949952483177, + "rewards_train/margins": -0.030920982360839844, + "rewards_train/rejected": 0.014683032408356667, + "step": 53 + }, + { + "epoch": 0.02, + "learning_rate": 2.0149253731343282e-07, + "loss": 0.7013, + "step": 54 + }, + { + "epoch": 0.02, + "logps_train/chosen": -72.03915405273438, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -69.89439392089844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.003134148893877864, + "rewards_train/margins": -0.01096053677611053, + "rewards_train/rejected": 0.007826387882232666, + "step": 54 + }, + { + "epoch": 0.02, + "logps_train/chosen": -85.34034729003906, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -86.39530181884766, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.01801927760243416, + "rewards_train/margins": -0.03161392919719219, + "rewards_train/rejected": 0.013594651594758034, + "step": 55 + }, + { + "epoch": 0.02, + "learning_rate": 2.08955223880597e-07, + "loss": 0.7039, + "step": 56 + }, + { + "epoch": 0.02, + "logps_train/chosen": -35.24477767944336, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -47.85071563720703, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.005209779366850853, + "rewards_train/margins": -0.005617326125502586, + "rewards_train/rejected": 0.01082710549235344, + "step": 56 + }, + { + "epoch": 0.02, + "logps_train/chosen": -36.646060943603516, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -54.13223648071289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.009026802144944668, + "rewards_train/margins": 0.011508381925523281, + "rewards_train/rejected": -0.0024815797805786133, + "step": 57 + }, + { + "epoch": 0.02, + "learning_rate": 2.1641791044776117e-07, + "loss": 0.692, + "step": 58 + }, + { + "epoch": 0.02, + "logps_train/chosen": -63.958457946777344, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -68.56193542480469, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.004244078882038593, + "rewards_train/margins": -0.023246058262884617, + "rewards_train/rejected": 0.019001979380846024, + "step": 58 + }, + { + "epoch": 0.02, + "logps_train/chosen": -70.13823699951172, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -75.07279205322266, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.01538639422506094, + "rewards_train/margins": -0.01924017141573131, + "rewards_train/rejected": 0.003853777190670371, + "step": 59 + }, + { + "epoch": 0.02, + "learning_rate": 2.2388059701492537e-07, + "loss": 0.7041, + "step": 60 + }, + { + "epoch": 0.02, + "logps_train/chosen": -50.05101776123047, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -58.0960693359375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.024194950237870216, + "rewards_train/margins": 0.008801520802080631, + "rewards_train/rejected": 0.015393429435789585, + "step": 60 + }, + { + "epoch": 0.02, + "logps_train/chosen": -62.10710906982422, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -51.14201736450195, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.007258105091750622, + "rewards_train/margins": 0.012280058581382036, + "rewards_train/rejected": -0.005021953489631414, + "step": 61 + }, + { + "epoch": 0.02, + "learning_rate": 2.3134328358208954e-07, + "loss": 0.6879, + "step": 62 + }, + { + "epoch": 0.02, + "logps_train/chosen": -79.69609069824219, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -100.82936096191406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.010859489440917969, + "rewards_train/margins": 0.0031700842082500458, + "rewards_train/rejected": 0.007689405232667923, + "step": 62 + }, + { + "epoch": 0.02, + "logps_train/chosen": -50.9639778137207, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -54.70915222167969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0046273572370409966, + "rewards_train/margins": -0.0018990645185112953, + "rewards_train/rejected": 0.006526421755552292, + "step": 63 + }, + { + "epoch": 0.02, + "learning_rate": 2.388059701492537e-07, + "loss": 0.6933, + "step": 64 + }, + { + "epoch": 0.02, + "logps_train/chosen": -54.156246185302734, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -77.61766052246094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0162116177380085, + "rewards_train/margins": 0.016259407624602318, + "rewards_train/rejected": -4.7789886593818665e-05, + "step": 64 + }, + { + "epoch": 0.02, + "logps_train/chosen": -56.522613525390625, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -60.18783950805664, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.01241797860711813, + "rewards_train/margins": -0.0030089272186160088, + "rewards_train/rejected": -0.009409051388502121, + "step": 65 + }, + { + "epoch": 0.02, + "learning_rate": 2.4626865671641786e-07, + "loss": 0.6905, + "step": 66 + }, + { + "epoch": 0.02, + "logps_train/chosen": -36.38053512573242, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -33.81393814086914, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.004655003547668457, + "rewards_train/margins": -0.0015814690850675106, + "rewards_train/rejected": -0.0030735344626009464, + "step": 66 + }, + { + "epoch": 0.02, + "logps_train/chosen": -115.5427474975586, + "logps_train/ref_chosen": -115.5, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -127.41462707519531, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.0004127498250454664, + "rewards_train/margins": -0.026874830247834325, + "rewards_train/rejected": 0.02728758007287979, + "step": 67 + }, + { + "epoch": 0.02, + "learning_rate": 2.537313432835821e-07, + "loss": 0.7003, + "step": 68 + }, + { + "epoch": 0.02, + "logps_train/chosen": -80.20258331298828, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -74.4958267211914, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.00424259714782238, + "rewards_train/margins": 0.0011994852684438229, + "rewards_train/rejected": -0.005442082416266203, + "step": 68 + }, + { + "epoch": 0.02, + "logps_train/chosen": -42.610416412353516, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -51.10821533203125, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.012200661934912205, + "rewards_train/margins": 0.0036860527470707893, + "rewards_train/rejected": 0.008514609187841415, + "step": 69 + }, + { + "epoch": 0.02, + "learning_rate": 2.611940298507462e-07, + "loss": 0.6925, + "step": 70 + }, + { + "epoch": 0.02, + "logps_train/chosen": -39.376792907714844, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -27.125, + "logps_train/rejected": -27.271896362304688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.00021116758580319583, + "rewards_train/margins": 0.01660984705085866, + "rewards_train/rejected": -0.016398679465055466, + "step": 70 + }, + { + "epoch": 0.02, + "logps_train/chosen": -74.99695587158203, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -83.50621032714844, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.013757944107055664, + "rewards_train/margins": -0.018800931982696056, + "rewards_train/rejected": 0.005042987875640392, + "step": 71 + }, + { + "epoch": 0.02, + "learning_rate": 2.686567164179104e-07, + "loss": 0.6938, + "step": 72 + }, + { + "epoch": 0.02, + "logps_train/chosen": -85.22811889648438, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -84.8520278930664, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.01890520006418228, + "rewards_train/margins": -0.010264983400702477, + "rewards_train/rejected": -0.008640216663479805, + "step": 72 + }, + { + "epoch": 0.02, + "logps_train/chosen": -42.7127685546875, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -62.7537956237793, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.017492689192295074, + "rewards_train/margins": 0.019776830216869712, + "rewards_train/rejected": -0.0022841410245746374, + "step": 73 + }, + { + "epoch": 0.02, + "learning_rate": 2.761194029850746e-07, + "loss": 0.6908, + "step": 74 + }, + { + "epoch": 0.02, + "logps_train/chosen": -97.81999206542969, + "logps_train/ref_chosen": -97.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -85.24695587158203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.015202022157609463, + "rewards_train/margins": 0.0016810325905680656, + "rewards_train/rejected": -0.01688305474817753, + "step": 74 + }, + { + "epoch": 0.02, + "logps_train/chosen": -26.67871856689453, + "logps_train/ref_chosen": -26.75, + "logps_train/ref_rejected": -32.5, + "logps_train/rejected": -32.54246520996094, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.004229029640555382, + "rewards_train/margins": 0.013328035362064838, + "rewards_train/rejected": -0.009099005721509457, + "step": 75 + }, + { + "epoch": 0.02, + "learning_rate": 2.8358208955223876e-07, + "loss": 0.6896, + "step": 76 + }, + { + "epoch": 0.02, + "logps_train/chosen": -56.448753356933594, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -52.06647491455078, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0031714322976768017, + "rewards_train/margins": -0.01283704349771142, + "rewards_train/rejected": 0.016008475795388222, + "step": 76 + }, + { + "epoch": 0.02, + "logps_train/chosen": -51.58002471923828, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -57.23174285888672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.009184528142213821, + "rewards_train/margins": 0.016293874010443687, + "rewards_train/rejected": -0.007109345868229866, + "step": 77 + }, + { + "epoch": 0.02, + "learning_rate": 2.9104477611940296e-07, + "loss": 0.6923, + "step": 78 + }, + { + "epoch": 0.02, + "logps_train/chosen": -47.17328643798828, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -34.302825927734375, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0014215768314898014, + "rewards_train/margins": -0.008920720312744379, + "rewards_train/rejected": 0.01034229714423418, + "step": 78 + }, + { + "epoch": 0.02, + "logps_train/chosen": -88.59147644042969, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -86.37312316894531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.01891244947910309, + "rewards_train/margins": -0.030819376930594444, + "rewards_train/rejected": 0.011906927451491356, + "step": 79 + }, + { + "epoch": 0.02, + "learning_rate": 2.985074626865671e-07, + "loss": 0.7041, + "step": 80 + }, + { + "epoch": 0.02, + "logps_train/chosen": -37.71403503417969, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -42.998077392578125, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.0020339488983154297, + "rewards_train/margins": 0.001060450915247202, + "rewards_train/rejected": 0.0009734979830682278, + "step": 80 + }, + { + "epoch": 0.02, + "logps_train/chosen": -42.956787109375, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -43.76375198364258, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": 0.0027585094794631004, + "rewards_train/margins": -0.021647651679813862, + "rewards_train/rejected": 0.024406161159276962, + "step": 81 + }, + { + "epoch": 0.02, + "learning_rate": 3.059701492537313e-07, + "loss": 0.6986, + "step": 82 + }, + { + "epoch": 0.02, + "logps_train/chosen": -52.473175048828125, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -58.66938018798828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.014303699135780334, + "rewards_train/margins": 0.0017494112253189087, + "rewards_train/rejected": 0.012554287910461426, + "step": 82 + }, + { + "epoch": 0.02, + "logps_train/chosen": -11.788549423217773, + "logps_train/ref_chosen": -11.6875, + "logps_train/ref_rejected": -14.6875, + "logps_train/rejected": -14.75306224822998, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.008640158921480179, + "rewards_train/margins": 0.00035746581852436066, + "rewards_train/rejected": -0.00899762474000454, + "step": 83 + }, + { + "epoch": 0.02, + "learning_rate": 3.134328358208955e-07, + "loss": 0.6927, + "step": 84 + }, + { + "epoch": 0.02, + "logps_train/chosen": -65.22269439697266, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -73.28553009033203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0037143828812986612, + "rewards_train/margins": -0.004116818541660905, + "rewards_train/rejected": 0.00040243566036224365, + "step": 84 + }, + { + "epoch": 0.02, + "logps_train/chosen": -33.343101501464844, + "logps_train/ref_chosen": -33.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -52.500579833984375, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.010611563920974731, + "rewards_train/margins": 0.00031818635761737823, + "rewards_train/rejected": 0.010293377563357353, + "step": 85 + }, + { + "epoch": 0.02, + "learning_rate": 3.2089552238805965e-07, + "loss": 0.6938, + "step": 86 + }, + { + "epoch": 0.02, + "logps_train/chosen": -80.40919494628906, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -66.40739440917969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.022752420976758003, + "rewards_train/margins": 0.032046287320554256, + "rewards_train/rejected": -0.009293866343796253, + "step": 86 + }, + { + "epoch": 0.02, + "logps_train/chosen": -80.67681884765625, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -98.67953491210938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.015911972150206566, + "rewards_train/margins": 0.02058399934321642, + "rewards_train/rejected": -0.004672027193009853, + "step": 87 + }, + { + "epoch": 0.02, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.6803, + "step": 88 + }, + { + "epoch": 0.02, + "logps_train/chosen": -54.21306228637695, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -59.89112854003906, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.0027515410911291838, + "rewards_train/margins": -0.024185542715713382, + "rewards_train/rejected": 0.021434001624584198, + "step": 88 + }, + { + "epoch": 0.02, + "logps_train/chosen": -48.04336929321289, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -48.124664306640625, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.011758768931031227, + "rewards_train/margins": 0.012621596455574036, + "rewards_train/rejected": -0.024380365386605263, + "step": 89 + }, + { + "epoch": 0.03, + "learning_rate": 3.3582089552238805e-07, + "loss": 0.6965, + "step": 90 + }, + { + "epoch": 0.03, + "logps_train/chosen": -74.77471923828125, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -90.97662353515625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.00803880300372839, + "rewards_train/margins": 0.0036856932565569878, + "rewards_train/rejected": -0.011724496260285378, + "step": 90 + }, + { + "epoch": 0.03, + "logps_train/chosen": -43.102508544921875, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -40.36244201660156, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0009797094389796257, + "rewards_train/margins": -0.007697904482483864, + "rewards_train/rejected": 0.00867761392146349, + "step": 91 + }, + { + "epoch": 0.03, + "learning_rate": 3.432835820895522e-07, + "loss": 0.6943, + "step": 92 + }, + { + "epoch": 0.03, + "logps_train/chosen": -62.44756317138672, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -57.75743103027344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.020868778228759766, + "rewards_train/margins": 0.007549166679382324, + "rewards_train/rejected": 0.013319611549377441, + "step": 92 + }, + { + "epoch": 0.03, + "logps_train/chosen": -42.98347473144531, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -20.25, + "logps_train/rejected": -20.200942993164062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.003996431827545166, + "rewards_train/margins": -0.002471828367561102, + "rewards_train/rejected": 0.006468260195106268, + "step": 93 + }, + { + "epoch": 0.03, + "learning_rate": 3.507462686567164e-07, + "loss": 0.6918, + "step": 94 + }, + { + "epoch": 0.03, + "logps_train/chosen": -47.029605865478516, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -63.656864166259766, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.02981605753302574, + "rewards_train/margins": -0.05631690286099911, + "rewards_train/rejected": 0.026500845327973366, + "step": 94 + }, + { + "epoch": 0.03, + "logps_train/chosen": -37.82858657836914, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -43.48558807373047, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0007352705579251051, + "rewards_train/margins": -0.0038309639785438776, + "rewards_train/rejected": 0.004566234536468983, + "step": 95 + }, + { + "epoch": 0.03, + "learning_rate": 3.5820895522388055e-07, + "loss": 0.7088, + "step": 96 + }, + { + "epoch": 0.03, + "logps_train/chosen": -90.15899658203125, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -109.81600952148438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.002459877170622349, + "rewards_train/margins": 0.024685590527951717, + "rewards_train/rejected": -0.02222571335732937, + "step": 96 + }, + { + "epoch": 0.03, + "logps_train/chosen": -74.28402709960938, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -92.08799743652344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.016909385100007057, + "rewards_train/margins": 0.022584796883165836, + "rewards_train/rejected": -0.005675411783158779, + "step": 97 + }, + { + "epoch": 0.03, + "learning_rate": 3.6567164179104475e-07, + "loss": 0.6816, + "step": 98 + }, + { + "epoch": 0.03, + "logps_train/chosen": -69.00167846679688, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.96739959716797, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.013449341058731079, + "rewards_train/margins": -0.006944017950445414, + "rewards_train/rejected": -0.0065053231082856655, + "step": 98 + }, + { + "epoch": 0.03, + "logps_train/chosen": -54.449981689453125, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -59.801513671875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.01300974190235138, + "rewards_train/margins": 0.008004629984498024, + "rewards_train/rejected": 0.005005111917853355, + "step": 99 + }, + { + "epoch": 0.03, + "learning_rate": 3.7313432835820895e-07, + "loss": 0.6937, + "step": 100 + }, + { + "epoch": 0.03, + "logps_train/chosen": -62.00507354736328, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -46.26144790649414, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.00011680088937282562, + "rewards_train/margins": -0.004440450109541416, + "rewards_train/rejected": 0.0043236492201685905, + "step": 100 + }, + { + "epoch": 0.03, + "logps_train/chosen": -17.897796630859375, + "logps_train/ref_chosen": -17.875, + "logps_train/ref_rejected": -28.375, + "logps_train/rejected": -28.47299575805664, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0002105669118463993, + "rewards_train/margins": 0.009375241119414568, + "rewards_train/rejected": -0.009164674207568169, + "step": 101 + }, + { + "epoch": 0.03, + "learning_rate": 3.805970149253731e-07, + "loss": 0.6908, + "step": 102 + }, + { + "epoch": 0.03, + "logps_train/chosen": -64.4922866821289, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -86.14122009277344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.00448232889175415, + "rewards_train/margins": 0.01547945849597454, + "rewards_train/rejected": -0.01099712960422039, + "step": 102 + }, + { + "epoch": 0.03, + "logps_train/chosen": -59.390933990478516, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -63.25, + "logps_train/rejected": -63.29236602783203, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": 0.007195710204541683, + "rewards_train/margins": 0.007623705081641674, + "rewards_train/rejected": -0.00042799487709999084, + "step": 103 + }, + { + "epoch": 0.03, + "learning_rate": 3.880597014925373e-07, + "loss": 0.6884, + "step": 104 + }, + { + "epoch": 0.03, + "logps_train/chosen": -90.15403747558594, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -70.99836730957031, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0037373658269643784, + "rewards_train/margins": -0.005800902843475342, + "rewards_train/rejected": 0.00953826867043972, + "step": 104 + }, + { + "epoch": 0.03, + "logps_train/chosen": -101.05010223388672, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -96.0403060913086, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.00012723170220851898, + "rewards_train/margins": 0.032711880281567574, + "rewards_train/rejected": -0.03283911198377609, + "step": 105 + }, + { + "epoch": 0.03, + "learning_rate": 3.9552238805970144e-07, + "loss": 0.6871, + "step": 106 + }, + { + "epoch": 0.03, + "logps_train/chosen": -74.55607604980469, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -73.98163604736328, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0059983693063259125, + "rewards_train/margins": 0.009547839872539043, + "rewards_train/rejected": -0.015546209178864956, + "step": 106 + }, + { + "epoch": 0.03, + "logps_train/chosen": -88.17259216308594, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -104.19441986083984, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0030534989200532436, + "rewards_train/margins": -0.00641061132773757, + "rewards_train/rejected": 0.009464110247790813, + "step": 107 + }, + { + "epoch": 0.03, + "learning_rate": 4.0298507462686564e-07, + "loss": 0.6928, + "step": 108 + }, + { + "epoch": 0.03, + "logps_train/chosen": -105.41600036621094, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -89.28736877441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.007032524794340134, + "rewards_train/margins": 0.03362088464200497, + "rewards_train/rejected": -0.026588359847664833, + "step": 108 + }, + { + "epoch": 0.03, + "logps_train/chosen": -41.34018325805664, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -41.07966995239258, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.01754409447312355, + "rewards_train/margins": 0.02336266729980707, + "rewards_train/rejected": -0.005818572826683521, + "step": 109 + }, + { + "epoch": 0.03, + "learning_rate": 4.1044776119402984e-07, + "loss": 0.6789, + "step": 110 + }, + { + "epoch": 0.03, + "logps_train/chosen": -76.86898803710938, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -74.01725769042969, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.01905249059200287, + "rewards_train/margins": -0.006267772987484932, + "rewards_train/rejected": -0.012784717604517937, + "step": 110 + }, + { + "epoch": 0.03, + "logps_train/chosen": -86.36917114257812, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -82.92460632324219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.008792471140623093, + "rewards_train/margins": -0.008519744966179132, + "rewards_train/rejected": -0.0002727261744439602, + "step": 111 + }, + { + "epoch": 0.03, + "learning_rate": 4.17910447761194e-07, + "loss": 0.6968, + "step": 112 + }, + { + "epoch": 0.03, + "logps_train/chosen": -52.108375549316406, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -71.63522338867188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.012404504232108593, + "rewards_train/margins": 0.010302287060767412, + "rewards_train/rejected": 0.002102217171341181, + "step": 112 + }, + { + "epoch": 0.03, + "logps_train/chosen": -65.410400390625, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.90836334228516, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.003759864717721939, + "rewards_train/margins": 0.0019203806295990944, + "rewards_train/rejected": -0.0056802453473210335, + "step": 113 + }, + { + "epoch": 0.03, + "learning_rate": 4.253731343283582e-07, + "loss": 0.6902, + "step": 114 + }, + { + "epoch": 0.03, + "logps_train/chosen": -64.06271362304688, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -65.58354187011719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.01599377952516079, + "rewards_train/margins": 0.029817133210599422, + "rewards_train/rejected": -0.013823353685438633, + "step": 114 + }, + { + "epoch": 0.03, + "logps_train/chosen": -72.0600814819336, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -61.60381317138672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.025241797789931297, + "rewards_train/margins": 0.01003727875649929, + "rewards_train/rejected": 0.015204519033432007, + "step": 115 + }, + { + "epoch": 0.03, + "learning_rate": 4.3283582089552234e-07, + "loss": 0.6834, + "step": 116 + }, + { + "epoch": 0.03, + "logps_train/chosen": -53.74980545043945, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -49.438079833984375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01935528591275215, + "rewards_train/margins": 0.009647496044635773, + "rewards_train/rejected": 0.009707789868116379, + "step": 116 + }, + { + "epoch": 0.03, + "logps_train/chosen": -44.96778869628906, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -56.949012756347656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.004197769798338413, + "rewards_train/margins": 0.025466148741543293, + "rewards_train/rejected": -0.02126837894320488, + "step": 117 + }, + { + "epoch": 0.03, + "learning_rate": 4.4029850746268654e-07, + "loss": 0.6848, + "step": 118 + }, + { + "epoch": 0.03, + "logps_train/chosen": -25.349895477294922, + "logps_train/ref_chosen": -25.5, + "logps_train/ref_rejected": -27.5, + "logps_train/rejected": -27.3612060546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01569410413503647, + "rewards_train/margins": 0.003084232099354267, + "rewards_train/rejected": 0.012609872035682201, + "step": 118 + }, + { + "epoch": 0.03, + "logps_train/chosen": -97.0093994140625, + "logps_train/ref_chosen": -97.5, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -121.2686538696289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02640366740524769, + "rewards_train/margins": 0.057956697419285774, + "rewards_train/rejected": -0.031553030014038086, + "step": 119 + }, + { + "epoch": 0.03, + "learning_rate": 4.4776119402985074e-07, + "loss": 0.6782, + "step": 120 + }, + { + "epoch": 0.03, + "logps_train/chosen": -80.8241195678711, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -72.63106536865234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.007431835867464542, + "rewards_train/margins": 0.06506937090307474, + "rewards_train/rejected": -0.0576375350356102, + "step": 120 + }, + { + "epoch": 0.03, + "logps_train/chosen": -88.81282806396484, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -77.29328155517578, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01754564791917801, + "rewards_train/margins": 0.024217775091528893, + "rewards_train/rejected": -0.0066721271723508835, + "step": 121 + }, + { + "epoch": 0.03, + "learning_rate": 4.552238805970149e-07, + "loss": 0.6719, + "step": 122 + }, + { + "epoch": 0.03, + "logps_train/chosen": -59.9490966796875, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -59.63578414916992, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.026183905079960823, + "rewards_train/margins": 0.016910739243030548, + "rewards_train/rejected": 0.009273165836930275, + "step": 122 + }, + { + "epoch": 0.03, + "logps_train/chosen": -41.998470306396484, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -55.53712463378906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.011090368032455444, + "rewards_train/margins": 0.02329868171364069, + "rewards_train/rejected": -0.012208313681185246, + "step": 123 + }, + { + "epoch": 0.03, + "learning_rate": 4.626865671641791e-07, + "loss": 0.6833, + "step": 124 + }, + { + "epoch": 0.03, + "logps_train/chosen": -43.328521728515625, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -36.337039947509766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.022860581055283546, + "rewards_train/margins": 0.017941400408744812, + "rewards_train/rejected": 0.0049191806465387344, + "step": 124 + }, + { + "epoch": 0.03, + "logps_train/chosen": -67.00100708007812, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -63.06188201904297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.013180387206375599, + "rewards_train/margins": 0.03597052674740553, + "rewards_train/rejected": -0.02279013954102993, + "step": 125 + }, + { + "epoch": 0.04, + "learning_rate": 4.701492537313433e-07, + "loss": 0.6796, + "step": 126 + }, + { + "epoch": 0.04, + "logps_train/chosen": -21.85276985168457, + "logps_train/ref_chosen": -22.0, + "logps_train/ref_rejected": -5.84375, + "logps_train/rejected": -5.905564308166504, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.013258203864097595, + "rewards_train/margins": 0.019097805954515934, + "rewards_train/rejected": -0.005839602090418339, + "step": 126 + }, + { + "epoch": 0.04, + "logps_train/chosen": -78.06990051269531, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -61.922767639160156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.015666604042053223, + "rewards_train/margins": 0.016537124291062355, + "rewards_train/rejected": -0.0008705202490091324, + "step": 127 + }, + { + "epoch": 0.04, + "learning_rate": 4.776119402985074e-07, + "loss": 0.684, + "step": 128 + }, + { + "epoch": 0.04, + "logps_train/chosen": -54.64369201660156, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -62.104461669921875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.04009862244129181, + "rewards_train/margins": 0.05611103028059006, + "rewards_train/rejected": -0.01601240783929825, + "step": 128 + }, + { + "epoch": 0.04, + "logps_train/chosen": -84.65701293945312, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -79.7423095703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.027267076075077057, + "rewards_train/margins": 0.01243535429239273, + "rewards_train/rejected": 0.014831721782684326, + "step": 129 + }, + { + "epoch": 0.04, + "learning_rate": 4.850746268656717e-07, + "loss": 0.6764, + "step": 130 + }, + { + "epoch": 0.04, + "logps_train/chosen": -33.27048873901367, + "logps_train/ref_chosen": -33.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -49.85738754272461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0020977838430553675, + "rewards_train/margins": 0.01118008024059236, + "rewards_train/rejected": -0.013277864083647728, + "step": 130 + }, + { + "epoch": 0.04, + "logps_train/chosen": -41.979339599609375, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -49.16737365722656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.013589856214821339, + "rewards_train/margins": 0.02290544006973505, + "rewards_train/rejected": -0.009315583854913712, + "step": 131 + }, + { + "epoch": 0.04, + "learning_rate": 4.925373134328357e-07, + "loss": 0.6848, + "step": 132 + }, + { + "epoch": 0.04, + "logps_train/chosen": -86.48685455322266, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -89.16781616210938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.000589694594964385, + "rewards_train/margins": 0.02039074362255633, + "rewards_train/rejected": -0.020980438217520714, + "step": 132 + }, + { + "epoch": 0.04, + "logps_train/chosen": -69.70428466796875, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -70.12745666503906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.025470493361353874, + "rewards_train/margins": 0.0425125639885664, + "rewards_train/rejected": -0.017042070627212524, + "step": 133 + }, + { + "epoch": 0.04, + "learning_rate": 5e-07, + "loss": 0.678, + "step": 134 + }, + { + "epoch": 0.04, + "logps_train/chosen": -58.901615142822266, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -68.68511962890625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0001215348020195961, + "rewards_train/margins": 0.016546146012842655, + "rewards_train/rejected": -0.01642461121082306, + "step": 134 + }, + { + "epoch": 0.04, + "logps_train/chosen": -62.59575653076172, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -65.994140625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.026752160862088203, + "rewards_train/margins": 0.0330016715452075, + "rewards_train/rejected": -0.006249510683119297, + "step": 135 + }, + { + "epoch": 0.04, + "learning_rate": 4.999992338985357e-07, + "loss": 0.6812, + "step": 136 + }, + { + "epoch": 0.04, + "logps_train/chosen": -53.07342529296875, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -52.71923828125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0067200311459600925, + "rewards_train/margins": 0.023077736143022776, + "rewards_train/rejected": -0.016357704997062683, + "step": 136 + }, + { + "epoch": 0.04, + "logps_train/chosen": -51.292930603027344, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -50.846290588378906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.029105406254529953, + "rewards_train/margins": 0.025648584589362144, + "rewards_train/rejected": 0.0034568216651678085, + "step": 137 + }, + { + "epoch": 0.04, + "learning_rate": 4.999969355988384e-07, + "loss": 0.6813, + "step": 138 + }, + { + "epoch": 0.04, + "logps_train/chosen": -66.85665130615234, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -62.703922271728516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.01667836867272854, + "rewards_train/margins": 0.03902385197579861, + "rewards_train/rejected": -0.02234548330307007, + "step": 138 + }, + { + "epoch": 0.04, + "logps_train/chosen": -70.43873596191406, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -81.95301818847656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0010485467500984669, + "rewards_train/margins": 0.0041623483411967754, + "rewards_train/rejected": -0.0031138015910983086, + "step": 139 + }, + { + "epoch": 0.04, + "learning_rate": 4.999931051149936e-07, + "loss": 0.6826, + "step": 140 + }, + { + "epoch": 0.04, + "logps_train/chosen": -70.70523834228516, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -74.788818359375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.013362860307097435, + "rewards_train/margins": 0.032869674265384674, + "rewards_train/rejected": -0.01950681395828724, + "step": 140 + }, + { + "epoch": 0.04, + "logps_train/chosen": -37.05867004394531, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -50.647212982177734, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.024699414148926735, + "rewards_train/margins": 0.007780121639370918, + "rewards_train/rejected": 0.016919292509555817, + "step": 141 + }, + { + "epoch": 0.04, + "learning_rate": 4.999877424704779e-07, + "loss": 0.6833, + "step": 142 + }, + { + "epoch": 0.04, + "logps_train/chosen": -59.707279205322266, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -81.51679992675781, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.021068911999464035, + "rewards_train/margins": 0.046185821294784546, + "rewards_train/rejected": -0.02511690929532051, + "step": 142 + }, + { + "epoch": 0.04, + "logps_train/chosen": -62.5545768737793, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -51.992942810058594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.004307818599045277, + "rewards_train/margins": 0.01766448002308607, + "rewards_train/rejected": -0.013356661424040794, + "step": 143 + }, + { + "epoch": 0.04, + "learning_rate": 4.999808476981578e-07, + "loss": 0.6769, + "step": 144 + }, + { + "epoch": 0.04, + "logps_train/chosen": -63.847530364990234, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -59.586456298828125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.004699907265603542, + "rewards_train/margins": -0.015462937764823437, + "rewards_train/rejected": 0.02016284503042698, + "step": 144 + }, + { + "epoch": 0.04, + "logps_train/chosen": -62.57245635986328, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -62.590755462646484, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.00017600646242499352, + "rewards_train/margins": -0.008717326913028955, + "rewards_train/rejected": 0.008893333375453949, + "step": 145 + }, + { + "epoch": 0.04, + "learning_rate": 4.9997242084029e-07, + "loss": 0.6989, + "step": 146 + }, + { + "epoch": 0.04, + "logps_train/chosen": -74.72541809082031, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -75.61740112304688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0003094852436333895, + "rewards_train/margins": 0.02454940532334149, + "rewards_train/rejected": -0.0242399200797081, + "step": 146 + }, + { + "epoch": 0.04, + "logps_train/chosen": -33.403953552246094, + "logps_train/ref_chosen": -33.5, + "logps_train/ref_rejected": -29.625, + "logps_train/rejected": -29.620080947875977, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.0037940330803394318, + "rewards_train/margins": 0.00794072262942791, + "rewards_train/rejected": -0.004146689549088478, + "step": 147 + }, + { + "epoch": 0.04, + "learning_rate": 4.999624619485213e-07, + "loss": 0.6852, + "step": 148 + }, + { + "epoch": 0.04, + "logps_train/chosen": -84.8711929321289, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -73.16226196289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.022255651652812958, + "rewards_train/margins": 0.05723201483488083, + "rewards_train/rejected": -0.03497636318206787, + "step": 148 + }, + { + "epoch": 0.04, + "logps_train/chosen": -89.43215942382812, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -90.07511138916016, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.019625797867774963, + "rewards_train/margins": 0.011316696181893349, + "rewards_train/rejected": 0.008309101685881615, + "step": 149 + }, + { + "epoch": 0.04, + "learning_rate": 4.999509710838877e-07, + "loss": 0.676, + "step": 150 + }, + { + "epoch": 0.04, + "logps_train/chosen": -80.99070739746094, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -56.01921844482422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01577312871813774, + "rewards_train/margins": 0.054804813116788864, + "rewards_train/rejected": -0.03903168439865112, + "step": 150 + }, + { + "epoch": 0.04, + "logps_train/chosen": -96.13180541992188, + "logps_train/ref_chosen": -96.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -84.63067626953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03056936338543892, + "rewards_train/margins": 0.02879274054430425, + "rewards_train/rejected": 0.0017766228411346674, + "step": 151 + }, + { + "epoch": 0.04, + "learning_rate": 4.999379483168148e-07, + "loss": 0.6732, + "step": 152 + }, + { + "epoch": 0.04, + "logps_train/chosen": -76.49810791015625, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -71.49435424804688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.024993985891342163, + "rewards_train/margins": 0.07130471616983414, + "rewards_train/rejected": -0.046310730278491974, + "step": 152 + }, + { + "epoch": 0.04, + "logps_train/chosen": -75.715087890625, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -87.75228118896484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.009741455316543579, + "rewards_train/margins": 0.007625645026564598, + "rewards_train/rejected": 0.002115810289978981, + "step": 153 + }, + { + "epoch": 0.04, + "learning_rate": 4.999233937271163e-07, + "loss": 0.6737, + "step": 154 + }, + { + "epoch": 0.04, + "logps_train/chosen": -67.73640441894531, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -79.92431640625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0232345350086689, + "rewards_train/margins": 0.06644811853766441, + "rewards_train/rejected": -0.043213583528995514, + "step": 154 + }, + { + "epoch": 0.04, + "logps_train/chosen": -61.9312629699707, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -67.37472534179688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.021815136075019836, + "rewards_train/margins": 0.014268056489527225, + "rewards_train/rejected": 0.007547079585492611, + "step": 155 + }, + { + "epoch": 0.04, + "learning_rate": 4.999073074039949e-07, + "loss": 0.6738, + "step": 156 + }, + { + "epoch": 0.04, + "logps_train/chosen": -70.95030212402344, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -80.04240417480469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.044813372194767, + "rewards_train/margins": 0.06311655044555664, + "rewards_train/rejected": -0.018303178250789642, + "step": 156 + }, + { + "epoch": 0.04, + "logps_train/chosen": -66.55253601074219, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -77.7836685180664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.025996200740337372, + "rewards_train/margins": 0.07428490370512009, + "rewards_train/rejected": -0.048288702964782715, + "step": 157 + }, + { + "epoch": 0.04, + "learning_rate": 4.998896894460405e-07, + "loss": 0.6609, + "step": 158 + }, + { + "epoch": 0.04, + "logps_train/chosen": -54.9752311706543, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -70.57875061035156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.0013316869735717773, + "rewards_train/margins": 0.010840454138815403, + "rewards_train/rejected": -0.01217214111238718, + "step": 158 + }, + { + "epoch": 0.04, + "logps_train/chosen": -47.71549987792969, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -56.431739807128906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05091074854135513, + "rewards_train/margins": 0.07377275638282299, + "rewards_train/rejected": -0.022862007841467857, + "step": 159 + }, + { + "epoch": 0.04, + "learning_rate": 4.998705399612302e-07, + "loss": 0.6734, + "step": 160 + }, + { + "epoch": 0.04, + "logps_train/chosen": -63.73102951049805, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -79.51908111572266, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.01244371011853218, + "rewards_train/margins": 0.020601927302777767, + "rewards_train/rejected": -0.008158217184245586, + "step": 160 + }, + { + "epoch": 0.04, + "logps_train/chosen": -53.09801483154297, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -55.18875503540039, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03355775773525238, + "rewards_train/margins": 0.03407386876642704, + "rewards_train/rejected": -0.0005161110311746597, + "step": 161 + }, + { + "epoch": 0.05, + "learning_rate": 4.998498590669277e-07, + "loss": 0.6805, + "step": 162 + }, + { + "epoch": 0.05, + "logps_train/chosen": -38.73344421386719, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -45.12217330932617, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.026655657216906548, + "rewards_train/margins": 0.025982449762523174, + "rewards_train/rejected": 0.0006732074543833733, + "step": 162 + }, + { + "epoch": 0.05, + "logps_train/chosen": -45.36915588378906, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -38.06843948364258, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.016209345310926437, + "rewards_train/margins": 0.03594367578625679, + "rewards_train/rejected": -0.019734330475330353, + "step": 163 + }, + { + "epoch": 0.05, + "learning_rate": 4.998276468898822e-07, + "loss": 0.6786, + "step": 164 + }, + { + "epoch": 0.05, + "logps_train/chosen": -73.39313507080078, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -82.00084686279297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.011321071535348892, + "rewards_train/margins": 0.06414046883583069, + "rewards_train/rejected": -0.052819397300481796, + "step": 164 + }, + { + "epoch": 0.05, + "logps_train/chosen": -69.62464141845703, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -72.80537414550781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.05394202470779419, + "rewards_train/margins": 0.07510420121252537, + "rewards_train/rejected": -0.02116217650473118, + "step": 165 + }, + { + "epoch": 0.05, + "learning_rate": 4.99803903566228e-07, + "loss": 0.66, + "step": 166 + }, + { + "epoch": 0.05, + "logps_train/chosen": -34.17655563354492, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -35.15182876586914, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.021407097578048706, + "rewards_train/margins": 0.05534002184867859, + "rewards_train/rejected": -0.03393292427062988, + "step": 166 + }, + { + "epoch": 0.05, + "logps_train/chosen": -68.01045989990234, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -73.16434478759766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05207864195108414, + "rewards_train/margins": 0.0966385267674923, + "rewards_train/rejected": -0.04455988481640816, + "step": 167 + }, + { + "epoch": 0.05, + "learning_rate": 4.997786292414834e-07, + "loss": 0.6576, + "step": 168 + }, + { + "epoch": 0.05, + "logps_train/chosen": -22.09493637084961, + "logps_train/ref_chosen": -22.25, + "logps_train/ref_rejected": -28.375, + "logps_train/rejected": -28.5806827545166, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.015311000868678093, + "rewards_train/margins": 0.03900415636599064, + "rewards_train/rejected": -0.023693155497312546, + "step": 168 + }, + { + "epoch": 0.05, + "logps_train/chosen": -103.13506317138672, + "logps_train/ref_chosen": -103.5, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -110.44572448730469, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.03766520321369171, + "rewards_train/margins": 0.0269642174243927, + "rewards_train/rejected": 0.010700985789299011, + "step": 169 + }, + { + "epoch": 0.05, + "learning_rate": 4.997518240705501e-07, + "loss": 0.6786, + "step": 170 + }, + { + "epoch": 0.05, + "logps_train/chosen": -68.88632202148438, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -77.92741394042969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.006650085560977459, + "rewards_train/margins": 0.03765397239476442, + "rewards_train/rejected": -0.04430405795574188, + "step": 170 + }, + { + "epoch": 0.05, + "logps_train/chosen": -73.45475769042969, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -83.04298400878906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.006086999084800482, + "rewards_train/margins": 0.09085501125082374, + "rewards_train/rejected": -0.08476801216602325, + "step": 171 + }, + { + "epoch": 0.05, + "learning_rate": 4.99723488217712e-07, + "loss": 0.6623, + "step": 172 + }, + { + "epoch": 0.05, + "logps_train/chosen": -75.52366638183594, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -71.59895324707031, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.01169600524008274, + "rewards_train/margins": 0.03565373457968235, + "rewards_train/rejected": -0.02395772933959961, + "step": 172 + }, + { + "epoch": 0.05, + "logps_train/chosen": -87.9087905883789, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -81.4451904296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.011191445402801037, + "rewards_train/margins": 0.03371789027005434, + "rewards_train/rejected": -0.04490933567285538, + "step": 173 + }, + { + "epoch": 0.05, + "learning_rate": 4.996936218566339e-07, + "loss": 0.6769, + "step": 174 + }, + { + "epoch": 0.05, + "logps_train/chosen": -87.11066436767578, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -98.43775177001953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.055828407406806946, + "rewards_train/margins": 0.10761188715696335, + "rewards_train/rejected": -0.0517834797501564, + "step": 174 + }, + { + "epoch": 0.05, + "logps_train/chosen": -67.88616943359375, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -60.13406753540039, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05044550448656082, + "rewards_train/margins": 0.08357885107398033, + "rewards_train/rejected": -0.03313334658741951, + "step": 175 + }, + { + "epoch": 0.05, + "learning_rate": 4.996622251703612e-07, + "loss": 0.6478, + "step": 176 + }, + { + "epoch": 0.05, + "logps_train/chosen": -42.33866882324219, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -58.38355255126953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0025344383902847767, + "rewards_train/margins": 0.04460058035328984, + "rewards_train/rejected": -0.042066141963005066, + "step": 176 + }, + { + "epoch": 0.05, + "logps_train/chosen": -44.17629623413086, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -43.33050537109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.025827443227171898, + "rewards_train/margins": 0.05087011493742466, + "rewards_train/rejected": -0.025042671710252762, + "step": 177 + }, + { + "epoch": 0.05, + "learning_rate": 4.996292983513186e-07, + "loss": 0.6706, + "step": 178 + }, + { + "epoch": 0.05, + "logps_train/chosen": -61.72535705566406, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -72.10646057128906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.033714115619659424, + "rewards_train/margins": 0.04436032846570015, + "rewards_train/rejected": -0.010646212846040726, + "step": 178 + }, + { + "epoch": 0.05, + "logps_train/chosen": -32.68050003051758, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -46.875099182128906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.010270203463733196, + "rewards_train/margins": 0.0329365162178874, + "rewards_train/rejected": -0.022666312754154205, + "step": 179 + }, + { + "epoch": 0.05, + "learning_rate": 4.99594841601308e-07, + "loss": 0.675, + "step": 180 + }, + { + "epoch": 0.05, + "logps_train/chosen": -26.811080932617188, + "logps_train/ref_chosen": -26.75, + "logps_train/ref_rejected": -23.875, + "logps_train/rejected": -23.95778465270996, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.0036788261495530605, + "rewards_train/margins": 0.007834318559616804, + "rewards_train/rejected": -0.011513144709169865, + "step": 180 + }, + { + "epoch": 0.05, + "logps_train/chosen": -72.8066177368164, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -73.80039978027344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.028542205691337585, + "rewards_train/margins": 0.046252816915512085, + "rewards_train/rejected": -0.0177106112241745, + "step": 181 + }, + { + "epoch": 0.05, + "learning_rate": 4.995588551315086e-07, + "loss": 0.6809, + "step": 182 + }, + { + "epoch": 0.05, + "logps_train/chosen": -52.16310119628906, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -79.22138214111328, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03623219206929207, + "rewards_train/margins": 0.011687222868204117, + "rewards_train/rejected": -0.047919414937496185, + "step": 182 + }, + { + "epoch": 0.05, + "logps_train/chosen": -84.68603515625, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -87.58245849609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03999017924070358, + "rewards_train/margins": 0.05604833923280239, + "rewards_train/rejected": -0.01605815999209881, + "step": 183 + }, + { + "epoch": 0.05, + "learning_rate": 4.995213391624745e-07, + "loss": 0.6773, + "step": 184 + }, + { + "epoch": 0.05, + "logps_train/chosen": -77.65924072265625, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -106.0, + "logps_train/rejected": -106.4644775390625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.003998493775725365, + "rewards_train/margins": 0.06392284296452999, + "rewards_train/rejected": -0.059924349188804626, + "step": 184 + }, + { + "epoch": 0.05, + "logps_train/chosen": -64.07603454589844, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -72.57796478271484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.016615189611911774, + "rewards_train/margins": 0.041013384237885475, + "rewards_train/rejected": -0.0243981946259737, + "step": 185 + }, + { + "epoch": 0.05, + "learning_rate": 4.994822939241341e-07, + "loss": 0.6671, + "step": 186 + }, + { + "epoch": 0.05, + "logps_train/chosen": -81.4778060913086, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -82.53826904296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.030202491208910942, + "rewards_train/margins": 0.1001871544867754, + "rewards_train/rejected": -0.13038964569568634, + "step": 186 + }, + { + "epoch": 0.05, + "logps_train/chosen": -56.910369873046875, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -58.9641227722168, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0221464354544878, + "rewards_train/margins": 0.08301190473139286, + "rewards_train/rejected": -0.06086546927690506, + "step": 187 + }, + { + "epoch": 0.05, + "learning_rate": 4.994417196557883e-07, + "loss": 0.6515, + "step": 188 + }, + { + "epoch": 0.05, + "logps_train/chosen": -98.9520263671875, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -98.87950134277344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08292235434055328, + "rewards_train/margins": 0.1208723932504654, + "rewards_train/rejected": -0.03795003890991211, + "step": 188 + }, + { + "epoch": 0.05, + "logps_train/chosen": -41.69307327270508, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -48.171234130859375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.025809943675994873, + "rewards_train/margins": 0.03160558361560106, + "rewards_train/rejected": -0.00579563993960619, + "step": 189 + }, + { + "epoch": 0.05, + "learning_rate": 4.993996166061091e-07, + "loss": 0.657, + "step": 190 + }, + { + "epoch": 0.05, + "logps_train/chosen": -67.85884094238281, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -84.90817260742188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.04145927354693413, + "rewards_train/margins": 0.17368273064494133, + "rewards_train/rejected": -0.1322234570980072, + "step": 190 + }, + { + "epoch": 0.05, + "logps_train/chosen": -74.41243743896484, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -71.4466323852539, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.032650504261255264, + "rewards_train/margins": 0.019825126975774765, + "rewards_train/rejected": -0.05247563123703003, + "step": 191 + }, + { + "epoch": 0.05, + "learning_rate": 4.993559850331383e-07, + "loss": 0.6496, + "step": 192 + }, + { + "epoch": 0.05, + "logps_train/chosen": -73.65574645996094, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -72.97344207763672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03657417371869087, + "rewards_train/margins": 0.08010983094573021, + "rewards_train/rejected": -0.04353565722703934, + "step": 192 + }, + { + "epoch": 0.05, + "logps_train/chosen": -81.60496520996094, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.50057983398438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06127751246094704, + "rewards_train/margins": 0.005968581885099411, + "rewards_train/rejected": -0.06724609434604645, + "step": 193 + }, + { + "epoch": 0.05, + "learning_rate": 4.993108252042853e-07, + "loss": 0.6731, + "step": 194 + }, + { + "epoch": 0.05, + "logps_train/chosen": -70.02510070800781, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -77.16969299316406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.017744647338986397, + "rewards_train/margins": 0.013287629932165146, + "rewards_train/rejected": -0.031032277271151543, + "step": 194 + }, + { + "epoch": 0.05, + "logps_train/chosen": -29.98483657836914, + "logps_train/ref_chosen": -30.125, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -33.236244201660156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.018703678622841835, + "rewards_train/margins": 0.033734289929270744, + "rewards_train/rejected": -0.01503061130642891, + "step": 195 + }, + { + "epoch": 0.05, + "learning_rate": 4.992641373963267e-07, + "loss": 0.6824, + "step": 196 + }, + { + "epoch": 0.05, + "logps_train/chosen": -23.881338119506836, + "logps_train/ref_chosen": -23.875, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -32.399864196777344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.004418111406266689, + "rewards_train/margins": 0.009591526351869106, + "rewards_train/rejected": -0.014009637758135796, + "step": 196 + }, + { + "epoch": 0.06, + "logps_train/chosen": -60.460777282714844, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -72.92411804199219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.03865598142147064, + "rewards_train/margins": 0.012349851429462433, + "rewards_train/rejected": -0.051005832850933075, + "step": 197 + }, + { + "epoch": 0.06, + "learning_rate": 4.992159218954027e-07, + "loss": 0.6882, + "step": 198 + }, + { + "epoch": 0.06, + "logps_train/chosen": -59.503578186035156, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.3902587890625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.002767347265034914, + "rewards_train/margins": 0.03583629475906491, + "rewards_train/rejected": -0.03306894749403, + "step": 198 + }, + { + "epoch": 0.06, + "logps_train/chosen": -75.33126831054688, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -76.33258056640625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06867402046918869, + "rewards_train/margins": 0.026302769780158997, + "rewards_train/rejected": -0.09497679024934769, + "step": 199 + }, + { + "epoch": 0.06, + "learning_rate": 4.991661789970175e-07, + "loss": 0.679, + "step": 200 + }, + { + "epoch": 0.06, + "logps_train/chosen": -62.34615707397461, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -64.10647583007812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.03335317596793175, + "rewards_train/margins": 0.017437901347875595, + "rewards_train/rejected": 0.015915274620056152, + "step": 200 + }, + { + "epoch": 0.06, + "logps_train/chosen": -32.947509765625, + "logps_train/ref_chosen": -33.25, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -37.5911750793457, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.03171394020318985, + "rewards_train/margins": 0.11231562495231628, + "rewards_train/rejected": -0.08060168474912643, + "step": 201 + }, + { + "epoch": 0.06, + "learning_rate": 4.991149090060357e-07, + "loss": 0.666, + "step": 202 + }, + { + "epoch": 0.06, + "logps_train/chosen": -52.979347229003906, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -66.29036712646484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.053188204765319824, + "rewards_train/margins": 0.1179671660065651, + "rewards_train/rejected": -0.06477896124124527, + "step": 202 + }, + { + "epoch": 0.06, + "logps_train/chosen": -40.81906509399414, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -40.9730339050293, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.010866811498999596, + "rewards_train/margins": 0.08453742228448391, + "rewards_train/rejected": -0.07367061078548431, + "step": 203 + }, + { + "epoch": 0.06, + "learning_rate": 4.990621122366814e-07, + "loss": 0.6465, + "step": 204 + }, + { + "epoch": 0.06, + "logps_train/chosen": -72.531982421875, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -81.16976165771484, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.0594484768807888, + "rewards_train/margins": -0.012833558022975922, + "rewards_train/rejected": -0.04661491885781288, + "step": 204 + }, + { + "epoch": 0.06, + "logps_train/chosen": -60.44779968261719, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -65.13034057617188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.017329489812254906, + "rewards_train/margins": 0.1112231370061636, + "rewards_train/rejected": -0.09389364719390869, + "step": 205 + }, + { + "epoch": 0.06, + "learning_rate": 4.990077890125363e-07, + "loss": 0.6729, + "step": 206 + }, + { + "epoch": 0.06, + "logps_train/chosen": -68.99644470214844, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -71.91361999511719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.013706970028579235, + "rewards_train/margins": 0.046600774861872196, + "rewards_train/rejected": -0.06030774489045143, + "step": 206 + }, + { + "epoch": 0.06, + "logps_train/chosen": -80.01974487304688, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -83.99603271484375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.03044714406132698, + "rewards_train/margins": 0.10719845816493034, + "rewards_train/rejected": -0.07675131410360336, + "step": 207 + }, + { + "epoch": 0.06, + "learning_rate": 4.98951939666537e-07, + "loss": 0.6567, + "step": 208 + }, + { + "epoch": 0.06, + "logps_train/chosen": -59.470733642578125, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -60.334495544433594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.00019833073019981384, + "rewards_train/margins": 0.05414910987019539, + "rewards_train/rejected": -0.0543474406003952, + "step": 208 + }, + { + "epoch": 0.06, + "logps_train/chosen": -60.944374084472656, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -69.1351089477539, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0071250684559345245, + "rewards_train/margins": 0.053448259830474854, + "rewards_train/rejected": -0.04632319137454033, + "step": 209 + }, + { + "epoch": 0.06, + "learning_rate": 4.988945645409737e-07, + "loss": 0.6679, + "step": 210 + }, + { + "epoch": 0.06, + "logps_train/chosen": -74.97129821777344, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -78.39900970458984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08724522590637207, + "rewards_train/margins": 0.17402145266532898, + "rewards_train/rejected": -0.08677622675895691, + "step": 210 + }, + { + "epoch": 0.06, + "logps_train/chosen": -37.18720245361328, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -43.60913848876953, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.01207956112921238, + "rewards_train/margins": -0.010540693998336792, + "rewards_train/rejected": -0.0015388671308755875, + "step": 211 + }, + { + "epoch": 0.06, + "learning_rate": 4.988356639874877e-07, + "loss": 0.6595, + "step": 212 + }, + { + "epoch": 0.06, + "logps_train/chosen": -60.066497802734375, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -69.7305908203125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06444393843412399, + "rewards_train/margins": 0.08906573243439198, + "rewards_train/rejected": -0.024621794000267982, + "step": 212 + }, + { + "epoch": 0.06, + "logps_train/chosen": -39.71125030517578, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -45.317726135253906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0685858279466629, + "rewards_train/margins": 0.07217095792293549, + "rewards_train/rejected": -0.1407567858695984, + "step": 213 + }, + { + "epoch": 0.06, + "learning_rate": 4.987752383670694e-07, + "loss": 0.6567, + "step": 214 + }, + { + "epoch": 0.06, + "logps_train/chosen": -102.42160034179688, + "logps_train/ref_chosen": -102.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -99.19660949707031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.018777890130877495, + "rewards_train/margins": 0.07906324602663517, + "rewards_train/rejected": -0.060285355895757675, + "step": 214 + }, + { + "epoch": 0.06, + "logps_train/chosen": -74.23822784423828, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -64.2943344116211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.024702059105038643, + "rewards_train/margins": 0.09457517974078655, + "rewards_train/rejected": -0.1192772388458252, + "step": 215 + }, + { + "epoch": 0.06, + "learning_rate": 4.987132880500561e-07, + "loss": 0.6532, + "step": 216 + }, + { + "epoch": 0.06, + "logps_train/chosen": -100.21539306640625, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -87.05062866210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04408545047044754, + "rewards_train/margins": 0.20149267464876175, + "rewards_train/rejected": -0.1574072241783142, + "step": 216 + }, + { + "epoch": 0.06, + "logps_train/chosen": -73.68387603759766, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -69.42728424072266, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.059794262051582336, + "rewards_train/margins": 0.052856430411338806, + "rewards_train/rejected": -0.11265069246292114, + "step": 217 + }, + { + "epoch": 0.06, + "learning_rate": 4.986498134161296e-07, + "loss": 0.6343, + "step": 218 + }, + { + "epoch": 0.06, + "logps_train/chosen": -87.93561553955078, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -100.5460205078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04434296116232872, + "rewards_train/margins": 0.11963364854454994, + "rewards_train/rejected": -0.16397660970687866, + "step": 218 + }, + { + "epoch": 0.06, + "logps_train/chosen": -76.35858154296875, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -74.37849426269531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0811704471707344, + "rewards_train/margins": 0.09535088390111923, + "rewards_train/rejected": -0.17652133107185364, + "step": 219 + }, + { + "epoch": 0.06, + "learning_rate": 4.98584814854314e-07, + "loss": 0.6434, + "step": 220 + }, + { + "epoch": 0.06, + "logps_train/chosen": -70.59697723388672, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -84.3460922241211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04680757224559784, + "rewards_train/margins": 0.09483274817466736, + "rewards_train/rejected": -0.1416403204202652, + "step": 220 + }, + { + "epoch": 0.06, + "logps_train/chosen": -44.067588806152344, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -45.22989273071289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.010818934068083763, + "rewards_train/margins": 0.036151934415102005, + "rewards_train/rejected": -0.025333000347018242, + "step": 221 + }, + { + "epoch": 0.06, + "learning_rate": 4.985182927629732e-07, + "loss": 0.6628, + "step": 222 + }, + { + "epoch": 0.06, + "logps_train/chosen": -59.0699577331543, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -52.7005500793457, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.019300485029816628, + "rewards_train/margins": 0.09919190965592861, + "rewards_train/rejected": -0.11849239468574524, + "step": 222 + }, + { + "epoch": 0.06, + "logps_train/chosen": -76.71279907226562, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -89.87239837646484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.05411018803715706, + "rewards_train/margins": 0.15736595168709755, + "rewards_train/rejected": -0.10325576364994049, + "step": 223 + }, + { + "epoch": 0.06, + "learning_rate": 4.984502475498087e-07, + "loss": 0.6353, + "step": 224 + }, + { + "epoch": 0.06, + "logps_train/chosen": -61.84689712524414, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -64.61875915527344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06618925929069519, + "rewards_train/margins": 0.12904177606105804, + "rewards_train/rejected": -0.06285251677036285, + "step": 224 + }, + { + "epoch": 0.06, + "logps_train/chosen": -37.708656311035156, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -38.8215217590332, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0836707204580307, + "rewards_train/margins": 0.047309741377830505, + "rewards_train/rejected": -0.1309804618358612, + "step": 225 + }, + { + "epoch": 0.06, + "learning_rate": 4.983806796318566e-07, + "loss": 0.6547, + "step": 226 + }, + { + "epoch": 0.06, + "logps_train/chosen": -35.12958526611328, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -40.443180084228516, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.012431969866156578, + "rewards_train/margins": 0.029650233685970306, + "rewards_train/rejected": -0.01721826381981373, + "step": 226 + }, + { + "epoch": 0.06, + "logps_train/chosen": -65.00015258789062, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -63.44427490234375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.01639087125658989, + "rewards_train/margins": 0.09441191330552101, + "rewards_train/rejected": -0.07802104204893112, + "step": 227 + }, + { + "epoch": 0.06, + "learning_rate": 4.983095894354857e-07, + "loss": 0.6635, + "step": 228 + }, + { + "epoch": 0.06, + "logps_train/chosen": -69.90158081054688, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -77.40280151367188, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.015212714672088623, + "rewards_train/margins": 0.08293427526950836, + "rewards_train/rejected": -0.06772156059741974, + "step": 228 + }, + { + "epoch": 0.06, + "logps_train/chosen": -72.91961669921875, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -76.48533630371094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06702268868684769, + "rewards_train/margins": 0.11711882799863815, + "rewards_train/rejected": -0.050096139311790466, + "step": 229 + }, + { + "epoch": 0.06, + "learning_rate": 4.982369773963945e-07, + "loss": 0.6479, + "step": 230 + }, + { + "epoch": 0.06, + "logps_train/chosen": -64.00090026855469, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -51.01182556152344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.006941433064639568, + "rewards_train/margins": 0.0829286826774478, + "rewards_train/rejected": -0.07598724961280823, + "step": 230 + }, + { + "epoch": 0.06, + "logps_train/chosen": -55.70654296875, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -74.71240234375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.019970979541540146, + "rewards_train/margins": 0.1654302142560482, + "rewards_train/rejected": -0.14545923471450806, + "step": 231 + }, + { + "epoch": 0.06, + "learning_rate": 4.981628439596082e-07, + "loss": 0.6359, + "step": 232 + }, + { + "epoch": 0.06, + "logps_train/chosen": -73.67041015625, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -93.5, + "logps_train/rejected": -96.10404205322266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03256802633404732, + "rewards_train/margins": 0.2894815690815449, + "rewards_train/rejected": -0.25691354274749756, + "step": 232 + }, + { + "epoch": 0.07, + "logps_train/chosen": -70.38328552246094, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -93.5, + "logps_train/rejected": -96.76373291015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.04887603595852852, + "rewards_train/margins": 0.2696845047175884, + "rewards_train/rejected": -0.31856054067611694, + "step": 233 + }, + { + "epoch": 0.07, + "learning_rate": 4.980871895794771e-07, + "loss": 0.5744, + "step": 234 + }, + { + "epoch": 0.07, + "logps_train/chosen": -45.762168884277344, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -55.0173454284668, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.02378300204873085, + "rewards_train/margins": -0.015107657760381699, + "rewards_train/rejected": 0.03889065980911255, + "step": 234 + }, + { + "epoch": 0.07, + "logps_train/chosen": -49.5078239440918, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -44.71027374267578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.08339961618185043, + "rewards_train/margins": 0.07788651436567307, + "rewards_train/rejected": -0.1612861305475235, + "step": 235 + }, + { + "epoch": 0.07, + "learning_rate": 4.980100147196724e-07, + "loss": 0.6802, + "step": 236 + }, + { + "epoch": 0.07, + "logps_train/chosen": -39.74016571044922, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -36.35296630859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0015552621334791183, + "rewards_train/margins": 0.07983522303402424, + "rewards_train/rejected": -0.08139048516750336, + "step": 236 + }, + { + "epoch": 0.07, + "logps_train/chosen": -74.76644134521484, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -72.96209716796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.015413643792271614, + "rewards_train/margins": 0.07229947857558727, + "rewards_train/rejected": -0.08771312236785889, + "step": 237 + }, + { + "epoch": 0.07, + "learning_rate": 4.979313198531843e-07, + "loss": 0.6575, + "step": 238 + }, + { + "epoch": 0.07, + "logps_train/chosen": -63.63264083862305, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -70.26762390136719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.08458728343248367, + "rewards_train/margins": 0.1047093328088522, + "rewards_train/rejected": -0.020122049376368523, + "step": 238 + }, + { + "epoch": 0.07, + "logps_train/chosen": -60.88203811645508, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -57.22417449951172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.011836709454655647, + "rewards_train/margins": 0.0781101118773222, + "rewards_train/rejected": -0.08994682133197784, + "step": 239 + }, + { + "epoch": 0.07, + "learning_rate": 4.978511054623189e-07, + "loss": 0.65, + "step": 240 + }, + { + "epoch": 0.07, + "logps_train/chosen": -73.17652130126953, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -86.74775695800781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.041332535445690155, + "rewards_train/margins": 0.07313960790634155, + "rewards_train/rejected": -0.0318070724606514, + "step": 240 + }, + { + "epoch": 0.07, + "logps_train/chosen": -17.81582260131836, + "logps_train/ref_chosen": -17.875, + "logps_train/ref_rejected": -22.25, + "logps_train/rejected": -23.40126609802246, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.005038968753069639, + "rewards_train/margins": 0.11733342660591006, + "rewards_train/rejected": -0.11229445785284042, + "step": 241 + }, + { + "epoch": 0.07, + "learning_rate": 4.97769372038695e-07, + "loss": 0.6524, + "step": 242 + }, + { + "epoch": 0.07, + "logps_train/chosen": -78.91831970214844, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -67.61292266845703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.10123417526483536, + "rewards_train/margins": 0.1330341175198555, + "rewards_train/rejected": -0.03179994225502014, + "step": 242 + }, + { + "epoch": 0.07, + "logps_train/chosen": -66.00843048095703, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -74.28964233398438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.01685863733291626, + "rewards_train/margins": 0.031246401369571686, + "rewards_train/rejected": -0.048105038702487946, + "step": 243 + }, + { + "epoch": 0.07, + "learning_rate": 4.976861200832414e-07, + "loss": 0.656, + "step": 244 + }, + { + "epoch": 0.07, + "logps_train/chosen": -39.82799530029297, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -35.4915771484375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.04122381657361984, + "rewards_train/margins": 0.08457089215517044, + "rewards_train/rejected": -0.0433470755815506, + "step": 244 + }, + { + "epoch": 0.07, + "logps_train/chosen": -45.840084075927734, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -36.725608825683594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.04528848081827164, + "rewards_train/margins": 0.1261502429842949, + "rewards_train/rejected": -0.08086176216602325, + "step": 245 + }, + { + "epoch": 0.07, + "learning_rate": 4.976013501061937e-07, + "loss": 0.6443, + "step": 246 + }, + { + "epoch": 0.07, + "logps_train/chosen": -33.44615173339844, + "logps_train/ref_chosen": -32.5, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -46.146392822265625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.10545528680086136, + "rewards_train/margins": 0.04668416827917099, + "rewards_train/rejected": -0.15213945508003235, + "step": 246 + }, + { + "epoch": 0.07, + "logps_train/chosen": -85.43083190917969, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -101.01116943359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.05519186705350876, + "rewards_train/margins": 0.1408468261361122, + "rewards_train/rejected": -0.19603869318962097, + "step": 247 + }, + { + "epoch": 0.07, + "learning_rate": 4.975150626270911e-07, + "loss": 0.6505, + "step": 248 + }, + { + "epoch": 0.07, + "logps_train/chosen": -74.1454086303711, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -79.21577453613281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05557667464017868, + "rewards_train/margins": 0.18652979284524918, + "rewards_train/rejected": -0.1309531182050705, + "step": 248 + }, + { + "epoch": 0.07, + "logps_train/chosen": -65.63245391845703, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -76.48052978515625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.09722961485385895, + "rewards_train/margins": 0.06556923687458038, + "rewards_train/rejected": -0.16279885172843933, + "step": 249 + }, + { + "epoch": 0.07, + "learning_rate": 4.974272581747734e-07, + "loss": 0.6403, + "step": 250 + }, + { + "epoch": 0.07, + "logps_train/chosen": -125.66022491455078, + "logps_train/ref_chosen": -124.5, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -115.13243103027344, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.11914749443531036, + "rewards_train/margins": -0.0027793869376182556, + "rewards_train/rejected": -0.11636810749769211, + "step": 250 + }, + { + "epoch": 0.07, + "logps_train/chosen": -34.89984130859375, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -32.87923049926758, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.07290676981210709, + "rewards_train/margins": 0.14266571402549744, + "rewards_train/rejected": -0.06975894421339035, + "step": 251 + }, + { + "epoch": 0.07, + "learning_rate": 4.973379372873774e-07, + "loss": 0.6682, + "step": 252 + }, + { + "epoch": 0.07, + "logps_train/chosen": -54.228675842285156, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -55.161293029785156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.02830410562455654, + "rewards_train/margins": 0.16044876538217068, + "rewards_train/rejected": -0.13214465975761414, + "step": 252 + }, + { + "epoch": 0.07, + "logps_train/chosen": -96.91339874267578, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -101.10609436035156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.09382425993680954, + "rewards_train/margins": 0.12551989406347275, + "rewards_train/rejected": -0.2193441540002823, + "step": 253 + }, + { + "epoch": 0.07, + "learning_rate": 4.97247100512334e-07, + "loss": 0.6293, + "step": 254 + }, + { + "epoch": 0.07, + "logps_train/chosen": -65.65286254882812, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -83.21630096435547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1160673275589943, + "rewards_train/margins": 0.30956657975912094, + "rewards_train/rejected": -0.42563390731811523, + "step": 254 + }, + { + "epoch": 0.07, + "logps_train/chosen": -61.75201416015625, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -62.04127883911133, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.016399795189499855, + "rewards_train/margins": 0.20919951237738132, + "rewards_train/rejected": -0.19279971718788147, + "step": 255 + }, + { + "epoch": 0.07, + "learning_rate": 4.971547484063648e-07, + "loss": 0.5835, + "step": 256 + }, + { + "epoch": 0.07, + "logps_train/chosen": -58.18988037109375, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -63.04719543457031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.018511950969696045, + "rewards_train/margins": 0.17791859805583954, + "rewards_train/rejected": -0.1594066470861435, + "step": 256 + }, + { + "epoch": 0.07, + "logps_train/chosen": -54.394195556640625, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -54.88017654418945, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.032455600798130035, + "rewards_train/margins": 0.08062924444675446, + "rewards_train/rejected": -0.04817364364862442, + "step": 257 + }, + { + "epoch": 0.07, + "learning_rate": 4.970608815354785e-07, + "loss": 0.634, + "step": 258 + }, + { + "epoch": 0.07, + "logps_train/chosen": -77.7452392578125, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -103.17654418945312, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.09405508637428284, + "rewards_train/margins": 0.15172407031059265, + "rewards_train/rejected": -0.2457791566848755, + "step": 258 + }, + { + "epoch": 0.07, + "logps_train/chosen": -12.239175796508789, + "logps_train/ref_chosen": -12.0625, + "logps_train/ref_rejected": -10.125, + "logps_train/rejected": -10.303321838378906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.017667582258582115, + "rewards_train/margins": -0.0015444457530975342, + "rewards_train/rejected": -0.01612313650548458, + "step": 259 + }, + { + "epoch": 0.07, + "learning_rate": 4.969655004749673e-07, + "loss": 0.6598, + "step": 260 + }, + { + "epoch": 0.07, + "logps_train/chosen": -58.5343132019043, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -63.327796936035156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07266954332590103, + "rewards_train/margins": 0.009719453752040863, + "rewards_train/rejected": -0.0823889970779419, + "step": 260 + }, + { + "epoch": 0.07, + "logps_train/chosen": -36.77006149291992, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -44.284690856933594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.04052316024899483, + "rewards_train/margins": 0.1600080542266369, + "rewards_train/rejected": -0.11948489397764206, + "step": 261 + }, + { + "epoch": 0.07, + "learning_rate": 4.96868605809404e-07, + "loss": 0.6581, + "step": 262 + }, + { + "epoch": 0.07, + "logps_train/chosen": -45.656219482421875, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -43.79771423339844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.06055013835430145, + "rewards_train/margins": 0.14735297858715057, + "rewards_train/rejected": -0.08680284023284912, + "step": 262 + }, + { + "epoch": 0.07, + "logps_train/chosen": -77.39680480957031, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -65.07958984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.06546133756637573, + "rewards_train/margins": 0.20655959844589233, + "rewards_train/rejected": -0.27202093601226807, + "step": 263 + }, + { + "epoch": 0.07, + "learning_rate": 4.967701981326376e-07, + "loss": 0.6144, + "step": 264 + }, + { + "epoch": 0.07, + "logps_train/chosen": -52.240684509277344, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -63.36178207397461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.007962603121995926, + "rewards_train/margins": 0.042383063584566116, + "rewards_train/rejected": -0.03442046046257019, + "step": 264 + }, + { + "epoch": 0.07, + "logps_train/chosen": -38.47343826293945, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -39.063697814941406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.07917973399162292, + "rewards_train/margins": 0.10121335089206696, + "rewards_train/rejected": -0.18039308488368988, + "step": 265 + }, + { + "epoch": 0.07, + "learning_rate": 4.966702780477901e-07, + "loss": 0.6603, + "step": 266 + }, + { + "epoch": 0.07, + "logps_train/chosen": -51.64854431152344, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -60.975067138671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0629773736000061, + "rewards_train/margins": 0.1491558775305748, + "rewards_train/rejected": -0.0861785039305687, + "step": 266 + }, + { + "epoch": 0.07, + "logps_train/chosen": -68.49447631835938, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -69.90038299560547, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.05686892941594124, + "rewards_train/margins": 0.07618732377886772, + "rewards_train/rejected": -0.13305625319480896, + "step": 267 + }, + { + "epoch": 0.07, + "learning_rate": 4.965688461672532e-07, + "loss": 0.6429, + "step": 268 + }, + { + "epoch": 0.07, + "logps_train/chosen": -80.12190246582031, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -71.27377319335938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.032893478870391846, + "rewards_train/margins": 0.10581210255622864, + "rewards_train/rejected": -0.13870558142662048, + "step": 268 + }, + { + "epoch": 0.08, + "logps_train/chosen": -70.51513671875, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -88.27554321289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.02465803734958172, + "rewards_train/margins": 0.27164660580456257, + "rewards_train/rejected": -0.2963046431541443, + "step": 269 + }, + { + "epoch": 0.08, + "learning_rate": 4.964659031126836e-07, + "loss": 0.6145, + "step": 270 + }, + { + "epoch": 0.08, + "logps_train/chosen": -55.628578186035156, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -68.12471008300781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.08304059505462646, + "rewards_train/margins": 0.21445679664611816, + "rewards_train/rejected": -0.1314162015914917, + "step": 270 + }, + { + "epoch": 0.08, + "logps_train/chosen": -66.52167510986328, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -71.10826110839844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.017754392698407173, + "rewards_train/margins": 0.1602208036929369, + "rewards_train/rejected": -0.14246641099452972, + "step": 271 + }, + { + "epoch": 0.08, + "learning_rate": 4.963614495149999e-07, + "loss": 0.6084, + "step": 272 + }, + { + "epoch": 0.08, + "logps_train/chosen": -46.289825439453125, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -48.580875396728516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.10007592290639877, + "rewards_train/margins": -0.013765774667263031, + "rewards_train/rejected": -0.08631014823913574, + "step": 272 + }, + { + "epoch": 0.08, + "logps_train/chosen": -20.95589828491211, + "logps_train/ref_chosen": -20.875, + "logps_train/ref_rejected": -32.0, + "logps_train/rejected": -33.178314208984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.006576208863407373, + "rewards_train/margins": 0.10148985078558326, + "rewards_train/rejected": -0.10806605964899063, + "step": 273 + }, + { + "epoch": 0.08, + "learning_rate": 4.962554860143786e-07, + "loss": 0.6745, + "step": 274 + }, + { + "epoch": 0.08, + "logps_train/chosen": -26.65819549560547, + "logps_train/ref_chosen": -26.625, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -44.0958366394043, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0029288511723279953, + "rewards_train/margins": 0.12711374647915363, + "rewards_train/rejected": -0.13004259765148163, + "step": 274 + }, + { + "epoch": 0.08, + "logps_train/chosen": -68.20321655273438, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -55.19052505493164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.03594617545604706, + "rewards_train/margins": 0.08369223028421402, + "rewards_train/rejected": -0.11963840574026108, + "step": 275 + }, + { + "epoch": 0.08, + "learning_rate": 4.9614801326025e-07, + "loss": 0.6465, + "step": 276 + }, + { + "epoch": 0.08, + "logps_train/chosen": -37.916133880615234, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -31.375, + "logps_train/rejected": -33.3668098449707, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.002136575523763895, + "rewards_train/margins": 0.19584876531735063, + "rewards_train/rejected": -0.19371218979358673, + "step": 276 + }, + { + "epoch": 0.08, + "logps_train/chosen": -106.4279556274414, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -104.23807525634766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12677955627441406, + "rewards_train/margins": 0.32046520709991455, + "rewards_train/rejected": -0.4472447633743286, + "step": 277 + }, + { + "epoch": 0.08, + "learning_rate": 4.960390319112945e-07, + "loss": 0.5839, + "step": 278 + }, + { + "epoch": 0.08, + "logps_train/chosen": -67.5575942993164, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -85.05168151855469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.03388417139649391, + "rewards_train/margins": 0.10136232897639275, + "rewards_train/rejected": -0.13524650037288666, + "step": 278 + }, + { + "epoch": 0.08, + "logps_train/chosen": -50.628753662109375, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -59.204593658447266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0066254232078790665, + "rewards_train/margins": 0.20289664529263973, + "rewards_train/rejected": -0.2095220685005188, + "step": 279 + }, + { + "epoch": 0.08, + "learning_rate": 4.959285426354381e-07, + "loss": 0.6273, + "step": 280 + }, + { + "epoch": 0.08, + "logps_train/chosen": -59.6366081237793, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -63.785099029541016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.06610220670700073, + "rewards_train/margins": 0.1313529908657074, + "rewards_train/rejected": -0.19745519757270813, + "step": 280 + }, + { + "epoch": 0.08, + "logps_train/chosen": -45.673583984375, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -57.671844482421875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.021899394690990448, + "rewards_train/margins": 0.21134945005178452, + "rewards_train/rejected": -0.18945005536079407, + "step": 281 + }, + { + "epoch": 0.08, + "learning_rate": 4.958165461098487e-07, + "loss": 0.6204, + "step": 282 + }, + { + "epoch": 0.08, + "logps_train/chosen": -47.9825553894043, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -50.561790466308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.021766599267721176, + "rewards_train/margins": 0.04613130912184715, + "rewards_train/rejected": -0.06789790838956833, + "step": 282 + }, + { + "epoch": 0.08, + "logps_train/chosen": -23.648746490478516, + "logps_train/ref_chosen": -23.375, + "logps_train/ref_rejected": -16.75, + "logps_train/rejected": -17.18783187866211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.02698415145277977, + "rewards_train/margins": 0.017580367624759674, + "rewards_train/rejected": -0.044564519077539444, + "step": 283 + }, + { + "epoch": 0.08, + "learning_rate": 4.957030430209321e-07, + "loss": 0.679, + "step": 284 + }, + { + "epoch": 0.08, + "logps_train/chosen": -54.955718994140625, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -54.96346664428711, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.016342464834451675, + "rewards_train/margins": 0.08407585695385933, + "rewards_train/rejected": -0.06773339211940765, + "step": 284 + }, + { + "epoch": 0.08, + "logps_train/chosen": -51.135501861572266, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -58.292991638183594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.07956597208976746, + "rewards_train/margins": 0.2786390483379364, + "rewards_train/rejected": -0.35820502042770386, + "step": 285 + }, + { + "epoch": 0.08, + "learning_rate": 4.955880340643274e-07, + "loss": 0.6148, + "step": 286 + }, + { + "epoch": 0.08, + "logps_train/chosen": -47.648345947265625, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -62.818199157714844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.06387611478567123, + "rewards_train/margins": 0.22928988188505173, + "rewards_train/rejected": -0.1654137670993805, + "step": 286 + }, + { + "epoch": 0.08, + "logps_train/chosen": -49.7845458984375, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -75.52261352539062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.05282023176550865, + "rewards_train/margins": 0.1923610307276249, + "rewards_train/rejected": -0.24518126249313354, + "step": 287 + }, + { + "epoch": 0.08, + "learning_rate": 4.954715199449026e-07, + "loss": 0.5995, + "step": 288 + }, + { + "epoch": 0.08, + "logps_train/chosen": -27.164216995239258, + "logps_train/ref_chosen": -27.375, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -30.333599090576172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.016781475394964218, + "rewards_train/margins": 0.07279767468571663, + "rewards_train/rejected": -0.05601619929075241, + "step": 288 + }, + { + "epoch": 0.08, + "logps_train/chosen": -62.513511657714844, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -77.88041687011719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03540075942873955, + "rewards_train/margins": 0.2973349802196026, + "rewards_train/rejected": -0.26193422079086304, + "step": 289 + }, + { + "epoch": 0.08, + "learning_rate": 4.95353501376751e-07, + "loss": 0.615, + "step": 290 + }, + { + "epoch": 0.08, + "logps_train/chosen": -59.4865837097168, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -62.95096969604492, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1172129362821579, + "rewards_train/margins": 0.037844955921173096, + "rewards_train/rejected": -0.155057892203331, + "step": 290 + }, + { + "epoch": 0.08, + "logps_train/chosen": -105.95268249511719, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -115.05427551269531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06558090448379517, + "rewards_train/margins": 0.15039342641830444, + "rewards_train/rejected": -0.2159743309020996, + "step": 291 + }, + { + "epoch": 0.08, + "learning_rate": 4.952339790831861e-07, + "loss": 0.6507, + "step": 292 + }, + { + "epoch": 0.08, + "logps_train/chosen": -59.823951721191406, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -64.58686065673828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.038547221571207047, + "rewards_train/margins": 0.1658419780433178, + "rewards_train/rejected": -0.20438919961452484, + "step": 292 + }, + { + "epoch": 0.08, + "logps_train/chosen": -56.13823699951172, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -68.07988739013672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0319875106215477, + "rewards_train/margins": 0.21579626947641373, + "rewards_train/rejected": -0.24778378009796143, + "step": 293 + }, + { + "epoch": 0.08, + "learning_rate": 4.951129537967377e-07, + "loss": 0.6119, + "step": 294 + }, + { + "epoch": 0.08, + "logps_train/chosen": -49.520912170410156, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -35.58644485473633, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.007088212296366692, + "rewards_train/margins": 0.19073268584907055, + "rewards_train/rejected": -0.18364447355270386, + "step": 294 + }, + { + "epoch": 0.08, + "logps_train/chosen": -67.98999786376953, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -70.90355682373047, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.003296487033367157, + "rewards_train/margins": 0.15151257067918777, + "rewards_train/rejected": -0.15480905771255493, + "step": 295 + }, + { + "epoch": 0.08, + "learning_rate": 4.949904262591467e-07, + "loss": 0.6205, + "step": 296 + }, + { + "epoch": 0.08, + "logps_train/chosen": -52.28907012939453, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -51.18754196166992, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.02148505672812462, + "rewards_train/margins": 0.1144569106400013, + "rewards_train/rejected": -0.13594196736812592, + "step": 296 + }, + { + "epoch": 0.08, + "logps_train/chosen": -34.808326721191406, + "logps_train/ref_chosen": -34.25, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -37.986000061035156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0535435788333416, + "rewards_train/margins": 0.17733753100037575, + "rewards_train/rejected": -0.23088110983371735, + "step": 297 + }, + { + "epoch": 0.08, + "learning_rate": 4.948663972213615e-07, + "loss": 0.6307, + "step": 298 + }, + { + "epoch": 0.08, + "logps_train/chosen": -79.06289672851562, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -70.95683288574219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.05082162469625473, + "rewards_train/margins": 0.30560410767793655, + "rewards_train/rejected": -0.3564257323741913, + "step": 298 + }, + { + "epoch": 0.08, + "logps_train/chosen": -33.221107482910156, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -21.875, + "logps_train/rejected": -22.736474990844727, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.022855589166283607, + "rewards_train/margins": 0.05798187665641308, + "rewards_train/rejected": -0.08083746582269669, + "step": 299 + }, + { + "epoch": 0.08, + "learning_rate": 4.947408674435326e-07, + "loss": 0.614, + "step": 300 + }, + { + "epoch": 0.08, + "logps_train/chosen": -94.10527038574219, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -107.86579895019531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.010526798665523529, + "rewards_train/margins": 0.3217567577958107, + "rewards_train/rejected": -0.33228355646133423, + "step": 300 + }, + { + "epoch": 0.08, + "logps_train/chosen": -48.57219696044922, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -63.75008010864258, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.009186375886201859, + "rewards_train/margins": 0.26476090028882027, + "rewards_train/rejected": -0.2555745244026184, + "step": 301 + }, + { + "epoch": 0.08, + "learning_rate": 4.946138376950086e-07, + "loss": 0.5724, + "step": 302 + }, + { + "epoch": 0.08, + "logps_train/chosen": -45.28582763671875, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -62.547607421875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.019989222288131714, + "rewards_train/margins": 0.10234978795051575, + "rewards_train/rejected": -0.12233901023864746, + "step": 302 + }, + { + "epoch": 0.08, + "logps_train/chosen": -87.74296569824219, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -81.56104278564453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.03542996197938919, + "rewards_train/margins": 0.23336955159902573, + "rewards_train/rejected": -0.2687995135784149, + "step": 303 + }, + { + "epoch": 0.08, + "learning_rate": 4.944853087543306e-07, + "loss": 0.6248, + "step": 304 + }, + { + "epoch": 0.08, + "logps_train/chosen": -62.05071258544922, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.63275909423828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0007883422076702118, + "rewards_train/margins": 0.0510755218565464, + "rewards_train/rejected": -0.05028717964887619, + "step": 304 + }, + { + "epoch": 0.09, + "logps_train/chosen": -84.78604125976562, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -95.69252014160156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.18797874450683594, + "rewards_train/margins": 0.39611655473709106, + "rewards_train/rejected": -0.584095299243927, + "step": 305 + }, + { + "epoch": 0.09, + "learning_rate": 4.943552814092287e-07, + "loss": 0.6015, + "step": 306 + }, + { + "epoch": 0.09, + "logps_train/chosen": -66.4135971069336, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -101.26895141601562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1567896604537964, + "rewards_train/margins": 0.3613162040710449, + "rewards_train/rejected": -0.5181058645248413, + "step": 306 + }, + { + "epoch": 0.09, + "logps_train/chosen": -64.56097412109375, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -69.8199691772461, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17797234654426575, + "rewards_train/margins": 0.08468848466873169, + "rewards_train/rejected": -0.26266083121299744, + "step": 307 + }, + { + "epoch": 0.09, + "learning_rate": 4.942237564566155e-07, + "loss": 0.6147, + "step": 308 + }, + { + "epoch": 0.09, + "logps_train/chosen": -95.0046157836914, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -112.35191345214844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0785864070057869, + "rewards_train/margins": 0.27848007529973984, + "rewards_train/rejected": -0.35706648230552673, + "step": 308 + }, + { + "epoch": 0.09, + "logps_train/chosen": -44.23193359375, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -59.32194137573242, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.020947443321347237, + "rewards_train/margins": 0.12267252989113331, + "rewards_train/rejected": -0.10172508656978607, + "step": 309 + }, + { + "epoch": 0.09, + "learning_rate": 4.94090734702583e-07, + "loss": 0.6114, + "step": 310 + }, + { + "epoch": 0.09, + "logps_train/chosen": -88.3235092163086, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -87.81587219238281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.035227496176958084, + "rewards_train/margins": 0.3707207925617695, + "rewards_train/rejected": -0.3354932963848114, + "step": 310 + }, + { + "epoch": 0.09, + "logps_train/chosen": -108.91020202636719, + "logps_train/ref_chosen": -107.5, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -118.88829040527344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12227039039134979, + "rewards_train/margins": 0.19155924022197723, + "rewards_train/rejected": -0.313829630613327, + "step": 311 + }, + { + "epoch": 0.09, + "learning_rate": 4.939562169623964e-07, + "loss": 0.5769, + "step": 312 + }, + { + "epoch": 0.09, + "logps_train/chosen": -65.96340942382812, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -67.19998168945312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.021529756486415863, + "rewards_train/margins": 0.39426176995038986, + "rewards_train/rejected": -0.372732013463974, + "step": 312 + }, + { + "epoch": 0.09, + "logps_train/chosen": -62.58346176147461, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -65.02446746826172, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.12126119434833527, + "rewards_train/margins": 0.1448579579591751, + "rewards_train/rejected": -0.2661191523075104, + "step": 313 + }, + { + "epoch": 0.09, + "learning_rate": 4.938202040604898e-07, + "loss": 0.589, + "step": 314 + }, + { + "epoch": 0.09, + "logps_train/chosen": -63.953369140625, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -85.4097671508789, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.13088364899158478, + "rewards_train/margins": 0.24603049457073212, + "rewards_train/rejected": -0.3769141435623169, + "step": 314 + }, + { + "epoch": 0.09, + "logps_train/chosen": -92.89512634277344, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -101.70654296875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.20591847598552704, + "rewards_train/margins": 0.47957928478717804, + "rewards_train/rejected": -0.6854977607727051, + "step": 315 + }, + { + "epoch": 0.09, + "learning_rate": 4.936826968304603e-07, + "loss": 0.5441, + "step": 316 + }, + { + "epoch": 0.09, + "logps_train/chosen": -94.77378845214844, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -100.09722137451172, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2539416551589966, + "rewards_train/margins": 0.1714058220386505, + "rewards_train/rejected": -0.4253474771976471, + "step": 316 + }, + { + "epoch": 0.09, + "logps_train/chosen": -82.51100158691406, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -81.36175537109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3784438371658325, + "rewards_train/margins": 0.017107129096984863, + "rewards_train/rejected": -0.3955509662628174, + "step": 317 + }, + { + "epoch": 0.09, + "learning_rate": 4.935436961150639e-07, + "loss": 0.6562, + "step": 318 + }, + { + "epoch": 0.09, + "logps_train/chosen": -41.34101486206055, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -45.80457305908203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.046601392328739166, + "rewards_train/margins": 0.07291863113641739, + "rewards_train/rejected": -0.11952002346515656, + "step": 318 + }, + { + "epoch": 0.09, + "logps_train/chosen": -38.202938079833984, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -49.329376220703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.06486262381076813, + "rewards_train/margins": 0.13608145713806152, + "rewards_train/rejected": -0.0712188333272934, + "step": 319 + }, + { + "epoch": 0.09, + "learning_rate": 4.934032027662101e-07, + "loss": 0.6493, + "step": 320 + }, + { + "epoch": 0.09, + "logps_train/chosen": -37.03022766113281, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -47.34885787963867, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.12919463217258453, + "rewards_train/margins": 0.05471464991569519, + "rewards_train/rejected": -0.18390928208827972, + "step": 320 + }, + { + "epoch": 0.09, + "logps_train/chosen": -67.58390045166016, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -67.95511627197266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.29706206917762756, + "rewards_train/margins": 0.11837100982666016, + "rewards_train/rejected": -0.4154330790042877, + "step": 321 + }, + { + "epoch": 0.09, + "learning_rate": 4.932612176449559e-07, + "loss": 0.6543, + "step": 322 + }, + { + "epoch": 0.09, + "logps_train/chosen": -28.910926818847656, + "logps_train/ref_chosen": -29.625, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -52.609092712402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0702354684472084, + "rewards_train/margins": 0.15204314142465591, + "rewards_train/rejected": -0.08180767297744751, + "step": 322 + }, + { + "epoch": 0.09, + "logps_train/chosen": -53.39405822753906, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -51.8156852722168, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.06371934711933136, + "rewards_train/margins": 0.2859126329421997, + "rewards_train/rejected": -0.22219328582286835, + "step": 323 + }, + { + "epoch": 0.09, + "learning_rate": 4.931177416215015e-07, + "loss": 0.5955, + "step": 324 + }, + { + "epoch": 0.09, + "logps_train/chosen": -69.10913848876953, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -73.91200256347656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.12517178058624268, + "rewards_train/margins": 0.17325466871261597, + "rewards_train/rejected": -0.29842644929885864, + "step": 324 + }, + { + "epoch": 0.09, + "logps_train/chosen": -27.53668785095215, + "logps_train/ref_chosen": -27.875, + "logps_train/ref_rejected": -32.75, + "logps_train/rejected": -33.74585723876953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03175593167543411, + "rewards_train/margins": 0.12196669727563858, + "rewards_train/rejected": -0.09021076560020447, + "step": 325 + }, + { + "epoch": 0.09, + "learning_rate": 4.929727755751845e-07, + "loss": 0.6329, + "step": 326 + }, + { + "epoch": 0.09, + "logps_train/chosen": -78.49880981445312, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -82.13140869140625, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.12956848740577698, + "rewards_train/margins": 0.08318158984184265, + "rewards_train/rejected": -0.21275007724761963, + "step": 326 + }, + { + "epoch": 0.09, + "logps_train/chosen": -55.102596282958984, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -68.35601806640625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11397083103656769, + "rewards_train/margins": 0.36076705157756805, + "rewards_train/rejected": -0.47473788261413574, + "step": 327 + }, + { + "epoch": 0.09, + "learning_rate": 4.928263203944743e-07, + "loss": 0.6162, + "step": 328 + }, + { + "epoch": 0.09, + "logps_train/chosen": -79.43524169921875, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -139.0, + "logps_train/rejected": -142.58819580078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2659849226474762, + "rewards_train/margins": 0.13189652562141418, + "rewards_train/rejected": -0.3978814482688904, + "step": 328 + }, + { + "epoch": 0.09, + "logps_train/chosen": -109.83356475830078, + "logps_train/ref_chosen": -109.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -78.52315521240234, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.04702841490507126, + "rewards_train/margins": 0.15997455269098282, + "rewards_train/rejected": -0.20700296759605408, + "step": 329 + }, + { + "epoch": 0.09, + "learning_rate": 4.926783769769671e-07, + "loss": 0.6316, + "step": 330 + }, + { + "epoch": 0.09, + "logps_train/chosen": -79.29072570800781, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -69.26737976074219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.018582772463560104, + "rewards_train/margins": 0.0820390097796917, + "rewards_train/rejected": -0.06345623731613159, + "step": 330 + }, + { + "epoch": 0.09, + "logps_train/chosen": -74.45913696289062, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -82.04327392578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.13341361284255981, + "rewards_train/margins": 0.2119290828704834, + "rewards_train/rejected": -0.3453426957130432, + "step": 331 + }, + { + "epoch": 0.09, + "learning_rate": 4.925289462293807e-07, + "loss": 0.6314, + "step": 332 + }, + { + "epoch": 0.09, + "logps_train/chosen": -78.71355438232422, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -93.92138671875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.10963641852140427, + "rewards_train/margins": 0.38992414623498917, + "rewards_train/rejected": -0.49956056475639343, + "step": 332 + }, + { + "epoch": 0.09, + "logps_train/chosen": -49.360774993896484, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -59.33673095703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.0645933449268341, + "rewards_train/margins": 0.16575965285301208, + "rewards_train/rejected": -0.2303529977798462, + "step": 333 + }, + { + "epoch": 0.09, + "learning_rate": 4.923780290675475e-07, + "loss": 0.5829, + "step": 334 + }, + { + "epoch": 0.09, + "logps_train/chosen": -36.918785095214844, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -74.87519836425781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.03780872002243996, + "rewards_train/margins": 0.24407846108078957, + "rewards_train/rejected": -0.2062697410583496, + "step": 334 + }, + { + "epoch": 0.09, + "logps_train/chosen": -60.549774169921875, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -63.61624526977539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0905240923166275, + "rewards_train/margins": 0.12559273838996887, + "rewards_train/rejected": -0.21611683070659637, + "step": 335 + }, + { + "epoch": 0.09, + "learning_rate": 4.922256264164105e-07, + "loss": 0.6111, + "step": 336 + }, + { + "epoch": 0.09, + "logps_train/chosen": -49.96208190917969, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -69.14873504638672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08068116009235382, + "rewards_train/margins": 0.1533328741788864, + "rewards_train/rejected": -0.23401403427124023, + "step": 336 + }, + { + "epoch": 0.09, + "logps_train/chosen": -75.82608032226562, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -96.15160369873047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0439356230199337, + "rewards_train/margins": 0.25169334188103676, + "rewards_train/rejected": -0.29562896490097046, + "step": 337 + }, + { + "epoch": 0.09, + "learning_rate": 4.92071739210017e-07, + "loss": 0.6082, + "step": 338 + }, + { + "epoch": 0.09, + "logps_train/chosen": -70.35716247558594, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -74.14249420166016, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.2540755271911621, + "rewards_train/margins": -0.005450919270515442, + "rewards_train/rejected": -0.24862460792064667, + "step": 338 + }, + { + "epoch": 0.09, + "logps_train/chosen": -31.80899429321289, + "logps_train/ref_chosen": -31.75, + "logps_train/ref_rejected": -29.125, + "logps_train/rejected": -30.475730895996094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.008633816614747047, + "rewards_train/margins": 0.12116583250463009, + "rewards_train/rejected": -0.12979964911937714, + "step": 339 + }, + { + "epoch": 0.1, + "learning_rate": 4.919163683915127e-07, + "loss": 0.6779, + "step": 340 + }, + { + "epoch": 0.1, + "logps_train/chosen": -23.782291412353516, + "logps_train/ref_chosen": -22.375, + "logps_train/ref_rejected": -25.25, + "logps_train/rejected": -26.922977447509766, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.13618811964988708, + "rewards_train/margins": 0.02671496570110321, + "rewards_train/rejected": -0.1629030853509903, + "step": 340 + }, + { + "epoch": 0.1, + "logps_train/chosen": -79.93000793457031, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -76.40746307373047, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.21702466905117035, + "rewards_train/margins": 0.04422964155673981, + "rewards_train/rejected": -0.26125431060791016, + "step": 341 + }, + { + "epoch": 0.1, + "learning_rate": 4.91759514913136e-07, + "loss": 0.6793, + "step": 342 + }, + { + "epoch": 0.1, + "logps_train/chosen": -63.07942199707031, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -61.959495544433594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.06868436187505722, + "rewards_train/margins": 0.17960888892412186, + "rewards_train/rejected": -0.24829325079917908, + "step": 342 + }, + { + "epoch": 0.1, + "logps_train/chosen": -68.63347625732422, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -74.12576293945312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.10494876652956009, + "rewards_train/margins": 0.557627372443676, + "rewards_train/rejected": -0.6625761389732361, + "step": 343 + }, + { + "epoch": 0.1, + "learning_rate": 4.916011797362123e-07, + "loss": 0.5576, + "step": 344 + }, + { + "epoch": 0.1, + "logps_train/chosen": -49.114723205566406, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -47.364410400390625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.040378522127866745, + "rewards_train/margins": 0.11871876195073128, + "rewards_train/rejected": -0.15909728407859802, + "step": 344 + }, + { + "epoch": 0.1, + "logps_train/chosen": -22.078798294067383, + "logps_train/ref_chosen": -22.125, + "logps_train/ref_rejected": -27.125, + "logps_train/rejected": -27.537137985229492, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -6.734486669301987e-05, + "rewards_train/margins": 0.046224416233599186, + "rewards_train/rejected": -0.046291761100292206, + "step": 345 + }, + { + "epoch": 0.1, + "learning_rate": 4.914413638311482e-07, + "loss": 0.6558, + "step": 346 + }, + { + "epoch": 0.1, + "logps_train/chosen": -71.45697021484375, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -68.04414367675781, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.2310480773448944, + "rewards_train/margins": -0.025852814316749573, + "rewards_train/rejected": -0.20519526302814484, + "step": 346 + }, + { + "epoch": 0.1, + "logps_train/chosen": -60.23451614379883, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -76.00108337402344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.05177208036184311, + "rewards_train/margins": 0.26786769181489944, + "rewards_train/rejected": -0.31963977217674255, + "step": 347 + }, + { + "epoch": 0.1, + "learning_rate": 4.912800681774253e-07, + "loss": 0.6695, + "step": 348 + }, + { + "epoch": 0.1, + "logps_train/chosen": -55.41716766357422, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -52.72454833984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.20265452563762665, + "rewards_train/margins": 0.09440977871417999, + "rewards_train/rejected": -0.29706430435180664, + "step": 348 + }, + { + "epoch": 0.1, + "logps_train/chosen": -87.39714050292969, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -87.9417724609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2334636151790619, + "rewards_train/margins": 0.24196359515190125, + "rewards_train/rejected": -0.47542721033096313, + "step": 349 + }, + { + "epoch": 0.1, + "learning_rate": 4.911172937635942e-07, + "loss": 0.622, + "step": 350 + }, + { + "epoch": 0.1, + "logps_train/chosen": -73.04360961914062, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -73.84121704101562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.14186100661754608, + "rewards_train/margins": 0.2610108405351639, + "rewards_train/rejected": -0.40287184715270996, + "step": 350 + }, + { + "epoch": 0.1, + "logps_train/chosen": -81.9167709350586, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -85.91574096679688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.026052236557006836, + "rewards_train/margins": 0.25341176986694336, + "rewards_train/rejected": -0.2794640064239502, + "step": 351 + }, + { + "epoch": 0.1, + "learning_rate": 4.909530415872685e-07, + "loss": 0.5867, + "step": 352 + }, + { + "epoch": 0.1, + "logps_train/chosen": -39.090415954589844, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -53.72549057006836, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.030021198093891144, + "rewards_train/margins": 0.2693668082356453, + "rewards_train/rejected": -0.23934561014175415, + "step": 352 + }, + { + "epoch": 0.1, + "logps_train/chosen": -63.3445930480957, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -72.13990783691406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.11004532873630524, + "rewards_train/margins": 0.36100123822689056, + "rewards_train/rejected": -0.4710465669631958, + "step": 353 + }, + { + "epoch": 0.1, + "learning_rate": 4.907873126551192e-07, + "loss": 0.5637, + "step": 354 + }, + { + "epoch": 0.1, + "logps_train/chosen": -65.56784057617188, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -39.25541305541992, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0036589615046977997, + "rewards_train/margins": 0.18203851953148842, + "rewards_train/rejected": -0.18569748103618622, + "step": 354 + }, + { + "epoch": 0.1, + "logps_train/chosen": -64.6380615234375, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -92.01697540283203, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.020056378096342087, + "rewards_train/margins": 0.2699230797588825, + "rewards_train/rejected": -0.2899794578552246, + "step": 355 + }, + { + "epoch": 0.1, + "learning_rate": 4.906201079828676e-07, + "loss": 0.5964, + "step": 356 + }, + { + "epoch": 0.1, + "logps_train/chosen": -73.18698120117188, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -76.12444305419922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.005715329200029373, + "rewards_train/margins": 0.10956589505076408, + "rewards_train/rejected": -0.10385056585073471, + "step": 356 + }, + { + "epoch": 0.1, + "logps_train/chosen": -60.72767639160156, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -68.98823547363281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2723768949508667, + "rewards_train/margins": 0.040704816579818726, + "rewards_train/rejected": -0.3130817115306854, + "step": 357 + }, + { + "epoch": 0.1, + "learning_rate": 4.904514285952794e-07, + "loss": 0.6646, + "step": 358 + }, + { + "epoch": 0.1, + "logps_train/chosen": -61.012474060058594, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -65.67412567138672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.016135694459080696, + "rewards_train/margins": 0.6413607615977526, + "rewards_train/rejected": -0.6252250671386719, + "step": 358 + }, + { + "epoch": 0.1, + "logps_train/chosen": -50.726600646972656, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -52.99345397949219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0968787670135498, + "rewards_train/margins": 0.16848242282867432, + "rewards_train/rejected": -0.2653611898422241, + "step": 359 + }, + { + "epoch": 0.1, + "learning_rate": 4.902812755261591e-07, + "loss": 0.5466, + "step": 360 + }, + { + "epoch": 0.1, + "logps_train/chosen": -58.21422576904297, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -55.1922607421875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04739930480718613, + "rewards_train/margins": 0.25253967195749283, + "rewards_train/rejected": -0.29993897676467896, + "step": 360 + }, + { + "epoch": 0.1, + "logps_train/chosen": -76.33403778076172, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -51.67599105834961, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.021684732288122177, + "rewards_train/margins": 0.2488439716398716, + "rewards_train/rejected": -0.2705287039279938, + "step": 361 + }, + { + "epoch": 0.1, + "learning_rate": 4.901096498183428e-07, + "loss": 0.5973, + "step": 362 + }, + { + "epoch": 0.1, + "logps_train/chosen": -27.518657684326172, + "logps_train/ref_chosen": -26.875, + "logps_train/ref_rejected": -24.75, + "logps_train/rejected": -26.936294555664062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.062070704996585846, + "rewards_train/margins": 0.15226199477910995, + "rewards_train/rejected": -0.2143326997756958, + "step": 362 + }, + { + "epoch": 0.1, + "logps_train/chosen": -35.827030181884766, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -56.23878479003906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0893670916557312, + "rewards_train/margins": 0.6168099045753479, + "rewards_train/rejected": -0.5274428129196167, + "step": 363 + }, + { + "epoch": 0.1, + "learning_rate": 4.899365525236921e-07, + "loss": 0.5601, + "step": 364 + }, + { + "epoch": 0.1, + "logps_train/chosen": -77.91535186767578, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -85.88685607910156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0022149011492729187, + "rewards_train/margins": 0.5287915393710136, + "rewards_train/rejected": -0.5265766382217407, + "step": 364 + }, + { + "epoch": 0.1, + "logps_train/chosen": -94.54623413085938, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -96.4705581665039, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.29993563890457153, + "rewards_train/margins": 0.3475103974342346, + "rewards_train/rejected": -0.6474460363388062, + "step": 365 + }, + { + "epoch": 0.1, + "learning_rate": 4.897619847030876e-07, + "loss": 0.5368, + "step": 366 + }, + { + "epoch": 0.1, + "logps_train/chosen": -100.95994567871094, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -100.17292785644531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4110339879989624, + "rewards_train/margins": 0.18516451120376587, + "rewards_train/rejected": -0.5961984992027283, + "step": 366 + }, + { + "epoch": 0.1, + "logps_train/chosen": -63.71215057373047, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -68.8509521484375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.13527780771255493, + "rewards_train/margins": 0.23848959803581238, + "rewards_train/rejected": -0.3737674057483673, + "step": 367 + }, + { + "epoch": 0.1, + "learning_rate": 4.895859474264228e-07, + "loss": 0.6032, + "step": 368 + }, + { + "epoch": 0.1, + "logps_train/chosen": -61.28565979003906, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -51.18120574951172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.037550076842308044, + "rewards_train/margins": 0.20063889026641846, + "rewards_train/rejected": -0.2381889671087265, + "step": 368 + }, + { + "epoch": 0.1, + "logps_train/chosen": -68.31848907470703, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -82.45947265625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.033238910138607025, + "rewards_train/margins": 0.3307482525706291, + "rewards_train/rejected": -0.2975093424320221, + "step": 369 + }, + { + "epoch": 0.1, + "learning_rate": 4.894084417725969e-07, + "loss": 0.5941, + "step": 370 + }, + { + "epoch": 0.1, + "logps_train/chosen": -108.39409637451172, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -108.1373291015625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.35815951228141785, + "rewards_train/margins": 0.03682336211204529, + "rewards_train/rejected": -0.39498287439346313, + "step": 370 + }, + { + "epoch": 0.1, + "logps_train/chosen": -45.12579345703125, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -50.849544525146484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.01613139919936657, + "rewards_train/margins": 0.44864421896636486, + "rewards_train/rejected": -0.4325128197669983, + "step": 371 + }, + { + "epoch": 0.1, + "learning_rate": 4.892294688295088e-07, + "loss": 0.6145, + "step": 372 + }, + { + "epoch": 0.1, + "logps_train/chosen": -62.0494270324707, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -86.26089477539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1879505217075348, + "rewards_train/margins": 0.37935033440589905, + "rewards_train/rejected": -0.5673008561134338, + "step": 372 + }, + { + "epoch": 0.1, + "logps_train/chosen": -99.60758209228516, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -99.30949401855469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5310704112052917, + "rewards_train/margins": 0.18523025512695312, + "rewards_train/rejected": -0.7163006663322449, + "step": 373 + }, + { + "epoch": 0.1, + "learning_rate": 4.890490296940496e-07, + "loss": 0.5772, + "step": 374 + }, + { + "epoch": 0.1, + "logps_train/chosen": -79.73228454589844, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -81.31240844726562, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.23709505796432495, + "rewards_train/margins": -0.09638087451457977, + "rewards_train/rejected": -0.14071418344974518, + "step": 374 + }, + { + "epoch": 0.1, + "logps_train/chosen": -68.92910766601562, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -86.09484100341797, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08158338069915771, + "rewards_train/margins": 0.3589552640914917, + "rewards_train/rejected": -0.4405386447906494, + "step": 375 + }, + { + "epoch": 0.11, + "learning_rate": 4.888671254720972e-07, + "loss": 0.6605, + "step": 376 + }, + { + "epoch": 0.11, + "logps_train/chosen": -44.20768737792969, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -52.269378662109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02493463270366192, + "rewards_train/margins": 0.19913811795413494, + "rewards_train/rejected": -0.17420348525047302, + "step": 376 + }, + { + "epoch": 0.11, + "logps_train/chosen": -71.08357238769531, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -59.199913024902344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.22769232094287872, + "rewards_train/margins": 0.21515028178691864, + "rewards_train/rejected": -0.44284260272979736, + "step": 377 + }, + { + "epoch": 0.11, + "learning_rate": 4.886837572785081e-07, + "loss": 0.611, + "step": 378 + }, + { + "epoch": 0.11, + "logps_train/chosen": -50.59214401245117, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -55.63287353515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0373397022485733, + "rewards_train/margins": 0.38141684234142303, + "rewards_train/rejected": -0.41875654458999634, + "step": 378 + }, + { + "epoch": 0.11, + "logps_train/chosen": -36.52409744262695, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -30.375, + "logps_train/rejected": -33.19350814819336, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.285368412733078, + "rewards_train/margins": -0.009327977895736694, + "rewards_train/rejected": -0.2760404348373413, + "step": 379 + }, + { + "epoch": 0.11, + "learning_rate": 4.884989262371114e-07, + "loss": 0.6264, + "step": 380 + }, + { + "epoch": 0.11, + "logps_train/chosen": -37.08723068237305, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -39.5583610534668, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.07593493163585663, + "rewards_train/margins": 0.02108776569366455, + "rewards_train/rejected": -0.09702269732952118, + "step": 380 + }, + { + "epoch": 0.11, + "logps_train/chosen": -78.44664001464844, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -81.58367919921875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.10560118407011032, + "rewards_train/margins": 0.40745461732149124, + "rewards_train/rejected": -0.5130558013916016, + "step": 381 + }, + { + "epoch": 0.11, + "learning_rate": 4.883126334807019e-07, + "loss": 0.6138, + "step": 382 + }, + { + "epoch": 0.11, + "logps_train/chosen": -74.96675872802734, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -77.48548889160156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3568321764469147, + "rewards_train/margins": 0.15304431319236755, + "rewards_train/rejected": -0.5098764896392822, + "step": 382 + }, + { + "epoch": 0.11, + "logps_train/chosen": -79.03826141357422, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -82.42130279541016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.12234561890363693, + "rewards_train/margins": 0.6360576674342155, + "rewards_train/rejected": -0.5137120485305786, + "step": 383 + }, + { + "epoch": 0.11, + "learning_rate": 4.881248801510328e-07, + "loss": 0.5603, + "step": 384 + }, + { + "epoch": 0.11, + "logps_train/chosen": -77.80622863769531, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -77.42886352539062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.042341478168964386, + "rewards_train/margins": 0.4349202439188957, + "rewards_train/rejected": -0.4772617220878601, + "step": 384 + }, + { + "epoch": 0.11, + "logps_train/chosen": -46.81436538696289, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -49.15264129638672, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.08514738827943802, + "rewards_train/margins": 0.05882773548364639, + "rewards_train/rejected": -0.1439751237630844, + "step": 385 + }, + { + "epoch": 0.11, + "learning_rate": 4.879356673988089e-07, + "loss": 0.5965, + "step": 386 + }, + { + "epoch": 0.11, + "logps_train/chosen": -33.68955612182617, + "logps_train/ref_chosen": -34.25, + "logps_train/ref_rejected": -23.75, + "logps_train/rejected": -24.28402328491211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.06151317059993744, + "rewards_train/margins": 0.11745448410511017, + "rewards_train/rejected": -0.05594131350517273, + "step": 386 + }, + { + "epoch": 0.11, + "logps_train/chosen": -111.4737548828125, + "logps_train/ref_chosen": -109.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -105.9298095703125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.18487553298473358, + "rewards_train/margins": 0.05068403482437134, + "rewards_train/rejected": -0.23555956780910492, + "step": 387 + }, + { + "epoch": 0.11, + "learning_rate": 4.877449963836794e-07, + "loss": 0.6609, + "step": 388 + }, + { + "epoch": 0.11, + "logps_train/chosen": -93.16571044921875, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -106.4239501953125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6222352385520935, + "rewards_train/margins": 0.13461285829544067, + "rewards_train/rejected": -0.7568480968475342, + "step": 388 + }, + { + "epoch": 0.11, + "logps_train/chosen": -45.30501174926758, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -48.52457046508789, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.16722005605697632, + "rewards_train/margins": 0.10828381776809692, + "rewards_train/rejected": -0.27550387382507324, + "step": 389 + }, + { + "epoch": 0.11, + "learning_rate": 4.875528682742312e-07, + "loss": 0.6591, + "step": 390 + }, + { + "epoch": 0.11, + "logps_train/chosen": -119.17372131347656, + "logps_train/ref_chosen": -117.5, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -114.12166595458984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.19002825021743774, + "rewards_train/margins": 0.4022170901298523, + "rewards_train/rejected": -0.59224534034729, + "step": 390 + }, + { + "epoch": 0.11, + "logps_train/chosen": -78.75126647949219, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -103.34577941894531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.29938769340515137, + "rewards_train/margins": 0.2758580446243286, + "rewards_train/rejected": -0.57524573802948, + "step": 391 + }, + { + "epoch": 0.11, + "learning_rate": 4.873592842479813e-07, + "loss": 0.5755, + "step": 392 + }, + { + "epoch": 0.11, + "logps_train/chosen": -73.84769439697266, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -80.33969116210938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07715222239494324, + "rewards_train/margins": 0.42830124497413635, + "rewards_train/rejected": -0.5054534673690796, + "step": 392 + }, + { + "epoch": 0.11, + "logps_train/chosen": -61.32831954956055, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -74.7039794921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.07267564535140991, + "rewards_train/margins": 0.3262382745742798, + "rewards_train/rejected": -0.3989139199256897, + "step": 393 + }, + { + "epoch": 0.11, + "learning_rate": 4.871642454913696e-07, + "loss": 0.5709, + "step": 394 + }, + { + "epoch": 0.11, + "logps_train/chosen": -38.112831115722656, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -30.125, + "logps_train/rejected": -32.09352493286133, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.004837889224290848, + "rewards_train/margins": 0.19621382281184196, + "rewards_train/rejected": -0.2010517120361328, + "step": 394 + }, + { + "epoch": 0.11, + "logps_train/chosen": -60.59162902832031, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -61.164512634277344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.3570142984390259, + "rewards_train/margins": -0.11595363914966583, + "rewards_train/rejected": -0.24106065928936005, + "step": 395 + }, + { + "epoch": 0.11, + "learning_rate": 4.86967753199752e-07, + "loss": 0.6936, + "step": 396 + }, + { + "epoch": 0.11, + "logps_train/chosen": -54.332664489746094, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -62.183677673339844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.09698053449392319, + "rewards_train/margins": 0.4324652776122093, + "rewards_train/rejected": -0.33548474311828613, + "step": 396 + }, + { + "epoch": 0.11, + "logps_train/chosen": -67.56103515625, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -75.46615600585938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1305174082517624, + "rewards_train/margins": 0.43875421583652496, + "rewards_train/rejected": -0.5692716240882874, + "step": 397 + }, + { + "epoch": 0.11, + "learning_rate": 4.867698085773929e-07, + "loss": 0.5435, + "step": 398 + }, + { + "epoch": 0.11, + "logps_train/chosen": -74.09577941894531, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -99.85299682617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.16739077866077423, + "rewards_train/margins": 0.6913464516401291, + "rewards_train/rejected": -0.8587372303009033, + "step": 398 + }, + { + "epoch": 0.11, + "logps_train/chosen": -72.29281616210938, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -83.37358856201172, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.19061006605625153, + "rewards_train/margins": 0.3238971084356308, + "rewards_train/rejected": -0.5145071744918823, + "step": 399 + }, + { + "epoch": 0.11, + "learning_rate": 4.865704128374575e-07, + "loss": 0.5364, + "step": 400 + }, + { + "epoch": 0.11, + "logps_train/chosen": -102.74913024902344, + "logps_train/ref_chosen": -98.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -107.50867462158203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4030378460884094, + "rewards_train/margins": 0.4126731753349304, + "rewards_train/rejected": -0.8157110214233398, + "step": 400 + }, + { + "epoch": 0.11, + "logps_train/chosen": -56.70863342285156, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -69.8048095703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21910548210144043, + "rewards_train/margins": 0.12368011474609375, + "rewards_train/rejected": -0.3427855968475342, + "step": 401 + }, + { + "epoch": 0.11, + "learning_rate": 4.863695672020047e-07, + "loss": 0.6287, + "step": 402 + }, + { + "epoch": 0.11, + "logps_train/chosen": -102.02717590332031, + "logps_train/ref_chosen": -99.5, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -113.07080841064453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.24256093800067902, + "rewards_train/margins": 0.6547542065382004, + "rewards_train/rejected": -0.8973151445388794, + "step": 402 + }, + { + "epoch": 0.11, + "logps_train/chosen": -53.998497009277344, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -73.76364135742188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.004244105890393257, + "rewards_train/margins": 0.172120226547122, + "rewards_train/rejected": -0.17636433243751526, + "step": 403 + }, + { + "epoch": 0.11, + "learning_rate": 4.861672729019796e-07, + "loss": 0.5442, + "step": 404 + }, + { + "epoch": 0.11, + "logps_train/chosen": -59.30846405029297, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -62.81505584716797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0035284943878650665, + "rewards_train/margins": 0.47038574889302254, + "rewards_train/rejected": -0.46685725450515747, + "step": 404 + }, + { + "epoch": 0.11, + "logps_train/chosen": -37.09577941894531, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -46.77699279785156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.14551538228988647, + "rewards_train/margins": 0.1204647421836853, + "rewards_train/rejected": -0.2659801244735718, + "step": 405 + }, + { + "epoch": 0.11, + "learning_rate": 4.85963531177206e-07, + "loss": 0.5697, + "step": 406 + }, + { + "epoch": 0.11, + "logps_train/chosen": -67.38511657714844, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -84.91424560546875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.029918041080236435, + "rewards_train/margins": 0.24138892069458961, + "rewards_train/rejected": -0.27130696177482605, + "step": 406 + }, + { + "epoch": 0.11, + "logps_train/chosen": -35.23558044433594, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -42.9957160949707, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.08527660369873047, + "rewards_train/margins": 0.28695130348205566, + "rewards_train/rejected": -0.37222790718078613, + "step": 407 + }, + { + "epoch": 0.11, + "learning_rate": 4.857583432763784e-07, + "loss": 0.5887, + "step": 408 + }, + { + "epoch": 0.11, + "logps_train/chosen": -37.568763732910156, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -39.81353759765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1369543969631195, + "rewards_train/margins": 0.3692041337490082, + "rewards_train/rejected": -0.5061585307121277, + "step": 408 + }, + { + "epoch": 0.11, + "logps_train/chosen": -45.81025695800781, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -44.82408142089844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22760777175426483, + "rewards_train/margins": 0.01769109070301056, + "rewards_train/rejected": -0.2452988624572754, + "step": 409 + }, + { + "epoch": 0.11, + "learning_rate": 4.85551710457055e-07, + "loss": 0.618, + "step": 410 + }, + { + "epoch": 0.11, + "logps_train/chosen": -65.6756362915039, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -75.97378540039062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.20076701045036316, + "rewards_train/margins": 0.34036150574684143, + "rewards_train/rejected": -0.5411285161972046, + "step": 410 + }, + { + "epoch": 0.11, + "logps_train/chosen": -52.825531005859375, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -58.52812194824219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.12093216925859451, + "rewards_train/margins": 0.13520055264234543, + "rewards_train/rejected": -0.25613272190093994, + "step": 411 + }, + { + "epoch": 0.12, + "learning_rate": 4.853436339856493e-07, + "loss": 0.598, + "step": 412 + }, + { + "epoch": 0.12, + "logps_train/chosen": -83.06622314453125, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -78.86665344238281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.06873153150081635, + "rewards_train/margins": 0.3175438195466995, + "rewards_train/rejected": -0.38627535104751587, + "step": 412 + }, + { + "epoch": 0.12, + "logps_train/chosen": -45.289215087890625, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -51.860042572021484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03924257308244705, + "rewards_train/margins": 0.06938788667321205, + "rewards_train/rejected": -0.030145313590765, + "step": 413 + }, + { + "epoch": 0.12, + "learning_rate": 4.851341151374227e-07, + "loss": 0.6123, + "step": 414 + }, + { + "epoch": 0.12, + "logps_train/chosen": -88.4419937133789, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -110.30561828613281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09366677701473236, + "rewards_train/margins": 0.9019871801137924, + "rewards_train/rejected": -0.8083204030990601, + "step": 414 + }, + { + "epoch": 0.12, + "logps_train/chosen": -40.033206939697266, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -46.84977722167969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.04980486258864403, + "rewards_train/margins": 0.02706756815314293, + "rewards_train/rejected": -0.07687243074178696, + "step": 415 + }, + { + "epoch": 0.12, + "learning_rate": 4.849231551964771e-07, + "loss": 0.5419, + "step": 416 + }, + { + "epoch": 0.12, + "logps_train/chosen": -51.32529830932617, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -57.75691604614258, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.13409240543842316, + "rewards_train/margins": 0.33700962364673615, + "rewards_train/rejected": -0.4711020290851593, + "step": 416 + }, + { + "epoch": 0.12, + "logps_train/chosen": -37.329917907714844, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -41.282283782958984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.006852060556411743, + "rewards_train/margins": 0.4186740517616272, + "rewards_train/rejected": -0.41182199120521545, + "step": 417 + }, + { + "epoch": 0.12, + "learning_rate": 4.84710755455746e-07, + "loss": 0.5501, + "step": 418 + }, + { + "epoch": 0.12, + "logps_train/chosen": -67.50133514404297, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -66.35977172851562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21087563037872314, + "rewards_train/margins": 0.4042034149169922, + "rewards_train/rejected": -0.6150790452957153, + "step": 418 + }, + { + "epoch": 0.12, + "logps_train/chosen": -27.96630859375, + "logps_train/ref_chosen": -28.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -41.9415397644043, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.05063493549823761, + "rewards_train/margins": 0.20885145664215088, + "rewards_train/rejected": -0.15821652114391327, + "step": 419 + }, + { + "epoch": 0.12, + "learning_rate": 4.844969172169875e-07, + "loss": 0.5743, + "step": 420 + }, + { + "epoch": 0.12, + "logps_train/chosen": -85.80335998535156, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -108.86079406738281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12642979621887207, + "rewards_train/margins": 0.6698055863380432, + "rewards_train/rejected": -0.7962353825569153, + "step": 420 + }, + { + "epoch": 0.12, + "logps_train/chosen": -60.522071838378906, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -92.09263610839844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06099611148238182, + "rewards_train/margins": 0.6867439188063145, + "rewards_train/rejected": -0.7477400302886963, + "step": 421 + }, + { + "epoch": 0.12, + "learning_rate": 4.842816417907758e-07, + "loss": 0.4661, + "step": 422 + }, + { + "epoch": 0.12, + "logps_train/chosen": -67.54059600830078, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -73.53221893310547, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2579660415649414, + "rewards_train/margins": 0.5901778340339661, + "rewards_train/rejected": -0.8481438755989075, + "step": 422 + }, + { + "epoch": 0.12, + "logps_train/chosen": -54.35505676269531, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -73.50029754638672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.25972452759742737, + "rewards_train/margins": 0.36061760783195496, + "rewards_train/rejected": -0.6203421354293823, + "step": 423 + }, + { + "epoch": 0.12, + "learning_rate": 4.840649304964937e-07, + "loss": 0.5545, + "step": 424 + }, + { + "epoch": 0.12, + "logps_train/chosen": -27.04102325439453, + "logps_train/ref_chosen": -25.75, + "logps_train/ref_rejected": -25.875, + "logps_train/rejected": -28.863727569580078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.12929782271385193, + "rewards_train/margins": 0.1678171157836914, + "rewards_train/rejected": -0.29711493849754333, + "step": 424 + }, + { + "epoch": 0.12, + "logps_train/chosen": -65.1640625, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -63.27779769897461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.28906217217445374, + "rewards_train/margins": 0.1396942138671875, + "rewards_train/rejected": -0.42875638604164124, + "step": 425 + }, + { + "epoch": 0.12, + "learning_rate": 4.838467846623237e-07, + "loss": 0.6549, + "step": 426 + }, + { + "epoch": 0.12, + "logps_train/chosen": -59.83967590332031, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -78.50164031982422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.11013912409543991, + "rewards_train/margins": 0.7677593305706978, + "rewards_train/rejected": -0.8778984546661377, + "step": 426 + }, + { + "epoch": 0.12, + "logps_train/chosen": -66.80098724365234, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -85.57430267333984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.209786057472229, + "rewards_train/margins": 0.7109251022338867, + "rewards_train/rejected": -0.9207111597061157, + "step": 427 + }, + { + "epoch": 0.12, + "learning_rate": 4.836272056252406e-07, + "loss": 0.4639, + "step": 428 + }, + { + "epoch": 0.12, + "logps_train/chosen": -50.29276657104492, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -39.00887680053711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.36749064922332764, + "rewards_train/margins": 0.1373031735420227, + "rewards_train/rejected": -0.5047938227653503, + "step": 428 + }, + { + "epoch": 0.12, + "logps_train/chosen": -54.68303680419922, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -67.43299865722656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.039557721465826035, + "rewards_train/margins": 0.3359828256070614, + "rewards_train/rejected": -0.29642510414123535, + "step": 429 + }, + { + "epoch": 0.12, + "learning_rate": 4.83406194731003e-07, + "loss": 0.6045, + "step": 430 + }, + { + "epoch": 0.12, + "logps_train/chosen": -79.35588073730469, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -95.75279235839844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4887131154537201, + "rewards_train/margins": 0.20609775185585022, + "rewards_train/rejected": -0.6948108673095703, + "step": 430 + }, + { + "epoch": 0.12, + "logps_train/chosen": -62.179195404052734, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -61.67304992675781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3796379864215851, + "rewards_train/margins": 0.36315545439720154, + "rewards_train/rejected": -0.7427934408187866, + "step": 431 + }, + { + "epoch": 0.12, + "learning_rate": 4.831837533341451e-07, + "loss": 0.5812, + "step": 432 + }, + { + "epoch": 0.12, + "logps_train/chosen": -60.57322692871094, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -76.04409790039062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.10175628960132599, + "rewards_train/margins": 0.07169632613658905, + "rewards_train/rejected": -0.17345261573791504, + "step": 432 + }, + { + "epoch": 0.12, + "logps_train/chosen": -72.44471740722656, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -90.85264587402344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.21126829087734222, + "rewards_train/margins": 0.6040742546319962, + "rewards_train/rejected": -0.8153425455093384, + "step": 433 + }, + { + "epoch": 0.12, + "learning_rate": 4.829598827979682e-07, + "loss": 0.5872, + "step": 434 + }, + { + "epoch": 0.12, + "logps_train/chosen": -58.04673767089844, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -55.37931442260742, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.39217352867126465, + "rewards_train/margins": -0.028070569038391113, + "rewards_train/rejected": -0.36410295963287354, + "step": 434 + }, + { + "epoch": 0.12, + "logps_train/chosen": -93.18022155761719, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -92.59236145019531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1730998456478119, + "rewards_train/margins": 0.17832356691360474, + "rewards_train/rejected": -0.3514234125614166, + "step": 435 + }, + { + "epoch": 0.12, + "learning_rate": 4.827345844945327e-07, + "loss": 0.6736, + "step": 436 + }, + { + "epoch": 0.12, + "logps_train/chosen": -63.54442596435547, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -51.33317565917969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.38188374042510986, + "rewards_train/margins": 0.1117851734161377, + "rewards_train/rejected": -0.49366891384124756, + "step": 436 + }, + { + "epoch": 0.12, + "logps_train/chosen": -44.57952880859375, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -54.54182052612305, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06264039874076843, + "rewards_train/margins": 0.1345101147890091, + "rewards_train/rejected": -0.19715051352977753, + "step": 437 + }, + { + "epoch": 0.12, + "learning_rate": 4.825078598046497e-07, + "loss": 0.671, + "step": 438 + }, + { + "epoch": 0.12, + "logps_train/chosen": -78.75662231445312, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -98.87887573242188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02726789191365242, + "rewards_train/margins": 0.9161319322884083, + "rewards_train/rejected": -0.8888640403747559, + "step": 438 + }, + { + "epoch": 0.12, + "logps_train/chosen": -61.60486602783203, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -70.74028778076172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.11165854334831238, + "rewards_train/margins": 0.6619797050952911, + "rewards_train/rejected": -0.7736382484436035, + "step": 439 + }, + { + "epoch": 0.12, + "learning_rate": 4.822797101178718e-07, + "loss": 0.4382, + "step": 440 + }, + { + "epoch": 0.12, + "logps_train/chosen": -101.79241943359375, + "logps_train/ref_chosen": -98.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -78.33682250976562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3050227761268616, + "rewards_train/margins": 0.16459760069847107, + "rewards_train/rejected": -0.46962037682533264, + "step": 440 + }, + { + "epoch": 0.12, + "logps_train/chosen": -69.30650329589844, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -88.3681640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.34393131732940674, + "rewards_train/margins": 0.91710364818573, + "rewards_train/rejected": -1.2610349655151367, + "step": 441 + }, + { + "epoch": 0.12, + "learning_rate": 4.820501368324858e-07, + "loss": 0.5168, + "step": 442 + }, + { + "epoch": 0.12, + "logps_train/chosen": -32.35873031616211, + "logps_train/ref_chosen": -31.125, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -50.96668243408203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12102940678596497, + "rewards_train/margins": 0.28559964895248413, + "rewards_train/rejected": -0.4066290557384491, + "step": 442 + }, + { + "epoch": 0.12, + "logps_train/chosen": -41.715675354003906, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -45.301876068115234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.08641133457422256, + "rewards_train/margins": 0.1713152751326561, + "rewards_train/rejected": -0.25772660970687866, + "step": 443 + }, + { + "epoch": 0.12, + "learning_rate": 4.818191413555029e-07, + "loss": 0.6137, + "step": 444 + }, + { + "epoch": 0.12, + "logps_train/chosen": -88.72758483886719, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -91.88250732421875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.47598087787628174, + "rewards_train/margins": 0.3681298494338989, + "rewards_train/rejected": -0.8441107273101807, + "step": 444 + }, + { + "epoch": 0.12, + "logps_train/chosen": -56.40264129638672, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -73.94459533691406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.004521720111370087, + "rewards_train/margins": 0.3090783432126045, + "rewards_train/rejected": -0.3136000633239746, + "step": 445 + }, + { + "epoch": 0.12, + "learning_rate": 4.81586725102651e-07, + "loss": 0.5719, + "step": 446 + }, + { + "epoch": 0.12, + "logps_train/chosen": -101.0855712890625, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -114.37125396728516, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6935184001922607, + "rewards_train/margins": 0.3915567398071289, + "rewards_train/rejected": -1.0850751399993896, + "step": 446 + }, + { + "epoch": 0.12, + "logps_train/chosen": -69.82218170166016, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -96.4715576171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2097572535276413, + "rewards_train/margins": 0.7131803780794144, + "rewards_train/rejected": -0.9229376316070557, + "step": 447 + }, + { + "epoch": 0.13, + "learning_rate": 4.813528894983653e-07, + "loss": 0.5464, + "step": 448 + }, + { + "epoch": 0.13, + "logps_train/chosen": -67.9070053100586, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -54.96982192993164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1430443972349167, + "rewards_train/margins": 0.3299145847558975, + "rewards_train/rejected": -0.4729589819908142, + "step": 448 + }, + { + "epoch": 0.13, + "logps_train/chosen": -64.23294830322266, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -74.89968872070312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.18677125871181488, + "rewards_train/margins": 0.6061270385980606, + "rewards_train/rejected": -0.7928982973098755, + "step": 449 + }, + { + "epoch": 0.13, + "learning_rate": 4.811176359757807e-07, + "loss": 0.5321, + "step": 450 + }, + { + "epoch": 0.13, + "logps_train/chosen": -71.74510955810547, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -83.18284606933594, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.5309804677963257, + "rewards_train/margins": 0.10624945163726807, + "rewards_train/rejected": -0.6372299194335938, + "step": 450 + }, + { + "epoch": 0.13, + "logps_train/chosen": -47.425254821777344, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -53.84074401855469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3173302412033081, + "rewards_train/margins": 0.19262266159057617, + "rewards_train/rejected": -0.5099529027938843, + "step": 451 + }, + { + "epoch": 0.13, + "learning_rate": 4.808809659767213e-07, + "loss": 0.6304, + "step": 452 + }, + { + "epoch": 0.13, + "logps_train/chosen": -53.49700164794922, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -56.15985870361328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.16766878962516785, + "rewards_train/margins": 0.5490983426570892, + "rewards_train/rejected": -0.7167671322822571, + "step": 452 + }, + { + "epoch": 0.13, + "logps_train/chosen": -38.96504211425781, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -46.623050689697266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08240190148353577, + "rewards_train/margins": 0.36189451813697815, + "rewards_train/rejected": -0.2794926166534424, + "step": 453 + }, + { + "epoch": 0.13, + "learning_rate": 4.806428809516932e-07, + "loss": 0.5151, + "step": 454 + }, + { + "epoch": 0.13, + "logps_train/chosen": -69.56109619140625, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -75.6127700805664, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2150936722755432, + "rewards_train/margins": 0.26571446657180786, + "rewards_train/rejected": -0.4808081388473511, + "step": 454 + }, + { + "epoch": 0.13, + "logps_train/chosen": -50.74924087524414, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -59.40789031982422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09523653984069824, + "rewards_train/margins": 0.5916464328765869, + "rewards_train/rejected": -0.6868829727172852, + "step": 455 + }, + { + "epoch": 0.13, + "learning_rate": 4.804033823598745e-07, + "loss": 0.5153, + "step": 456 + }, + { + "epoch": 0.13, + "logps_train/chosen": -90.73284912109375, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -100.54654693603516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5598087310791016, + "rewards_train/margins": 0.6948459148406982, + "rewards_train/rejected": -1.2546546459197998, + "step": 456 + }, + { + "epoch": 0.13, + "logps_train/chosen": -72.41488647460938, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -57.239173889160156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2641451358795166, + "rewards_train/margins": 0.12110072374343872, + "rewards_train/rejected": -0.3852458596229553, + "step": 457 + }, + { + "epoch": 0.13, + "learning_rate": 4.801624716691072e-07, + "loss": 0.5505, + "step": 458 + }, + { + "epoch": 0.13, + "logps_train/chosen": -64.20964050292969, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -63.94740295410156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.26490911841392517, + "rewards_train/margins": 0.20014378428459167, + "rewards_train/rejected": -0.46505290269851685, + "step": 458 + }, + { + "epoch": 0.13, + "logps_train/chosen": -65.96568298339844, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -68.7130355834961, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.08563080430030823, + "rewards_train/margins": 0.24504747986793518, + "rewards_train/rejected": -0.3306782841682434, + "step": 459 + }, + { + "epoch": 0.13, + "learning_rate": 4.799201503558873e-07, + "loss": 0.6115, + "step": 460 + }, + { + "epoch": 0.13, + "logps_train/chosen": -101.25625610351562, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -108.38130187988281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4350007176399231, + "rewards_train/margins": 0.5078170895576477, + "rewards_train/rejected": -0.9428178071975708, + "step": 460 + }, + { + "epoch": 0.13, + "logps_train/chosen": -45.537818908691406, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -48.5035400390625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17721939086914062, + "rewards_train/margins": 0.25516611337661743, + "rewards_train/rejected": -0.43238550424575806, + "step": 461 + }, + { + "epoch": 0.13, + "learning_rate": 4.796764199053568e-07, + "loss": 0.579, + "step": 462 + }, + { + "epoch": 0.13, + "logps_train/chosen": -55.23745346069336, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -80.50357055664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3159329295158386, + "rewards_train/margins": 0.6236823201179504, + "rewards_train/rejected": -0.9396152496337891, + "step": 462 + }, + { + "epoch": 0.13, + "logps_train/chosen": -71.55741882324219, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -81.88558197021484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3819139003753662, + "rewards_train/margins": 0.06435957551002502, + "rewards_train/rejected": -0.44627347588539124, + "step": 463 + }, + { + "epoch": 0.13, + "learning_rate": 4.794312818112935e-07, + "loss": 0.5781, + "step": 464 + }, + { + "epoch": 0.13, + "logps_train/chosen": -58.86534118652344, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -80.2977294921875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.15889745950698853, + "rewards_train/margins": 0.2706802785396576, + "rewards_train/rejected": -0.4295777380466461, + "step": 464 + }, + { + "epoch": 0.13, + "logps_train/chosen": -45.71202087402344, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -75.93025207519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.13213971257209778, + "rewards_train/margins": 0.49877622723579407, + "rewards_train/rejected": -0.6309159398078918, + "step": 465 + }, + { + "epoch": 0.13, + "learning_rate": 4.791847375761027e-07, + "loss": 0.5414, + "step": 466 + }, + { + "epoch": 0.13, + "logps_train/chosen": -50.770538330078125, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -59.130760192871094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3323274254798889, + "rewards_train/margins": 0.16824871301651, + "rewards_train/rejected": -0.5005761384963989, + "step": 466 + }, + { + "epoch": 0.13, + "logps_train/chosen": -90.62741088867188, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -81.84719848632812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6760218143463135, + "rewards_train/margins": 0.2461986541748047, + "rewards_train/rejected": -0.9222204685211182, + "step": 467 + }, + { + "epoch": 0.13, + "learning_rate": 4.789367887108076e-07, + "loss": 0.6064, + "step": 468 + }, + { + "epoch": 0.13, + "logps_train/chosen": -105.81695556640625, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -123.5, + "logps_train/rejected": -136.29693603515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3746643364429474, + "rewards_train/margins": 0.882177084684372, + "rewards_train/rejected": -1.2568414211273193, + "step": 468 + }, + { + "epoch": 0.13, + "logps_train/chosen": -41.73709487915039, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -19.5, + "logps_train/rejected": -21.092626571655273, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.10593586415052414, + "rewards_train/margins": 0.05879557877779007, + "rewards_train/rejected": -0.1647314429283142, + "step": 469 + }, + { + "epoch": 0.13, + "learning_rate": 4.786874367350402e-07, + "loss": 0.5404, + "step": 470 + }, + { + "epoch": 0.13, + "logps_train/chosen": -42.11448669433594, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -53.18017578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4178939759731293, + "rewards_train/margins": 0.11828795075416565, + "rewards_train/rejected": -0.5361819267272949, + "step": 470 + }, + { + "epoch": 0.13, + "logps_train/chosen": -71.83967590332031, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -100.61727905273438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3722483515739441, + "rewards_train/margins": 1.105105221271515, + "rewards_train/rejected": -1.477353572845459, + "step": 471 + }, + { + "epoch": 0.13, + "learning_rate": 4.784366831770317e-07, + "loss": 0.4904, + "step": 472 + }, + { + "epoch": 0.13, + "logps_train/chosen": -58.41155242919922, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -64.99591064453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.286467969417572, + "rewards_train/margins": 0.14593565464019775, + "rewards_train/rejected": -0.4324036240577698, + "step": 472 + }, + { + "epoch": 0.13, + "logps_train/chosen": -81.48255157470703, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -97.74531555175781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.39356729388237, + "rewards_train/margins": 0.349714070558548, + "rewards_train/rejected": -0.743281364440918, + "step": 473 + }, + { + "epoch": 0.13, + "learning_rate": 4.781845295736033e-07, + "loss": 0.6057, + "step": 474 + }, + { + "epoch": 0.13, + "logps_train/chosen": -24.26443099975586, + "logps_train/ref_chosen": -22.875, + "logps_train/ref_rejected": -34.25, + "logps_train/rejected": -38.41557693481445, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.14040803909301758, + "rewards_train/margins": 0.26535844802856445, + "rewards_train/rejected": -0.40576648712158203, + "step": 474 + }, + { + "epoch": 0.13, + "logps_train/chosen": -62.27745819091797, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -56.919986724853516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.20977705717086792, + "rewards_train/margins": 0.05683097243309021, + "rewards_train/rejected": -0.26660802960395813, + "step": 475 + }, + { + "epoch": 0.13, + "learning_rate": 4.779309774701573e-07, + "loss": 0.6405, + "step": 476 + }, + { + "epoch": 0.13, + "logps_train/chosen": -54.82136535644531, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -71.8387451171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2172931730747223, + "rewards_train/margins": 0.26462796330451965, + "rewards_train/rejected": -0.48192113637924194, + "step": 476 + }, + { + "epoch": 0.13, + "logps_train/chosen": -48.08940887451172, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -36.25, + "logps_train/rejected": -41.057167053222656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.27642133831977844, + "rewards_train/margins": 0.2077132761478424, + "rewards_train/rejected": -0.48413461446762085, + "step": 477 + }, + { + "epoch": 0.13, + "learning_rate": 4.776760284206667e-07, + "loss": 0.6099, + "step": 478 + }, + { + "epoch": 0.13, + "logps_train/chosen": -30.757614135742188, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -52.735233306884766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04529266804456711, + "rewards_train/margins": 0.6586994007229805, + "rewards_train/rejected": -0.7039920687675476, + "step": 478 + }, + { + "epoch": 0.13, + "logps_train/chosen": -87.45452117919922, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -73.78488159179688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.23685763776302338, + "rewards_train/margins": 0.2888965755701065, + "rewards_train/rejected": -0.5257542133331299, + "step": 479 + }, + { + "epoch": 0.13, + "learning_rate": 4.774196839876659e-07, + "loss": 0.5428, + "step": 480 + }, + { + "epoch": 0.13, + "logps_train/chosen": -66.86396789550781, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -84.89311218261719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6605181097984314, + "rewards_train/margins": 0.8312340378761292, + "rewards_train/rejected": -1.4917521476745605, + "step": 480 + }, + { + "epoch": 0.13, + "logps_train/chosen": -28.001731872558594, + "logps_train/ref_chosen": -26.125, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -39.36369323730469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.18816140294075012, + "rewards_train/margins": 0.35348114371299744, + "rewards_train/rejected": -0.5416425466537476, + "step": 481 + }, + { + "epoch": 0.13, + "learning_rate": 4.771619457422421e-07, + "loss": 0.5028, + "step": 482 + }, + { + "epoch": 0.13, + "logps_train/chosen": -52.00128936767578, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -51.218685150146484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.15382777154445648, + "rewards_train/margins": 0.1682240217924118, + "rewards_train/rejected": -0.3220517933368683, + "step": 482 + }, + { + "epoch": 0.13, + "logps_train/chosen": -60.037567138671875, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -60.619300842285156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0021761134266853333, + "rewards_train/margins": 0.6805120185017586, + "rewards_train/rejected": -0.6783359050750732, + "step": 483 + }, + { + "epoch": 0.14, + "learning_rate": 4.769028152640243e-07, + "loss": 0.5578, + "step": 484 + }, + { + "epoch": 0.14, + "logps_train/chosen": -94.0927734375, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -93.60713195800781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.48740169405937195, + "rewards_train/margins": 0.5895219147205353, + "rewards_train/rejected": -1.0769236087799072, + "step": 484 + }, + { + "epoch": 0.14, + "logps_train/chosen": -70.69978332519531, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -92.28846740722656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6100178956985474, + "rewards_train/margins": 0.5551564693450928, + "rewards_train/rejected": -1.1651743650436401, + "step": 485 + }, + { + "epoch": 0.14, + "learning_rate": 4.7664229414117437e-07, + "loss": 0.4843, + "step": 486 + }, + { + "epoch": 0.14, + "logps_train/chosen": -53.37134552001953, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -66.5076904296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2820568084716797, + "rewards_train/margins": 0.2972272038459778, + "rewards_train/rejected": -0.5792840123176575, + "step": 486 + }, + { + "epoch": 0.14, + "logps_train/chosen": -42.8087158203125, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -39.998809814453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09525149315595627, + "rewards_train/margins": 0.3722808286547661, + "rewards_train/rejected": -0.2770293354988098, + "step": 487 + }, + { + "epoch": 0.14, + "learning_rate": 4.7638038397037724e-07, + "loss": 0.5738, + "step": 488 + }, + { + "epoch": 0.14, + "logps_train/chosen": -34.80330276489258, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -43.25440979003906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11118964850902557, + "rewards_train/margins": 0.30438844859600067, + "rewards_train/rejected": -0.41557809710502625, + "step": 488 + }, + { + "epoch": 0.14, + "logps_train/chosen": -50.295082092285156, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -68.56315612792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2521645426750183, + "rewards_train/margins": 0.5594242811203003, + "rewards_train/rejected": -0.8115888237953186, + "step": 489 + }, + { + "epoch": 0.14, + "learning_rate": 4.76117086356831e-07, + "loss": 0.5276, + "step": 490 + }, + { + "epoch": 0.14, + "logps_train/chosen": -31.872812271118164, + "logps_train/ref_chosen": -31.375, + "logps_train/ref_rejected": -28.625, + "logps_train/rejected": -30.312978744506836, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.05349217355251312, + "rewards_train/margins": 0.10944627225399017, + "rewards_train/rejected": -0.1629384458065033, + "step": 490 + }, + { + "epoch": 0.14, + "logps_train/chosen": -57.79804229736328, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -49.39804458618164, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2505076229572296, + "rewards_train/margins": 0.14193353056907654, + "rewards_train/rejected": -0.39244115352630615, + "step": 491 + }, + { + "epoch": 0.14, + "learning_rate": 4.758524029142372e-07, + "loss": 0.6623, + "step": 492 + }, + { + "epoch": 0.14, + "logps_train/chosen": -69.51055908203125, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -78.01091003417969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5385558605194092, + "rewards_train/margins": 0.6375353336334229, + "rewards_train/rejected": -1.176091194152832, + "step": 492 + }, + { + "epoch": 0.14, + "logps_train/chosen": -71.8697509765625, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -83.75015258789062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.0095336437225342, + "rewards_train/margins": 0.502004861831665, + "rewards_train/rejected": -1.5115385055541992, + "step": 493 + }, + { + "epoch": 0.14, + "learning_rate": 4.7558633526479084e-07, + "loss": 0.5098, + "step": 494 + }, + { + "epoch": 0.14, + "logps_train/chosen": -22.331228256225586, + "logps_train/ref_chosen": -21.375, + "logps_train/ref_rejected": -28.875, + "logps_train/rejected": -30.32866096496582, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.09894312918186188, + "rewards_train/margins": 0.05130559206008911, + "rewards_train/rejected": -0.150248721241951, + "step": 494 + }, + { + "epoch": 0.14, + "logps_train/chosen": -62.72913360595703, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -45.517066955566406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.19234707951545715, + "rewards_train/margins": 0.2933442294597626, + "rewards_train/rejected": -0.4856913089752197, + "step": 495 + }, + { + "epoch": 0.14, + "learning_rate": 4.753188850391705e-07, + "loss": 0.6269, + "step": 496 + }, + { + "epoch": 0.14, + "logps_train/chosen": -29.741872787475586, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -46.2149658203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.055304765701293945, + "rewards_train/margins": 0.5283637642860413, + "rewards_train/rejected": -0.4730589985847473, + "step": 496 + }, + { + "epoch": 0.14, + "logps_train/chosen": -64.40620422363281, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -66.39431762695312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.23051299154758453, + "rewards_train/margins": 0.45579348504543304, + "rewards_train/rejected": -0.6863064765930176, + "step": 497 + }, + { + "epoch": 0.14, + "learning_rate": 4.7505005387652805e-07, + "loss": 0.555, + "step": 498 + }, + { + "epoch": 0.14, + "logps_train/chosen": -72.48267364501953, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -82.56196594238281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4054940938949585, + "rewards_train/margins": 0.5087108016014099, + "rewards_train/rejected": -0.9142048954963684, + "step": 498 + }, + { + "epoch": 0.14, + "logps_train/chosen": -86.4345703125, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -81.76953125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9524410367012024, + "rewards_train/margins": 0.2749027609825134, + "rewards_train/rejected": -1.2273437976837158, + "step": 499 + }, + { + "epoch": 0.14, + "learning_rate": 4.747798434244793e-07, + "loss": 0.6017, + "step": 500 + }, + { + "epoch": 0.14, + "logps_train/chosen": -73.94851684570312, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -74.02375793457031, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.4011017084121704, + "rewards_train/margins": 0.0686572790145874, + "rewards_train/rejected": -0.4697589874267578, + "step": 500 + }, + { + "epoch": 0.14, + "logps_train/chosen": -96.72221374511719, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -117.32746887207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12222132831811905, + "rewards_train/margins": 0.6851353868842125, + "rewards_train/rejected": -0.8073567152023315, + "step": 501 + }, + { + "epoch": 0.14, + "learning_rate": 4.745082553390931e-07, + "loss": 0.5593, + "step": 502 + }, + { + "epoch": 0.14, + "logps_train/chosen": -55.377403259277344, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -89.51190185546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.19125555455684662, + "rewards_train/margins": 0.5690164715051651, + "rewards_train/rejected": -0.7602720260620117, + "step": 502 + }, + { + "epoch": 0.14, + "logps_train/chosen": -63.07593536376953, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -80.22622680664062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.029125303030014038, + "rewards_train/margins": 0.7242086231708527, + "rewards_train/rejected": -0.6950833201408386, + "step": 503 + }, + { + "epoch": 0.14, + "learning_rate": 4.742352912848817e-07, + "loss": 0.4981, + "step": 504 + }, + { + "epoch": 0.14, + "logps_train/chosen": -88.90757751464844, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -84.74685668945312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7782579660415649, + "rewards_train/margins": 0.027678251266479492, + "rewards_train/rejected": -0.8059362173080444, + "step": 504 + }, + { + "epoch": 0.14, + "logps_train/chosen": -70.9320068359375, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -68.92997741699219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.37430447340011597, + "rewards_train/margins": 0.5436931252479553, + "rewards_train/rejected": -0.9179975986480713, + "step": 505 + }, + { + "epoch": 0.14, + "learning_rate": 4.7396095293479043e-07, + "loss": 0.6605, + "step": 506 + }, + { + "epoch": 0.14, + "logps_train/chosen": -80.8028564453125, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -97.88168334960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6273554563522339, + "rewards_train/margins": 0.7297576665878296, + "rewards_train/rejected": -1.3571131229400635, + "step": 506 + }, + { + "epoch": 0.14, + "logps_train/chosen": -91.94205474853516, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -86.92388916015625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.38170573115348816, + "rewards_train/margins": 0.22162029147148132, + "rewards_train/rejected": -0.6033260226249695, + "step": 507 + }, + { + "epoch": 0.14, + "learning_rate": 4.7368524197018735e-07, + "loss": 0.5651, + "step": 508 + }, + { + "epoch": 0.14, + "logps_train/chosen": -41.23701477050781, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -60.14656066894531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.026435818523168564, + "rewards_train/margins": 0.5856812335550785, + "rewards_train/rejected": -0.6121170520782471, + "step": 508 + }, + { + "epoch": 0.14, + "logps_train/chosen": -138.77102661132812, + "logps_train/ref_chosen": -129.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -113.13682556152344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.9345235228538513, + "rewards_train/margins": -0.14818471670150757, + "rewards_train/rejected": -0.7863388061523438, + "step": 509 + }, + { + "epoch": 0.14, + "learning_rate": 4.7340816008085305e-07, + "loss": 0.6675, + "step": 510 + }, + { + "epoch": 0.14, + "logps_train/chosen": -65.23442077636719, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -76.99596405029297, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3794972896575928, + "rewards_train/margins": -0.01701033115386963, + "rewards_train/rejected": -0.36248695850372314, + "step": 510 + }, + { + "epoch": 0.14, + "logps_train/chosen": -98.24939727783203, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -104.49810028076172, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4968145787715912, + "rewards_train/margins": -0.05169233679771423, + "rewards_train/rejected": -0.44512224197387695, + "step": 511 + }, + { + "epoch": 0.14, + "learning_rate": 4.7312970896497027e-07, + "loss": 0.7327, + "step": 512 + }, + { + "epoch": 0.14, + "logps_train/chosen": -23.39773941040039, + "logps_train/ref_chosen": -22.125, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -34.852760314941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12141447514295578, + "rewards_train/margins": 0.05683001130819321, + "rewards_train/rejected": -0.178244486451149, + "step": 512 + }, + { + "epoch": 0.14, + "logps_train/chosen": -34.62717819213867, + "logps_train/ref_chosen": -32.0, + "logps_train/ref_rejected": -19.75, + "logps_train/rejected": -22.485401153564453, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.26056942343711853, + "rewards_train/margins": 0.011530131101608276, + "rewards_train/rejected": -0.2720995545387268, + "step": 513 + }, + { + "epoch": 0.14, + "learning_rate": 4.728498903291135e-07, + "loss": 0.6824, + "step": 514 + }, + { + "epoch": 0.14, + "logps_train/chosen": -75.23242950439453, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -93.727294921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5572268962860107, + "rewards_train/margins": 0.08581513166427612, + "rewards_train/rejected": -0.6430420279502869, + "step": 514 + }, + { + "epoch": 0.14, + "logps_train/chosen": -31.645565032958984, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -40.616207122802734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.13799408078193665, + "rewards_train/margins": 0.4856139123439789, + "rewards_train/rejected": -0.6236079931259155, + "step": 515 + }, + { + "epoch": 0.14, + "learning_rate": 4.7256870588823847e-07, + "loss": 0.6125, + "step": 516 + }, + { + "epoch": 0.14, + "logps_train/chosen": -81.1720199584961, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -94.9151611328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.20392099022865295, + "rewards_train/margins": 0.5293920934200287, + "rewards_train/rejected": -0.7333130836486816, + "step": 516 + }, + { + "epoch": 0.14, + "logps_train/chosen": -62.583106994628906, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -79.09172058105469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3465920090675354, + "rewards_train/margins": 0.5137519240379333, + "rewards_train/rejected": -0.8603439331054688, + "step": 517 + }, + { + "epoch": 0.14, + "learning_rate": 4.722861573656716e-07, + "loss": 0.5278, + "step": 518 + }, + { + "epoch": 0.14, + "logps_train/chosen": -80.66468048095703, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -99.61990356445312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5630502104759216, + "rewards_train/margins": 0.6380017399787903, + "rewards_train/rejected": -1.201051950454712, + "step": 518 + }, + { + "epoch": 0.15, + "logps_train/chosen": -76.64894104003906, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -83.43063354492188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2889173924922943, + "rewards_train/margins": 0.2760217487812042, + "rewards_train/rejected": -0.5649391412734985, + "step": 519 + }, + { + "epoch": 0.15, + "learning_rate": 4.7200224649309974e-07, + "loss": 0.5385, + "step": 520 + }, + { + "epoch": 0.15, + "logps_train/chosen": -42.11444854736328, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -48.564491271972656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08040061593055725, + "rewards_train/margins": 0.3436858654022217, + "rewards_train/rejected": -0.26328524947166443, + "step": 520 + }, + { + "epoch": 0.15, + "logps_train/chosen": -20.484411239624023, + "logps_train/ref_chosen": -20.125, + "logps_train/ref_rejected": -27.0, + "logps_train/rejected": -28.30255889892578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.036624811589717865, + "rewards_train/margins": 0.09734208136796951, + "rewards_train/rejected": -0.13396689295768738, + "step": 521 + }, + { + "epoch": 0.15, + "learning_rate": 4.71716975010559e-07, + "loss": 0.6026, + "step": 522 + }, + { + "epoch": 0.15, + "logps_train/chosen": -107.2323226928711, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -124.77798461914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6540920734405518, + "rewards_train/margins": 0.6166746616363525, + "rewards_train/rejected": -1.2707667350769043, + "step": 522 + }, + { + "epoch": 0.15, + "logps_train/chosen": -72.55206298828125, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -104.24154663085938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.24426917731761932, + "rewards_train/margins": 0.5662143379449844, + "rewards_train/rejected": -0.8104835152626038, + "step": 523 + }, + { + "epoch": 0.15, + "learning_rate": 4.714303446664246e-07, + "loss": 0.4907, + "step": 524 + }, + { + "epoch": 0.15, + "logps_train/chosen": -110.03675079345703, + "logps_train/ref_chosen": -101.5, + "logps_train/ref_rejected": -118.0, + "logps_train/rejected": -134.18072509765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8478155136108398, + "rewards_train/margins": 0.7593201398849487, + "rewards_train/rejected": -1.6071356534957886, + "step": 524 + }, + { + "epoch": 0.15, + "logps_train/chosen": -58.35194396972656, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -61.030277252197266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3433973789215088, + "rewards_train/margins": 0.3086537718772888, + "rewards_train/rejected": -0.6520511507987976, + "step": 525 + }, + { + "epoch": 0.15, + "learning_rate": 4.7114235721740005e-07, + "loss": 0.5154, + "step": 526 + }, + { + "epoch": 0.15, + "logps_train/chosen": -54.793128967285156, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -68.19047546386719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3890784680843353, + "rewards_train/margins": 0.5676640570163727, + "rewards_train/rejected": -0.956742525100708, + "step": 526 + }, + { + "epoch": 0.15, + "logps_train/chosen": -66.0406494140625, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -69.14704132080078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.24078363180160522, + "rewards_train/margins": 0.40204566717147827, + "rewards_train/rejected": -0.6428292989730835, + "step": 527 + }, + { + "epoch": 0.15, + "learning_rate": 4.708530144285062e-07, + "loss": 0.5388, + "step": 528 + }, + { + "epoch": 0.15, + "logps_train/chosen": -52.726234436035156, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -76.1851806640625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.37876343727111816, + "rewards_train/margins": 0.5359342694282532, + "rewards_train/rejected": -0.9146977066993713, + "step": 528 + }, + { + "epoch": 0.15, + "logps_train/chosen": -43.520347595214844, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -54.7512321472168, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2442224621772766, + "rewards_train/margins": 0.49515873193740845, + "rewards_train/rejected": -0.7393811941146851, + "step": 529 + }, + { + "epoch": 0.15, + "learning_rate": 4.7056231807307045e-07, + "loss": 0.5458, + "step": 530 + }, + { + "epoch": 0.15, + "logps_train/chosen": -82.77078247070312, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -73.08013916015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.32512497901916504, + "rewards_train/margins": 0.46667778491973877, + "rewards_train/rejected": -0.7918027639389038, + "step": 530 + }, + { + "epoch": 0.15, + "logps_train/chosen": -75.02953338623047, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -77.51634979248047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06154738366603851, + "rewards_train/margins": 0.8650875836610794, + "rewards_train/rejected": -0.9266349673271179, + "step": 531 + }, + { + "epoch": 0.15, + "learning_rate": 4.70270269932716e-07, + "loss": 0.499, + "step": 532 + }, + { + "epoch": 0.15, + "logps_train/chosen": -23.459375381469727, + "logps_train/ref_chosen": -20.75, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -36.59151077270508, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.26537108421325684, + "rewards_train/margins": 0.15960031747817993, + "rewards_train/rejected": -0.42497140169143677, + "step": 532 + }, + { + "epoch": 0.15, + "logps_train/chosen": -51.84281921386719, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -57.507625579833984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2215864062309265, + "rewards_train/margins": 0.09128543734550476, + "rewards_train/rejected": -0.3128718435764313, + "step": 533 + }, + { + "epoch": 0.15, + "learning_rate": 4.699768717973511e-07, + "loss": 0.6438, + "step": 534 + }, + { + "epoch": 0.15, + "logps_train/chosen": -76.173583984375, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -99.55367279052734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5900144577026367, + "rewards_train/margins": 0.49660229682922363, + "rewards_train/rejected": -1.0866167545318604, + "step": 534 + }, + { + "epoch": 0.15, + "logps_train/chosen": -70.67461395263672, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -81.53765869140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2904106378555298, + "rewards_train/margins": 0.7003673315048218, + "rewards_train/rejected": -0.9907779693603516, + "step": 535 + }, + { + "epoch": 0.15, + "learning_rate": 4.696821254651574e-07, + "loss": 0.5112, + "step": 536 + }, + { + "epoch": 0.15, + "logps_train/chosen": -76.12255859375, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -80.60302734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5864747166633606, + "rewards_train/margins": 0.7764647603034973, + "rewards_train/rejected": -1.362939476966858, + "step": 536 + }, + { + "epoch": 0.15, + "logps_train/chosen": -104.69821166992188, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -114.86045837402344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5728488564491272, + "rewards_train/margins": 1.0352677702903748, + "rewards_train/rejected": -1.608116626739502, + "step": 537 + }, + { + "epoch": 0.15, + "learning_rate": 4.693860327425799e-07, + "loss": 0.4259, + "step": 538 + }, + { + "epoch": 0.15, + "logps_train/chosen": -38.83859634399414, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -30.375, + "logps_train/rejected": -32.0008659362793, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08053939044475555, + "rewards_train/margins": 0.07911746203899384, + "rewards_train/rejected": -0.1596568524837494, + "step": 538 + }, + { + "epoch": 0.15, + "logps_train/chosen": -130.12071228027344, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -126.61973571777344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5995713472366333, + "rewards_train/margins": 0.9575196504592896, + "rewards_train/rejected": -1.5570909976959229, + "step": 539 + }, + { + "epoch": 0.15, + "learning_rate": 4.690885954443151e-07, + "loss": 0.5288, + "step": 540 + }, + { + "epoch": 0.15, + "logps_train/chosen": -89.41333770751953, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -105.00894165039062, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.9659432768821716, + "rewards_train/margins": 0.3537004590034485, + "rewards_train/rejected": -1.3196437358856201, + "step": 540 + }, + { + "epoch": 0.15, + "logps_train/chosen": -15.983552932739258, + "logps_train/ref_chosen": -16.125, + "logps_train/ref_rejected": -23.75, + "logps_train/rejected": -24.74352264404297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.015365377068519592, + "rewards_train/margins": 0.11022551357746124, + "rewards_train/rejected": -0.09486013650894165, + "step": 541 + }, + { + "epoch": 0.15, + "learning_rate": 4.687898153933001e-07, + "loss": 0.6695, + "step": 542 + }, + { + "epoch": 0.15, + "logps_train/chosen": -67.36192321777344, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -68.09326171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2351670265197754, + "rewards_train/margins": 0.49930596351623535, + "rewards_train/rejected": -0.7344729900360107, + "step": 542 + }, + { + "epoch": 0.15, + "logps_train/chosen": -69.2124252319336, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -114.23719024658203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.302492618560791, + "rewards_train/margins": 0.47435152530670166, + "rewards_train/rejected": -0.7768441438674927, + "step": 543 + }, + { + "epoch": 0.15, + "learning_rate": 4.6848969442070177e-07, + "loss": 0.5516, + "step": 544 + }, + { + "epoch": 0.15, + "logps_train/chosen": -34.0140495300293, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -30.5, + "logps_train/rejected": -34.26715087890625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.04742322117090225, + "rewards_train/margins": 0.4264822378754616, + "rewards_train/rejected": -0.3790590167045593, + "step": 544 + }, + { + "epoch": 0.15, + "logps_train/chosen": -57.643898010253906, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -74.52793884277344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2847025394439697, + "rewards_train/margins": 0.4878177046775818, + "rewards_train/rejected": -0.7725202441215515, + "step": 545 + }, + { + "epoch": 0.15, + "learning_rate": 4.6818823436590475e-07, + "loss": 0.5354, + "step": 546 + }, + { + "epoch": 0.15, + "logps_train/chosen": -76.83927917480469, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -94.51193237304688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.28275570273399353, + "rewards_train/margins": 0.28855451941490173, + "rewards_train/rejected": -0.5713102221488953, + "step": 546 + }, + { + "epoch": 0.15, + "logps_train/chosen": -111.79127502441406, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -82.41812133789062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9564964771270752, + "rewards_train/margins": 0.010339975357055664, + "rewards_train/rejected": -0.9668364524841309, + "step": 547 + }, + { + "epoch": 0.15, + "learning_rate": 4.6788543707650116e-07, + "loss": 0.6551, + "step": 548 + }, + { + "epoch": 0.15, + "logps_train/chosen": -69.09648132324219, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -72.76083374023438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4018356502056122, + "rewards_train/margins": 0.3637010157108307, + "rewards_train/rejected": -0.7655366659164429, + "step": 548 + }, + { + "epoch": 0.15, + "logps_train/chosen": -64.34158325195312, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -61.416481018066406, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.7240022420883179, + "rewards_train/margins": -0.015166401863098145, + "rewards_train/rejected": -0.7088358402252197, + "step": 549 + }, + { + "epoch": 0.15, + "learning_rate": 4.6758130440827864e-07, + "loss": 0.6297, + "step": 550 + }, + { + "epoch": 0.15, + "logps_train/chosen": -54.109127044677734, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -57.748172760009766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0011966973543167114, + "rewards_train/margins": 0.6152716428041458, + "rewards_train/rejected": -0.6140749454498291, + "step": 550 + }, + { + "epoch": 0.15, + "logps_train/chosen": -63.19514465332031, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -108.71592712402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15935802459716797, + "rewards_train/margins": 0.9356725215911865, + "rewards_train/rejected": -1.0950305461883545, + "step": 551 + }, + { + "epoch": 0.15, + "learning_rate": 4.672758382252089e-07, + "loss": 0.445, + "step": 552 + }, + { + "epoch": 0.15, + "logps_train/chosen": -50.334468841552734, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -47.449493408203125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17289999127388, + "rewards_train/margins": 0.23220571875572205, + "rewards_train/rejected": -0.40510571002960205, + "step": 552 + }, + { + "epoch": 0.15, + "logps_train/chosen": -50.62821578979492, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -62.52609634399414, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2872355282306671, + "rewards_train/margins": 0.28178027272224426, + "rewards_train/rejected": -0.5690158009529114, + "step": 553 + }, + { + "epoch": 0.15, + "learning_rate": 4.669690403994366e-07, + "loss": 0.6167, + "step": 554 + }, + { + "epoch": 0.15, + "logps_train/chosen": -68.87958526611328, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -87.34165954589844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3912666440010071, + "rewards_train/margins": 0.6028479337692261, + "rewards_train/rejected": -0.9941145777702332, + "step": 554 + }, + { + "epoch": 0.16, + "logps_train/chosen": -43.510765075683594, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -53.08168411254883, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.11672595888376236, + "rewards_train/margins": 0.6015984639525414, + "rewards_train/rejected": -0.7183244228363037, + "step": 555 + }, + { + "epoch": 0.16, + "learning_rate": 4.666609128112681e-07, + "loss": 0.4856, + "step": 556 + }, + { + "epoch": 0.16, + "logps_train/chosen": -45.0446662902832, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -74.44892883300781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.10915383696556091, + "rewards_train/margins": 0.6821256577968597, + "rewards_train/rejected": -0.7912794947624207, + "step": 556 + }, + { + "epoch": 0.16, + "logps_train/chosen": -35.762298583984375, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -42.06949996948242, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0168546661734581, + "rewards_train/margins": 0.2217358574271202, + "rewards_train/rejected": -0.2385905236005783, + "step": 557 + }, + { + "epoch": 0.16, + "learning_rate": 4.6635145734915914e-07, + "loss": 0.5506, + "step": 558 + }, + { + "epoch": 0.16, + "logps_train/chosen": -61.52577209472656, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -72.86044311523438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3555067777633667, + "rewards_train/margins": 0.3422565460205078, + "rewards_train/rejected": -0.6977633237838745, + "step": 558 + }, + { + "epoch": 0.16, + "logps_train/chosen": -60.89529037475586, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -69.43919372558594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2145289033651352, + "rewards_train/margins": 0.7424758821725845, + "rewards_train/rejected": -0.9570047855377197, + "step": 559 + }, + { + "epoch": 0.16, + "learning_rate": 4.6604067590970414e-07, + "loss": 0.5379, + "step": 560 + }, + { + "epoch": 0.16, + "logps_train/chosen": -42.53410339355469, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -41.25, + "logps_train/rejected": -48.164886474609375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.034953486174345016, + "rewards_train/margins": 0.6631755717098713, + "rewards_train/rejected": -0.6981290578842163, + "step": 560 + }, + { + "epoch": 0.16, + "logps_train/chosen": -56.45689392089844, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -62.107749938964844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07049411535263062, + "rewards_train/margins": 0.7445778250694275, + "rewards_train/rejected": -0.8150719404220581, + "step": 561 + }, + { + "epoch": 0.16, + "learning_rate": 4.657285703976239e-07, + "loss": 0.5179, + "step": 562 + }, + { + "epoch": 0.16, + "logps_train/chosen": -46.0301399230957, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -64.97852325439453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5315294861793518, + "rewards_train/margins": 0.5905417799949646, + "rewards_train/rejected": -1.1220712661743164, + "step": 562 + }, + { + "epoch": 0.16, + "logps_train/chosen": -29.97800064086914, + "logps_train/ref_chosen": -28.75, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -48.09043502807617, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1265111118555069, + "rewards_train/margins": 0.3512822836637497, + "rewards_train/rejected": -0.4777933955192566, + "step": 563 + }, + { + "epoch": 0.16, + "learning_rate": 4.6541514272575445e-07, + "loss": 0.5393, + "step": 564 + }, + { + "epoch": 0.16, + "logps_train/chosen": -59.70259094238281, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -66.1999282836914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22768065333366394, + "rewards_train/margins": 0.1819603443145752, + "rewards_train/rejected": -0.40964099764823914, + "step": 564 + }, + { + "epoch": 0.16, + "logps_train/chosen": -71.78914642333984, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -78.57498168945312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.23555545508861542, + "rewards_train/margins": 0.622724249958992, + "rewards_train/rejected": -0.8582797050476074, + "step": 565 + }, + { + "epoch": 0.16, + "learning_rate": 4.6510039481503485e-07, + "loss": 0.5683, + "step": 566 + }, + { + "epoch": 0.16, + "logps_train/chosen": -40.68168640136719, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -38.25, + "logps_train/rejected": -43.14057159423828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.18135224282741547, + "rewards_train/margins": 0.31434543430805206, + "rewards_train/rejected": -0.49569767713546753, + "step": 566 + }, + { + "epoch": 0.16, + "logps_train/chosen": -82.29490661621094, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -87.00576782226562, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.9814441204071045, + "rewards_train/margins": -0.19434422254562378, + "rewards_train/rejected": -0.7870998978614807, + "step": 567 + }, + { + "epoch": 0.16, + "learning_rate": 4.6478432859449583e-07, + "loss": 0.6966, + "step": 568 + }, + { + "epoch": 0.16, + "logps_train/chosen": -48.82793045043945, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -49.01304626464844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1437305361032486, + "rewards_train/margins": 0.3829648345708847, + "rewards_train/rejected": -0.5266953706741333, + "step": 568 + }, + { + "epoch": 0.16, + "logps_train/chosen": -19.0291748046875, + "logps_train/ref_chosen": -19.875, + "logps_train/ref_rejected": -23.125, + "logps_train/rejected": -24.782997131347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.07878405600786209, + "rewards_train/margins": 0.25057750195264816, + "rewards_train/rejected": -0.17179344594478607, + "step": 569 + }, + { + "epoch": 0.16, + "learning_rate": 4.644669460012478e-07, + "loss": 0.5729, + "step": 570 + }, + { + "epoch": 0.16, + "logps_train/chosen": -72.10989379882812, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -66.34757995605469, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.19704875349998474, + "rewards_train/margins": 0.16400322318077087, + "rewards_train/rejected": -0.3610519766807556, + "step": 570 + }, + { + "epoch": 0.16, + "logps_train/chosen": -62.749656677246094, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -90.15699768066406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.12464368343353271, + "rewards_train/margins": 0.8118283152580261, + "rewards_train/rejected": -0.6871846318244934, + "step": 571 + }, + { + "epoch": 0.16, + "learning_rate": 4.641482489804689e-07, + "loss": 0.5792, + "step": 572 + }, + { + "epoch": 0.16, + "logps_train/chosen": -87.15548706054688, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -105.77081298828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08429862558841705, + "rewards_train/margins": 0.4150484651327133, + "rewards_train/rejected": -0.49934709072113037, + "step": 572 + }, + { + "epoch": 0.16, + "logps_train/chosen": -70.7585678100586, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -90.46743774414062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.512184739112854, + "rewards_train/margins": 0.8380746841430664, + "rewards_train/rejected": -1.3502594232559204, + "step": 573 + }, + { + "epoch": 0.16, + "learning_rate": 4.638282394853932e-07, + "loss": 0.4728, + "step": 574 + }, + { + "epoch": 0.16, + "logps_train/chosen": -68.72225189208984, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -88.58625793457031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6972252130508423, + "rewards_train/margins": 0.637474775314331, + "rewards_train/rejected": -1.3346999883651733, + "step": 574 + }, + { + "epoch": 0.16, + "logps_train/chosen": -90.45333099365234, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -98.67056274414062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3683798611164093, + "rewards_train/margins": 0.24164548516273499, + "rewards_train/rejected": -0.6100253462791443, + "step": 575 + }, + { + "epoch": 0.16, + "learning_rate": 4.6350691947729845e-07, + "loss": 0.5621, + "step": 576 + }, + { + "epoch": 0.16, + "logps_train/chosen": -77.0277099609375, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -77.37614440917969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6768916249275208, + "rewards_train/margins": 0.6372849345207214, + "rewards_train/rejected": -1.3141765594482422, + "step": 576 + }, + { + "epoch": 0.16, + "logps_train/chosen": -37.29064178466797, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -58.67890930175781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.35047563910484314, + "rewards_train/margins": 0.5439777672290802, + "rewards_train/rejected": -0.8944534063339233, + "step": 577 + }, + { + "epoch": 0.16, + "learning_rate": 4.631842909254947e-07, + "loss": 0.5355, + "step": 578 + }, + { + "epoch": 0.16, + "logps_train/chosen": -84.76373291015625, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -108.00592041015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5107486844062805, + "rewards_train/margins": 1.342577874660492, + "rewards_train/rejected": -1.8533265590667725, + "step": 578 + }, + { + "epoch": 0.16, + "logps_train/chosen": -66.60935974121094, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -92.59552001953125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2109362781047821, + "rewards_train/margins": 1.2681472599506378, + "rewards_train/rejected": -1.47908353805542, + "step": 579 + }, + { + "epoch": 0.16, + "learning_rate": 4.628603558073115e-07, + "loss": 0.3661, + "step": 580 + }, + { + "epoch": 0.16, + "logps_train/chosen": -81.98976135253906, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -95.69758605957031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.35717883706092834, + "rewards_train/margins": 0.37820491194725037, + "rewards_train/rejected": -0.7353837490081787, + "step": 580 + }, + { + "epoch": 0.16, + "logps_train/chosen": -80.45840454101562, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -93.5, + "logps_train/rejected": -107.05191040039062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3048250675201416, + "rewards_train/margins": 1.0464599132537842, + "rewards_train/rejected": -1.3512849807739258, + "step": 581 + }, + { + "epoch": 0.16, + "learning_rate": 4.6253511610808625e-07, + "loss": 0.4926, + "step": 582 + }, + { + "epoch": 0.16, + "logps_train/chosen": -87.01325988769531, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -77.6513671875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4786696434020996, + "rewards_train/margins": 0.3825607895851135, + "rewards_train/rejected": -0.8612304329872131, + "step": 582 + }, + { + "epoch": 0.16, + "logps_train/chosen": -49.4372444152832, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -75.7767105102539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.023411860689520836, + "rewards_train/margins": 0.7585559580475092, + "rewards_train/rejected": -0.78196781873703, + "step": 583 + }, + { + "epoch": 0.16, + "learning_rate": 4.622085738211518e-07, + "loss": 0.5155, + "step": 584 + }, + { + "epoch": 0.16, + "logps_train/chosen": -44.25600051879883, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -52.15533447265625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5583515763282776, + "rewards_train/margins": 0.17606604099273682, + "rewards_train/rejected": -0.7344176173210144, + "step": 584 + }, + { + "epoch": 0.16, + "logps_train/chosen": -35.850746154785156, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -63.023162841796875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03819965571165085, + "rewards_train/margins": 0.5672416165471077, + "rewards_train/rejected": -0.6054412722587585, + "step": 585 + }, + { + "epoch": 0.16, + "learning_rate": 4.618807309478243e-07, + "loss": 0.5876, + "step": 586 + }, + { + "epoch": 0.16, + "logps_train/chosen": -73.76558685302734, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -79.83191680908203, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7403284311294556, + "rewards_train/margins": 0.5956953763961792, + "rewards_train/rejected": -1.3360238075256348, + "step": 586 + }, + { + "epoch": 0.16, + "logps_train/chosen": -91.15377044677734, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -92.85140991210938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.592720627784729, + "rewards_train/margins": 0.6736701726913452, + "rewards_train/rejected": -1.2663908004760742, + "step": 587 + }, + { + "epoch": 0.16, + "learning_rate": 4.6155158949739103e-07, + "loss": 0.4957, + "step": 588 + }, + { + "epoch": 0.16, + "logps_train/chosen": -53.04076385498047, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -53.293479919433594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4956779479980469, + "rewards_train/margins": 0.08484196662902832, + "rewards_train/rejected": -0.5805199146270752, + "step": 588 + }, + { + "epoch": 0.16, + "logps_train/chosen": -52.326904296875, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -48.143741607666016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3766355514526367, + "rewards_train/margins": 0.007074713706970215, + "rewards_train/rejected": -0.38371026515960693, + "step": 589 + }, + { + "epoch": 0.16, + "learning_rate": 4.6122115148709793e-07, + "loss": 0.683, + "step": 590 + }, + { + "epoch": 0.16, + "logps_train/chosen": -59.204689025878906, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -65.17899322509766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11109380424022675, + "rewards_train/margins": 0.21383674442768097, + "rewards_train/rejected": -0.3249305486679077, + "step": 590 + }, + { + "epoch": 0.17, + "logps_train/chosen": -60.41273498535156, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -82.80255126953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.24361774325370789, + "rewards_train/margins": 1.0311680138111115, + "rewards_train/rejected": -1.2747857570648193, + "step": 591 + }, + { + "epoch": 0.17, + "learning_rate": 4.608894189421374e-07, + "loss": 0.5469, + "step": 592 + }, + { + "epoch": 0.17, + "logps_train/chosen": -43.5977668762207, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -51.89591598510742, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15645642578601837, + "rewards_train/margins": 0.42454157769680023, + "rewards_train/rejected": -0.5809980034828186, + "step": 592 + }, + { + "epoch": 0.17, + "logps_train/chosen": -97.82725524902344, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -89.68708801269531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6387799978256226, + "rewards_train/margins": 0.6754366159439087, + "rewards_train/rejected": -1.3142166137695312, + "step": 593 + }, + { + "epoch": 0.17, + "learning_rate": 4.6055639389563573e-07, + "loss": 0.5269, + "step": 594 + }, + { + "epoch": 0.17, + "logps_train/chosen": -78.71722412109375, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -87.13782501220703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8431581258773804, + "rewards_train/margins": 0.5284371376037598, + "rewards_train/rejected": -1.3715952634811401, + "step": 594 + }, + { + "epoch": 0.17, + "logps_train/chosen": -98.85196685791016, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -107.1817626953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4551188051700592, + "rewards_train/margins": 0.703193873167038, + "rewards_train/rejected": -1.1583126783370972, + "step": 595 + }, + { + "epoch": 0.17, + "learning_rate": 4.6022207838864073e-07, + "loss": 0.4972, + "step": 596 + }, + { + "epoch": 0.17, + "logps_train/chosen": -104.48693084716797, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -122.48839569091797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.46881014108657837, + "rewards_train/margins": 0.839794933795929, + "rewards_train/rejected": -1.3086050748825073, + "step": 596 + }, + { + "epoch": 0.17, + "logps_train/chosen": -72.1766357421875, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -88.05975341796875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.34539756178855896, + "rewards_train/margins": 0.8301087915897369, + "rewards_train/rejected": -1.175506353378296, + "step": 597 + }, + { + "epoch": 0.17, + "learning_rate": 4.598864744701092e-07, + "loss": 0.4649, + "step": 598 + }, + { + "epoch": 0.17, + "logps_train/chosen": -57.29808807373047, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -78.87712097167969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2516838014125824, + "rewards_train/margins": 1.1243096888065338, + "rewards_train/rejected": -1.3759934902191162, + "step": 598 + }, + { + "epoch": 0.17, + "logps_train/chosen": -71.5537109375, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -84.06674194335938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1652102768421173, + "rewards_train/margins": 0.9629485309123993, + "rewards_train/rejected": -1.1281588077545166, + "step": 599 + }, + { + "epoch": 0.17, + "learning_rate": 4.595495841968944e-07, + "loss": 0.4377, + "step": 600 + }, + { + "epoch": 0.17, + "logps_train/chosen": -35.55915832519531, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -53.199073791503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.011466770432889462, + "rewards_train/margins": 0.4583270838484168, + "rewards_train/rejected": -0.44686031341552734, + "step": 600 + }, + { + "epoch": 0.17, + "logps_train/chosen": -69.23992919921875, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -76.4552230834961, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0794619768857956, + "rewards_train/margins": 0.830122783780098, + "rewards_train/rejected": -0.9095847606658936, + "step": 601 + }, + { + "epoch": 0.17, + "learning_rate": 4.592114096337333e-07, + "loss": 0.487, + "step": 602 + }, + { + "epoch": 0.17, + "logps_train/chosen": -61.107452392578125, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -54.82007598876953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6993560791015625, + "rewards_train/margins": -0.008034110069274902, + "rewards_train/rejected": -0.6913219690322876, + "step": 602 + }, + { + "epoch": 0.17, + "logps_train/chosen": -48.074554443359375, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -72.86968231201172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3742521107196808, + "rewards_train/margins": 0.7513880431652069, + "rewards_train/rejected": -1.1256401538848877, + "step": 603 + }, + { + "epoch": 0.17, + "learning_rate": 4.588719528532341e-07, + "loss": 0.6121, + "step": 604 + }, + { + "epoch": 0.17, + "logps_train/chosen": -55.13451385498047, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -60.32548522949219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3062373399734497, + "rewards_train/margins": 0.35835492610931396, + "rewards_train/rejected": -0.6645922660827637, + "step": 604 + }, + { + "epoch": 0.17, + "logps_train/chosen": -79.34104919433594, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -90.52935028076172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3741443455219269, + "rewards_train/margins": 0.7194158732891083, + "rewards_train/rejected": -1.0935602188110352, + "step": 605 + }, + { + "epoch": 0.17, + "learning_rate": 4.5853121593586365e-07, + "loss": 0.5288, + "step": 606 + }, + { + "epoch": 0.17, + "logps_train/chosen": -37.593963623046875, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -47.17842102050781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.25549042224884033, + "rewards_train/margins": 0.49770355224609375, + "rewards_train/rejected": -0.7531939744949341, + "step": 606 + }, + { + "epoch": 0.17, + "logps_train/chosen": -91.64691162109375, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -120.29931640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.741253674030304, + "rewards_train/margins": 0.7820373177528381, + "rewards_train/rejected": -1.523290991783142, + "step": 607 + }, + { + "epoch": 0.17, + "learning_rate": 4.581892009699342e-07, + "loss": 0.5044, + "step": 608 + }, + { + "epoch": 0.17, + "logps_train/chosen": -69.13575744628906, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -83.65777587890625, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.34111425280570984, + "rewards_train/margins": 0.2746632397174835, + "rewards_train/rejected": -0.6157774925231934, + "step": 608 + }, + { + "epoch": 0.17, + "logps_train/chosen": -73.95113372802734, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -81.9327392578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5802695751190186, + "rewards_train/margins": 0.517691969871521, + "rewards_train/rejected": -1.0979615449905396, + "step": 609 + }, + { + "epoch": 0.17, + "learning_rate": 4.578459100515911e-07, + "loss": 0.558, + "step": 610 + }, + { + "epoch": 0.17, + "logps_train/chosen": -103.4654541015625, + "logps_train/ref_chosen": -96.5, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -131.02651977539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6910765171051025, + "rewards_train/margins": 0.8644089698791504, + "rewards_train/rejected": -1.555485486984253, + "step": 610 + }, + { + "epoch": 0.17, + "logps_train/chosen": -90.72823333740234, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -118.1752700805664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.022041872143745422, + "rewards_train/margins": 1.3849381655454636, + "rewards_train/rejected": -1.406980037689209, + "step": 611 + }, + { + "epoch": 0.17, + "learning_rate": 4.5750134528479987e-07, + "loss": 0.4026, + "step": 612 + }, + { + "epoch": 0.17, + "logps_train/chosen": -38.14342498779297, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -42.50956726074219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0489390604197979, + "rewards_train/margins": 0.29442719742655754, + "rewards_train/rejected": -0.24548813700675964, + "step": 612 + }, + { + "epoch": 0.17, + "logps_train/chosen": -71.17725372314453, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -94.51168823242188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2556159794330597, + "rewards_train/margins": 0.49086567759513855, + "rewards_train/rejected": -0.7464816570281982, + "step": 613 + }, + { + "epoch": 0.17, + "learning_rate": 4.57155508781333e-07, + "loss": 0.5611, + "step": 614 + }, + { + "epoch": 0.17, + "logps_train/chosen": -75.19429016113281, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -105.45350646972656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.47997623682022095, + "rewards_train/margins": 1.1724061369895935, + "rewards_train/rejected": -1.6523823738098145, + "step": 614 + }, + { + "epoch": 0.17, + "logps_train/chosen": -81.6805648803711, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -78.3815689086914, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.77650386095047, + "rewards_train/margins": 0.5130689740180969, + "rewards_train/rejected": -1.289572834968567, + "step": 615 + }, + { + "epoch": 0.17, + "learning_rate": 4.568084026607574e-07, + "loss": 0.4385, + "step": 616 + }, + { + "epoch": 0.17, + "logps_train/chosen": -83.96728515625, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -89.79374694824219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.44086921215057373, + "rewards_train/margins": 0.8107714653015137, + "rewards_train/rejected": -1.2516406774520874, + "step": 616 + }, + { + "epoch": 0.17, + "logps_train/chosen": -39.587135314941406, + "logps_train/ref_chosen": -36.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -49.2750244140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.310862272977829, + "rewards_train/margins": 0.37484338879585266, + "rewards_train/rejected": -0.6857056617736816, + "step": 617 + }, + { + "epoch": 0.17, + "learning_rate": 4.5646002905042096e-07, + "loss": 0.5272, + "step": 618 + }, + { + "epoch": 0.17, + "logps_train/chosen": -44.052940368652344, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -68.78353118896484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2131064385175705, + "rewards_train/margins": 0.9093870669603348, + "rewards_train/rejected": -1.1224935054779053, + "step": 618 + }, + { + "epoch": 0.17, + "logps_train/chosen": -102.40580749511719, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -88.42894744873047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22124440968036652, + "rewards_train/margins": 0.8700881451368332, + "rewards_train/rejected": -1.0913325548171997, + "step": 619 + }, + { + "epoch": 0.17, + "learning_rate": 4.5611039008544007e-07, + "loss": 0.4397, + "step": 620 + }, + { + "epoch": 0.17, + "logps_train/chosen": -97.11043548583984, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -96.13004302978516, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6344809532165527, + "rewards_train/margins": 0.3949296474456787, + "rewards_train/rejected": -1.0294106006622314, + "step": 620 + }, + { + "epoch": 0.17, + "logps_train/chosen": -61.36271286010742, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -63.25, + "logps_train/rejected": -75.26327514648438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.13861505687236786, + "rewards_train/margins": 1.0523604899644852, + "rewards_train/rejected": -1.190975546836853, + "step": 621 + }, + { + "epoch": 0.17, + "learning_rate": 4.5575948790868603e-07, + "loss": 0.4873, + "step": 622 + }, + { + "epoch": 0.17, + "logps_train/chosen": -93.0059814453125, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -103.60557556152344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3269651532173157, + "rewards_train/margins": 1.222850501537323, + "rewards_train/rejected": -1.5498156547546387, + "step": 622 + }, + { + "epoch": 0.17, + "logps_train/chosen": -82.55561828613281, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -94.36223602294922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3866158127784729, + "rewards_train/margins": 0.893748939037323, + "rewards_train/rejected": -1.280364751815796, + "step": 623 + }, + { + "epoch": 0.17, + "learning_rate": 4.5540732467077233e-07, + "loss": 0.4044, + "step": 624 + }, + { + "epoch": 0.17, + "logps_train/chosen": -41.70579528808594, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -58.41779708862305, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.019016893580555916, + "rewards_train/margins": 0.7829191368073225, + "rewards_train/rejected": -0.8019360303878784, + "step": 624 + }, + { + "epoch": 0.17, + "logps_train/chosen": -37.41344451904297, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -52.190956115722656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.18987981975078583, + "rewards_train/margins": -0.01346006989479065, + "rewards_train/rejected": -0.17641974985599518, + "step": 625 + }, + { + "epoch": 0.17, + "learning_rate": 4.5505390253004104e-07, + "loss": 0.5797, + "step": 626 + }, + { + "epoch": 0.17, + "logps_train/chosen": -75.43762969970703, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -86.45477294921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3323371410369873, + "rewards_train/margins": 0.812847375869751, + "rewards_train/rejected": -1.1451845169067383, + "step": 626 + }, + { + "epoch": 0.18, + "logps_train/chosen": -85.30070495605469, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -118.30488586425781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9474532604217529, + "rewards_train/margins": 1.0707910060882568, + "rewards_train/rejected": -2.0182442665100098, + "step": 627 + }, + { + "epoch": 0.18, + "learning_rate": 4.5469922365254995e-07, + "loss": 0.4124, + "step": 628 + }, + { + "epoch": 0.18, + "logps_train/chosen": -68.51701354980469, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -91.02525329589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3190349042415619, + "rewards_train/margins": 1.1977479755878448, + "rewards_train/rejected": -1.5167828798294067, + "step": 628 + }, + { + "epoch": 0.18, + "logps_train/chosen": -64.99640655517578, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -86.93624877929688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5605781674385071, + "rewards_train/margins": 1.1076564192771912, + "rewards_train/rejected": -1.6682345867156982, + "step": 629 + }, + { + "epoch": 0.18, + "learning_rate": 4.543432902120591e-07, + "loss": 0.4286, + "step": 630 + }, + { + "epoch": 0.18, + "logps_train/chosen": -70.08255004882812, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -76.27694702148438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8255029320716858, + "rewards_train/margins": 0.43936246633529663, + "rewards_train/rejected": -1.2648653984069824, + "step": 630 + }, + { + "epoch": 0.18, + "logps_train/chosen": -32.9892463684082, + "logps_train/ref_chosen": -28.375, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -42.423362731933594, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.45908087491989136, + "rewards_train/margins": -0.003072798252105713, + "rewards_train/rejected": -0.45600807666778564, + "step": 631 + }, + { + "epoch": 0.18, + "learning_rate": 4.5398610439001754e-07, + "loss": 0.6323, + "step": 632 + }, + { + "epoch": 0.18, + "logps_train/chosen": -70.62435150146484, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -88.1588134765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2561851739883423, + "rewards_train/margins": 1.2421189546585083, + "rewards_train/rejected": -1.4983041286468506, + "step": 632 + }, + { + "epoch": 0.18, + "logps_train/chosen": -37.48558044433594, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -52.9913330078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.018186796456575394, + "rewards_train/margins": 0.5228411667048931, + "rewards_train/rejected": -0.5410279631614685, + "step": 633 + }, + { + "epoch": 0.18, + "learning_rate": 4.5362766837555e-07, + "loss": 0.462, + "step": 634 + }, + { + "epoch": 0.18, + "logps_train/chosen": -59.0457878112793, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -66.14567565917969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3864145874977112, + "rewards_train/margins": 0.9375272393226624, + "rewards_train/rejected": -1.3239418268203735, + "step": 634 + }, + { + "epoch": 0.18, + "logps_train/chosen": -50.50392150878906, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -54.16680145263672, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.31679850816726685, + "rewards_train/margins": 0.23738187551498413, + "rewards_train/rejected": -0.554180383682251, + "step": 635 + }, + { + "epoch": 0.18, + "learning_rate": 4.5326798436544323e-07, + "loss": 0.5731, + "step": 636 + }, + { + "epoch": 0.18, + "logps_train/chosen": -61.84890365600586, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -76.36966705322266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.02512475848197937, + "rewards_train/margins": 0.9423111975193024, + "rewards_train/rejected": -0.9674359560012817, + "step": 636 + }, + { + "epoch": 0.18, + "logps_train/chosen": -74.06765747070312, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -85.84880065917969, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.15209071338176727, + "rewards_train/margins": 0.5753799229860306, + "rewards_train/rejected": -0.7274706363677979, + "step": 637 + }, + { + "epoch": 0.18, + "learning_rate": 4.5290705456413274e-07, + "loss": 0.4851, + "step": 638 + }, + { + "epoch": 0.18, + "logps_train/chosen": -71.73297882080078, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -71.759033203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.34693074226379395, + "rewards_train/margins": 0.4447438716888428, + "rewards_train/rejected": -0.7916746139526367, + "step": 638 + }, + { + "epoch": 0.18, + "logps_train/chosen": -90.84843444824219, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -103.9638900756836, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8617967367172241, + "rewards_train/margins": 0.9881085157394409, + "rewards_train/rejected": -1.849905252456665, + "step": 639 + }, + { + "epoch": 0.18, + "learning_rate": 4.525448811836895e-07, + "loss": 0.5341, + "step": 640 + }, + { + "epoch": 0.18, + "logps_train/chosen": -96.2398910522461, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -110.59107971191406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.605202317237854, + "rewards_train/margins": 0.5812125205993652, + "rewards_train/rejected": -1.1864148378372192, + "step": 640 + }, + { + "epoch": 0.18, + "logps_train/chosen": -40.226585388183594, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -58.57625198364258, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.06034912168979645, + "rewards_train/margins": 0.5128963440656662, + "rewards_train/rejected": -0.45254722237586975, + "step": 641 + }, + { + "epoch": 0.18, + "learning_rate": 4.521814664438058e-07, + "loss": 0.5262, + "step": 642 + }, + { + "epoch": 0.18, + "logps_train/chosen": -40.263187408447266, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -24.5, + "logps_train/rejected": -30.442285537719727, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4275878965854645, + "rewards_train/margins": 0.1609766185283661, + "rewards_train/rejected": -0.5885645151138306, + "step": 642 + }, + { + "epoch": 0.18, + "logps_train/chosen": -33.801544189453125, + "logps_train/ref_chosen": -29.5, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -48.0215950012207, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.43054524064064026, + "rewards_train/margins": 0.026692569255828857, + "rewards_train/rejected": -0.4572378098964691, + "step": 643 + }, + { + "epoch": 0.18, + "learning_rate": 4.5181681257178235e-07, + "loss": 0.6861, + "step": 644 + }, + { + "epoch": 0.18, + "logps_train/chosen": -73.14881896972656, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -79.70219421386719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7141008377075195, + "rewards_train/margins": 0.7440091371536255, + "rewards_train/rejected": -1.458109974861145, + "step": 644 + }, + { + "epoch": 0.18, + "logps_train/chosen": -63.07398223876953, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -72.7054672241211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.425659716129303, + "rewards_train/margins": 0.616371214389801, + "rewards_train/rejected": -1.042030930519104, + "step": 645 + }, + { + "epoch": 0.18, + "learning_rate": 4.514509218025139e-07, + "loss": 0.4893, + "step": 646 + }, + { + "epoch": 0.18, + "logps_train/chosen": -32.77337646484375, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -51.40776824951172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.001751527190208435, + "rewards_train/margins": 0.24722842872142792, + "rewards_train/rejected": -0.24897995591163635, + "step": 646 + }, + { + "epoch": 0.18, + "logps_train/chosen": -95.13325500488281, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -115.75078582763672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4730914235115051, + "rewards_train/margins": 0.8523772358894348, + "rewards_train/rejected": -1.32546865940094, + "step": 647 + }, + { + "epoch": 0.18, + "learning_rate": 4.510837963784762e-07, + "loss": 0.5427, + "step": 648 + }, + { + "epoch": 0.18, + "logps_train/chosen": -61.911338806152344, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -90.363525390625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12453193217515945, + "rewards_train/margins": 1.2088908478617668, + "rewards_train/rejected": -1.3334227800369263, + "step": 648 + }, + { + "epoch": 0.18, + "logps_train/chosen": -88.54911041259766, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -115.21802520751953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.36233267188072205, + "rewards_train/margins": 1.0243140757083893, + "rewards_train/rejected": -1.3866467475891113, + "step": 649 + }, + { + "epoch": 0.18, + "learning_rate": 4.507154385497117e-07, + "loss": 0.4207, + "step": 650 + }, + { + "epoch": 0.18, + "logps_train/chosen": -81.17900085449219, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -102.59858703613281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.345048725605011, + "rewards_train/margins": 1.1343411803245544, + "rewards_train/rejected": -1.4793899059295654, + "step": 650 + }, + { + "epoch": 0.18, + "logps_train/chosen": -72.46735382080078, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -83.43834686279297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6561101078987122, + "rewards_train/margins": 0.20139652490615845, + "rewards_train/rejected": -0.8575066328048706, + "step": 651 + }, + { + "epoch": 0.18, + "learning_rate": 4.5034585057381626e-07, + "loss": 0.5609, + "step": 652 + }, + { + "epoch": 0.18, + "logps_train/chosen": -65.99787902832031, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -60.746192932128906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.363298624753952, + "rewards_train/margins": 0.71584752202034, + "rewards_train/rejected": -0.35254889726638794, + "step": 652 + }, + { + "epoch": 0.18, + "logps_train/chosen": -64.7233657836914, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -82.37864685058594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3309304118156433, + "rewards_train/margins": 0.6815438866615295, + "rewards_train/rejected": -1.0124742984771729, + "step": 653 + }, + { + "epoch": 0.18, + "learning_rate": 4.49975034715925e-07, + "loss": 0.5347, + "step": 654 + }, + { + "epoch": 0.18, + "logps_train/chosen": -35.867515563964844, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -55.71794128417969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.10979853570461273, + "rewards_train/margins": 0.4460289031267166, + "rewards_train/rejected": -0.5558274388313293, + "step": 654 + }, + { + "epoch": 0.18, + "logps_train/chosen": -96.11976623535156, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -111.7199935913086, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8938125371932983, + "rewards_train/margins": 0.9885386228561401, + "rewards_train/rejected": -1.8823511600494385, + "step": 655 + }, + { + "epoch": 0.18, + "learning_rate": 4.4960299324869857e-07, + "loss": 0.4862, + "step": 656 + }, + { + "epoch": 0.18, + "logps_train/chosen": -72.49343872070312, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -91.55741882324219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7552034258842468, + "rewards_train/margins": 1.1499527096748352, + "rewards_train/rejected": -1.905156135559082, + "step": 656 + }, + { + "epoch": 0.18, + "logps_train/chosen": -71.75846099853516, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -95.741455078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4320959150791168, + "rewards_train/margins": 0.6506434381008148, + "rewards_train/rejected": -1.0827393531799316, + "step": 657 + }, + { + "epoch": 0.18, + "learning_rate": 4.4922972845230895e-07, + "loss": 0.4493, + "step": 658 + }, + { + "epoch": 0.18, + "logps_train/chosen": -85.15675354003906, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -89.21485137939453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.24184703826904297, + "rewards_train/margins": 0.3972163796424866, + "rewards_train/rejected": -0.6390634179115295, + "step": 658 + }, + { + "epoch": 0.18, + "logps_train/chosen": -98.7637939453125, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -108.19346618652344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7701294422149658, + "rewards_train/margins": 1.2867178916931152, + "rewards_train/rejected": -2.056847333908081, + "step": 659 + }, + { + "epoch": 0.18, + "learning_rate": 4.4885524261442585e-07, + "loss": 0.4751, + "step": 660 + }, + { + "epoch": 0.18, + "logps_train/chosen": -79.21875, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -93.99169158935547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0007820129394531, + "rewards_train/margins": 0.6814925670623779, + "rewards_train/rejected": -1.682274580001831, + "step": 660 + }, + { + "epoch": 0.18, + "logps_train/chosen": -68.07029724121094, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -94.90338897705078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6750957369804382, + "rewards_train/margins": 0.9785241484642029, + "rewards_train/rejected": -1.6536198854446411, + "step": 661 + }, + { + "epoch": 0.19, + "learning_rate": 4.4847953803020246e-07, + "loss": 0.4814, + "step": 662 + }, + { + "epoch": 0.19, + "logps_train/chosen": -57.656044006347656, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -78.78140258789062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1460731029510498, + "rewards_train/margins": 1.0043323040008545, + "rewards_train/rejected": -1.1504054069519043, + "step": 662 + }, + { + "epoch": 0.19, + "logps_train/chosen": -86.06736755371094, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -92.02853393554688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4536122679710388, + "rewards_train/margins": 0.43361592292785645, + "rewards_train/rejected": -0.8872281908988953, + "step": 663 + }, + { + "epoch": 0.19, + "learning_rate": 4.481026170022614e-07, + "loss": 0.4835, + "step": 664 + }, + { + "epoch": 0.19, + "logps_train/chosen": -75.21578216552734, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -88.8048095703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7039028406143188, + "rewards_train/margins": 1.113687515258789, + "rewards_train/rejected": -1.817590355873108, + "step": 664 + }, + { + "epoch": 0.19, + "logps_train/chosen": -89.71745300292969, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -92.02498626708984, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.843620240688324, + "rewards_train/margins": 0.8002843260765076, + "rewards_train/rejected": -1.6439045667648315, + "step": 665 + }, + { + "epoch": 0.19, + "learning_rate": 4.4772448184068067e-07, + "loss": 0.5129, + "step": 666 + }, + { + "epoch": 0.19, + "logps_train/chosen": -85.15534210205078, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -94.31193542480469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8545966148376465, + "rewards_train/margins": 0.23831546306610107, + "rewards_train/rejected": -1.0929120779037476, + "step": 666 + }, + { + "epoch": 0.19, + "logps_train/chosen": -23.884380340576172, + "logps_train/ref_chosen": -23.75, + "logps_train/ref_rejected": -18.5, + "logps_train/rejected": -23.69770050048828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.014219170436263084, + "rewards_train/margins": 0.49969165958464146, + "rewards_train/rejected": -0.5139108300209045, + "step": 667 + }, + { + "epoch": 0.19, + "learning_rate": 4.4734513486297964e-07, + "loss": 0.5766, + "step": 668 + }, + { + "epoch": 0.19, + "logps_train/chosen": -56.38758850097656, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -69.25550079345703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2778211534023285, + "rewards_train/margins": 0.9207759201526642, + "rewards_train/rejected": -1.1985970735549927, + "step": 668 + }, + { + "epoch": 0.19, + "logps_train/chosen": -59.53071212768555, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -45.45075225830078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21596190333366394, + "rewards_train/margins": 0.3982537090778351, + "rewards_train/rejected": -0.614215612411499, + "step": 669 + }, + { + "epoch": 0.19, + "learning_rate": 4.4696457839410427e-07, + "loss": 0.499, + "step": 670 + }, + { + "epoch": 0.19, + "logps_train/chosen": -61.48435592651367, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -67.98274230957031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7351545095443726, + "rewards_train/margins": 0.7779637575149536, + "rewards_train/rejected": -1.5131182670593262, + "step": 670 + }, + { + "epoch": 0.19, + "logps_train/chosen": -78.70622253417969, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -94.24607849121094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7309736609458923, + "rewards_train/margins": 0.7086732983589172, + "rewards_train/rejected": -1.4396469593048096, + "step": 671 + }, + { + "epoch": 0.19, + "learning_rate": 4.465828147664137e-07, + "loss": 0.5492, + "step": 672 + }, + { + "epoch": 0.19, + "logps_train/chosen": -62.592857360839844, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -67.83057403564453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.34668827056884766, + "rewards_train/margins": 0.641251802444458, + "rewards_train/rejected": -0.9879400730133057, + "step": 672 + }, + { + "epoch": 0.19, + "logps_train/chosen": -73.33329010009766, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -77.51690673828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6296181082725525, + "rewards_train/margins": 0.3064476251602173, + "rewards_train/rejected": -0.9360657334327698, + "step": 673 + }, + { + "epoch": 0.19, + "learning_rate": 4.4619984631966524e-07, + "loss": 0.5517, + "step": 674 + }, + { + "epoch": 0.19, + "logps_train/chosen": -50.96469497680664, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -58.390281677246094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.23562970757484436, + "rewards_train/margins": 0.494804710149765, + "rewards_train/rejected": -0.7304344177246094, + "step": 674 + }, + { + "epoch": 0.19, + "logps_train/chosen": -81.35869598388672, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -81.34394836425781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0551568940281868, + "rewards_train/margins": 1.0149314925074577, + "rewards_train/rejected": -1.0700883865356445, + "step": 675 + }, + { + "epoch": 0.19, + "learning_rate": 4.458156754010004e-07, + "loss": 0.4865, + "step": 676 + }, + { + "epoch": 0.19, + "logps_train/chosen": -56.81840515136719, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -73.55111694335938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7370163798332214, + "rewards_train/margins": 0.7875295281410217, + "rewards_train/rejected": -1.5245459079742432, + "step": 676 + }, + { + "epoch": 0.19, + "logps_train/chosen": -52.559120178222656, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -51.95066833496094, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.7412638664245605, + "rewards_train/margins": 0.03309953212738037, + "rewards_train/rejected": -0.7743633985519409, + "step": 677 + }, + { + "epoch": 0.19, + "learning_rate": 4.4543030436493036e-07, + "loss": 0.6319, + "step": 678 + }, + { + "epoch": 0.19, + "logps_train/chosen": -51.42641830444336, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -68.86459350585938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.34811049699783325, + "rewards_train/margins": 1.0535833239555359, + "rewards_train/rejected": -1.4016938209533691, + "step": 678 + }, + { + "epoch": 0.19, + "logps_train/chosen": -98.58399963378906, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -115.09819030761719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1162129640579224, + "rewards_train/margins": 0.6939965486526489, + "rewards_train/rejected": -1.8102095127105713, + "step": 679 + }, + { + "epoch": 0.19, + "learning_rate": 4.450437355733217e-07, + "loss": 0.4698, + "step": 680 + }, + { + "epoch": 0.19, + "logps_train/chosen": -54.211544036865234, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -60.974143981933594, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.5453733205795288, + "rewards_train/margins": 0.1684473156929016, + "rewards_train/rejected": -0.7138206362724304, + "step": 680 + }, + { + "epoch": 0.19, + "logps_train/chosen": -73.12992095947266, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -94.11784362792969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6758828163146973, + "rewards_train/margins": 0.38355767726898193, + "rewards_train/rejected": -1.0594404935836792, + "step": 681 + }, + { + "epoch": 0.19, + "learning_rate": 4.4465597139538175e-07, + "loss": 0.6193, + "step": 682 + }, + { + "epoch": 0.19, + "logps_train/chosen": -62.34566116333008, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -85.4367904663086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9302570223808289, + "rewards_train/margins": 0.7649847865104675, + "rewards_train/rejected": -1.6952418088912964, + "step": 682 + }, + { + "epoch": 0.19, + "logps_train/chosen": -80.58963775634766, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -67.1247329711914, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2303017377853394, + "rewards_train/margins": 0.2347104549407959, + "rewards_train/rejected": -1.4650121927261353, + "step": 683 + }, + { + "epoch": 0.19, + "learning_rate": 4.442670142076441e-07, + "loss": 0.5485, + "step": 684 + }, + { + "epoch": 0.19, + "logps_train/chosen": -46.678382873535156, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -60.481929779052734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.01731795445084572, + "rewards_train/margins": 0.6487140096724033, + "rewards_train/rejected": -0.6313960552215576, + "step": 684 + }, + { + "epoch": 0.19, + "logps_train/chosen": -75.64036560058594, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -90.77500915527344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3628641664981842, + "rewards_train/margins": 0.8718629777431488, + "rewards_train/rejected": -1.234727144241333, + "step": 685 + }, + { + "epoch": 0.19, + "learning_rate": 4.4387686639395427e-07, + "loss": 0.4704, + "step": 686 + }, + { + "epoch": 0.19, + "logps_train/chosen": -97.7982177734375, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -132.18392944335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4970096945762634, + "rewards_train/margins": 1.6504855751991272, + "rewards_train/rejected": -2.1474952697753906, + "step": 686 + }, + { + "epoch": 0.19, + "logps_train/chosen": -35.54115295410156, + "logps_train/ref_chosen": -32.0, + "logps_train/ref_rejected": -28.75, + "logps_train/rejected": -35.30659484863281, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.34591197967529297, + "rewards_train/margins": 0.30901503562927246, + "rewards_train/rejected": -0.6549270153045654, + "step": 687 + }, + { + "epoch": 0.19, + "learning_rate": 4.4348553034545455e-07, + "loss": 0.4681, + "step": 688 + }, + { + "epoch": 0.19, + "logps_train/chosen": -75.43375396728516, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -111.872802734375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.575553297996521, + "rewards_train/margins": 1.1656333208084106, + "rewards_train/rejected": -1.7411866188049316, + "step": 688 + }, + { + "epoch": 0.19, + "logps_train/chosen": -54.86640930175781, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -51.671295166015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3280473053455353, + "rewards_train/margins": 0.3699416220188141, + "rewards_train/rejected": -0.6979889273643494, + "step": 689 + }, + { + "epoch": 0.19, + "learning_rate": 4.4309300846056997e-07, + "loss": 0.4749, + "step": 690 + }, + { + "epoch": 0.19, + "logps_train/chosen": -65.31968688964844, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -115.38182067871094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12357078492641449, + "rewards_train/margins": 1.2216432243585587, + "rewards_train/rejected": -1.3452140092849731, + "step": 690 + }, + { + "epoch": 0.19, + "logps_train/chosen": -54.80620574951172, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -85.59770965576172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.27944865822792053, + "rewards_train/margins": 0.6799319684505463, + "rewards_train/rejected": -0.9593806266784668, + "step": 691 + }, + { + "epoch": 0.19, + "learning_rate": 4.426993031449934e-07, + "loss": 0.4428, + "step": 692 + }, + { + "epoch": 0.19, + "logps_train/chosen": -81.28584289550781, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -98.84505462646484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6164754629135132, + "rewards_train/margins": 1.0586549043655396, + "rewards_train/rejected": -1.6751303672790527, + "step": 692 + }, + { + "epoch": 0.19, + "logps_train/chosen": -56.008544921875, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -62.47737121582031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12531700730323792, + "rewards_train/margins": 1.0977133214473724, + "rewards_train/rejected": -1.2230303287506104, + "step": 693 + }, + { + "epoch": 0.19, + "learning_rate": 4.423044168116703e-07, + "loss": 0.4061, + "step": 694 + }, + { + "epoch": 0.19, + "logps_train/chosen": -50.23446273803711, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -55.06452941894531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.27032139897346497, + "rewards_train/margins": 0.43417826294898987, + "rewards_train/rejected": -0.7044996619224548, + "step": 694 + }, + { + "epoch": 0.19, + "logps_train/chosen": -67.40570068359375, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -74.43222045898438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5548282861709595, + "rewards_train/margins": 0.3792140483856201, + "rewards_train/rejected": -0.9340423345565796, + "step": 695 + }, + { + "epoch": 0.19, + "learning_rate": 4.419083518807849e-07, + "loss": 0.5375, + "step": 696 + }, + { + "epoch": 0.19, + "logps_train/chosen": -66.59686279296875, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -87.62117767333984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.528436005115509, + "rewards_train/margins": 0.6325101256370544, + "rewards_train/rejected": -1.1609461307525635, + "step": 696 + }, + { + "epoch": 0.19, + "logps_train/chosen": -96.261962890625, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -88.99104309082031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7295166254043579, + "rewards_train/margins": 0.39590609073638916, + "rewards_train/rejected": -1.125422716140747, + "step": 697 + }, + { + "epoch": 0.2, + "learning_rate": 4.415111107797445e-07, + "loss": 0.5404, + "step": 698 + }, + { + "epoch": 0.2, + "logps_train/chosen": -38.09441375732422, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -42.03752899169922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0683908611536026, + "rewards_train/margins": 0.31149922311306, + "rewards_train/rejected": -0.2431083619594574, + "step": 698 + }, + { + "epoch": 0.2, + "logps_train/chosen": -74.51948547363281, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -99.397216796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6858353614807129, + "rewards_train/margins": 1.3628711700439453, + "rewards_train/rejected": -2.048706531524658, + "step": 699 + }, + { + "epoch": 0.2, + "learning_rate": 4.4111269594316504e-07, + "loss": 0.4587, + "step": 700 + }, + { + "epoch": 0.2, + "logps_train/chosen": -68.04469299316406, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -63.25, + "logps_train/rejected": -80.16493225097656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1466572284698486, + "rewards_train/margins": 0.534679651260376, + "rewards_train/rejected": -1.6813368797302246, + "step": 700 + }, + { + "epoch": 0.2, + "logps_train/chosen": -65.927490234375, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -88.21231079101562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.49196773767471313, + "rewards_train/margins": 0.9068029522895813, + "rewards_train/rejected": -1.3987706899642944, + "step": 701 + }, + { + "epoch": 0.2, + "learning_rate": 4.40713109812856e-07, + "loss": 0.4802, + "step": 702 + }, + { + "epoch": 0.2, + "logps_train/chosen": -43.46561050415039, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -64.47077941894531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2250766158103943, + "rewards_train/margins": 1.2809858918190002, + "rewards_train/rejected": -1.5060625076293945, + "step": 702 + }, + { + "epoch": 0.2, + "logps_train/chosen": -93.71458435058594, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -76.2461929321289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5913801193237305, + "rewards_train/margins": 0.34808313846588135, + "rewards_train/rejected": -0.9394632577896118, + "step": 703 + }, + { + "epoch": 0.2, + "learning_rate": 4.403123548378055e-07, + "loss": 0.496, + "step": 704 + }, + { + "epoch": 0.2, + "logps_train/chosen": -14.555025100708008, + "logps_train/ref_chosen": -13.875, + "logps_train/ref_rejected": -21.75, + "logps_train/rejected": -24.022647857666016, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06907662749290466, + "rewards_train/margins": 0.1593601405620575, + "rewards_train/rejected": -0.22843676805496216, + "step": 704 + }, + { + "epoch": 0.2, + "logps_train/chosen": -55.99735641479492, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -88.1343002319336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5546185374259949, + "rewards_train/margins": 1.595529854297638, + "rewards_train/rejected": -2.150148391723633, + "step": 705 + }, + { + "epoch": 0.2, + "learning_rate": 4.3991043347416545e-07, + "loss": 0.5, + "step": 706 + }, + { + "epoch": 0.2, + "logps_train/chosen": -15.79543685913086, + "logps_train/ref_chosen": -16.375, + "logps_train/ref_rejected": -23.375, + "logps_train/rejected": -24.89569091796875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.055417317897081375, + "rewards_train/margins": 0.21061145141720772, + "rewards_train/rejected": -0.15519413352012634, + "step": 706 + }, + { + "epoch": 0.2, + "logps_train/chosen": -93.42644500732422, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -99.34465026855469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.335613489151001, + "rewards_train/margins": 0.7207262516021729, + "rewards_train/rejected": -2.056339740753174, + "step": 707 + }, + { + "epoch": 0.2, + "learning_rate": 4.3950734818523606e-07, + "loss": 0.6011, + "step": 708 + }, + { + "epoch": 0.2, + "logps_train/chosen": -53.259437561035156, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -57.41320037841797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4588903784751892, + "rewards_train/margins": 0.8788536190986633, + "rewards_train/rejected": -1.3377439975738525, + "step": 708 + }, + { + "epoch": 0.2, + "logps_train/chosen": -67.23185729980469, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -80.57467651367188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.08325980603694916, + "rewards_train/margins": 0.32588331401348114, + "rewards_train/rejected": -0.24262350797653198, + "step": 709 + }, + { + "epoch": 0.2, + "learning_rate": 4.3910310144145137e-07, + "loss": 0.5841, + "step": 710 + }, + { + "epoch": 0.2, + "logps_train/chosen": -50.67464828491211, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -71.0008773803711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04099975898861885, + "rewards_train/margins": 1.0672911517322063, + "rewards_train/rejected": -1.1082909107208252, + "step": 710 + }, + { + "epoch": 0.2, + "logps_train/chosen": -48.37436294555664, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -58.73229217529297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5963231325149536, + "rewards_train/margins": 0.5827653408050537, + "rewards_train/rejected": -1.1790884733200073, + "step": 711 + }, + { + "epoch": 0.2, + "learning_rate": 4.386976957203633e-07, + "loss": 0.4646, + "step": 712 + }, + { + "epoch": 0.2, + "logps_train/chosen": -65.23060607910156, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -74.45390319824219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3915172517299652, + "rewards_train/margins": 0.3273109495639801, + "rewards_train/rejected": -0.7188282012939453, + "step": 712 + }, + { + "epoch": 0.2, + "logps_train/chosen": -24.60601043701172, + "logps_train/ref_chosen": -24.625, + "logps_train/ref_rejected": -32.75, + "logps_train/rejected": -38.18189239501953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0063912393525242805, + "rewards_train/margins": 0.5437214495614171, + "rewards_train/rejected": -0.5373302102088928, + "step": 713 + }, + { + "epoch": 0.2, + "learning_rate": 4.3829113350662737e-07, + "loss": 0.5398, + "step": 714 + }, + { + "epoch": 0.2, + "logps_train/chosen": -81.86094665527344, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -108.54100799560547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4165637791156769, + "rewards_train/margins": 1.674256056547165, + "rewards_train/rejected": -2.090819835662842, + "step": 714 + }, + { + "epoch": 0.2, + "logps_train/chosen": -75.81840515136719, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -92.84339904785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.519242525100708, + "rewards_train/margins": 1.4959566593170166, + "rewards_train/rejected": -2.0151991844177246, + "step": 715 + }, + { + "epoch": 0.2, + "learning_rate": 4.378834172919869e-07, + "loss": 0.3572, + "step": 716 + }, + { + "epoch": 0.2, + "logps_train/chosen": -28.508033752441406, + "logps_train/ref_chosen": -26.875, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -47.858375549316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1649634838104248, + "rewards_train/margins": 0.5304440855979919, + "rewards_train/rejected": -0.6954075694084167, + "step": 716 + }, + { + "epoch": 0.2, + "logps_train/chosen": -85.81369018554688, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -103.82503509521484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5036348104476929, + "rewards_train/margins": 0.19332194328308105, + "rewards_train/rejected": -1.696956753730774, + "step": 717 + }, + { + "epoch": 0.2, + "learning_rate": 4.3747454957525755e-07, + "loss": 0.5688, + "step": 718 + }, + { + "epoch": 0.2, + "logps_train/chosen": -47.84799575805664, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -45.9123649597168, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3778659999370575, + "rewards_train/margins": 0.10194471478462219, + "rewards_train/rejected": -0.4798107147216797, + "step": 718 + }, + { + "epoch": 0.2, + "logps_train/chosen": -53.432533264160156, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -75.42215728759766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.431144118309021, + "rewards_train/margins": 1.0542360544204712, + "rewards_train/rejected": -1.4853801727294922, + "step": 719 + }, + { + "epoch": 0.2, + "learning_rate": 4.3706453286231273e-07, + "loss": 0.5331, + "step": 720 + }, + { + "epoch": 0.2, + "logps_train/chosen": -43.604515075683594, + "logps_train/ref_chosen": -36.5, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -55.57349395751953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6987329125404358, + "rewards_train/margins": 0.7650619149208069, + "rewards_train/rejected": -1.4637948274612427, + "step": 720 + }, + { + "epoch": 0.2, + "logps_train/chosen": -62.73842239379883, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -93.53396606445312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5853657126426697, + "rewards_train/margins": 1.5977187752723694, + "rewards_train/rejected": -2.183084487915039, + "step": 721 + }, + { + "epoch": 0.2, + "learning_rate": 4.366533696660677e-07, + "loss": 0.4224, + "step": 722 + }, + { + "epoch": 0.2, + "logps_train/chosen": -36.93931579589844, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -55.6656379699707, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06766213476657867, + "rewards_train/margins": 0.5907962769269943, + "rewards_train/rejected": -0.658458411693573, + "step": 722 + }, + { + "epoch": 0.2, + "logps_train/chosen": -99.7580795288086, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -102.43901824951172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3067646026611328, + "rewards_train/margins": 0.8185834884643555, + "rewards_train/rejected": -2.1253480911254883, + "step": 723 + }, + { + "epoch": 0.2, + "learning_rate": 4.362410625064642e-07, + "loss": 0.5024, + "step": 724 + }, + { + "epoch": 0.2, + "logps_train/chosen": -65.61380004882812, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -81.13764190673828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4079624116420746, + "rewards_train/margins": 0.931974321603775, + "rewards_train/rejected": -1.3399367332458496, + "step": 724 + }, + { + "epoch": 0.2, + "logps_train/chosen": -54.373531341552734, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -67.60997772216797, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2898923456668854, + "rewards_train/margins": 0.7055784165859222, + "rewards_train/rejected": -0.9954707622528076, + "step": 725 + }, + { + "epoch": 0.2, + "learning_rate": 4.3582761391045526e-07, + "loss": 0.4936, + "step": 726 + }, + { + "epoch": 0.2, + "logps_train/chosen": -54.74101638793945, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -77.01251220703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8436573147773743, + "rewards_train/margins": 0.6876965165138245, + "rewards_train/rejected": -1.5313538312911987, + "step": 726 + }, + { + "epoch": 0.2, + "logps_train/chosen": -28.0190486907959, + "logps_train/ref_chosen": -23.75, + "logps_train/ref_rejected": -34.25, + "logps_train/rejected": -44.53799819946289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4283209443092346, + "rewards_train/margins": 0.6121000647544861, + "rewards_train/rejected": -1.0404210090637207, + "step": 727 + }, + { + "epoch": 0.2, + "learning_rate": 4.354130264119894e-07, + "loss": 0.4942, + "step": 728 + }, + { + "epoch": 0.2, + "logps_train/chosen": -26.15768051147461, + "logps_train/ref_chosen": -23.75, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -37.0157470703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.23744767904281616, + "rewards_train/margins": 0.48912733793258667, + "rewards_train/rejected": -0.7265750169754028, + "step": 728 + }, + { + "epoch": 0.2, + "logps_train/chosen": -74.88507080078125, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -88.21916198730469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8620414733886719, + "rewards_train/margins": 0.8075311183929443, + "rewards_train/rejected": -1.6695725917816162, + "step": 729 + }, + { + "epoch": 0.2, + "learning_rate": 4.349973025519953e-07, + "loss": 0.4937, + "step": 730 + }, + { + "epoch": 0.2, + "logps_train/chosen": -73.86646270751953, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -82.05445098876953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4632086753845215, + "rewards_train/margins": 0.384227991104126, + "rewards_train/rejected": -0.8474366664886475, + "step": 730 + }, + { + "epoch": 0.2, + "logps_train/chosen": -85.669189453125, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -99.95613098144531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.26691943407058716, + "rewards_train/margins": 1.3876779675483704, + "rewards_train/rejected": -1.6545974016189575, + "step": 731 + }, + { + "epoch": 0.2, + "learning_rate": 4.345804448783664e-07, + "loss": 0.4943, + "step": 732 + }, + { + "epoch": 0.2, + "logps_train/chosen": -104.18721008300781, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -101.2660140991211, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1251177787780762, + "rewards_train/margins": 0.44445228576660156, + "rewards_train/rejected": -1.5695700645446777, + "step": 732 + }, + { + "epoch": 0.2, + "logps_train/chosen": -61.69408416748047, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -73.74052429199219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3416737914085388, + "rewards_train/margins": 0.692925751209259, + "rewards_train/rejected": -1.0345995426177979, + "step": 733 + }, + { + "epoch": 0.21, + "learning_rate": 4.341624559459447e-07, + "loss": 0.4912, + "step": 734 + }, + { + "epoch": 0.21, + "logps_train/chosen": -19.932621002197266, + "logps_train/ref_chosen": -17.0, + "logps_train/ref_rejected": -8.375, + "logps_train/rejected": -12.725445747375488, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2985355257987976, + "rewards_train/margins": 0.13816922903060913, + "rewards_train/rejected": -0.43670475482940674, + "step": 734 + }, + { + "epoch": 0.21, + "logps_train/chosen": -50.78586959838867, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -65.61875915527344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5696026682853699, + "rewards_train/margins": 0.8292847275733948, + "rewards_train/rejected": -1.3988873958587646, + "step": 735 + }, + { + "epoch": 0.21, + "learning_rate": 4.337433383165058e-07, + "loss": 0.5719, + "step": 736 + }, + { + "epoch": 0.21, + "logps_train/chosen": -57.67706298828125, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -49.850616455078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6176084280014038, + "rewards_train/margins": 0.34167182445526123, + "rewards_train/rejected": -0.959280252456665, + "step": 736 + }, + { + "epoch": 0.21, + "logps_train/chosen": -137.65756225585938, + "logps_train/ref_chosen": -122.0, + "logps_train/ref_rejected": -136.0, + "logps_train/rejected": -160.66299438476562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.550912618637085, + "rewards_train/margins": 0.8966367244720459, + "rewards_train/rejected": -2.447549343109131, + "step": 737 + }, + { + "epoch": 0.21, + "learning_rate": 4.333230945587426e-07, + "loss": 0.5527, + "step": 738 + }, + { + "epoch": 0.21, + "logps_train/chosen": -81.39942169189453, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -89.80335998535156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8987309336662292, + "rewards_train/margins": 1.0257458090782166, + "rewards_train/rejected": -1.9244767427444458, + "step": 738 + }, + { + "epoch": 0.21, + "logps_train/chosen": -60.17104721069336, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -85.5159912109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5997220277786255, + "rewards_train/margins": 0.7862527370452881, + "rewards_train/rejected": -1.3859747648239136, + "step": 739 + }, + { + "epoch": 0.21, + "learning_rate": 4.3290172724825015e-07, + "loss": 0.4614, + "step": 740 + }, + { + "epoch": 0.21, + "logps_train/chosen": -84.48946380615234, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -80.91348266601562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1631065607070923, + "rewards_train/margins": 0.3619331121444702, + "rewards_train/rejected": -1.5250396728515625, + "step": 740 + }, + { + "epoch": 0.21, + "logps_train/chosen": -45.46261978149414, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -20.25, + "logps_train/rejected": -23.03445053100586, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.12907451391220093, + "rewards_train/margins": 0.15093308687210083, + "rewards_train/rejected": -0.28000760078430176, + "step": 741 + }, + { + "epoch": 0.21, + "learning_rate": 4.3247923896750915e-07, + "loss": 0.6586, + "step": 742 + }, + { + "epoch": 0.21, + "logps_train/chosen": -64.82351684570312, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -61.36857986450195, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.16809424757957458, + "rewards_train/margins": 0.582044929265976, + "rewards_train/rejected": -0.7501391768455505, + "step": 742 + }, + { + "epoch": 0.21, + "logps_train/chosen": -33.12548065185547, + "logps_train/ref_chosen": -31.0, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -42.422828674316406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21118101477622986, + "rewards_train/margins": 0.42172685265541077, + "rewards_train/rejected": -0.6329078674316406, + "step": 743 + }, + { + "epoch": 0.21, + "learning_rate": 4.320556323058709e-07, + "loss": 0.5268, + "step": 744 + }, + { + "epoch": 0.21, + "logps_train/chosen": -22.184978485107422, + "logps_train/ref_chosen": -17.625, + "logps_train/ref_rejected": -19.0, + "logps_train/rejected": -21.502098083496094, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.45785337686538696, + "rewards_train/margins": -0.21067100763320923, + "rewards_train/rejected": -0.24718236923217773, + "step": 744 + }, + { + "epoch": 0.21, + "logps_train/chosen": -66.39347076416016, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -94.58641052246094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0261726379394531, + "rewards_train/margins": 1.144577980041504, + "rewards_train/rejected": -2.170750617980957, + "step": 745 + }, + { + "epoch": 0.21, + "learning_rate": 4.3163090985954074e-07, + "loss": 0.7109, + "step": 746 + }, + { + "epoch": 0.21, + "logps_train/chosen": -34.51667785644531, + "logps_train/ref_chosen": -30.625, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -38.35007858276367, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.39158475399017334, + "rewards_train/margins": -0.011191070079803467, + "rewards_train/rejected": -0.3803936839103699, + "step": 746 + }, + { + "epoch": 0.21, + "logps_train/chosen": -28.390628814697266, + "logps_train/ref_chosen": -26.75, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -39.71607971191406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.16225619614124298, + "rewards_train/margins": 0.44489888846874237, + "rewards_train/rejected": -0.6071550846099854, + "step": 747 + }, + { + "epoch": 0.21, + "learning_rate": 4.312050742315627e-07, + "loss": 0.6253, + "step": 748 + }, + { + "epoch": 0.21, + "logps_train/chosen": -69.1902084350586, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -75.73802185058594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5235127210617065, + "rewards_train/margins": 0.9332972764968872, + "rewards_train/rejected": -1.4568099975585938, + "step": 748 + }, + { + "epoch": 0.21, + "logps_train/chosen": -49.8427734375, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -66.03128051757812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1014648973941803, + "rewards_train/margins": 1.4969750344753265, + "rewards_train/rejected": -1.5984399318695068, + "step": 749 + }, + { + "epoch": 0.21, + "learning_rate": 4.307781280318031e-07, + "loss": 0.4297, + "step": 750 + }, + { + "epoch": 0.21, + "logps_train/chosen": -61.00275802612305, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -77.95832824707031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2415599822998047, + "rewards_train/margins": 0.9275884628295898, + "rewards_train/rejected": -1.1691484451293945, + "step": 750 + }, + { + "epoch": 0.21, + "logps_train/chosen": -69.67094421386719, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -73.00233459472656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.7874069809913635, + "rewards_train/margins": 0.7291356921195984, + "rewards_train/rejected": -1.516542673110962, + "step": 751 + }, + { + "epoch": 0.21, + "learning_rate": 4.303500738769348e-07, + "loss": 0.4868, + "step": 752 + }, + { + "epoch": 0.21, + "logps_train/chosen": -79.32991027832031, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -102.75628662109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5596757531166077, + "rewards_train/margins": 1.2167338728904724, + "rewards_train/rejected": -1.77640962600708, + "step": 752 + }, + { + "epoch": 0.21, + "logps_train/chosen": -82.9136962890625, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -101.89189147949219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7511347532272339, + "rewards_train/margins": 1.1444997787475586, + "rewards_train/rejected": -1.8956345319747925, + "step": 753 + }, + { + "epoch": 0.21, + "learning_rate": 4.299209143904211e-07, + "loss": 0.3593, + "step": 754 + }, + { + "epoch": 0.21, + "logps_train/chosen": -78.71809387207031, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -87.73480224609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9702469110488892, + "rewards_train/margins": 0.9237418174743652, + "rewards_train/rejected": -1.8939887285232544, + "step": 754 + }, + { + "epoch": 0.21, + "logps_train/chosen": -70.54363250732422, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -91.7065658569336, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7981129884719849, + "rewards_train/margins": 0.89559006690979, + "rewards_train/rejected": -1.693703055381775, + "step": 755 + }, + { + "epoch": 0.21, + "learning_rate": 4.294906522024997e-07, + "loss": 0.4506, + "step": 756 + }, + { + "epoch": 0.21, + "logps_train/chosen": -66.64875030517578, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -64.6475830078125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6670231819152832, + "rewards_train/margins": 0.498028039932251, + "rewards_train/rejected": -1.1650512218475342, + "step": 756 + }, + { + "epoch": 0.21, + "logps_train/chosen": -56.22367477416992, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -63.79145812988281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8731487989425659, + "rewards_train/margins": 1.0929113626480103, + "rewards_train/rejected": -1.9660601615905762, + "step": 757 + }, + { + "epoch": 0.21, + "learning_rate": 4.290592899501666e-07, + "loss": 0.4703, + "step": 758 + }, + { + "epoch": 0.21, + "logps_train/chosen": -75.13713073730469, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -101.6120834350586, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0801197290420532, + "rewards_train/margins": 1.2982760667800903, + "rewards_train/rejected": -2.3783957958221436, + "step": 758 + }, + { + "epoch": 0.21, + "logps_train/chosen": -66.06676483154297, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -72.21924591064453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5070674419403076, + "rewards_train/margins": 0.6445446014404297, + "rewards_train/rejected": -1.1516120433807373, + "step": 759 + }, + { + "epoch": 0.21, + "learning_rate": 4.2862683027715975e-07, + "loss": 0.4188, + "step": 760 + }, + { + "epoch": 0.21, + "logps_train/chosen": -47.103973388671875, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -81.80034637451172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2663542628288269, + "rewards_train/margins": 0.7855551838874817, + "rewards_train/rejected": -1.0519094467163086, + "step": 760 + }, + { + "epoch": 0.21, + "logps_train/chosen": -57.30543518066406, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -79.66067504882812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.62077796459198, + "rewards_train/margins": 0.48337578773498535, + "rewards_train/rejected": -1.1041537523269653, + "step": 761 + }, + { + "epoch": 0.21, + "learning_rate": 4.281932758339431e-07, + "loss": 0.4906, + "step": 762 + }, + { + "epoch": 0.21, + "logps_train/chosen": -59.006019592285156, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -107.13850402832031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9209145307540894, + "rewards_train/margins": 1.6273113489151, + "rewards_train/rejected": -2.5482258796691895, + "step": 762 + }, + { + "epoch": 0.21, + "logps_train/chosen": -53.86160659790039, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -61.93323516845703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5422154068946838, + "rewards_train/margins": 0.6380223631858826, + "rewards_train/rejected": -1.1802377700805664, + "step": 763 + }, + { + "epoch": 0.21, + "learning_rate": 4.277586292776902e-07, + "loss": 0.407, + "step": 764 + }, + { + "epoch": 0.21, + "logps_train/chosen": -78.84117126464844, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -92.62709045410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9454452991485596, + "rewards_train/margins": 0.49343574047088623, + "rewards_train/rejected": -1.4388810396194458, + "step": 764 + }, + { + "epoch": 0.21, + "logps_train/chosen": -44.09888458251953, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -56.846553802490234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.16955626010894775, + "rewards_train/margins": 0.5073843598365784, + "rewards_train/rejected": -0.6769406199455261, + "step": 765 + }, + { + "epoch": 0.21, + "learning_rate": 4.273228932722679e-07, + "loss": 0.5323, + "step": 766 + }, + { + "epoch": 0.21, + "logps_train/chosen": -70.78803253173828, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -87.11528015136719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6672797799110413, + "rewards_train/margins": 1.2649632096290588, + "rewards_train/rejected": -1.9322429895401, + "step": 766 + }, + { + "epoch": 0.21, + "logps_train/chosen": -65.51760864257812, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -81.43682098388672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4484404921531677, + "rewards_train/margins": 1.420436680316925, + "rewards_train/rejected": -1.8688771724700928, + "step": 767 + }, + { + "epoch": 0.21, + "learning_rate": 4.268860704882202e-07, + "loss": 0.3768, + "step": 768 + }, + { + "epoch": 0.21, + "logps_train/chosen": -49.655696868896484, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -48.817283630371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1602962613105774, + "rewards_train/margins": 0.4538540244102478, + "rewards_train/rejected": -0.6141502857208252, + "step": 768 + }, + { + "epoch": 0.21, + "logps_train/chosen": -74.95917510986328, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -99.34453582763672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.600800633430481, + "rewards_train/margins": 0.9467389583587646, + "rewards_train/rejected": -1.5475395917892456, + "step": 769 + }, + { + "epoch": 0.22, + "learning_rate": 4.2644816360275173e-07, + "loss": 0.4975, + "step": 770 + }, + { + "epoch": 0.22, + "logps_train/chosen": -98.62361145019531, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -102.39410400390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3014233708381653, + "rewards_train/margins": 0.9379865527153015, + "rewards_train/rejected": -1.2394099235534668, + "step": 770 + }, + { + "epoch": 0.22, + "logps_train/chosen": -86.83627319335938, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -76.5495834350586, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3202731609344482, + "rewards_train/margins": -0.07351791858673096, + "rewards_train/rejected": -1.2467552423477173, + "step": 771 + }, + { + "epoch": 0.22, + "learning_rate": 4.2600917529971124e-07, + "loss": 0.6152, + "step": 772 + }, + { + "epoch": 0.22, + "logps_train/chosen": -72.23640441894531, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -64.30915069580078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7181720733642578, + "rewards_train/margins": 0.31196165084838867, + "rewards_train/rejected": -1.0301337242126465, + "step": 772 + }, + { + "epoch": 0.22, + "logps_train/chosen": -99.03294372558594, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -115.25489807128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4572010040283203, + "rewards_train/margins": 1.3969022035598755, + "rewards_train/rejected": -1.8541032075881958, + "step": 773 + }, + { + "epoch": 0.22, + "learning_rate": 4.255691082695754e-07, + "loss": 0.4459, + "step": 774 + }, + { + "epoch": 0.22, + "logps_train/chosen": -39.96922302246094, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -40.81270980834961, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4469220042228699, + "rewards_train/margins": 0.03202971816062927, + "rewards_train/rejected": -0.47895172238349915, + "step": 774 + }, + { + "epoch": 0.22, + "logps_train/chosen": -72.17256164550781, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -85.79423522949219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12487365305423737, + "rewards_train/margins": 0.9592375606298447, + "rewards_train/rejected": -1.084111213684082, + "step": 775 + }, + { + "epoch": 0.22, + "learning_rate": 4.2512796520943215e-07, + "loss": 0.5324, + "step": 776 + }, + { + "epoch": 0.22, + "logps_train/chosen": -131.396728515625, + "logps_train/ref_chosen": -115.5, + "logps_train/ref_rejected": -128.0, + "logps_train/rejected": -155.43008422851562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.614282488822937, + "rewards_train/margins": 1.1396623849868774, + "rewards_train/rejected": -2.7539448738098145, + "step": 776 + }, + { + "epoch": 0.22, + "logps_train/chosen": -68.80056762695312, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -74.79603576660156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5984159708023071, + "rewards_train/margins": 0.9292343854904175, + "rewards_train/rejected": -1.5276503562927246, + "step": 777 + }, + { + "epoch": 0.22, + "learning_rate": 4.246857488229644e-07, + "loss": 0.4663, + "step": 778 + }, + { + "epoch": 0.22, + "logps_train/chosen": -77.71932220458984, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -91.0114974975586, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.9127525091171265, + "rewards_train/margins": 0.10480368137359619, + "rewards_train/rejected": -1.0175561904907227, + "step": 778 + }, + { + "epoch": 0.22, + "logps_train/chosen": -55.49123764038086, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -65.74839782714844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2718776762485504, + "rewards_train/margins": 0.650716632604599, + "rewards_train/rejected": -0.9225943088531494, + "step": 779 + }, + { + "epoch": 0.22, + "learning_rate": 4.24242461820433e-07, + "loss": 0.6503, + "step": 780 + }, + { + "epoch": 0.22, + "logps_train/chosen": -35.94996643066406, + "logps_train/ref_chosen": -27.125, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -50.71809005737305, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8792738914489746, + "rewards_train/margins": 0.6047424077987671, + "rewards_train/rejected": -1.4840162992477417, + "step": 780 + }, + { + "epoch": 0.22, + "logps_train/chosen": -45.22307586669922, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -53.12763977050781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.09750300645828247, + "rewards_train/margins": 0.6219016909599304, + "rewards_train/rejected": -0.7194046974182129, + "step": 781 + }, + { + "epoch": 0.22, + "learning_rate": 4.237981069186606e-07, + "loss": 0.544, + "step": 782 + }, + { + "epoch": 0.22, + "logps_train/chosen": -81.47496795654297, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -93.6407241821289, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.057995080947876, + "rewards_train/margins": 0.8180397748947144, + "rewards_train/rejected": -1.8760348558425903, + "step": 782 + }, + { + "epoch": 0.22, + "logps_train/chosen": -107.39361572265625, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -114.52546691894531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.122174620628357, + "rewards_train/margins": 0.6584969758987427, + "rewards_train/rejected": -1.7806715965270996, + "step": 783 + }, + { + "epoch": 0.22, + "learning_rate": 4.2335268684101456e-07, + "loss": 0.5798, + "step": 784 + }, + { + "epoch": 0.22, + "logps_train/chosen": -55.175079345703125, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -65.32829284667969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6636016964912415, + "rewards_train/margins": 0.670985996723175, + "rewards_train/rejected": -1.3345876932144165, + "step": 784 + }, + { + "epoch": 0.22, + "logps_train/chosen": -69.07587432861328, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -69.96524047851562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5310250520706177, + "rewards_train/margins": 0.8592489957809448, + "rewards_train/rejected": -1.3902740478515625, + "step": 785 + }, + { + "epoch": 0.22, + "learning_rate": 4.2290620431739083e-07, + "loss": 0.4823, + "step": 786 + }, + { + "epoch": 0.22, + "logps_train/chosen": -81.48248291015625, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -83.1921615600586, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.068756341934204, + "rewards_train/margins": -0.012821078300476074, + "rewards_train/rejected": -1.055935263633728, + "step": 786 + }, + { + "epoch": 0.22, + "logps_train/chosen": -52.133052825927734, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -76.95704650878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5356687307357788, + "rewards_train/margins": 1.3787859678268433, + "rewards_train/rejected": -1.914454698562622, + "step": 787 + }, + { + "epoch": 0.22, + "learning_rate": 4.2245866208419667e-07, + "loss": 0.5452, + "step": 788 + }, + { + "epoch": 0.22, + "logps_train/chosen": -36.936492919921875, + "logps_train/ref_chosen": -32.0, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -38.59773254394531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.483004629611969, + "rewards_train/margins": 0.009190499782562256, + "rewards_train/rejected": -0.49219512939453125, + "step": 788 + }, + { + "epoch": 0.22, + "logps_train/chosen": -52.72639846801758, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -59.34904479980469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.04763978719711304, + "rewards_train/margins": 0.8146083354949951, + "rewards_train/rejected": -0.8622481226921082, + "step": 789 + }, + { + "epoch": 0.22, + "learning_rate": 4.220100628843342e-07, + "loss": 0.606, + "step": 790 + }, + { + "epoch": 0.22, + "logps_train/chosen": -43.210609436035156, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -49.56851577758789, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5629796385765076, + "rewards_train/margins": 0.4407469630241394, + "rewards_train/rejected": -1.003726601600647, + "step": 790 + }, + { + "epoch": 0.22, + "logps_train/chosen": -40.289878845214844, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -43.93583679199219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.18123379349708557, + "rewards_train/margins": 0.3130333423614502, + "rewards_train/rejected": -0.49426713585853577, + "step": 791 + }, + { + "epoch": 0.22, + "learning_rate": 4.2156040946718343e-07, + "loss": 0.5674, + "step": 792 + }, + { + "epoch": 0.22, + "logps_train/chosen": -79.20447540283203, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -89.39700317382812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7209354043006897, + "rewards_train/margins": 0.36651867628097534, + "rewards_train/rejected": -1.087454080581665, + "step": 792 + }, + { + "epoch": 0.22, + "logps_train/chosen": -55.22673034667969, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -44.716453552246094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5199382901191711, + "rewards_train/margins": 0.4990217089653015, + "rewards_train/rejected": -1.0189599990844727, + "step": 793 + }, + { + "epoch": 0.22, + "learning_rate": 4.2110970458858544e-07, + "loss": 0.5269, + "step": 794 + }, + { + "epoch": 0.22, + "logps_train/chosen": -34.29668426513672, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -58.57682418823242, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1391409933567047, + "rewards_train/margins": 1.3052602708339691, + "rewards_train/rejected": -1.4444012641906738, + "step": 794 + }, + { + "epoch": 0.22, + "logps_train/chosen": -48.54473114013672, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -50.31939697265625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.35613349080085754, + "rewards_train/margins": 0.5680917799472809, + "rewards_train/rejected": -0.9242252707481384, + "step": 795 + }, + { + "epoch": 0.22, + "learning_rate": 4.206579510108256e-07, + "loss": 0.5159, + "step": 796 + }, + { + "epoch": 0.22, + "logps_train/chosen": -42.91671371459961, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -30.0, + "logps_train/rejected": -41.97731018066406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8901088237762451, + "rewards_train/margins": 0.3125048875808716, + "rewards_train/rejected": -1.2026137113571167, + "step": 796 + }, + { + "epoch": 0.22, + "logps_train/chosen": -67.01020050048828, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -72.64645385742188, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6365671157836914, + "rewards_train/margins": 0.46772706508636475, + "rewards_train/rejected": -1.1042941808700562, + "step": 797 + }, + { + "epoch": 0.22, + "learning_rate": 4.202051515026166e-07, + "loss": 0.5596, + "step": 798 + }, + { + "epoch": 0.22, + "logps_train/chosen": -53.409149169921875, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -55.79767608642578, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2479463517665863, + "rewards_train/margins": 0.8273290693759918, + "rewards_train/rejected": -1.0752754211425781, + "step": 798 + }, + { + "epoch": 0.22, + "logps_train/chosen": -96.3235855102539, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -124.00653076171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5710307955741882, + "rewards_train/margins": 1.655402958393097, + "rewards_train/rejected": -2.226433753967285, + "step": 799 + }, + { + "epoch": 0.22, + "learning_rate": 4.197513088390813e-07, + "loss": 0.431, + "step": 800 + }, + { + "epoch": 0.22, + "logps_train/chosen": -65.17707061767578, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -92.5411376953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0185375213623047, + "rewards_train/margins": 0.8723433017730713, + "rewards_train/rejected": -1.890880823135376, + "step": 800 + }, + { + "epoch": 0.22, + "logps_train/chosen": -44.5245361328125, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -60.77937316894531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6116332411766052, + "rewards_train/margins": 0.7637651562690735, + "rewards_train/rejected": -1.3753983974456787, + "step": 801 + }, + { + "epoch": 0.22, + "learning_rate": 4.1929642580173585e-07, + "loss": 0.4697, + "step": 802 + }, + { + "epoch": 0.22, + "logps_train/chosen": -98.2421875, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -130.2311553955078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2947750091552734, + "rewards_train/margins": 0.6127151250839233, + "rewards_train/rejected": -1.9074901342391968, + "step": 802 + }, + { + "epoch": 0.22, + "logps_train/chosen": -57.04893493652344, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -75.33086395263672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5850875973701477, + "rewards_train/margins": 0.9184271693229675, + "rewards_train/rejected": -1.5035147666931152, + "step": 803 + }, + { + "epoch": 0.22, + "learning_rate": 4.188405051784729e-07, + "loss": 0.5162, + "step": 804 + }, + { + "epoch": 0.22, + "logps_train/chosen": -82.97969055175781, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -120.67314147949219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1600789725780487, + "rewards_train/margins": 1.8509850800037384, + "rewards_train/rejected": -2.011064052581787, + "step": 804 + }, + { + "epoch": 0.22, + "logps_train/chosen": -86.50658416748047, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -86.41155242919922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8424553871154785, + "rewards_train/margins": 0.2572929859161377, + "rewards_train/rejected": -1.0997483730316162, + "step": 805 + }, + { + "epoch": 0.23, + "learning_rate": 4.1838354976354406e-07, + "loss": 0.4673, + "step": 806 + }, + { + "epoch": 0.23, + "logps_train/chosen": -62.000694274902344, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -73.03533935546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.21764707565307617, + "rewards_train/margins": 1.3284653425216675, + "rewards_train/rejected": -1.5461124181747437, + "step": 806 + }, + { + "epoch": 0.23, + "logps_train/chosen": -65.02070617675781, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -87.37649536132812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.46691471338272095, + "rewards_train/margins": 1.0633131861686707, + "rewards_train/rejected": -1.5302278995513916, + "step": 807 + }, + { + "epoch": 0.23, + "learning_rate": 4.17925562357543e-07, + "loss": 0.4781, + "step": 808 + }, + { + "epoch": 0.23, + "logps_train/chosen": -42.0662841796875, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -54.25373840332031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9398560523986816, + "rewards_train/margins": 0.15233898162841797, + "rewards_train/rejected": -1.0921950340270996, + "step": 808 + }, + { + "epoch": 0.23, + "logps_train/chosen": -88.19157409667969, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -76.43254089355469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2472336292266846, + "rewards_train/margins": 0.39992737770080566, + "rewards_train/rejected": -1.6471610069274902, + "step": 809 + }, + { + "epoch": 0.23, + "learning_rate": 4.1746654576738824e-07, + "loss": 0.5823, + "step": 810 + }, + { + "epoch": 0.23, + "logps_train/chosen": -69.62837219238281, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -73.64430236816406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6413524746894836, + "rewards_train/margins": 0.9695623517036438, + "rewards_train/rejected": -1.6109148263931274, + "step": 810 + }, + { + "epoch": 0.23, + "logps_train/chosen": -104.60528564453125, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -109.66112518310547, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5290825366973877, + "rewards_train/margins": 0.7444517612457275, + "rewards_train/rejected": -2.2735342979431152, + "step": 811 + }, + { + "epoch": 0.23, + "learning_rate": 4.1700650280630624e-07, + "loss": 0.5085, + "step": 812 + }, + { + "epoch": 0.23, + "logps_train/chosen": -78.03692626953125, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -75.56971740722656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5325984954833984, + "rewards_train/margins": 0.5513255596160889, + "rewards_train/rejected": -2.0839240550994873, + "step": 812 + }, + { + "epoch": 0.23, + "logps_train/chosen": -42.48480987548828, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -57.76921463012695, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5846140384674072, + "rewards_train/margins": 0.813010573387146, + "rewards_train/rejected": -1.3976246118545532, + "step": 813 + }, + { + "epoch": 0.23, + "learning_rate": 4.1654543629381346e-07, + "loss": 0.4606, + "step": 814 + }, + { + "epoch": 0.23, + "logps_train/chosen": -62.89506912231445, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -75.18867492675781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7123585939407349, + "rewards_train/margins": 0.9132469892501831, + "rewards_train/rejected": -1.625605583190918, + "step": 814 + }, + { + "epoch": 0.23, + "logps_train/chosen": -48.81223678588867, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -79.15168762207031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.17887991666793823, + "rewards_train/margins": 1.2269136309623718, + "rewards_train/rejected": -1.40579354763031, + "step": 815 + }, + { + "epoch": 0.23, + "learning_rate": 4.1608334905569986e-07, + "loss": 0.4137, + "step": 816 + }, + { + "epoch": 0.23, + "logps_train/chosen": -38.1473503112793, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -32.343116760253906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.19852405786514282, + "rewards_train/margins": 0.05922490358352661, + "rewards_train/rejected": -0.25774896144866943, + "step": 816 + }, + { + "epoch": 0.23, + "logps_train/chosen": -91.79652404785156, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -98.53657531738281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5296523571014404, + "rewards_train/margins": 0.8661925792694092, + "rewards_train/rejected": -2.3958449363708496, + "step": 817 + }, + { + "epoch": 0.23, + "learning_rate": 4.156202439240111e-07, + "loss": 0.5826, + "step": 818 + }, + { + "epoch": 0.23, + "logps_train/chosen": -62.88359451293945, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -64.70132446289062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.14617177844047546, + "rewards_train/margins": 0.24075812101364136, + "rewards_train/rejected": -0.3869298994541168, + "step": 818 + }, + { + "epoch": 0.23, + "logps_train/chosen": -66.91609191894531, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -68.0125732421875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5916092395782471, + "rewards_train/margins": 0.845194935798645, + "rewards_train/rejected": -1.436804175376892, + "step": 819 + }, + { + "epoch": 0.23, + "learning_rate": 4.1515612373703125e-07, + "loss": 0.5709, + "step": 820 + }, + { + "epoch": 0.23, + "logps_train/chosen": -72.45796203613281, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -93.32817077636719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5786080360412598, + "rewards_train/margins": 0.9327239990234375, + "rewards_train/rejected": -1.5113320350646973, + "step": 820 + }, + { + "epoch": 0.23, + "logps_train/chosen": -77.12188720703125, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -94.6893310546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8117375373840332, + "rewards_train/margins": 1.0581116676330566, + "rewards_train/rejected": -1.8698492050170898, + "step": 821 + }, + { + "epoch": 0.23, + "learning_rate": 4.1469099133926566e-07, + "loss": 0.4374, + "step": 822 + }, + { + "epoch": 0.23, + "logps_train/chosen": -97.09497833251953, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -132.5654296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1946545839309692, + "rewards_train/margins": 1.7118874788284302, + "rewards_train/rejected": -2.9065420627593994, + "step": 822 + }, + { + "epoch": 0.23, + "logps_train/chosen": -70.23595428466797, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -97.10716247558594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.47086095809936523, + "rewards_train/margins": 0.4336055517196655, + "rewards_train/rejected": -0.9044665098190308, + "step": 823 + }, + { + "epoch": 0.23, + "learning_rate": 4.1422484958142324e-07, + "loss": 0.3872, + "step": 824 + }, + { + "epoch": 0.23, + "logps_train/chosen": -61.643978118896484, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -80.45086669921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5362118482589722, + "rewards_train/margins": 0.7791265249252319, + "rewards_train/rejected": -1.315338373184204, + "step": 824 + }, + { + "epoch": 0.23, + "logps_train/chosen": -71.22943115234375, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -95.36404418945312, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.9088804721832275, + "rewards_train/margins": 1.4546713829040527, + "rewards_train/rejected": -2.3635518550872803, + "step": 825 + }, + { + "epoch": 0.23, + "learning_rate": 4.137577013203989e-07, + "loss": 0.4482, + "step": 826 + }, + { + "epoch": 0.23, + "logps_train/chosen": -63.636497497558594, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -54.24509811401367, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5723414421081543, + "rewards_train/margins": 0.21090877056121826, + "rewards_train/rejected": -0.7832502126693726, + "step": 826 + }, + { + "epoch": 0.23, + "logps_train/chosen": -95.05361938476562, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -111.03193664550781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4521886110305786, + "rewards_train/margins": 1.0213178396224976, + "rewards_train/rejected": -2.473506450653076, + "step": 827 + }, + { + "epoch": 0.23, + "learning_rate": 4.1328954941925656e-07, + "loss": 0.5355, + "step": 828 + }, + { + "epoch": 0.23, + "logps_train/chosen": -63.327484130859375, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -69.67037963867188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1571626663208008, + "rewards_train/margins": 0.6307734251022339, + "rewards_train/rejected": -1.7879360914230347, + "step": 828 + }, + { + "epoch": 0.23, + "logps_train/chosen": -73.36369323730469, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -88.95587921142578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7222095727920532, + "rewards_train/margins": 1.1878312826156616, + "rewards_train/rejected": -2.910040855407715, + "step": 829 + }, + { + "epoch": 0.23, + "learning_rate": 4.1282039674721093e-07, + "loss": 0.6553, + "step": 830 + }, + { + "epoch": 0.23, + "logps_train/chosen": -100.62915802001953, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -134.68023681640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0336189270019531, + "rewards_train/margins": 1.7113580703735352, + "rewards_train/rejected": -2.7449769973754883, + "step": 830 + }, + { + "epoch": 0.23, + "logps_train/chosen": -44.26068115234375, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -61.75473403930664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1421934962272644, + "rewards_train/margins": 0.8658111691474915, + "rewards_train/rejected": -0.723617672920227, + "step": 831 + }, + { + "epoch": 0.23, + "learning_rate": 4.123502461796105e-07, + "loss": 0.3498, + "step": 832 + }, + { + "epoch": 0.23, + "logps_train/chosen": -68.79542541503906, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -99.24661254882812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7224140763282776, + "rewards_train/margins": 1.008497178554535, + "rewards_train/rejected": -1.7309112548828125, + "step": 832 + }, + { + "epoch": 0.23, + "logps_train/chosen": -53.21661376953125, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -72.57496643066406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7533018589019775, + "rewards_train/margins": 1.0877885818481445, + "rewards_train/rejected": -1.841090440750122, + "step": 833 + }, + { + "epoch": 0.23, + "learning_rate": 4.118791005979195e-07, + "loss": 0.4874, + "step": 834 + }, + { + "epoch": 0.23, + "logps_train/chosen": -74.54422760009766, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -84.56739807128906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.37571173906326294, + "rewards_train/margins": 1.3034891486167908, + "rewards_train/rejected": -1.6792008876800537, + "step": 834 + }, + { + "epoch": 0.23, + "logps_train/chosen": -63.8644905090332, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -77.23162841796875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6001206636428833, + "rewards_train/margins": 1.1230422258377075, + "rewards_train/rejected": -1.7231628894805908, + "step": 835 + }, + { + "epoch": 0.23, + "learning_rate": 4.114069628897006e-07, + "loss": 0.4857, + "step": 836 + }, + { + "epoch": 0.23, + "logps_train/chosen": -63.14500427246094, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -61.190818786621094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8691879510879517, + "rewards_train/margins": 0.39442527294158936, + "rewards_train/rejected": -1.263613224029541, + "step": 836 + }, + { + "epoch": 0.23, + "logps_train/chosen": -34.73402786254883, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -31.625, + "logps_train/rejected": -39.241756439208984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.45207691192626953, + "rewards_train/margins": 0.30350762605667114, + "rewards_train/rejected": -0.7555845379829407, + "step": 837 + }, + { + "epoch": 0.23, + "learning_rate": 4.109338359485968e-07, + "loss": 0.5491, + "step": 838 + }, + { + "epoch": 0.23, + "logps_train/chosen": -55.93766784667969, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -74.05572509765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.45892825722694397, + "rewards_train/margins": 0.9501595199108124, + "rewards_train/rejected": -1.4090877771377563, + "step": 838 + }, + { + "epoch": 0.23, + "logps_train/chosen": -41.26186752319336, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -48.672325134277344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.45548343658447266, + "rewards_train/margins": 0.6961238384246826, + "rewards_train/rejected": -1.1516072750091553, + "step": 839 + }, + { + "epoch": 0.23, + "learning_rate": 4.1045972267431407e-07, + "loss": 0.4481, + "step": 840 + }, + { + "epoch": 0.23, + "logps_train/chosen": -39.77976989746094, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -55.97582244873047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.23769403994083405, + "rewards_train/margins": 0.8598878234624863, + "rewards_train/rejected": -1.0975818634033203, + "step": 840 + }, + { + "epoch": 0.24, + "logps_train/chosen": -62.116844177246094, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -67.9813232421875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.22418440878391266, + "rewards_train/margins": 0.8791235536336899, + "rewards_train/rejected": -1.1033079624176025, + "step": 841 + }, + { + "epoch": 0.24, + "learning_rate": 4.0998462597260343e-07, + "loss": 0.4588, + "step": 842 + }, + { + "epoch": 0.24, + "logps_train/chosen": -42.325218200683594, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -51.84090042114258, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.7645533084869385, + "rewards_train/margins": 0.44268131256103516, + "rewards_train/rejected": -1.2072346210479736, + "step": 842 + }, + { + "epoch": 0.24, + "logps_train/chosen": -68.06748962402344, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -103.82437133789062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8422955274581909, + "rewards_train/margins": 1.2043992280960083, + "rewards_train/rejected": -2.046694755554199, + "step": 843 + }, + { + "epoch": 0.24, + "learning_rate": 4.0950854875524305e-07, + "loss": 0.48, + "step": 844 + }, + { + "epoch": 0.24, + "logps_train/chosen": -72.60628509521484, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -78.16154479980469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9348475933074951, + "rewards_train/margins": 0.702791690826416, + "rewards_train/rejected": -1.6376392841339111, + "step": 844 + }, + { + "epoch": 0.24, + "logps_train/chosen": -43.7841796875, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -68.80323791503906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4030269980430603, + "rewards_train/margins": 0.6636252999305725, + "rewards_train/rejected": -1.0666522979736328, + "step": 845 + }, + { + "epoch": 0.24, + "learning_rate": 4.090314939400206e-07, + "loss": 0.5491, + "step": 846 + }, + { + "epoch": 0.24, + "logps_train/chosen": -62.69173049926758, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -78.07003784179688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6222983598709106, + "rewards_train/margins": 1.0536508560180664, + "rewards_train/rejected": -1.675949215888977, + "step": 846 + }, + { + "epoch": 0.24, + "logps_train/chosen": -99.3228759765625, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -129.64483642578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1174439191818237, + "rewards_train/margins": 1.796258807182312, + "rewards_train/rejected": -2.9137027263641357, + "step": 847 + }, + { + "epoch": 0.24, + "learning_rate": 4.0855346445071524e-07, + "loss": 0.3345, + "step": 848 + }, + { + "epoch": 0.24, + "logps_train/chosen": -65.88160705566406, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -71.70245361328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2537854015827179, + "rewards_train/margins": 0.8770063817501068, + "rewards_train/rejected": -1.1307917833328247, + "step": 848 + }, + { + "epoch": 0.24, + "logps_train/chosen": -62.95964050292969, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -73.73855590820312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.23619846999645233, + "rewards_train/margins": 0.7333600074052811, + "rewards_train/rejected": -0.9695584774017334, + "step": 849 + }, + { + "epoch": 0.24, + "learning_rate": 4.0807446321707974e-07, + "loss": 0.4208, + "step": 850 + }, + { + "epoch": 0.24, + "logps_train/chosen": -69.36096954345703, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -101.65882873535156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1899546384811401, + "rewards_train/margins": 1.8454598188400269, + "rewards_train/rejected": -3.035414457321167, + "step": 850 + }, + { + "epoch": 0.24, + "logps_train/chosen": -73.83930969238281, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -96.34136199951172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.49164584279060364, + "rewards_train/margins": 1.4840918481349945, + "rewards_train/rejected": -1.9757376909255981, + "step": 851 + }, + { + "epoch": 0.24, + "learning_rate": 4.075944931748223e-07, + "loss": 0.3147, + "step": 852 + }, + { + "epoch": 0.24, + "logps_train/chosen": -61.73343276977539, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -58.92315673828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7012729644775391, + "rewards_train/margins": 0.31762969493865967, + "rewards_train/rejected": -1.0189026594161987, + "step": 852 + }, + { + "epoch": 0.24, + "logps_train/chosen": -41.882659912109375, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -55.42266845703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3237150311470032, + "rewards_train/margins": 0.8373019099235535, + "rewards_train/rejected": -1.1610169410705566, + "step": 853 + }, + { + "epoch": 0.24, + "learning_rate": 4.071135572655892e-07, + "loss": 0.5335, + "step": 854 + }, + { + "epoch": 0.24, + "logps_train/chosen": -102.44183349609375, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -109.13877868652344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1290467977523804, + "rewards_train/margins": 1.2723315954208374, + "rewards_train/rejected": -2.4013783931732178, + "step": 854 + }, + { + "epoch": 0.24, + "logps_train/chosen": -94.62855529785156, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -114.26194763183594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6460587978363037, + "rewards_train/margins": 1.2277929782867432, + "rewards_train/rejected": -2.873851776123047, + "step": 855 + }, + { + "epoch": 0.24, + "learning_rate": 4.066316584369458e-07, + "loss": 0.4149, + "step": 856 + }, + { + "epoch": 0.24, + "logps_train/chosen": -37.71443557739258, + "logps_train/ref_chosen": -28.75, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -51.58930206298828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8912678956985474, + "rewards_train/margins": 0.30984973907470703, + "rewards_train/rejected": -1.2011176347732544, + "step": 856 + }, + { + "epoch": 0.24, + "logps_train/chosen": -82.29135131835938, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -136.8566131591797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9064784049987793, + "rewards_train/margins": 1.6909008026123047, + "rewards_train/rejected": -2.597379207611084, + "step": 857 + }, + { + "epoch": 0.24, + "learning_rate": 4.061487996423594e-07, + "loss": 0.4545, + "step": 858 + }, + { + "epoch": 0.24, + "logps_train/chosen": -66.29974365234375, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -64.24313354492188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5932061076164246, + "rewards_train/margins": 0.669388473033905, + "rewards_train/rejected": -1.2625945806503296, + "step": 858 + }, + { + "epoch": 0.24, + "logps_train/chosen": -75.85748291015625, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -89.54031372070312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9605528116226196, + "rewards_train/margins": 1.065353274345398, + "rewards_train/rejected": -2.0259060859680176, + "step": 859 + }, + { + "epoch": 0.24, + "learning_rate": 4.056649838411807e-07, + "loss": 0.4745, + "step": 860 + }, + { + "epoch": 0.24, + "logps_train/chosen": -76.42633056640625, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -81.47369384765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3918523788452148, + "rewards_train/margins": 1.0730953216552734, + "rewards_train/rejected": -2.4649477005004883, + "step": 860 + }, + { + "epoch": 0.24, + "logps_train/chosen": -77.15873718261719, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -130.86778259277344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8947802186012268, + "rewards_train/margins": 2.401763617992401, + "rewards_train/rejected": -3.296543836593628, + "step": 861 + }, + { + "epoch": 0.24, + "learning_rate": 4.0518021399862554e-07, + "loss": 0.384, + "step": 862 + }, + { + "epoch": 0.24, + "logps_train/chosen": -64.38674926757812, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -65.65277862548828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9566435813903809, + "rewards_train/margins": 0.3637123107910156, + "rewards_train/rejected": -1.3203558921813965, + "step": 862 + }, + { + "epoch": 0.24, + "logps_train/chosen": -57.803504943847656, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -62.15605545043945, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4194130301475525, + "rewards_train/margins": 0.9262707829475403, + "rewards_train/rejected": -1.3456838130950928, + "step": 863 + }, + { + "epoch": 0.24, + "learning_rate": 4.0469449308575716e-07, + "loss": 0.5045, + "step": 864 + }, + { + "epoch": 0.24, + "logps_train/chosen": -58.545047760009766, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -87.83868408203125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8935671448707581, + "rewards_train/margins": 0.9514340758323669, + "rewards_train/rejected": -1.845001220703125, + "step": 864 + }, + { + "epoch": 0.24, + "logps_train/chosen": -59.745479583740234, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -78.23158264160156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7815794348716736, + "rewards_train/margins": 1.1259544491767883, + "rewards_train/rejected": -1.907533884048462, + "step": 865 + }, + { + "epoch": 0.24, + "learning_rate": 4.042078240794674e-07, + "loss": 0.4875, + "step": 866 + }, + { + "epoch": 0.24, + "logps_train/chosen": -87.96759033203125, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -115.6199951171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9797187447547913, + "rewards_train/margins": 1.2900447249412537, + "rewards_train/rejected": -2.269763469696045, + "step": 866 + }, + { + "epoch": 0.24, + "logps_train/chosen": -97.64161682128906, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -92.29362487792969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0157241821289062, + "rewards_train/margins": 0.2773101329803467, + "rewards_train/rejected": -2.293034315109253, + "step": 867 + }, + { + "epoch": 0.24, + "learning_rate": 4.0372020996245917e-07, + "loss": 0.469, + "step": 868 + }, + { + "epoch": 0.24, + "logps_train/chosen": -49.51487350463867, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -64.07353973388672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.660178542137146, + "rewards_train/margins": 0.3977614641189575, + "rewards_train/rejected": -1.0579400062561035, + "step": 868 + }, + { + "epoch": 0.24, + "logps_train/chosen": -100.80717468261719, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -108.25254821777344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.869194507598877, + "rewards_train/margins": 0.7628970146179199, + "rewards_train/rejected": -2.632091522216797, + "step": 869 + }, + { + "epoch": 0.24, + "learning_rate": 4.032316537232274e-07, + "loss": 0.4791, + "step": 870 + }, + { + "epoch": 0.24, + "logps_train/chosen": -108.68254852294922, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -133.51974487304688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.027629852294922, + "rewards_train/margins": 0.8053998947143555, + "rewards_train/rejected": -2.8330297470092773, + "step": 870 + }, + { + "epoch": 0.24, + "logps_train/chosen": -81.01268768310547, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -80.3427505493164, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5993157625198364, + "rewards_train/margins": 0.3802715539932251, + "rewards_train/rejected": -0.9795873165130615, + "step": 871 + }, + { + "epoch": 0.24, + "learning_rate": 4.027421583560413e-07, + "loss": 0.5457, + "step": 872 + }, + { + "epoch": 0.24, + "logps_train/chosen": -72.78117370605469, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -90.53843688964844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6754804849624634, + "rewards_train/margins": 0.45180022716522217, + "rewards_train/rejected": -1.1272807121276855, + "step": 872 + }, + { + "epoch": 0.24, + "logps_train/chosen": -76.83738708496094, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -79.88653564453125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6415513753890991, + "rewards_train/margins": 0.40628230571746826, + "rewards_train/rejected": -1.0478336811065674, + "step": 873 + }, + { + "epoch": 0.24, + "learning_rate": 4.0225172686092594e-07, + "loss": 0.5542, + "step": 874 + }, + { + "epoch": 0.24, + "logps_train/chosen": -39.06175994873047, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -49.59917449951172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5120354890823364, + "rewards_train/margins": 1.0713683366775513, + "rewards_train/rejected": -1.5834038257598877, + "step": 874 + }, + { + "epoch": 0.24, + "logps_train/chosen": -57.961402893066406, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -61.920379638671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3348119854927063, + "rewards_train/margins": 0.7919917702674866, + "rewards_train/rejected": -1.1268037557601929, + "step": 875 + }, + { + "epoch": 0.24, + "learning_rate": 4.0176036224364353e-07, + "loss": 0.4595, + "step": 876 + }, + { + "epoch": 0.24, + "logps_train/chosen": -47.12226867675781, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -48.73451232910156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6239946484565735, + "rewards_train/margins": 0.17172223329544067, + "rewards_train/rejected": -0.7957168817520142, + "step": 876 + }, + { + "epoch": 0.25, + "logps_train/chosen": -63.09483337402344, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -72.131103515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9297962188720703, + "rewards_train/margins": 1.1639783382415771, + "rewards_train/rejected": -2.0937745571136475, + "step": 877 + }, + { + "epoch": 0.25, + "learning_rate": 4.0126806751567527e-07, + "loss": 0.5059, + "step": 878 + }, + { + "epoch": 0.25, + "logps_train/chosen": -83.72850036621094, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -94.05934143066406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2841782569885254, + "rewards_train/margins": 1.6248812675476074, + "rewards_train/rejected": -2.909059524536133, + "step": 878 + }, + { + "epoch": 0.25, + "logps_train/chosen": -78.61212158203125, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -83.38127136230469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.42073363065719604, + "rewards_train/margins": 0.3151957392692566, + "rewards_train/rejected": -0.7359293699264526, + "step": 879 + }, + { + "epoch": 0.25, + "learning_rate": 4.007748456942029e-07, + "loss": 0.5034, + "step": 880 + }, + { + "epoch": 0.25, + "logps_train/chosen": -62.1357536315918, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -62.99882507324219, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.5403333902359009, + "rewards_train/margins": 0.20095562934875488, + "rewards_train/rejected": -1.7412890195846558, + "step": 880 + }, + { + "epoch": 0.25, + "logps_train/chosen": -84.62681579589844, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -85.95426177978516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5182240009307861, + "rewards_train/margins": 0.0038379430770874023, + "rewards_train/rejected": -1.5220619440078735, + "step": 881 + }, + { + "epoch": 0.25, + "learning_rate": 4.002806998020901e-07, + "loss": 0.7178, + "step": 882 + }, + { + "epoch": 0.25, + "logps_train/chosen": -80.11326599121094, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -80.46920013427734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2980449199676514, + "rewards_train/margins": 1.2316877841949463, + "rewards_train/rejected": -2.5297327041625977, + "step": 882 + }, + { + "epoch": 0.25, + "logps_train/chosen": -92.61192321777344, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -104.4124755859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2393176555633545, + "rewards_train/margins": 1.0152103900909424, + "rewards_train/rejected": -2.254528045654297, + "step": 883 + }, + { + "epoch": 0.25, + "learning_rate": 3.997856328678639e-07, + "loss": 0.4796, + "step": 884 + }, + { + "epoch": 0.25, + "logps_train/chosen": -64.76846313476562, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -81.062255859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5537751317024231, + "rewards_train/margins": 1.4395355582237244, + "rewards_train/rejected": -1.9933106899261475, + "step": 884 + }, + { + "epoch": 0.25, + "logps_train/chosen": -49.63147735595703, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -64.95520782470703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8460581302642822, + "rewards_train/margins": 0.7018063068389893, + "rewards_train/rejected": -1.5478644371032715, + "step": 885 + }, + { + "epoch": 0.25, + "learning_rate": 3.9928964792569654e-07, + "loss": 0.4331, + "step": 886 + }, + { + "epoch": 0.25, + "logps_train/chosen": -52.839752197265625, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -60.27191925048828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8090975284576416, + "rewards_train/margins": 0.3689000606536865, + "rewards_train/rejected": -1.1779975891113281, + "step": 886 + }, + { + "epoch": 0.25, + "logps_train/chosen": -40.76837158203125, + "logps_train/ref_chosen": -31.5, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -42.9201545715332, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.9231261610984802, + "rewards_train/margins": 0.07709234952926636, + "rewards_train/rejected": -1.0002185106277466, + "step": 887 + }, + { + "epoch": 0.25, + "learning_rate": 3.9879274801538614e-07, + "loss": 0.6346, + "step": 888 + }, + { + "epoch": 0.25, + "logps_train/chosen": -73.1782455444336, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -75.08341979980469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9022000432014465, + "rewards_train/margins": 0.1959860920906067, + "rewards_train/rejected": -1.0981861352920532, + "step": 888 + }, + { + "epoch": 0.25, + "logps_train/chosen": -76.214599609375, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -87.58216094970703, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.1128662824630737, + "rewards_train/margins": 1.0871468782424927, + "rewards_train/rejected": -2.2000131607055664, + "step": 889 + }, + { + "epoch": 0.25, + "learning_rate": 3.982949361823388e-07, + "loss": 0.6556, + "step": 890 + }, + { + "epoch": 0.25, + "logps_train/chosen": -64.17122650146484, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -76.07058715820312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.46184954047203064, + "rewards_train/margins": 1.1381775438785553, + "rewards_train/rejected": -1.600027084350586, + "step": 890 + }, + { + "epoch": 0.25, + "logps_train/chosen": -52.07596206665039, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -66.04190826416016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6796664595603943, + "rewards_train/margins": 1.0745243430137634, + "rewards_train/rejected": -1.7541908025741577, + "step": 891 + }, + { + "epoch": 0.25, + "learning_rate": 3.977962154775495e-07, + "loss": 0.4402, + "step": 892 + }, + { + "epoch": 0.25, + "logps_train/chosen": -81.47840118408203, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -98.81383514404297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8094611167907715, + "rewards_train/margins": 1.1202619075775146, + "rewards_train/rejected": -1.9297230243682861, + "step": 892 + }, + { + "epoch": 0.25, + "logps_train/chosen": -124.6190414428711, + "logps_train/ref_chosen": -112.0, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -122.74934387207031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2790918350219727, + "rewards_train/margins": 0.726311206817627, + "rewards_train/rejected": -2.0054030418395996, + "step": 893 + }, + { + "epoch": 0.25, + "learning_rate": 3.9729658895758345e-07, + "loss": 0.485, + "step": 894 + }, + { + "epoch": 0.25, + "logps_train/chosen": -36.10619354248047, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -62.65257263183594, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.20097553730010986, + "rewards_train/margins": 1.416821002960205, + "rewards_train/rejected": -1.617796540260315, + "step": 894 + }, + { + "epoch": 0.25, + "logps_train/chosen": -83.04344177246094, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -100.74031829833984, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4949696063995361, + "rewards_train/margins": 0.9103126525878906, + "rewards_train/rejected": -2.4052822589874268, + "step": 895 + }, + { + "epoch": 0.25, + "learning_rate": 3.967960596845576e-07, + "loss": 0.4898, + "step": 896 + }, + { + "epoch": 0.25, + "logps_train/chosen": -84.88798522949219, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -119.5206298828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.436795711517334, + "rewards_train/margins": 1.4168298244476318, + "rewards_train/rejected": -2.853625535964966, + "step": 896 + }, + { + "epoch": 0.25, + "logps_train/chosen": -36.703125, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -38.25, + "logps_train/rejected": -44.81077194213867, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1004394143819809, + "rewards_train/margins": 0.5486068874597549, + "rewards_train/rejected": -0.6490463018417358, + "step": 897 + }, + { + "epoch": 0.25, + "learning_rate": 3.962946307261216e-07, + "loss": 0.3956, + "step": 898 + }, + { + "epoch": 0.25, + "logps_train/chosen": -98.51810455322266, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -106.60423278808594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1647982597351074, + "rewards_train/margins": 1.142890453338623, + "rewards_train/rejected": -2.3076887130737305, + "step": 898 + }, + { + "epoch": 0.25, + "logps_train/chosen": -104.0866470336914, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -106.87762451171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3336645364761353, + "rewards_train/margins": 0.8150357007980347, + "rewards_train/rejected": -2.14870023727417, + "step": 899 + }, + { + "epoch": 0.25, + "learning_rate": 3.9579230515543914e-07, + "loss": 0.4522, + "step": 900 + }, + { + "epoch": 0.25, + "logps_train/chosen": -86.26780700683594, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -123.68040466308594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1877177953720093, + "rewards_train/margins": 1.0637205839157104, + "rewards_train/rejected": -2.2514383792877197, + "step": 900 + }, + { + "epoch": 0.25, + "logps_train/chosen": -56.91307067871094, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -60.705528259277344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8772448301315308, + "rewards_train/margins": 0.783737301826477, + "rewards_train/rejected": -1.6609821319580078, + "step": 901 + }, + { + "epoch": 0.25, + "learning_rate": 3.952890860511691e-07, + "loss": 0.4604, + "step": 902 + }, + { + "epoch": 0.25, + "logps_train/chosen": -74.55409240722656, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -87.43728637695312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4456429481506348, + "rewards_train/margins": 0.5340230464935303, + "rewards_train/rejected": -1.979665994644165, + "step": 902 + }, + { + "epoch": 0.25, + "logps_train/chosen": -63.08944320678711, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -81.43690490722656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7562098503112793, + "rewards_train/margins": 1.2976367473602295, + "rewards_train/rejected": -2.053846597671509, + "step": 903 + }, + { + "epoch": 0.25, + "learning_rate": 3.9478497649744656e-07, + "loss": 0.4786, + "step": 904 + }, + { + "epoch": 0.25, + "logps_train/chosen": -67.21929931640625, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -90.53805541992188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8326723575592041, + "rewards_train/margins": 1.0820711851119995, + "rewards_train/rejected": -1.9147435426712036, + "step": 904 + }, + { + "epoch": 0.25, + "logps_train/chosen": -72.20622253417969, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -80.93544006347656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2575366497039795, + "rewards_train/margins": 0.5289757251739502, + "rewards_train/rejected": -1.7865123748779297, + "step": 905 + }, + { + "epoch": 0.25, + "learning_rate": 3.9427997958386403e-07, + "loss": 0.5094, + "step": 906 + }, + { + "epoch": 0.25, + "logps_train/chosen": -61.616329193115234, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -77.89895629882812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6491086483001709, + "rewards_train/margins": 1.3169593811035156, + "rewards_train/rejected": -1.9660680294036865, + "step": 906 + }, + { + "epoch": 0.25, + "logps_train/chosen": -74.62712097167969, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -76.43041229248047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3951337337493896, + "rewards_train/margins": 0.012946128845214844, + "rewards_train/rejected": -1.4080798625946045, + "step": 907 + }, + { + "epoch": 0.25, + "learning_rate": 3.9377409840545254e-07, + "loss": 0.6108, + "step": 908 + }, + { + "epoch": 0.25, + "logps_train/chosen": -75.5404281616211, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -73.61381530761719, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.308144211769104, + "rewards_train/margins": 0.5717917084693909, + "rewards_train/rejected": -0.8799359202384949, + "step": 908 + }, + { + "epoch": 0.25, + "logps_train/chosen": -77.3523941040039, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -79.1645736694336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.293442726135254, + "rewards_train/margins": 1.035905361175537, + "rewards_train/rejected": -2.329348087310791, + "step": 909 + }, + { + "epoch": 0.25, + "learning_rate": 3.932673360626627e-07, + "loss": 0.4687, + "step": 910 + }, + { + "epoch": 0.25, + "logps_train/chosen": -89.87672424316406, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -90.80999755859375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.7661880850791931, + "rewards_train/margins": 0.5757489800453186, + "rewards_train/rejected": -1.3419370651245117, + "step": 910 + }, + { + "epoch": 0.25, + "logps_train/chosen": -42.630550384521484, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -54.55341339111328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4053402245044708, + "rewards_train/margins": 0.9484386146068573, + "rewards_train/rejected": -1.3537788391113281, + "step": 911 + }, + { + "epoch": 0.25, + "learning_rate": 3.9275969566134526e-07, + "loss": 0.5068, + "step": 912 + }, + { + "epoch": 0.25, + "logps_train/chosen": -57.77507400512695, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -66.93160247802734, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.840886116027832, + "rewards_train/margins": 0.19485235214233398, + "rewards_train/rejected": -1.035738468170166, + "step": 912 + }, + { + "epoch": 0.26, + "logps_train/chosen": -46.246376037597656, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -59.70482635498047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5017860531806946, + "rewards_train/margins": 1.3604934811592102, + "rewards_train/rejected": -1.8622795343399048, + "step": 913 + }, + { + "epoch": 0.26, + "learning_rate": 3.9225118031273285e-07, + "loss": 0.4865, + "step": 914 + }, + { + "epoch": 0.26, + "logps_train/chosen": -74.92893981933594, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -98.67294311523438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.033128261566162, + "rewards_train/margins": 1.7943224906921387, + "rewards_train/rejected": -2.827450752258301, + "step": 914 + }, + { + "epoch": 0.26, + "logps_train/chosen": -79.56307983398438, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -82.70267486572266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1656832695007324, + "rewards_train/margins": 0.8362245559692383, + "rewards_train/rejected": -2.0019078254699707, + "step": 915 + }, + { + "epoch": 0.26, + "learning_rate": 3.917417931334202e-07, + "loss": 0.3316, + "step": 916 + }, + { + "epoch": 0.26, + "logps_train/chosen": -82.14381408691406, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -92.40758514404297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7503196597099304, + "rewards_train/margins": 1.2701264023780823, + "rewards_train/rejected": -2.0204460620880127, + "step": 916 + }, + { + "epoch": 0.26, + "logps_train/chosen": -60.04490280151367, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -68.78225708007812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3076155483722687, + "rewards_train/margins": 1.4287887513637543, + "rewards_train/rejected": -1.736404299736023, + "step": 917 + }, + { + "epoch": 0.26, + "learning_rate": 3.912315372453455e-07, + "loss": 0.3723, + "step": 918 + }, + { + "epoch": 0.26, + "logps_train/chosen": -96.38775634765625, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -117.23223876953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.479400396347046, + "rewards_train/margins": 1.7149176597595215, + "rewards_train/rejected": -3.1943180561065674, + "step": 918 + }, + { + "epoch": 0.26, + "logps_train/chosen": -43.03814697265625, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -61.24681854248047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.48052364587783813, + "rewards_train/margins": 1.2604665160179138, + "rewards_train/rejected": -1.740990161895752, + "step": 919 + }, + { + "epoch": 0.26, + "learning_rate": 3.9072041577577086e-07, + "loss": 0.4571, + "step": 920 + }, + { + "epoch": 0.26, + "logps_train/chosen": -88.52217864990234, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -122.61509704589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8412805795669556, + "rewards_train/margins": 2.4546045064926147, + "rewards_train/rejected": -3.2958850860595703, + "step": 920 + }, + { + "epoch": 0.26, + "logps_train/chosen": -87.45503997802734, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -82.5659408569336, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8392540216445923, + "rewards_train/margins": 0.7376526594161987, + "rewards_train/rejected": -1.576906681060791, + "step": 921 + }, + { + "epoch": 0.26, + "learning_rate": 3.9020843185726375e-07, + "loss": 0.3199, + "step": 922 + }, + { + "epoch": 0.26, + "logps_train/chosen": -59.98433303833008, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -75.5238265991211, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5667929649353027, + "rewards_train/margins": 0.9804141521453857, + "rewards_train/rejected": -1.5472071170806885, + "step": 922 + }, + { + "epoch": 0.26, + "logps_train/chosen": -101.81692504882812, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -92.60254669189453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4176301956176758, + "rewards_train/margins": 0.9957494735717773, + "rewards_train/rejected": -2.413379669189453, + "step": 923 + }, + { + "epoch": 0.26, + "learning_rate": 3.89695588627677e-07, + "loss": 0.4387, + "step": 924 + }, + { + "epoch": 0.26, + "logps_train/chosen": -66.48164367675781, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -59.29444122314453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6981643438339233, + "rewards_train/margins": 0.5075982809066772, + "rewards_train/rejected": -1.2057626247406006, + "step": 924 + }, + { + "epoch": 0.26, + "logps_train/chosen": -78.38574981689453, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -79.59117126464844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0385751724243164, + "rewards_train/margins": 0.5092141628265381, + "rewards_train/rejected": -1.5477893352508545, + "step": 925 + }, + { + "epoch": 0.26, + "learning_rate": 3.891818892301304e-07, + "loss": 0.549, + "step": 926 + }, + { + "epoch": 0.26, + "logps_train/chosen": -58.79720687866211, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -70.23634338378906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1250334978103638, + "rewards_train/margins": 0.9478198289871216, + "rewards_train/rejected": -2.0728533267974854, + "step": 926 + }, + { + "epoch": 0.26, + "logps_train/chosen": -31.278968811035156, + "logps_train/ref_chosen": -31.0, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -41.0980224609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.022013187408447266, + "rewards_train/margins": 0.3496786952018738, + "rewards_train/rejected": -0.37169188261032104, + "step": 927 + }, + { + "epoch": 0.26, + "learning_rate": 3.8866733681299066e-07, + "loss": 0.5181, + "step": 928 + }, + { + "epoch": 0.26, + "logps_train/chosen": -92.73963928222656, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -112.15428161621094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3704484701156616, + "rewards_train/margins": 1.499667763710022, + "rewards_train/rejected": -2.8701162338256836, + "step": 928 + }, + { + "epoch": 0.26, + "logps_train/chosen": -69.67546081542969, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -82.57536315917969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.932390034198761, + "rewards_train/margins": 1.0333487391471863, + "rewards_train/rejected": -1.9657387733459473, + "step": 929 + }, + { + "epoch": 0.26, + "learning_rate": 3.8815193452985274e-07, + "loss": 0.351, + "step": 930 + }, + { + "epoch": 0.26, + "logps_train/chosen": -52.05970764160156, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -60.10272216796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21065837144851685, + "rewards_train/margins": 1.4384812712669373, + "rewards_train/rejected": -1.649139642715454, + "step": 930 + }, + { + "epoch": 0.26, + "logps_train/chosen": -33.79679489135742, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -38.25, + "logps_train/rejected": -49.92681884765625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.14764821529388428, + "rewards_train/margins": 1.0192526578903198, + "rewards_train/rejected": -1.166900873184204, + "step": 931 + }, + { + "epoch": 0.26, + "learning_rate": 3.876356855395202e-07, + "loss": 0.4436, + "step": 932 + }, + { + "epoch": 0.26, + "logps_train/chosen": -57.312294006347656, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -75.57594299316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1866980493068695, + "rewards_train/margins": 1.2412082254886627, + "rewards_train/rejected": -1.4279062747955322, + "step": 932 + }, + { + "epoch": 0.26, + "logps_train/chosen": -82.04449462890625, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -79.35250091552734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2163639068603516, + "rewards_train/margins": 0.8449602127075195, + "rewards_train/rejected": -2.061324119567871, + "step": 933 + }, + { + "epoch": 0.26, + "learning_rate": 3.8711859300598584e-07, + "loss": 0.4442, + "step": 934 + }, + { + "epoch": 0.26, + "logps_train/chosen": -37.849609375, + "logps_train/ref_chosen": -29.875, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -52.62455749511719, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8034486770629883, + "rewards_train/margins": 0.4742414951324463, + "rewards_train/rejected": -1.2776901721954346, + "step": 934 + }, + { + "epoch": 0.26, + "logps_train/chosen": -77.09475708007812, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -101.5009536743164, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9555695056915283, + "rewards_train/margins": 1.3156194686889648, + "rewards_train/rejected": -2.271188974380493, + "step": 935 + }, + { + "epoch": 0.26, + "learning_rate": 3.866006600984125e-07, + "loss": 0.5286, + "step": 936 + }, + { + "epoch": 0.26, + "logps_train/chosen": -66.92366790771484, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -81.85852813720703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8860437273979187, + "rewards_train/margins": 1.6044233441352844, + "rewards_train/rejected": -2.490467071533203, + "step": 936 + }, + { + "epoch": 0.26, + "logps_train/chosen": -58.88975524902344, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -63.51271057128906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.958702027797699, + "rewards_train/margins": 0.7064364552497864, + "rewards_train/rejected": -1.6651384830474854, + "step": 937 + }, + { + "epoch": 0.26, + "learning_rate": 3.860818899911134e-07, + "loss": 0.4249, + "step": 938 + }, + { + "epoch": 0.26, + "logps_train/chosen": -75.26493835449219, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -83.61276245117188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7279584407806396, + "rewards_train/margins": 1.2928874492645264, + "rewards_train/rejected": -2.020845890045166, + "step": 938 + }, + { + "epoch": 0.26, + "logps_train/chosen": -80.58453369140625, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -91.38980102539062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2749571800231934, + "rewards_train/margins": 0.7747652530670166, + "rewards_train/rejected": -2.04972243309021, + "step": 939 + }, + { + "epoch": 0.26, + "learning_rate": 3.855622858635329e-07, + "loss": 0.5245, + "step": 940 + }, + { + "epoch": 0.26, + "logps_train/chosen": -68.03158569335938, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -71.84562683105469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.38909628987312317, + "rewards_train/margins": 0.714607447385788, + "rewards_train/rejected": -1.1037037372589111, + "step": 940 + }, + { + "epoch": 0.26, + "logps_train/chosen": -74.56521606445312, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -69.20830535888672, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.1147241592407227, + "rewards_train/margins": -0.026706457138061523, + "rewards_train/rejected": -1.0880177021026611, + "step": 941 + }, + { + "epoch": 0.26, + "learning_rate": 3.850418509002269e-07, + "loss": 0.6317, + "step": 942 + }, + { + "epoch": 0.26, + "logps_train/chosen": -93.73075866699219, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -92.96345520019531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.697880744934082, + "rewards_train/margins": 0.30159056186676025, + "rewards_train/rejected": -1.9994713068008423, + "step": 942 + }, + { + "epoch": 0.26, + "logps_train/chosen": -60.12846374511719, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -73.21841430664062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.28999507427215576, + "rewards_train/margins": 0.5353621244430542, + "rewards_train/rejected": -0.82535719871521, + "step": 943 + }, + { + "epoch": 0.26, + "learning_rate": 3.845205882908432e-07, + "loss": 0.6042, + "step": 944 + }, + { + "epoch": 0.26, + "logps_train/chosen": -76.77639770507812, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -73.18373107910156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9510769248008728, + "rewards_train/margins": 0.36924880743026733, + "rewards_train/rejected": -1.3203257322311401, + "step": 944 + }, + { + "epoch": 0.26, + "logps_train/chosen": -59.21940994262695, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -86.09684753417969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8250659704208374, + "rewards_train/margins": 1.2748533487319946, + "rewards_train/rejected": -2.099919319152832, + "step": 945 + }, + { + "epoch": 0.26, + "learning_rate": 3.839985012301021e-07, + "loss": 0.4803, + "step": 946 + }, + { + "epoch": 0.26, + "logps_train/chosen": -77.0558853149414, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -75.96937561035156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2387912273406982, + "rewards_train/margins": 0.46146631240844727, + "rewards_train/rejected": -1.7002575397491455, + "step": 946 + }, + { + "epoch": 0.26, + "logps_train/chosen": -39.2894287109375, + "logps_train/ref_chosen": -34.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -53.667877197265625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4969112277030945, + "rewards_train/margins": 0.2210482954978943, + "rewards_train/rejected": -0.7179595232009888, + "step": 947 + }, + { + "epoch": 0.26, + "learning_rate": 3.834755929177772e-07, + "loss": 0.6018, + "step": 948 + }, + { + "epoch": 0.26, + "logps_train/chosen": -82.52078247070312, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -75.34527587890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6145778894424438, + "rewards_train/margins": 1.3945590257644653, + "rewards_train/rejected": -2.009136915206909, + "step": 948 + }, + { + "epoch": 0.27, + "logps_train/chosen": -117.22262573242188, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -127.93438720703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.590231418609619, + "rewards_train/margins": 1.379183292388916, + "rewards_train/rejected": -3.969414710998535, + "step": 949 + }, + { + "epoch": 0.27, + "learning_rate": 3.8295186655867484e-07, + "loss": 0.4674, + "step": 950 + }, + { + "epoch": 0.27, + "logps_train/chosen": -107.68785095214844, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -121.63465881347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.365269899368286, + "rewards_train/margins": 1.8106958866119385, + "rewards_train/rejected": -4.175965785980225, + "step": 950 + }, + { + "epoch": 0.27, + "logps_train/chosen": -76.14000701904297, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -120.9788589477539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3702507019042969, + "rewards_train/margins": 2.1659164428710938, + "rewards_train/rejected": -2.5361671447753906, + "step": 951 + }, + { + "epoch": 0.27, + "learning_rate": 3.8242732536261534e-07, + "loss": 0.3037, + "step": 952 + }, + { + "epoch": 0.27, + "logps_train/chosen": -114.98648834228516, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -152.13429260253906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4033362865447998, + "rewards_train/margins": 2.6171247959136963, + "rewards_train/rejected": -4.020461082458496, + "step": 952 + }, + { + "epoch": 0.27, + "logps_train/chosen": -93.19520568847656, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -125.11563110351562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.676551342010498, + "rewards_train/margins": 2.2264187335968018, + "rewards_train/rejected": -2.9029700756073, + "step": 953 + }, + { + "epoch": 0.27, + "learning_rate": 3.819019725444129e-07, + "loss": 0.2898, + "step": 954 + }, + { + "epoch": 0.27, + "logps_train/chosen": -93.14697265625, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -114.73471069335938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6443849802017212, + "rewards_train/margins": 1.9978359937667847, + "rewards_train/rejected": -2.642220973968506, + "step": 954 + }, + { + "epoch": 0.27, + "logps_train/chosen": -34.24162673950195, + "logps_train/ref_chosen": -28.875, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -52.270198822021484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5331470966339111, + "rewards_train/margins": 0.47202205657958984, + "rewards_train/rejected": -1.005169153213501, + "step": 955 + }, + { + "epoch": 0.27, + "learning_rate": 3.813758113238561e-07, + "loss": 0.4821, + "step": 956 + }, + { + "epoch": 0.27, + "logps_train/chosen": -57.884178161621094, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -73.18476104736328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7455227375030518, + "rewards_train/margins": 1.4659221172332764, + "rewards_train/rejected": -2.211444854736328, + "step": 956 + }, + { + "epoch": 0.27, + "logps_train/chosen": -80.27413177490234, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -107.8955307006836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2862021923065186, + "rewards_train/margins": 0.8603823184967041, + "rewards_train/rejected": -2.1465845108032227, + "step": 957 + }, + { + "epoch": 0.27, + "learning_rate": 3.808488449256879e-07, + "loss": 0.4038, + "step": 958 + }, + { + "epoch": 0.27, + "logps_train/chosen": -55.29138946533203, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -76.25001525878906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.17679546773433685, + "rewards_train/margins": 0.6376593858003616, + "rewards_train/rejected": -0.8144548535346985, + "step": 958 + }, + { + "epoch": 0.27, + "logps_train/chosen": -99.0465087890625, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -74.69607543945312, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.4119257926940918, + "rewards_train/margins": 0.3733067512512207, + "rewards_train/rejected": -1.7852325439453125, + "step": 959 + }, + { + "epoch": 0.27, + "learning_rate": 3.8032107657958626e-07, + "loss": 0.6727, + "step": 960 + }, + { + "epoch": 0.27, + "logps_train/chosen": -28.30646514892578, + "logps_train/ref_chosen": -27.375, + "logps_train/ref_rejected": -28.375, + "logps_train/rejected": -37.51717758178711, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.08845892548561096, + "rewards_train/margins": 0.8301533758640289, + "rewards_train/rejected": -0.9186123013496399, + "step": 960 + }, + { + "epoch": 0.27, + "logps_train/chosen": -31.715511322021484, + "logps_train/ref_chosen": -28.0, + "logps_train/ref_rejected": -28.125, + "logps_train/rejected": -35.91783142089844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.37037909030914307, + "rewards_train/margins": 0.40538859367370605, + "rewards_train/rejected": -0.7757676839828491, + "step": 961 + }, + { + "epoch": 0.27, + "learning_rate": 3.797925095201438e-07, + "loss": 0.5759, + "step": 962 + }, + { + "epoch": 0.27, + "logps_train/chosen": -66.50740814208984, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -103.23048400878906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7607013583183289, + "rewards_train/margins": 2.427582085132599, + "rewards_train/rejected": -3.1882834434509277, + "step": 962 + }, + { + "epoch": 0.27, + "logps_train/chosen": -62.93327331542969, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -76.07221984863281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7359054088592529, + "rewards_train/margins": 1.2978789806365967, + "rewards_train/rejected": -2.0337843894958496, + "step": 963 + }, + { + "epoch": 0.27, + "learning_rate": 3.792631469868487e-07, + "loss": 0.3608, + "step": 964 + }, + { + "epoch": 0.27, + "logps_train/chosen": -69.26075744628906, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -78.39433288574219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.73271644115448, + "rewards_train/margins": 0.6305446624755859, + "rewards_train/rejected": -1.363261103630066, + "step": 964 + }, + { + "epoch": 0.27, + "logps_train/chosen": -85.05624389648438, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -91.3147964477539, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2844817042350769, + "rewards_train/margins": 1.776245892047882, + "rewards_train/rejected": -2.060727596282959, + "step": 965 + }, + { + "epoch": 0.27, + "learning_rate": 3.787329922240642e-07, + "loss": 0.4784, + "step": 966 + }, + { + "epoch": 0.27, + "logps_train/chosen": -59.374000549316406, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -70.02639770507812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.36415785551071167, + "rewards_train/margins": 1.7120173573493958, + "rewards_train/rejected": -2.0761752128601074, + "step": 966 + }, + { + "epoch": 0.27, + "logps_train/chosen": -73.93157958984375, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -78.58828735351562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7268491983413696, + "rewards_train/margins": 1.136960506439209, + "rewards_train/rejected": -1.8638097047805786, + "step": 967 + }, + { + "epoch": 0.27, + "learning_rate": 3.782020484810089e-07, + "loss": 0.3588, + "step": 968 + }, + { + "epoch": 0.27, + "logps_train/chosen": -75.43313598632812, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -83.15252685546875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.362356424331665, + "rewards_train/margins": 0.17281866073608398, + "rewards_train/rejected": -1.535175085067749, + "step": 968 + }, + { + "epoch": 0.27, + "logps_train/chosen": -99.70472717285156, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -113.51841735839844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.9423480033874512, + "rewards_train/margins": 1.1915254592895508, + "rewards_train/rejected": -3.133873462677002, + "step": 969 + }, + { + "epoch": 0.27, + "learning_rate": 3.776703190117372e-07, + "loss": 0.5644, + "step": 970 + }, + { + "epoch": 0.27, + "logps_train/chosen": -52.91246032714844, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -58.44598388671875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7568711042404175, + "rewards_train/margins": 0.3406568765640259, + "rewards_train/rejected": -1.0975279808044434, + "step": 970 + }, + { + "epoch": 0.27, + "logps_train/chosen": -36.84766387939453, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -27.625, + "logps_train/rejected": -32.782142639160156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.17265711724758148, + "rewards_train/margins": 0.3489164263010025, + "rewards_train/rejected": -0.521573543548584, + "step": 971 + }, + { + "epoch": 0.27, + "learning_rate": 3.771378070751188e-07, + "loss": 0.5983, + "step": 972 + }, + { + "epoch": 0.27, + "logps_train/chosen": -61.879356384277344, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -71.01873779296875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.14887315034866333, + "rewards_train/margins": 0.886203944683075, + "rewards_train/rejected": -1.0350770950317383, + "step": 972 + }, + { + "epoch": 0.27, + "logps_train/chosen": -60.30873107910156, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -57.47590637207031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7766494154930115, + "rewards_train/margins": 0.3982852101325989, + "rewards_train/rejected": -1.1749346256256104, + "step": 973 + }, + { + "epoch": 0.27, + "learning_rate": 3.7660451593481906e-07, + "loss": 0.4696, + "step": 974 + }, + { + "epoch": 0.27, + "logps_train/chosen": -84.56649017333984, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -82.98353576660156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.775667428970337, + "rewards_train/margins": 0.5963926315307617, + "rewards_train/rejected": -2.3720600605010986, + "step": 974 + }, + { + "epoch": 0.27, + "logps_train/chosen": -57.22075653076172, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -57.888938903808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.24507342278957367, + "rewards_train/margins": 0.9418674856424332, + "rewards_train/rejected": -1.1869409084320068, + "step": 975 + }, + { + "epoch": 0.27, + "learning_rate": 3.7607044885927913e-07, + "loss": 0.5744, + "step": 976 + }, + { + "epoch": 0.27, + "logps_train/chosen": -116.5517578125, + "logps_train/ref_chosen": -106.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -149.39083862304688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0426762104034424, + "rewards_train/margins": 2.0589070320129395, + "rewards_train/rejected": -3.101583242416382, + "step": 976 + }, + { + "epoch": 0.27, + "logps_train/chosen": -100.63493347167969, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -96.67007446289062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4766771793365479, + "rewards_train/margins": 0.6221663951873779, + "rewards_train/rejected": -2.098843574523926, + "step": 977 + }, + { + "epoch": 0.27, + "learning_rate": 3.7553560912169533e-07, + "loss": 0.3579, + "step": 978 + }, + { + "epoch": 0.27, + "logps_train/chosen": -63.02811050415039, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -115.67050170898438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8616006374359131, + "rewards_train/margins": 2.8093068599700928, + "rewards_train/rejected": -3.670907497406006, + "step": 978 + }, + { + "epoch": 0.27, + "logps_train/chosen": -63.73190689086914, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -85.74191284179688, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.8986788988113403, + "rewards_train/margins": 0.5228750705718994, + "rewards_train/rejected": -1.4215539693832397, + "step": 979 + }, + { + "epoch": 0.27, + "learning_rate": 3.75e-07, + "loss": 0.4244, + "step": 980 + }, + { + "epoch": 0.27, + "logps_train/chosen": -44.13560485839844, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -43.51254653930664, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.30501577258110046, + "rewards_train/margins": 0.5095200836658478, + "rewards_train/rejected": -0.8145358562469482, + "step": 980 + }, + { + "epoch": 0.27, + "logps_train/chosen": -38.95943069458008, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -45.373722076416016, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.35805249214172363, + "rewards_train/margins": 0.15978842973709106, + "rewards_train/rejected": -0.5178409218788147, + "step": 981 + }, + { + "epoch": 0.27, + "learning_rate": 3.744636247768404e-07, + "loss": 0.6043, + "step": 982 + }, + { + "epoch": 0.27, + "logps_train/chosen": -96.59376525878906, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -111.5, + "logps_train/rejected": -136.58489990234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7636739611625671, + "rewards_train/margins": 1.7620039582252502, + "rewards_train/rejected": -2.5256779193878174, + "step": 982 + }, + { + "epoch": 0.27, + "logps_train/chosen": -69.85820007324219, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -67.28929901123047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4448046684265137, + "rewards_train/margins": 0.3501410484313965, + "rewards_train/rejected": -1.7949457168579102, + "step": 983 + }, + { + "epoch": 0.28, + "learning_rate": 3.739264867395593e-07, + "loss": 0.4394, + "step": 984 + }, + { + "epoch": 0.28, + "logps_train/chosen": -96.44001007080078, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -117.32788848876953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6221262812614441, + "rewards_train/margins": 1.725213348865509, + "rewards_train/rejected": -2.347339630126953, + "step": 984 + }, + { + "epoch": 0.28, + "logps_train/chosen": -60.99164581298828, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -68.77722930908203, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7296335697174072, + "rewards_train/margins": 0.820159912109375, + "rewards_train/rejected": -1.5497934818267822, + "step": 985 + }, + { + "epoch": 0.28, + "learning_rate": 3.7338858918017455e-07, + "loss": 0.3727, + "step": 986 + }, + { + "epoch": 0.28, + "logps_train/chosen": -33.59450912475586, + "logps_train/ref_chosen": -24.625, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -53.87938690185547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8946071863174438, + "rewards_train/margins": 0.527706503868103, + "rewards_train/rejected": -1.4223136901855469, + "step": 986 + }, + { + "epoch": 0.28, + "logps_train/chosen": -73.20545196533203, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -78.76162719726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4795297086238861, + "rewards_train/margins": 1.1833525002002716, + "rewards_train/rejected": -1.6628822088241577, + "step": 987 + }, + { + "epoch": 0.28, + "learning_rate": 3.728499353953591e-07, + "loss": 0.4296, + "step": 988 + }, + { + "epoch": 0.28, + "logps_train/chosen": -92.98452758789062, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -112.6695556640625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.30782726407051086, + "rewards_train/margins": 2.038816660642624, + "rewards_train/rejected": -2.3466439247131348, + "step": 988 + }, + { + "epoch": 0.28, + "logps_train/chosen": -41.62238311767578, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -32.75, + "logps_train/rejected": -45.17969512939453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6384590864181519, + "rewards_train/margins": 0.6095397472381592, + "rewards_train/rejected": -1.247998833656311, + "step": 989 + }, + { + "epoch": 0.28, + "learning_rate": 3.7231052868642066e-07, + "loss": 0.4056, + "step": 990 + }, + { + "epoch": 0.28, + "logps_train/chosen": -77.09317016601562, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -92.37982940673828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6284575462341309, + "rewards_train/margins": 0.4040563106536865, + "rewards_train/rejected": -2.0325138568878174, + "step": 990 + }, + { + "epoch": 0.28, + "logps_train/chosen": -76.18235778808594, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -104.20475769042969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1693096160888672, + "rewards_train/margins": 1.309173583984375, + "rewards_train/rejected": -2.478483200073242, + "step": 991 + }, + { + "epoch": 0.28, + "learning_rate": 3.717703723592811e-07, + "loss": 0.5202, + "step": 992 + }, + { + "epoch": 0.28, + "logps_train/chosen": -78.13817596435547, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -107.66416931152344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7498531341552734, + "rewards_train/margins": 2.491173505783081, + "rewards_train/rejected": -3.2410266399383545, + "step": 992 + }, + { + "epoch": 0.28, + "logps_train/chosen": -33.91649627685547, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -15.375, + "logps_train/rejected": -22.65131378173828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.34868061542510986, + "rewards_train/margins": 0.3814411163330078, + "rewards_train/rejected": -0.7301217317581177, + "step": 993 + }, + { + "epoch": 0.28, + "learning_rate": 3.7122946972445717e-07, + "loss": 0.4623, + "step": 994 + }, + { + "epoch": 0.28, + "logps_train/chosen": -58.274356842041016, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -92.54460906982422, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.961810827255249, + "rewards_train/margins": 1.0485093593597412, + "rewards_train/rejected": -2.0103201866149902, + "step": 994 + }, + { + "epoch": 0.28, + "logps_train/chosen": -89.96923828125, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -96.57150268554688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8988767862319946, + "rewards_train/margins": 0.9582729339599609, + "rewards_train/rejected": -1.8571497201919556, + "step": 995 + }, + { + "epoch": 0.28, + "learning_rate": 3.706878240970391e-07, + "loss": 0.4345, + "step": 996 + }, + { + "epoch": 0.28, + "logps_train/chosen": -93.88801574707031, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -103.30751037597656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8087237477302551, + "rewards_train/margins": 0.9267154335975647, + "rewards_train/rejected": -1.7354391813278198, + "step": 996 + }, + { + "epoch": 0.28, + "logps_train/chosen": -59.505218505859375, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -64.93518829345703, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.85823655128479, + "rewards_train/margins": 0.6938760280609131, + "rewards_train/rejected": -1.5521125793457031, + "step": 997 + }, + { + "epoch": 0.28, + "learning_rate": 3.7014543879667095e-07, + "loss": 0.4147, + "step": 998 + }, + { + "epoch": 0.28, + "logps_train/chosen": -128.09014892578125, + "logps_train/ref_chosen": -108.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -121.72225189208984, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9648754596710205, + "rewards_train/margins": 0.9577405452728271, + "rewards_train/rejected": -2.9226160049438477, + "step": 998 + }, + { + "epoch": 0.28, + "logps_train/chosen": -88.49454498291016, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -89.01692199707031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7994546294212341, + "rewards_train/margins": 0.8420813679695129, + "rewards_train/rejected": -1.641535997390747, + "step": 999 + }, + { + "epoch": 0.28, + "learning_rate": 3.696023171475301e-07, + "loss": 0.5059, + "step": 1000 + }, + { + "epoch": 0.28, + "logps_train/chosen": -92.29740905761719, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -124.15961456298828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.365678608417511, + "rewards_train/margins": 2.425282657146454, + "rewards_train/rejected": -2.790961265563965, + "step": 1000 + }, + { + "epoch": 0.28, + "logps_train/chosen": -39.370155334472656, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -50.7298698425293, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.35166412591934204, + "rewards_train/margins": 0.7014012932777405, + "rewards_train/rejected": -1.0530654191970825, + "step": 1001 + }, + { + "epoch": 0.28, + "learning_rate": 3.69058462478307e-07, + "loss": 0.3835, + "step": 1002 + }, + { + "epoch": 0.28, + "logps_train/chosen": -55.21368408203125, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -72.70394897460938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8161681890487671, + "rewards_train/margins": 1.2327419519424438, + "rewards_train/rejected": -2.048910140991211, + "step": 1002 + }, + { + "epoch": 0.28, + "logps_train/chosen": -73.49148559570312, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -100.387451171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5975865721702576, + "rewards_train/margins": 1.8755331635475159, + "rewards_train/rejected": -2.4731197357177734, + "step": 1003 + }, + { + "epoch": 0.28, + "learning_rate": 3.685138781221844e-07, + "loss": 0.2971, + "step": 1004 + }, + { + "epoch": 0.28, + "logps_train/chosen": -76.83126068115234, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -78.23001098632812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6876183152198792, + "rewards_train/margins": 0.9588205218315125, + "rewards_train/rejected": -1.6464388370513916, + "step": 1004 + }, + { + "epoch": 0.28, + "logps_train/chosen": -92.24978637695312, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -117.96641540527344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.6948999166488647, + "rewards_train/margins": 1.4544755220413208, + "rewards_train/rejected": -3.1493754386901855, + "step": 1005 + }, + { + "epoch": 0.28, + "learning_rate": 3.6796856741681726e-07, + "loss": 0.436, + "step": 1006 + }, + { + "epoch": 0.28, + "logps_train/chosen": -87.13131713867188, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -85.78050994873047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.376413106918335, + "rewards_train/margins": 0.4496849775314331, + "rewards_train/rejected": -1.826098084449768, + "step": 1006 + }, + { + "epoch": 0.28, + "logps_train/chosen": -87.46511840820312, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -96.9076919555664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0320582389831543, + "rewards_train/margins": 1.220038890838623, + "rewards_train/rejected": -2.2520971298217773, + "step": 1007 + }, + { + "epoch": 0.28, + "learning_rate": 3.674225337043123e-07, + "loss": 0.6127, + "step": 1008 + }, + { + "epoch": 0.28, + "logps_train/chosen": -84.76114654541016, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -96.82876586914062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4245526790618896, + "rewards_train/margins": 0.9680891036987305, + "rewards_train/rejected": -2.39264178276062, + "step": 1008 + }, + { + "epoch": 0.28, + "logps_train/chosen": -61.201988220214844, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -50.20012664794922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8639487028121948, + "rewards_train/margins": 0.16407179832458496, + "rewards_train/rejected": -1.0280205011367798, + "step": 1009 + }, + { + "epoch": 0.28, + "learning_rate": 3.6687578033120736e-07, + "loss": 0.5959, + "step": 1010 + }, + { + "epoch": 0.28, + "logps_train/chosen": -73.82183074951172, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -112.18840026855469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4138238430023193, + "rewards_train/margins": 0.7862663269042969, + "rewards_train/rejected": -2.200090169906616, + "step": 1010 + }, + { + "epoch": 0.28, + "logps_train/chosen": -86.24446105957031, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -90.42868041992188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9603840112686157, + "rewards_train/margins": 0.57701575756073, + "rewards_train/rejected": -1.5373997688293457, + "step": 1011 + }, + { + "epoch": 0.28, + "learning_rate": 3.6632831064845077e-07, + "loss": 0.4987, + "step": 1012 + }, + { + "epoch": 0.28, + "logps_train/chosen": -33.3088493347168, + "logps_train/ref_chosen": -29.75, + "logps_train/ref_rejected": -32.0, + "logps_train/rejected": -48.71315383911133, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3562754690647125, + "rewards_train/margins": 1.307129830121994, + "rewards_train/rejected": -1.6634052991867065, + "step": 1012 + }, + { + "epoch": 0.28, + "logps_train/chosen": -41.54804229736328, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -67.42402648925781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.26105427742004395, + "rewards_train/margins": 1.3075196743011475, + "rewards_train/rejected": -1.5685739517211914, + "step": 1013 + }, + { + "epoch": 0.28, + "learning_rate": 3.657801280113813e-07, + "loss": 0.4629, + "step": 1014 + }, + { + "epoch": 0.28, + "logps_train/chosen": -67.31124114990234, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -93.14631652832031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5881553292274475, + "rewards_train/margins": 1.8823363184928894, + "rewards_train/rejected": -2.470491647720337, + "step": 1014 + }, + { + "epoch": 0.28, + "logps_train/chosen": -56.12282943725586, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -61.75353240966797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6810327768325806, + "rewards_train/margins": 0.5302577018737793, + "rewards_train/rejected": -1.2112904787063599, + "step": 1015 + }, + { + "epoch": 0.28, + "learning_rate": 3.6523123577970693e-07, + "loss": 0.4064, + "step": 1016 + }, + { + "epoch": 0.28, + "logps_train/chosen": -127.6142807006836, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -140.6337890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.3192403316497803, + "rewards_train/margins": 0.981637716293335, + "rewards_train/rejected": -3.3008780479431152, + "step": 1016 + }, + { + "epoch": 0.28, + "logps_train/chosen": -44.224998474121094, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -61.081878662109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4836325943470001, + "rewards_train/margins": 1.0846136510372162, + "rewards_train/rejected": -1.5682462453842163, + "step": 1017 + }, + { + "epoch": 0.28, + "learning_rate": 3.64681637317485e-07, + "loss": 0.4366, + "step": 1018 + }, + { + "epoch": 0.28, + "logps_train/chosen": -100.96199798583984, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -116.30669403076172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7578212022781372, + "rewards_train/margins": 1.118648886680603, + "rewards_train/rejected": -2.8764700889587402, + "step": 1018 + }, + { + "epoch": 0.28, + "logps_train/chosen": -66.99490356445312, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -78.38606262207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3624570071697235, + "rewards_train/margins": 1.403178483247757, + "rewards_train/rejected": -1.7656354904174805, + "step": 1019 + }, + { + "epoch": 0.29, + "learning_rate": 3.6413133599310096e-07, + "loss": 0.5204, + "step": 1020 + }, + { + "epoch": 0.29, + "logps_train/chosen": -83.38843536376953, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -85.05964660644531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5329840183258057, + "rewards_train/margins": 0.5751283168792725, + "rewards_train/rejected": -2.108112335205078, + "step": 1020 + }, + { + "epoch": 0.29, + "logps_train/chosen": -53.12415313720703, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -53.799285888671875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7831181883811951, + "rewards_train/margins": 0.5073574185371399, + "rewards_train/rejected": -1.290475606918335, + "step": 1021 + }, + { + "epoch": 0.29, + "learning_rate": 3.635803351792479e-07, + "loss": 0.5144, + "step": 1022 + }, + { + "epoch": 0.29, + "logps_train/chosen": -76.83980560302734, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -97.3507308959961, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4844684898853302, + "rewards_train/margins": 1.1534366309642792, + "rewards_train/rejected": -1.6379051208496094, + "step": 1022 + }, + { + "epoch": 0.29, + "logps_train/chosen": -33.135948181152344, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -57.80500793457031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2725791931152344, + "rewards_train/margins": 0.8645621538162231, + "rewards_train/rejected": -1.1371413469314575, + "step": 1023 + }, + { + "epoch": 0.29, + "learning_rate": 3.630286382529061e-07, + "loss": 0.4425, + "step": 1024 + }, + { + "epoch": 0.29, + "logps_train/chosen": -55.14508056640625, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -52.56376266479492, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.35552334785461426, + "rewards_train/margins": 0.6935285329818726, + "rewards_train/rejected": -1.0490518808364868, + "step": 1024 + }, + { + "epoch": 0.29, + "logps_train/chosen": -36.391231536865234, + "logps_train/ref_chosen": -31.125, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -59.95484924316406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5218378305435181, + "rewards_train/margins": 1.2287250757217407, + "rewards_train/rejected": -1.7505629062652588, + "step": 1025 + }, + { + "epoch": 0.29, + "learning_rate": 3.6247624859532223e-07, + "loss": 0.4426, + "step": 1026 + }, + { + "epoch": 0.29, + "logps_train/chosen": -44.920921325683594, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -58.58968734741211, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3239278793334961, + "rewards_train/margins": 0.7397284507751465, + "rewards_train/rejected": -1.0636563301086426, + "step": 1026 + }, + { + "epoch": 0.29, + "logps_train/chosen": -22.355199813842773, + "logps_train/ref_chosen": -18.75, + "logps_train/ref_rejected": -21.625, + "logps_train/rejected": -28.59585952758789, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.36482909321784973, + "rewards_train/margins": 0.33109697699546814, + "rewards_train/rejected": -0.6959260702133179, + "step": 1027 + }, + { + "epoch": 0.29, + "learning_rate": 3.619231695919884e-07, + "loss": 0.5221, + "step": 1028 + }, + { + "epoch": 0.29, + "logps_train/chosen": -83.94831085205078, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -72.9941177368164, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0690497159957886, + "rewards_train/margins": 0.6928616762161255, + "rewards_train/rejected": -1.761911392211914, + "step": 1028 + }, + { + "epoch": 0.29, + "logps_train/chosen": -50.01479721069336, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -65.52947998046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.33976075053215027, + "rewards_train/margins": 0.7421911060810089, + "rewards_train/rejected": -1.0819518566131592, + "step": 1029 + }, + { + "epoch": 0.29, + "learning_rate": 3.613694046326217e-07, + "loss": 0.5026, + "step": 1030 + }, + { + "epoch": 0.29, + "logps_train/chosen": -64.20675659179688, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -63.5363655090332, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9573944807052612, + "rewards_train/margins": 0.35444533824920654, + "rewards_train/rejected": -1.3118398189544678, + "step": 1030 + }, + { + "epoch": 0.29, + "logps_train/chosen": -64.35011291503906, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -94.18923950195312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.395460307598114, + "rewards_train/margins": 1.5453386902809143, + "rewards_train/rejected": -1.9407989978790283, + "step": 1031 + }, + { + "epoch": 0.29, + "learning_rate": 3.608149571111434e-07, + "loss": 0.4341, + "step": 1032 + }, + { + "epoch": 0.29, + "logps_train/chosen": -54.97735595703125, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -30.75, + "logps_train/rejected": -44.96813201904297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6032044887542725, + "rewards_train/margins": 0.8182178735733032, + "rewards_train/rejected": -1.4214223623275757, + "step": 1032 + }, + { + "epoch": 0.29, + "logps_train/chosen": -37.03369140625, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -61.21590042114258, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.23774416744709015, + "rewards_train/margins": 1.2986898869276047, + "rewards_train/rejected": -1.5364340543746948, + "step": 1033 + }, + { + "epoch": 0.29, + "learning_rate": 3.6025983042565787e-07, + "loss": 0.3872, + "step": 1034 + }, + { + "epoch": 0.29, + "logps_train/chosen": -117.79395294189453, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -131.65074157714844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0825207233428955, + "rewards_train/margins": 0.6700534820556641, + "rewards_train/rejected": -2.7525742053985596, + "step": 1034 + }, + { + "epoch": 0.29, + "logps_train/chosen": -127.1763916015625, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -129.92922973632812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5638302564620972, + "rewards_train/margins": 0.9345625638961792, + "rewards_train/rejected": -2.4983928203582764, + "step": 1035 + }, + { + "epoch": 0.29, + "learning_rate": 3.59704027978432e-07, + "loss": 0.5562, + "step": 1036 + }, + { + "epoch": 0.29, + "logps_train/chosen": -40.47643280029297, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -63.31345748901367, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6408072710037231, + "rewards_train/margins": 0.6225696802139282, + "rewards_train/rejected": -1.2633769512176514, + "step": 1036 + }, + { + "epoch": 0.29, + "logps_train/chosen": -59.064483642578125, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -85.43781280517578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4763703942298889, + "rewards_train/margins": 1.6791290640830994, + "rewards_train/rejected": -2.1554994583129883, + "step": 1037 + }, + { + "epoch": 0.29, + "learning_rate": 3.591475531758745e-07, + "loss": 0.4342, + "step": 1038 + }, + { + "epoch": 0.29, + "logps_train/chosen": -45.847835540771484, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -51.582645416259766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5097836852073669, + "rewards_train/margins": 0.5814889073371887, + "rewards_train/rejected": -1.0912725925445557, + "step": 1038 + }, + { + "epoch": 0.29, + "logps_train/chosen": -40.81827926635742, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -57.85715866088867, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.23973798751831055, + "rewards_train/margins": 0.45515745878219604, + "rewards_train/rejected": -0.6948954463005066, + "step": 1039 + }, + { + "epoch": 0.29, + "learning_rate": 3.585904094285145e-07, + "loss": 0.5034, + "step": 1040 + }, + { + "epoch": 0.29, + "logps_train/chosen": -106.05583190917969, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -137.294921875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8173024654388428, + "rewards_train/margins": 1.9246892929077148, + "rewards_train/rejected": -3.7419917583465576, + "step": 1040 + }, + { + "epoch": 0.29, + "logps_train/chosen": -84.23391723632812, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -113.50904846191406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9944851994514465, + "rewards_train/margins": 1.298607051372528, + "rewards_train/rejected": -2.2930922508239746, + "step": 1041 + }, + { + "epoch": 0.29, + "learning_rate": 3.5803260015098113e-07, + "loss": 0.4149, + "step": 1042 + }, + { + "epoch": 0.29, + "logps_train/chosen": -89.83704376220703, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -101.71491241455078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6047983169555664, + "rewards_train/margins": 1.0354430675506592, + "rewards_train/rejected": -1.6402413845062256, + "step": 1042 + }, + { + "epoch": 0.29, + "logps_train/chosen": -74.22798919677734, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -79.30785369873047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.019283413887024, + "rewards_train/margins": 1.2669705152511597, + "rewards_train/rejected": -2.2862539291381836, + "step": 1043 + }, + { + "epoch": 0.29, + "learning_rate": 3.5747412876198256e-07, + "loss": 0.4699, + "step": 1044 + }, + { + "epoch": 0.29, + "logps_train/chosen": -39.050621032714844, + "logps_train/ref_chosen": -29.25, + "logps_train/ref_rejected": -26.0, + "logps_train/rejected": -37.39555740356445, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.985042929649353, + "rewards_train/margins": 0.15246200561523438, + "rewards_train/rejected": -1.1375049352645874, + "step": 1044 + }, + { + "epoch": 0.29, + "logps_train/chosen": -69.05142974853516, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -101.85133361816406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5389320850372314, + "rewards_train/margins": 1.8883883953094482, + "rewards_train/rejected": -3.4273204803466797, + "step": 1045 + }, + { + "epoch": 0.29, + "learning_rate": 3.5691499868428463e-07, + "loss": 0.491, + "step": 1046 + }, + { + "epoch": 0.29, + "logps_train/chosen": -45.52775192260742, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -61.117061614990234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4158613085746765, + "rewards_train/margins": 1.1839309334754944, + "rewards_train/rejected": -1.599792242050171, + "step": 1046 + }, + { + "epoch": 0.29, + "logps_train/chosen": -73.66189575195312, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -76.9146728515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7325963973999023, + "rewards_train/margins": 0.7260591983795166, + "rewards_train/rejected": -1.458655595779419, + "step": 1047 + }, + { + "epoch": 0.29, + "learning_rate": 3.563552133446904e-07, + "loss": 0.4242, + "step": 1048 + }, + { + "epoch": 0.29, + "logps_train/chosen": -73.01969909667969, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -92.4240493774414, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0511879920959473, + "rewards_train/margins": 1.2873108386993408, + "rewards_train/rejected": -2.338498830795288, + "step": 1048 + }, + { + "epoch": 0.29, + "logps_train/chosen": -81.02203369140625, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -83.94853973388672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.40083646774292, + "rewards_train/margins": 0.8600332736968994, + "rewards_train/rejected": -2.2608697414398193, + "step": 1049 + }, + { + "epoch": 0.29, + "learning_rate": 3.557947761740188e-07, + "loss": 0.415, + "step": 1050 + }, + { + "epoch": 0.29, + "logps_train/chosen": -99.14959716796875, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -111.18470764160156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9208005666732788, + "rewards_train/margins": 1.4121047258377075, + "rewards_train/rejected": -3.3329052925109863, + "step": 1050 + }, + { + "epoch": 0.29, + "logps_train/chosen": -46.27050018310547, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -65.02919006347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5790029764175415, + "rewards_train/margins": 0.8339753150939941, + "rewards_train/rejected": -1.4129782915115356, + "step": 1051 + }, + { + "epoch": 0.29, + "learning_rate": 3.5523369060708374e-07, + "loss": 0.4856, + "step": 1052 + }, + { + "epoch": 0.29, + "logps_train/chosen": -53.709083557128906, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -66.9501724243164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4507913291454315, + "rewards_train/margins": 0.4293822944164276, + "rewards_train/rejected": -0.8801736235618591, + "step": 1052 + }, + { + "epoch": 0.29, + "logps_train/chosen": -60.62303924560547, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -89.16844177246094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4779288172721863, + "rewards_train/margins": 2.288133919239044, + "rewards_train/rejected": -2.7660627365112305, + "step": 1053 + }, + { + "epoch": 0.29, + "learning_rate": 3.546719600826729e-07, + "loss": 0.4181, + "step": 1054 + }, + { + "epoch": 0.29, + "logps_train/chosen": -62.254615783691406, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -68.5887451171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.232297658920288, + "rewards_train/margins": 0.6871232986450195, + "rewards_train/rejected": -1.9194209575653076, + "step": 1054 + }, + { + "epoch": 0.29, + "logps_train/chosen": -52.8306770324707, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -70.62107849121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.44537240266799927, + "rewards_train/margins": 0.9804078936576843, + "rewards_train/rejected": -1.4257802963256836, + "step": 1055 + }, + { + "epoch": 0.3, + "learning_rate": 3.541095880435271e-07, + "loss": 0.4846, + "step": 1056 + }, + { + "epoch": 0.3, + "logps_train/chosen": -80.06373596191406, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -97.99337005615234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7899674773216248, + "rewards_train/margins": 1.5953076481819153, + "rewards_train/rejected": -2.38527512550354, + "step": 1056 + }, + { + "epoch": 0.3, + "logps_train/chosen": -36.4183464050293, + "logps_train/ref_chosen": -31.125, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -55.211761474609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5305064916610718, + "rewards_train/margins": 0.9630330801010132, + "rewards_train/rejected": -1.493539571762085, + "step": 1057 + }, + { + "epoch": 0.3, + "learning_rate": 3.535465779363186e-07, + "loss": 0.478, + "step": 1058 + }, + { + "epoch": 0.3, + "logps_train/chosen": -71.41215515136719, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -92.251953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4980020821094513, + "rewards_train/margins": 1.9035124480724335, + "rewards_train/rejected": -2.4015145301818848, + "step": 1058 + }, + { + "epoch": 0.3, + "logps_train/chosen": -119.8857421875, + "logps_train/ref_chosen": -101.5, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -139.56463623046875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8229491710662842, + "rewards_train/margins": 0.7442560195922852, + "rewards_train/rejected": -2.5672051906585693, + "step": 1059 + }, + { + "epoch": 0.3, + "learning_rate": 3.529829332116302e-07, + "loss": 0.4137, + "step": 1060 + }, + { + "epoch": 0.3, + "logps_train/chosen": -90.95877075195312, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -83.42403411865234, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2107205390930176, + "rewards_train/margins": 1.610588788986206, + "rewards_train/rejected": -2.8213093280792236, + "step": 1060 + }, + { + "epoch": 0.3, + "logps_train/chosen": -58.11225891113281, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -69.92088317871094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2807567119598389, + "rewards_train/margins": 0.35898780822753906, + "rewards_train/rejected": -1.639744520187378, + "step": 1061 + }, + { + "epoch": 0.3, + "learning_rate": 3.524186573239345e-07, + "loss": 0.5618, + "step": 1062 + }, + { + "epoch": 0.3, + "logps_train/chosen": -91.04060363769531, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -110.70835876464844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.5214428901672363, + "rewards_train/margins": 1.2108182907104492, + "rewards_train/rejected": -2.7322611808776855, + "step": 1062 + }, + { + "epoch": 0.3, + "logps_train/chosen": -110.97198486328125, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -116.05752563476562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.957244634628296, + "rewards_train/margins": 0.9339449405670166, + "rewards_train/rejected": -2.8911895751953125, + "step": 1063 + }, + { + "epoch": 0.3, + "learning_rate": 3.51853753731572e-07, + "loss": 0.5831, + "step": 1064 + }, + { + "epoch": 0.3, + "logps_train/chosen": -96.94522094726562, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -131.8717498779297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.007804036140442, + "rewards_train/margins": 2.5246838331222534, + "rewards_train/rejected": -3.5324878692626953, + "step": 1064 + }, + { + "epoch": 0.3, + "logps_train/chosen": -37.12311935424805, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -57.761512756347656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4888744354248047, + "rewards_train/margins": 1.0573943853378296, + "rewards_train/rejected": -1.5462688207626343, + "step": 1065 + }, + { + "epoch": 0.3, + "learning_rate": 3.512882258967306e-07, + "loss": 0.3271, + "step": 1066 + }, + { + "epoch": 0.3, + "logps_train/chosen": -87.00187683105469, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -98.45104217529297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.48651570081710815, + "rewards_train/margins": 1.7613230347633362, + "rewards_train/rejected": -2.2478387355804443, + "step": 1066 + }, + { + "epoch": 0.3, + "logps_train/chosen": -79.45100402832031, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -90.55461120605469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.145100474357605, + "rewards_train/margins": 0.9455169439315796, + "rewards_train/rejected": -2.0906174182891846, + "step": 1067 + }, + { + "epoch": 0.3, + "learning_rate": 3.507220772854238e-07, + "loss": 0.4016, + "step": 1068 + }, + { + "epoch": 0.3, + "logps_train/chosen": -58.23585510253906, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -91.26455688476562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6845228672027588, + "rewards_train/margins": 1.9044318199157715, + "rewards_train/rejected": -2.5889546871185303, + "step": 1068 + }, + { + "epoch": 0.3, + "logps_train/chosen": -54.60529327392578, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -74.64722442626953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.41863471269607544, + "rewards_train/margins": 1.590228259563446, + "rewards_train/rejected": -2.0088629722595215, + "step": 1069 + }, + { + "epoch": 0.3, + "learning_rate": 3.501553113674699e-07, + "loss": 0.4185, + "step": 1070 + }, + { + "epoch": 0.3, + "logps_train/chosen": -53.26490783691406, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -67.52047729492188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5455335378646851, + "rewards_train/margins": 1.1348342895507812, + "rewards_train/rejected": -1.6803678274154663, + "step": 1070 + }, + { + "epoch": 0.3, + "logps_train/chosen": -88.54481506347656, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -113.20479583740234, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.270106315612793, + "rewards_train/margins": 1.7519359588623047, + "rewards_train/rejected": -3.0220422744750977, + "step": 1071 + }, + { + "epoch": 0.3, + "learning_rate": 3.495879316164705e-07, + "loss": 0.3781, + "step": 1072 + }, + { + "epoch": 0.3, + "logps_train/chosen": -79.0526351928711, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -102.6068115234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3712785243988037, + "rewards_train/margins": 1.157273530960083, + "rewards_train/rejected": -2.5285520553588867, + "step": 1072 + }, + { + "epoch": 0.3, + "logps_train/chosen": -111.29562377929688, + "logps_train/ref_chosen": -92.5, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -142.13092041015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8853240013122559, + "rewards_train/margins": 1.5762062072753906, + "rewards_train/rejected": -3.4615302085876465, + "step": 1073 + }, + { + "epoch": 0.3, + "learning_rate": 3.490199415097892e-07, + "loss": 0.374, + "step": 1074 + }, + { + "epoch": 0.3, + "logps_train/chosen": -61.015018463134766, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -65.94660949707031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.20950940251350403, + "rewards_train/margins": 0.4718700349330902, + "rewards_train/rejected": -0.6813794374465942, + "step": 1074 + }, + { + "epoch": 0.3, + "logps_train/chosen": -91.4825439453125, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -90.24166107177734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9312621355056763, + "rewards_train/margins": 0.6930994987487793, + "rewards_train/rejected": -1.6243616342544556, + "step": 1075 + }, + { + "epoch": 0.3, + "learning_rate": 3.4845134452853054e-07, + "loss": 0.5903, + "step": 1076 + }, + { + "epoch": 0.3, + "logps_train/chosen": -51.69927978515625, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -82.89276885986328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5726622343063354, + "rewards_train/margins": 1.9298962354660034, + "rewards_train/rejected": -2.502558469772339, + "step": 1076 + }, + { + "epoch": 0.3, + "logps_train/chosen": -52.848289489746094, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -66.4503402709961, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0582666397094727, + "rewards_train/margins": 0.9441893100738525, + "rewards_train/rejected": -2.002455949783325, + "step": 1077 + }, + { + "epoch": 0.3, + "learning_rate": 3.4788214415751823e-07, + "loss": 0.4098, + "step": 1078 + }, + { + "epoch": 0.3, + "logps_train/chosen": -45.74258804321289, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -58.0853271484375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5560947060585022, + "rewards_train/margins": 0.8438445925712585, + "rewards_train/rejected": -1.3999392986297607, + "step": 1078 + }, + { + "epoch": 0.3, + "logps_train/chosen": -22.18498992919922, + "logps_train/ref_chosen": -17.125, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -47.595497131347656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.5052910447120667, + "rewards_train/margins": 0.8816022276878357, + "rewards_train/rejected": -1.3868932723999023, + "step": 1079 + }, + { + "epoch": 0.3, + "learning_rate": 3.4731234388527424e-07, + "loss": 0.647, + "step": 1080 + }, + { + "epoch": 0.3, + "logps_train/chosen": -48.87290573120117, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -70.92533111572266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2337748259305954, + "rewards_train/margins": 1.3786800652742386, + "rewards_train/rejected": -1.612454891204834, + "step": 1080 + }, + { + "epoch": 0.3, + "logps_train/chosen": -66.94660186767578, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -98.40412902832031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5646798014640808, + "rewards_train/margins": 1.3968260884284973, + "rewards_train/rejected": -1.9615058898925781, + "step": 1081 + }, + { + "epoch": 0.3, + "learning_rate": 3.4674194720399715e-07, + "loss": 0.4132, + "step": 1082 + }, + { + "epoch": 0.3, + "logps_train/chosen": -45.16423797607422, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -52.1833610534668, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.36388489603996277, + "rewards_train/margins": 0.6267167627811432, + "rewards_train/rejected": -0.990601658821106, + "step": 1082 + }, + { + "epoch": 0.3, + "logps_train/chosen": -36.05809783935547, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -50.100059509277344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3359364867210388, + "rewards_train/margins": 1.0795872807502747, + "rewards_train/rejected": -1.4155237674713135, + "step": 1083 + }, + { + "epoch": 0.3, + "learning_rate": 3.4617095760954086e-07, + "loss": 0.52, + "step": 1084 + }, + { + "epoch": 0.3, + "logps_train/chosen": -94.58909606933594, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -102.871337890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9440658688545227, + "rewards_train/margins": 1.8233415484428406, + "rewards_train/rejected": -2.7674074172973633, + "step": 1084 + }, + { + "epoch": 0.3, + "logps_train/chosen": -67.44469451904297, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -96.22750091552734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1706414222717285, + "rewards_train/margins": 1.7423431873321533, + "rewards_train/rejected": -2.912984609603882, + "step": 1085 + }, + { + "epoch": 0.3, + "learning_rate": 3.45599378601393e-07, + "loss": 0.3278, + "step": 1086 + }, + { + "epoch": 0.3, + "logps_train/chosen": -87.40137481689453, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -99.51847839355469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4643561840057373, + "rewards_train/margins": 0.538663387298584, + "rewards_train/rejected": -1.0030195713043213, + "step": 1086 + }, + { + "epoch": 0.3, + "logps_train/chosen": -61.284934997558594, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -57.48148727416992, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.3559348583221436, + "rewards_train/margins": 0.4422140121459961, + "rewards_train/rejected": -1.7981488704681396, + "step": 1087 + }, + { + "epoch": 0.3, + "learning_rate": 3.4502721368265367e-07, + "loss": 0.6218, + "step": 1088 + }, + { + "epoch": 0.3, + "logps_train/chosen": -73.07564544677734, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -105.82978057861328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7022911310195923, + "rewards_train/margins": 2.396312117576599, + "rewards_train/rejected": -3.0986032485961914, + "step": 1088 + }, + { + "epoch": 0.3, + "logps_train/chosen": -75.86065673828125, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -64.31005096435547, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2260072231292725, + "rewards_train/margins": 0.46945154666900635, + "rewards_train/rejected": -1.6954587697982788, + "step": 1089 + }, + { + "epoch": 0.3, + "learning_rate": 3.444544663600141e-07, + "loss": 0.5199, + "step": 1090 + }, + { + "epoch": 0.3, + "logps_train/chosen": -15.774518013000488, + "logps_train/ref_chosen": -12.4375, + "logps_train/ref_rejected": -11.25, + "logps_train/rejected": -17.932443618774414, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.33580145239830017, + "rewards_train/margins": 0.3344937264919281, + "rewards_train/rejected": -0.6702951788902283, + "step": 1090 + }, + { + "epoch": 0.3, + "logps_train/chosen": -60.13362121582031, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -63.003387451171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9002760648727417, + "rewards_train/margins": 0.30728912353515625, + "rewards_train/rejected": -1.207565188407898, + "step": 1091 + }, + { + "epoch": 0.31, + "learning_rate": 3.438811401437346e-07, + "loss": 0.5702, + "step": 1092 + }, + { + "epoch": 0.31, + "logps_train/chosen": -58.83032989501953, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -88.28053283691406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6442146897315979, + "rewards_train/margins": 1.63247150182724, + "rewards_train/rejected": -2.276686191558838, + "step": 1092 + }, + { + "epoch": 0.31, + "logps_train/chosen": -54.11398696899414, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -79.3961181640625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8170624375343323, + "rewards_train/margins": 1.267861783504486, + "rewards_train/rejected": -2.0849242210388184, + "step": 1093 + }, + { + "epoch": 0.31, + "learning_rate": 3.4330723854762364e-07, + "loss": 0.3817, + "step": 1094 + }, + { + "epoch": 0.31, + "logps_train/chosen": -36.65195083618164, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -48.813690185546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6269139647483826, + "rewards_train/margins": 0.6720332503318787, + "rewards_train/rejected": -1.2989472150802612, + "step": 1094 + }, + { + "epoch": 0.31, + "logps_train/chosen": -116.93565368652344, + "logps_train/ref_chosen": -96.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -140.33056640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0654401779174805, + "rewards_train/margins": 2.1191797256469727, + "rewards_train/rejected": -4.184619903564453, + "step": 1095 + }, + { + "epoch": 0.31, + "learning_rate": 3.4273276508901615e-07, + "loss": 0.3646, + "step": 1096 + }, + { + "epoch": 0.31, + "logps_train/chosen": -44.57806396484375, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -64.02806091308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.40770870447158813, + "rewards_train/margins": 1.463066279888153, + "rewards_train/rejected": -1.8707749843597412, + "step": 1096 + }, + { + "epoch": 0.31, + "logps_train/chosen": -49.8616943359375, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -55.13411331176758, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2953982651233673, + "rewards_train/margins": 0.9715287387371063, + "rewards_train/rejected": -1.2669270038604736, + "step": 1097 + }, + { + "epoch": 0.31, + "learning_rate": 3.4215772328875177e-07, + "loss": 0.3748, + "step": 1098 + }, + { + "epoch": 0.31, + "logps_train/chosen": -78.97357177734375, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -92.58221435546875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0360286235809326, + "rewards_train/margins": 1.2143800258636475, + "rewards_train/rejected": -2.25040864944458, + "step": 1098 + }, + { + "epoch": 0.31, + "logps_train/chosen": -55.08749008178711, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -69.11585998535156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5506435036659241, + "rewards_train/margins": 1.0691455006599426, + "rewards_train/rejected": -1.6197890043258667, + "step": 1099 + }, + { + "epoch": 0.31, + "learning_rate": 3.4158211667115335e-07, + "loss": 0.4407, + "step": 1100 + }, + { + "epoch": 0.31, + "logps_train/chosen": -120.96487426757812, + "logps_train/ref_chosen": -103.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -122.01763916015625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7710965871810913, + "rewards_train/margins": 0.7810579538345337, + "rewards_train/rejected": -2.552154541015625, + "step": 1100 + }, + { + "epoch": 0.31, + "logps_train/chosen": -61.33692169189453, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -65.42922973632812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.817481279373169, + "rewards_train/margins": 0.36567628383636475, + "rewards_train/rejected": -1.1831575632095337, + "step": 1101 + }, + { + "epoch": 0.31, + "learning_rate": 3.4100594876400543e-07, + "loss": 0.569, + "step": 1102 + }, + { + "epoch": 0.31, + "logps_train/chosen": -45.08638381958008, + "logps_train/ref_chosen": -33.5, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -46.0760383605957, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1685993671417236, + "rewards_train/margins": 0.03919970989227295, + "rewards_train/rejected": -1.2077990770339966, + "step": 1102 + }, + { + "epoch": 0.31, + "logps_train/chosen": -121.77224731445312, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -131.49594116210938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -3.256521463394165, + "rewards_train/margins": 0.7868220806121826, + "rewards_train/rejected": -4.043343544006348, + "step": 1103 + }, + { + "epoch": 0.31, + "learning_rate": 3.404292230985327e-07, + "loss": 0.6479, + "step": 1104 + }, + { + "epoch": 0.31, + "logps_train/chosen": -39.853607177734375, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -56.08747863769531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.16036087274551392, + "rewards_train/margins": 1.1894025206565857, + "rewards_train/rejected": -1.3497633934020996, + "step": 1104 + }, + { + "epoch": 0.31, + "logps_train/chosen": -68.73233032226562, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -77.93067932128906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8608310222625732, + "rewards_train/margins": 1.051377534866333, + "rewards_train/rejected": -1.9122085571289062, + "step": 1105 + }, + { + "epoch": 0.31, + "learning_rate": 3.3985194320937815e-07, + "loss": 0.3638, + "step": 1106 + }, + { + "epoch": 0.31, + "logps_train/chosen": -73.71665954589844, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -83.02525329589844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.192368507385254, + "rewards_train/margins": 0.8316411972045898, + "rewards_train/rejected": -2.0240097045898438, + "step": 1106 + }, + { + "epoch": 0.31, + "logps_train/chosen": -69.09898376464844, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -78.67167663574219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1285510063171387, + "rewards_train/margins": 0.9606866836547852, + "rewards_train/rejected": -2.089237689971924, + "step": 1107 + }, + { + "epoch": 0.31, + "learning_rate": 3.3927411263458166e-07, + "loss": 0.4816, + "step": 1108 + }, + { + "epoch": 0.31, + "logps_train/chosen": -68.6715316772461, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -103.76207733154297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7993796467781067, + "rewards_train/margins": 1.4291722178459167, + "rewards_train/rejected": -2.2285518646240234, + "step": 1108 + }, + { + "epoch": 0.31, + "logps_train/chosen": -79.95457458496094, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -68.17952728271484, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.091942548751831, + "rewards_train/margins": 0.08147883415222168, + "rewards_train/rejected": -1.1734213829040527, + "step": 1109 + }, + { + "epoch": 0.31, + "learning_rate": 3.386957349155578e-07, + "loss": 0.4955, + "step": 1110 + }, + { + "epoch": 0.31, + "logps_train/chosen": -57.168636322021484, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -70.13655853271484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0338555574417114, + "rewards_train/margins": 1.1188629865646362, + "rewards_train/rejected": -2.1527185440063477, + "step": 1110 + }, + { + "epoch": 0.31, + "logps_train/chosen": -56.17914581298828, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -73.0317611694336, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1761176586151123, + "rewards_train/margins": 1.321882963180542, + "rewards_train/rejected": -2.4980006217956543, + "step": 1111 + }, + { + "epoch": 0.31, + "learning_rate": 3.381168135970749e-07, + "loss": 0.4141, + "step": 1112 + }, + { + "epoch": 0.31, + "logps_train/chosen": -41.319061279296875, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -28.5, + "logps_train/rejected": -36.65044021606445, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4227263927459717, + "rewards_train/margins": 0.3862628936767578, + "rewards_train/rejected": -0.8089892864227295, + "step": 1112 + }, + { + "epoch": 0.31, + "logps_train/chosen": -88.0911865234375, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -92.42652130126953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0791380405426025, + "rewards_train/margins": 0.5197646617889404, + "rewards_train/rejected": -2.598902702331543, + "step": 1113 + }, + { + "epoch": 0.31, + "learning_rate": 3.375373522272326e-07, + "loss": 0.5766, + "step": 1114 + }, + { + "epoch": 0.31, + "logps_train/chosen": -74.1852035522461, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -80.33769226074219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6786766052246094, + "rewards_train/margins": 0.35704636573791504, + "rewards_train/rejected": -2.0357229709625244, + "step": 1114 + }, + { + "epoch": 0.31, + "logps_train/chosen": -63.75776672363281, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -92.407958984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7430132031440735, + "rewards_train/margins": 2.0325481295585632, + "rewards_train/rejected": -2.7755613327026367, + "step": 1115 + }, + { + "epoch": 0.31, + "learning_rate": 3.3695735435744055e-07, + "loss": 0.5116, + "step": 1116 + }, + { + "epoch": 0.31, + "logps_train/chosen": -36.19145202636719, + "logps_train/ref_chosen": -31.375, + "logps_train/ref_rejected": -31.875, + "logps_train/rejected": -43.52620315551758, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.48542898893356323, + "rewards_train/margins": 0.6796916127204895, + "rewards_train/rejected": -1.1651206016540527, + "step": 1116 + }, + { + "epoch": 0.31, + "logps_train/chosen": -56.11375427246094, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -63.73368835449219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6451154947280884, + "rewards_train/margins": 0.7055970430374146, + "rewards_train/rejected": -1.350712537765503, + "step": 1117 + }, + { + "epoch": 0.31, + "learning_rate": 3.3637682354239656e-07, + "loss": 0.5013, + "step": 1118 + }, + { + "epoch": 0.31, + "logps_train/chosen": -89.7490005493164, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -87.31269836425781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.10009503364563, + "rewards_train/margins": 0.40578365325927734, + "rewards_train/rejected": -2.5058786869049072, + "step": 1118 + }, + { + "epoch": 0.31, + "logps_train/chosen": -65.16108703613281, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -85.17172241210938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.46166563034057617, + "rewards_train/margins": 1.0532604455947876, + "rewards_train/rejected": -1.5149260759353638, + "step": 1119 + }, + { + "epoch": 0.31, + "learning_rate": 3.357957633400645e-07, + "loss": 0.5347, + "step": 1120 + }, + { + "epoch": 0.31, + "logps_train/chosen": -45.67375564575195, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -72.63983154296875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7869070172309875, + "rewards_train/margins": 1.2682868838310242, + "rewards_train/rejected": -2.0551939010620117, + "step": 1120 + }, + { + "epoch": 0.31, + "logps_train/chosen": -69.65164184570312, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -82.64480590820312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7079370021820068, + "rewards_train/margins": 1.0424809455871582, + "rewards_train/rejected": -2.750417947769165, + "step": 1121 + }, + { + "epoch": 0.31, + "learning_rate": 3.3521417731165323e-07, + "loss": 0.4252, + "step": 1122 + }, + { + "epoch": 0.31, + "logps_train/chosen": -43.70858383178711, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -57.502525329589844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8567959070205688, + "rewards_train/margins": 0.27470648288726807, + "rewards_train/rejected": -1.131502389907837, + "step": 1122 + }, + { + "epoch": 0.31, + "logps_train/chosen": -52.49120330810547, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -70.92811584472656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5112298727035522, + "rewards_train/margins": 0.816542387008667, + "rewards_train/rejected": -1.3277722597122192, + "step": 1123 + }, + { + "epoch": 0.31, + "learning_rate": 3.346320690215939e-07, + "loss": 0.5869, + "step": 1124 + }, + { + "epoch": 0.31, + "logps_train/chosen": -85.27842712402344, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -90.1741943359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5864367485046387, + "rewards_train/margins": 0.9448494911193848, + "rewards_train/rejected": -1.5312862396240234, + "step": 1124 + }, + { + "epoch": 0.31, + "logps_train/chosen": -88.07566833496094, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -105.86996459960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7584952116012573, + "rewards_train/margins": 1.5497905015945435, + "rewards_train/rejected": -3.308285713195801, + "step": 1125 + }, + { + "epoch": 0.31, + "learning_rate": 3.3404944203751847e-07, + "loss": 0.4124, + "step": 1126 + }, + { + "epoch": 0.31, + "logps_train/chosen": -73.59733581542969, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -103.78475189208984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8066086769104004, + "rewards_train/margins": 1.584366798400879, + "rewards_train/rejected": -2.3909754753112793, + "step": 1126 + }, + { + "epoch": 0.31, + "logps_train/chosen": -44.5773811340332, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -20.875, + "logps_train/rejected": -29.997364044189453, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.5758774280548096, + "rewards_train/margins": 0.330987811088562, + "rewards_train/rejected": -0.9068652391433716, + "step": 1127 + }, + { + "epoch": 0.32, + "learning_rate": 3.334662999302382e-07, + "loss": 0.4612, + "step": 1128 + }, + { + "epoch": 0.32, + "logps_train/chosen": -69.8879165649414, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -88.37731170654297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7446507215499878, + "rewards_train/margins": 1.3508929014205933, + "rewards_train/rejected": -2.095543622970581, + "step": 1128 + }, + { + "epoch": 0.32, + "logps_train/chosen": -105.51724243164062, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -122.52033996582031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6696932911872864, + "rewards_train/margins": 1.9749180674552917, + "rewards_train/rejected": -2.644611358642578, + "step": 1129 + }, + { + "epoch": 0.32, + "learning_rate": 3.3288264627372115e-07, + "loss": 0.3273, + "step": 1130 + }, + { + "epoch": 0.32, + "logps_train/chosen": -86.48792266845703, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -88.79074096679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.600745439529419, + "rewards_train/margins": 0.9377037286758423, + "rewards_train/rejected": -1.5384491682052612, + "step": 1130 + }, + { + "epoch": 0.32, + "logps_train/chosen": -47.698570251464844, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -61.304847717285156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7073571085929871, + "rewards_train/margins": 0.8093579411506653, + "rewards_train/rejected": -1.5167150497436523, + "step": 1131 + }, + { + "epoch": 0.32, + "learning_rate": 3.322984846450708e-07, + "loss": 0.5058, + "step": 1132 + }, + { + "epoch": 0.32, + "logps_train/chosen": -69.10832214355469, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -92.10992431640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4506756067276, + "rewards_train/margins": 1.310707449913025, + "rewards_train/rejected": -2.761383056640625, + "step": 1132 + }, + { + "epoch": 0.32, + "logps_train/chosen": -95.66616821289062, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -110.3429183959961, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0639801025390625, + "rewards_train/margins": 2.1142570972442627, + "rewards_train/rejected": -3.178237199783325, + "step": 1133 + }, + { + "epoch": 0.32, + "learning_rate": 3.3171381862450366e-07, + "loss": 0.3162, + "step": 1134 + }, + { + "epoch": 0.32, + "logps_train/chosen": -74.5470962524414, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -76.4083251953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7293190360069275, + "rewards_train/margins": 0.8237200379371643, + "rewards_train/rejected": -1.5530390739440918, + "step": 1134 + }, + { + "epoch": 0.32, + "logps_train/chosen": -57.64684295654297, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -81.79562377929688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.486656665802002, + "rewards_train/margins": 0.7013044357299805, + "rewards_train/rejected": -2.1879611015319824, + "step": 1135 + }, + { + "epoch": 0.32, + "learning_rate": 3.311286517953278e-07, + "loss": 0.5588, + "step": 1136 + }, + { + "epoch": 0.32, + "logps_train/chosen": -62.12207794189453, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -74.61312103271484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.278662919998169, + "rewards_train/margins": 0.4844069480895996, + "rewards_train/rejected": -1.7630698680877686, + "step": 1136 + }, + { + "epoch": 0.32, + "logps_train/chosen": -60.08687210083008, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -64.04116821289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.39862850308418274, + "rewards_train/margins": 1.336737722158432, + "rewards_train/rejected": -1.7353662252426147, + "step": 1137 + }, + { + "epoch": 0.32, + "learning_rate": 3.305429877439205e-07, + "loss": 0.4804, + "step": 1138 + }, + { + "epoch": 0.32, + "logps_train/chosen": -46.473594665527344, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -60.557891845703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.447359561920166, + "rewards_train/margins": 0.47600793838500977, + "rewards_train/rejected": -0.9233675003051758, + "step": 1138 + }, + { + "epoch": 0.32, + "logps_train/chosen": -72.01480865478516, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -89.46124267578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5598795413970947, + "rewards_train/margins": 0.8055808544158936, + "rewards_train/rejected": -1.3654603958129883, + "step": 1139 + }, + { + "epoch": 0.32, + "learning_rate": 3.2995683005970636e-07, + "loss": 0.4624, + "step": 1140 + }, + { + "epoch": 0.32, + "logps_train/chosen": -76.2835922241211, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -91.91067504882812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.303091049194336, + "rewards_train/margins": 0.8092658519744873, + "rewards_train/rejected": -3.1123569011688232, + "step": 1140 + }, + { + "epoch": 0.32, + "logps_train/chosen": -53.02901840209961, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -81.51502990722656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1332728862762451, + "rewards_train/margins": 1.7668633460998535, + "rewards_train/rejected": -2.9001362323760986, + "step": 1141 + }, + { + "epoch": 0.32, + "learning_rate": 3.2937018233513564e-07, + "loss": 0.4394, + "step": 1142 + }, + { + "epoch": 0.32, + "logps_train/chosen": -78.84823608398438, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -81.2984619140625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.217635989189148, + "rewards_train/margins": 0.7903352975845337, + "rewards_train/rejected": -2.0079712867736816, + "step": 1142 + }, + { + "epoch": 0.32, + "logps_train/chosen": -74.09119415283203, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -89.66231536865234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.596228837966919, + "rewards_train/margins": 0.7967607975006104, + "rewards_train/rejected": -1.3929896354675293, + "step": 1143 + }, + { + "epoch": 0.32, + "learning_rate": 3.287830481656616e-07, + "loss": 0.4635, + "step": 1144 + }, + { + "epoch": 0.32, + "logps_train/chosen": -69.08924865722656, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -97.29635620117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5905653238296509, + "rewards_train/margins": 1.4461008310317993, + "rewards_train/rejected": -2.03666615486145, + "step": 1144 + }, + { + "epoch": 0.32, + "logps_train/chosen": -48.96424865722656, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -59.011512756347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09945185482501984, + "rewards_train/margins": 1.0329498499631882, + "rewards_train/rejected": -1.132401704788208, + "step": 1145 + }, + { + "epoch": 0.32, + "learning_rate": 3.281954311497192e-07, + "loss": 0.3515, + "step": 1146 + }, + { + "epoch": 0.32, + "logps_train/chosen": -54.37567138671875, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -31.625, + "logps_train/rejected": -41.38093566894531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9230404496192932, + "rewards_train/margins": 0.051674067974090576, + "rewards_train/rejected": -0.9747145175933838, + "step": 1146 + }, + { + "epoch": 0.32, + "logps_train/chosen": -32.55580139160156, + "logps_train/ref_chosen": -30.125, + "logps_train/ref_rejected": -27.75, + "logps_train/rejected": -35.703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.23829485476016998, + "rewards_train/margins": 0.5552596896886826, + "rewards_train/rejected": -0.7935545444488525, + "step": 1147 + }, + { + "epoch": 0.32, + "learning_rate": 3.276073348887024e-07, + "loss": 0.6186, + "step": 1148 + }, + { + "epoch": 0.32, + "logps_train/chosen": -47.1552734375, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -50.57854461669922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1946287155151367, + "rewards_train/margins": 0.08939743041992188, + "rewards_train/rejected": -1.2840261459350586, + "step": 1148 + }, + { + "epoch": 0.32, + "logps_train/chosen": -100.0604019165039, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -114.34955596923828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4126805067062378, + "rewards_train/margins": 1.0964940786361694, + "rewards_train/rejected": -2.5091745853424072, + "step": 1149 + }, + { + "epoch": 0.32, + "learning_rate": 3.2701876298694244e-07, + "loss": 0.6195, + "step": 1150 + }, + { + "epoch": 0.32, + "logps_train/chosen": -58.04315948486328, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -77.4443588256836, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.484394371509552, + "rewards_train/margins": 2.067854344844818, + "rewards_train/rejected": -2.55224871635437, + "step": 1150 + }, + { + "epoch": 0.32, + "logps_train/chosen": -87.42581939697266, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -109.06840515136719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.020316243171692, + "rewards_train/margins": 1.6830090284347534, + "rewards_train/rejected": -2.7033252716064453, + "step": 1151 + }, + { + "epoch": 0.32, + "learning_rate": 3.2642971905168566e-07, + "loss": 0.303, + "step": 1152 + }, + { + "epoch": 0.32, + "logps_train/chosen": -41.9207763671875, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -30.5, + "logps_train/rejected": -42.09564208984375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6830934882164001, + "rewards_train/margins": 0.47998613119125366, + "rewards_train/rejected": -1.1630796194076538, + "step": 1152 + }, + { + "epoch": 0.32, + "logps_train/chosen": -71.35394287109375, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -76.39705657958984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6361757516860962, + "rewards_train/margins": 0.7066670656204224, + "rewards_train/rejected": -2.3428428173065186, + "step": 1153 + }, + { + "epoch": 0.32, + "learning_rate": 3.2584020669307144e-07, + "loss": 0.5285, + "step": 1154 + }, + { + "epoch": 0.32, + "logps_train/chosen": -71.41012573242188, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -80.9860610961914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4444795846939087, + "rewards_train/margins": 1.4273685216903687, + "rewards_train/rejected": -2.8718481063842773, + "step": 1154 + }, + { + "epoch": 0.32, + "logps_train/chosen": -82.23304748535156, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -105.46199035644531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3001964092254639, + "rewards_train/margins": 2.2279725074768066, + "rewards_train/rejected": -3.5281689167022705, + "step": 1155 + }, + { + "epoch": 0.32, + "learning_rate": 3.252502295241101e-07, + "loss": 0.3612, + "step": 1156 + }, + { + "epoch": 0.32, + "logps_train/chosen": -94.26513671875, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -103.13916015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5120606422424316, + "rewards_train/margins": 1.803417682647705, + "rewards_train/rejected": -3.3154783248901367, + "step": 1156 + }, + { + "epoch": 0.32, + "logps_train/chosen": -42.25543975830078, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -30.375, + "logps_train/rejected": -43.421539306640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8304266929626465, + "rewards_train/margins": 0.471493124961853, + "rewards_train/rejected": -1.3019198179244995, + "step": 1157 + }, + { + "epoch": 0.32, + "learning_rate": 3.2465979116066053e-07, + "loss": 0.4138, + "step": 1158 + }, + { + "epoch": 0.32, + "logps_train/chosen": -36.60765838623047, + "logps_train/ref_chosen": -32.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -61.4089241027832, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.41408610343933105, + "rewards_train/margins": 0.7150387763977051, + "rewards_train/rejected": -1.1291248798370361, + "step": 1158 + }, + { + "epoch": 0.32, + "logps_train/chosen": -88.83758544921875, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -126.98834991455078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3618834018707275, + "rewards_train/margins": 2.334998846054077, + "rewards_train/rejected": -3.6968822479248047, + "step": 1159 + }, + { + "epoch": 0.32, + "learning_rate": 3.240688952214085e-07, + "loss": 0.3957, + "step": 1160 + }, + { + "epoch": 0.32, + "logps_train/chosen": -64.8724365234375, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -83.40242767333984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8106813430786133, + "rewards_train/margins": 1.5135459899902344, + "rewards_train/rejected": -2.3242273330688477, + "step": 1160 + }, + { + "epoch": 0.32, + "logps_train/chosen": -89.62482452392578, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -88.18783569335938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8790841102600098, + "rewards_train/margins": 0.5377459526062012, + "rewards_train/rejected": -1.416830062866211, + "step": 1161 + }, + { + "epoch": 0.32, + "learning_rate": 3.2347754532784365e-07, + "loss": 0.487, + "step": 1162 + }, + { + "epoch": 0.32, + "logps_train/chosen": -67.27902221679688, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -81.1677017211914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7918180227279663, + "rewards_train/margins": 2.1025396585464478, + "rewards_train/rejected": -2.894357681274414, + "step": 1162 + }, + { + "epoch": 0.33, + "logps_train/chosen": -77.49712371826172, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -92.51890563964844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8801814913749695, + "rewards_train/margins": 0.7646780610084534, + "rewards_train/rejected": -1.6448595523834229, + "step": 1163 + }, + { + "epoch": 0.33, + "learning_rate": 3.228857451042384e-07, + "loss": 0.4235, + "step": 1164 + }, + { + "epoch": 0.33, + "logps_train/chosen": -68.49908447265625, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -91.06550598144531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9470763206481934, + "rewards_train/margins": 0.8711934089660645, + "rewards_train/rejected": -1.8182697296142578, + "step": 1164 + }, + { + "epoch": 0.33, + "logps_train/chosen": -62.16388702392578, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -77.70111846923828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4075016975402832, + "rewards_train/margins": 1.6556766033172607, + "rewards_train/rejected": -2.063178300857544, + "step": 1165 + }, + { + "epoch": 0.33, + "learning_rate": 3.2229349817762476e-07, + "loss": 0.5344, + "step": 1166 + }, + { + "epoch": 0.33, + "logps_train/chosen": -82.88621520996094, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -99.08432006835938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7245585322380066, + "rewards_train/margins": 1.861997902393341, + "rewards_train/rejected": -2.5865564346313477, + "step": 1166 + }, + { + "epoch": 0.33, + "logps_train/chosen": -84.58604431152344, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -100.53765869140625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.134873628616333, + "rewards_train/margins": 0.7903773784637451, + "rewards_train/rejected": -2.925251007080078, + "step": 1167 + }, + { + "epoch": 0.33, + "learning_rate": 3.2170080817777257e-07, + "loss": 0.3993, + "step": 1168 + }, + { + "epoch": 0.33, + "logps_train/chosen": -78.18352508544922, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -98.81004333496094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6421806812286377, + "rewards_train/margins": 1.1499561071395874, + "rewards_train/rejected": -1.792136788368225, + "step": 1168 + }, + { + "epoch": 0.33, + "logps_train/chosen": -66.99746704101562, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -92.34423828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8132960796356201, + "rewards_train/margins": 2.4062840938568115, + "rewards_train/rejected": -3.2195801734924316, + "step": 1169 + }, + { + "epoch": 0.33, + "learning_rate": 3.2110767873716736e-07, + "loss": 0.3673, + "step": 1170 + }, + { + "epoch": 0.33, + "logps_train/chosen": -101.83489990234375, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -129.85865783691406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4677672386169434, + "rewards_train/margins": 1.4091145992279053, + "rewards_train/rejected": -2.8768818378448486, + "step": 1170 + }, + { + "epoch": 0.33, + "logps_train/chosen": -53.57844924926758, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -84.144287109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5094074010848999, + "rewards_train/margins": 1.4049242734909058, + "rewards_train/rejected": -1.9143316745758057, + "step": 1171 + }, + { + "epoch": 0.33, + "learning_rate": 3.205141134909878e-07, + "loss": 0.3803, + "step": 1172 + }, + { + "epoch": 0.33, + "logps_train/chosen": -70.63436889648438, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -75.62820434570312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1985933780670166, + "rewards_train/margins": 0.633758544921875, + "rewards_train/rejected": -1.8323519229888916, + "step": 1172 + }, + { + "epoch": 0.33, + "logps_train/chosen": -82.42655944824219, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -104.75534057617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2934373617172241, + "rewards_train/margins": 1.3875659704208374, + "rewards_train/rejected": -2.6810033321380615, + "step": 1173 + }, + { + "epoch": 0.33, + "learning_rate": 3.1992011607708346e-07, + "loss": 0.4462, + "step": 1174 + }, + { + "epoch": 0.33, + "logps_train/chosen": -71.51427459716797, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -51.11572265625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5781850814819336, + "rewards_train/margins": -0.47012877464294434, + "rewards_train/rejected": -1.1080563068389893, + "step": 1174 + }, + { + "epoch": 0.33, + "logps_train/chosen": -104.39380645751953, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -127.56053161621094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2862557172775269, + "rewards_train/margins": 2.9838610887527466, + "rewards_train/rejected": -4.270116806030273, + "step": 1175 + }, + { + "epoch": 0.33, + "learning_rate": 3.193256901359526e-07, + "loss": 0.847, + "step": 1176 + }, + { + "epoch": 0.33, + "logps_train/chosen": -75.4478988647461, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -91.39605712890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.531899333000183, + "rewards_train/margins": 1.1452065706253052, + "rewards_train/rejected": -2.6771059036254883, + "step": 1176 + }, + { + "epoch": 0.33, + "logps_train/chosen": -74.81266784667969, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -87.51103973388672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4264819622039795, + "rewards_train/margins": 0.781848669052124, + "rewards_train/rejected": -2.2083306312561035, + "step": 1177 + }, + { + "epoch": 0.33, + "learning_rate": 3.187308393107201e-07, + "loss": 0.4431, + "step": 1178 + }, + { + "epoch": 0.33, + "logps_train/chosen": -82.31769561767578, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -110.784423828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0415349006652832, + "rewards_train/margins": 1.4191339015960693, + "rewards_train/rejected": -2.4606688022613525, + "step": 1178 + }, + { + "epoch": 0.33, + "logps_train/chosen": -62.3345832824707, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -90.6983642578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.65494304895401, + "rewards_train/margins": 1.8934091925621033, + "rewards_train/rejected": -2.5483522415161133, + "step": 1179 + }, + { + "epoch": 0.33, + "learning_rate": 3.181355672471144e-07, + "loss": 0.3626, + "step": 1180 + }, + { + "epoch": 0.33, + "logps_train/chosen": -80.82015228271484, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -87.15404510498047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1870933771133423, + "rewards_train/margins": 0.5337800979614258, + "rewards_train/rejected": -1.720873475074768, + "step": 1180 + }, + { + "epoch": 0.33, + "logps_train/chosen": -69.87452697753906, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -89.21676635742188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5815937519073486, + "rewards_train/margins": 0.7410590648651123, + "rewards_train/rejected": -2.322652816772461, + "step": 1181 + }, + { + "epoch": 0.33, + "learning_rate": 3.1753987759344616e-07, + "loss": 0.5324, + "step": 1182 + }, + { + "epoch": 0.33, + "logps_train/chosen": -92.84492492675781, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -119.51774597167969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9559764862060547, + "rewards_train/margins": 2.623922348022461, + "rewards_train/rejected": -4.579898834228516, + "step": 1182 + }, + { + "epoch": 0.33, + "logps_train/chosen": -62.7845458984375, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -90.22932434082031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3030641078948975, + "rewards_train/margins": 1.1229934692382812, + "rewards_train/rejected": -2.4260575771331787, + "step": 1183 + }, + { + "epoch": 0.33, + "learning_rate": 3.169437740005849e-07, + "loss": 0.3255, + "step": 1184 + }, + { + "epoch": 0.33, + "logps_train/chosen": -112.349365234375, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -142.35031127929688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2036864757537842, + "rewards_train/margins": 1.8297817707061768, + "rewards_train/rejected": -3.033468246459961, + "step": 1184 + }, + { + "epoch": 0.33, + "logps_train/chosen": -75.7343521118164, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -102.13626098632812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.205039381980896, + "rewards_train/margins": 0.9179619550704956, + "rewards_train/rejected": -2.1230013370513916, + "step": 1185 + }, + { + "epoch": 0.33, + "learning_rate": 3.1634726012193734e-07, + "loss": 0.3849, + "step": 1186 + }, + { + "epoch": 0.33, + "logps_train/chosen": -58.36687088012695, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -67.93152618408203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6109057068824768, + "rewards_train/margins": 1.3588091731071472, + "rewards_train/rejected": -1.969714879989624, + "step": 1186 + }, + { + "epoch": 0.33, + "logps_train/chosen": -64.16983032226562, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -78.98739624023438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0888584852218628, + "rewards_train/margins": 1.0196467638015747, + "rewards_train/rejected": -2.1085052490234375, + "step": 1187 + }, + { + "epoch": 0.33, + "learning_rate": 3.1575033961342477e-07, + "loss": 0.421, + "step": 1188 + }, + { + "epoch": 0.33, + "logps_train/chosen": -89.76327514648438, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -105.0319595336914, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5091396570205688, + "rewards_train/margins": 1.062806248664856, + "rewards_train/rejected": -2.571945905685425, + "step": 1188 + }, + { + "epoch": 0.33, + "logps_train/chosen": -98.1567153930664, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -91.50418853759766, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.6533666849136353, + "rewards_train/margins": 0.43455207347869873, + "rewards_train/rejected": -2.087918758392334, + "step": 1189 + }, + { + "epoch": 0.33, + "learning_rate": 3.151530161334607e-07, + "loss": 0.5724, + "step": 1190 + }, + { + "epoch": 0.33, + "logps_train/chosen": -109.42497253417969, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -121.7960205078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9292153120040894, + "rewards_train/margins": 1.0121055841445923, + "rewards_train/rejected": -2.9413208961486816, + "step": 1190 + }, + { + "epoch": 0.33, + "logps_train/chosen": -92.81179809570312, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -98.54808044433594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8897742629051208, + "rewards_train/margins": 1.693745195865631, + "rewards_train/rejected": -2.583519458770752, + "step": 1191 + }, + { + "epoch": 0.33, + "learning_rate": 3.14555293342928e-07, + "loss": 0.3727, + "step": 1192 + }, + { + "epoch": 0.33, + "logps_train/chosen": -69.55101776123047, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -70.6552963256836, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.644848108291626, + "rewards_train/margins": 0.5960235595703125, + "rewards_train/rejected": -2.2408716678619385, + "step": 1192 + }, + { + "epoch": 0.33, + "logps_train/chosen": -50.99018096923828, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -67.72573852539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2188911437988281, + "rewards_train/margins": 0.9634487628936768, + "rewards_train/rejected": -2.182339906692505, + "step": 1193 + }, + { + "epoch": 0.33, + "learning_rate": 3.1395717490515736e-07, + "loss": 0.5125, + "step": 1194 + }, + { + "epoch": 0.33, + "logps_train/chosen": -90.19172668457031, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -117.96939086914062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.4976885318756104, + "rewards_train/margins": 0.965266227722168, + "rewards_train/rejected": -3.4629547595977783, + "step": 1194 + }, + { + "epoch": 0.33, + "logps_train/chosen": -107.63124084472656, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -132.37387084960938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.058436632156372, + "rewards_train/margins": 2.381587028503418, + "rewards_train/rejected": -3.44002366065979, + "step": 1195 + }, + { + "epoch": 0.33, + "learning_rate": 3.133586644859039e-07, + "loss": 0.4764, + "step": 1196 + }, + { + "epoch": 0.33, + "logps_train/chosen": -45.805442810058594, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -54.42270278930664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5825948119163513, + "rewards_train/margins": 0.3073318600654602, + "rewards_train/rejected": -0.8899266719818115, + "step": 1196 + }, + { + "epoch": 0.33, + "logps_train/chosen": -86.712158203125, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -88.41496276855469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7137949466705322, + "rewards_train/margins": 1.6230137348175049, + "rewards_train/rejected": -2.336808681488037, + "step": 1197 + }, + { + "epoch": 0.33, + "learning_rate": 3.127597657533255e-07, + "loss": 0.4552, + "step": 1198 + }, + { + "epoch": 0.33, + "logps_train/chosen": -85.20030212402344, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -91.2405776977539, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6713238954544067, + "rewards_train/margins": 0.6649652719497681, + "rewards_train/rejected": -2.336289167404175, + "step": 1198 + }, + { + "epoch": 0.34, + "logps_train/chosen": -52.18776321411133, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -63.601783752441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0654374361038208, + "rewards_train/margins": 1.1307576894760132, + "rewards_train/rejected": -2.196195125579834, + "step": 1199 + }, + { + "epoch": 0.34, + "learning_rate": 3.1216048237795945e-07, + "loss": 0.4597, + "step": 1200 + }, + { + "epoch": 0.34, + "logps_train/chosen": -71.60494232177734, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -89.45936584472656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1646933555603027, + "rewards_train/margins": 1.4083914756774902, + "rewards_train/rejected": -2.573084831237793, + "step": 1200 + }, + { + "epoch": 0.34, + "logps_train/chosen": -91.63750457763672, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -94.1178970336914, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5677721500396729, + "rewards_train/margins": 1.1195242404937744, + "rewards_train/rejected": -2.6872963905334473, + "step": 1201 + }, + { + "epoch": 0.34, + "learning_rate": 3.1156081803270095e-07, + "loss": 0.4558, + "step": 1202 + }, + { + "epoch": 0.34, + "logps_train/chosen": -25.83116912841797, + "logps_train/ref_chosen": -21.375, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -43.94371795654297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4452263116836548, + "rewards_train/margins": 0.4569580554962158, + "rewards_train/rejected": -0.9021843671798706, + "step": 1202 + }, + { + "epoch": 0.34, + "logps_train/chosen": -56.673805236816406, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -60.42707443237305, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5927220582962036, + "rewards_train/margins": 1.0473486185073853, + "rewards_train/rejected": -1.6400706768035889, + "step": 1203 + }, + { + "epoch": 0.34, + "learning_rate": 3.109607763927798e-07, + "loss": 0.4603, + "step": 1204 + }, + { + "epoch": 0.34, + "logps_train/chosen": -59.55609893798828, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -75.75169372558594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.47221139073371887, + "rewards_train/margins": 0.9004187285900116, + "rewards_train/rejected": -1.3726301193237305, + "step": 1204 + }, + { + "epoch": 0.34, + "logps_train/chosen": -71.41329193115234, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -93.61686706542969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9730676412582397, + "rewards_train/margins": 0.9899858236312866, + "rewards_train/rejected": -1.9630534648895264, + "step": 1205 + }, + { + "epoch": 0.34, + "learning_rate": 3.103603611357381e-07, + "loss": 0.4422, + "step": 1206 + }, + { + "epoch": 0.34, + "logps_train/chosen": -97.32916259765625, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -110.93840789794922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1220767498016357, + "rewards_train/margins": 1.1252796649932861, + "rewards_train/rejected": -3.247356414794922, + "step": 1206 + }, + { + "epoch": 0.34, + "logps_train/chosen": -73.59026336669922, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -92.25763702392578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0117605924606323, + "rewards_train/margins": 1.4169319868087769, + "rewards_train/rejected": -2.428692579269409, + "step": 1207 + }, + { + "epoch": 0.34, + "learning_rate": 3.097595759414081e-07, + "loss": 0.377, + "step": 1208 + }, + { + "epoch": 0.34, + "logps_train/chosen": -73.49578857421875, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -89.04098510742188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2222349643707275, + "rewards_train/margins": 1.3064734935760498, + "rewards_train/rejected": -2.5287084579467773, + "step": 1208 + }, + { + "epoch": 0.34, + "logps_train/chosen": -35.669700622558594, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -30.25, + "logps_train/rejected": -43.356666564941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.35395729541778564, + "rewards_train/margins": 0.9569779634475708, + "rewards_train/rejected": -1.3109352588653564, + "step": 1209 + }, + { + "epoch": 0.34, + "learning_rate": 3.09158424491889e-07, + "loss": 0.44, + "step": 1210 + }, + { + "epoch": 0.34, + "logps_train/chosen": -60.83733367919922, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -101.0129623413086, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6280696392059326, + "rewards_train/margins": 2.691976308822632, + "rewards_train/rejected": -3.3200459480285645, + "step": 1210 + }, + { + "epoch": 0.34, + "logps_train/chosen": -58.323280334472656, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -72.33810424804688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.005464881658554077, + "rewards_train/margins": 2.2822435796260834, + "rewards_train/rejected": -2.2767786979675293, + "step": 1211 + }, + { + "epoch": 0.34, + "learning_rate": 3.08556910471525e-07, + "loss": 0.3123, + "step": 1212 + }, + { + "epoch": 0.34, + "logps_train/chosen": -75.33900451660156, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -118.93092346191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.710463285446167, + "rewards_train/margins": 1.4701282978057861, + "rewards_train/rejected": -2.180591583251953, + "step": 1212 + }, + { + "epoch": 0.34, + "logps_train/chosen": -86.78569793701172, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -108.21052551269531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.197808027267456, + "rewards_train/margins": 1.1191422939300537, + "rewards_train/rejected": -3.3169503211975098, + "step": 1213 + }, + { + "epoch": 0.34, + "learning_rate": 3.0795503756688205e-07, + "loss": 0.5301, + "step": 1214 + }, + { + "epoch": 0.34, + "logps_train/chosen": -80.62371063232422, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -126.99827575683594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.921746015548706, + "rewards_train/margins": 2.1335508823394775, + "rewards_train/rejected": -4.055296897888184, + "step": 1214 + }, + { + "epoch": 0.34, + "logps_train/chosen": -64.48846435546875, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -103.56227111816406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.049627661705017, + "rewards_train/margins": 2.3608967065811157, + "rewards_train/rejected": -3.410524368286133, + "step": 1215 + }, + { + "epoch": 0.34, + "learning_rate": 3.0735280946672604e-07, + "loss": 0.3228, + "step": 1216 + }, + { + "epoch": 0.34, + "logps_train/chosen": -73.5542984008789, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -61.83903884887695, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2909764051437378, + "rewards_train/margins": -0.19144737720489502, + "rewards_train/rejected": -1.0995290279388428, + "step": 1216 + }, + { + "epoch": 0.34, + "logps_train/chosen": -47.58641815185547, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -84.713134765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0947747230529785, + "rewards_train/margins": 1.342163324356079, + "rewards_train/rejected": -2.4369380474090576, + "step": 1217 + }, + { + "epoch": 0.34, + "learning_rate": 3.067502298619996e-07, + "loss": 0.7511, + "step": 1218 + }, + { + "epoch": 0.34, + "logps_train/chosen": -108.0360107421875, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -113.9399185180664, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.850402355194092, + "rewards_train/margins": 0.8303086757659912, + "rewards_train/rejected": -3.680711030960083, + "step": 1218 + }, + { + "epoch": 0.34, + "logps_train/chosen": -94.44078063964844, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -108.80403900146484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.619859218597412, + "rewards_train/margins": 0.8124978542327881, + "rewards_train/rejected": -2.4323570728302, + "step": 1219 + }, + { + "epoch": 0.34, + "learning_rate": 3.061473024457995e-07, + "loss": 0.4899, + "step": 1220 + }, + { + "epoch": 0.34, + "logps_train/chosen": -83.36814880371094, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -106.5103988647461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0624502897262573, + "rewards_train/margins": 1.9005528688430786, + "rewards_train/rejected": -2.963003158569336, + "step": 1220 + }, + { + "epoch": 0.34, + "logps_train/chosen": -75.4331283569336, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -80.57693481445312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5780785083770752, + "rewards_train/margins": 0.5919194221496582, + "rewards_train/rejected": -2.1699979305267334, + "step": 1221 + }, + { + "epoch": 0.34, + "learning_rate": 3.0554403091335454e-07, + "loss": 0.416, + "step": 1222 + }, + { + "epoch": 0.34, + "logps_train/chosen": -110.41349792480469, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -126.67073822021484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8757256269454956, + "rewards_train/margins": 1.1765047311782837, + "rewards_train/rejected": -3.0522303581237793, + "step": 1222 + }, + { + "epoch": 0.34, + "logps_train/chosen": -25.623764038085938, + "logps_train/ref_chosen": -19.25, + "logps_train/ref_rejected": -19.375, + "logps_train/rejected": -29.09286117553711, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6393295526504517, + "rewards_train/margins": 0.3276714086532593, + "rewards_train/rejected": -0.9670009613037109, + "step": 1223 + }, + { + "epoch": 0.34, + "learning_rate": 3.049404189620023e-07, + "loss": 0.4956, + "step": 1224 + }, + { + "epoch": 0.34, + "logps_train/chosen": -79.60652160644531, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -85.83671569824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3190501928329468, + "rewards_train/margins": 0.8620820045471191, + "rewards_train/rejected": -1.181132197380066, + "step": 1224 + }, + { + "epoch": 0.34, + "logps_train/chosen": -84.14520263671875, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -106.1170425415039, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.40280088782310486, + "rewards_train/margins": 2.290543705224991, + "rewards_train/rejected": -2.6933445930480957, + "step": 1225 + }, + { + "epoch": 0.34, + "learning_rate": 3.043364702911667e-07, + "loss": 0.3709, + "step": 1226 + }, + { + "epoch": 0.34, + "logps_train/chosen": -129.52308654785156, + "logps_train/ref_chosen": -103.0, + "logps_train/ref_rejected": -117.0, + "logps_train/rejected": -174.3406524658203, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.6671528816223145, + "rewards_train/margins": 3.0770678520202637, + "rewards_train/rejected": -5.744220733642578, + "step": 1226 + }, + { + "epoch": 0.34, + "logps_train/chosen": -105.80215454101562, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -124.0, + "logps_train/rejected": -161.20736694335938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5141997933387756, + "rewards_train/margins": 3.201849400997162, + "rewards_train/rejected": -3.7160491943359375, + "step": 1227 + }, + { + "epoch": 0.34, + "learning_rate": 3.037321886023356e-07, + "loss": 0.1914, + "step": 1228 + }, + { + "epoch": 0.34, + "logps_train/chosen": -86.39804077148438, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -109.82169342041016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3898032903671265, + "rewards_train/margins": 0.9011553525924683, + "rewards_train/rejected": -2.2909586429595947, + "step": 1228 + }, + { + "epoch": 0.34, + "logps_train/chosen": -44.62421417236328, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -57.36525344848633, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8378124237060547, + "rewards_train/margins": 0.5795726776123047, + "rewards_train/rejected": -1.4173851013183594, + "step": 1229 + }, + { + "epoch": 0.34, + "learning_rate": 3.0312757759903746e-07, + "loss": 0.4381, + "step": 1230 + }, + { + "epoch": 0.34, + "logps_train/chosen": -85.21878051757812, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -99.37905883789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9085963368415833, + "rewards_train/margins": 1.5668098330497742, + "rewards_train/rejected": -2.4754061698913574, + "step": 1230 + }, + { + "epoch": 0.34, + "logps_train/chosen": -103.96012878417969, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -122.29784393310547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.054802179336548, + "rewards_train/margins": 1.137481927871704, + "rewards_train/rejected": -3.192284107208252, + "step": 1231 + }, + { + "epoch": 0.34, + "learning_rate": 3.0252264098681946e-07, + "loss": 0.3885, + "step": 1232 + }, + { + "epoch": 0.34, + "logps_train/chosen": -63.1392936706543, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -94.64093017578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0686168670654297, + "rewards_train/margins": 1.7798504829406738, + "rewards_train/rejected": -2.8484673500061035, + "step": 1232 + }, + { + "epoch": 0.34, + "logps_train/chosen": -71.816650390625, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -73.51524353027344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2732181251049042, + "rewards_train/margins": 0.277133971452713, + "rewards_train/rejected": -0.5503520965576172, + "step": 1233 + }, + { + "epoch": 0.34, + "learning_rate": 3.0191738247322415e-07, + "loss": 0.4933, + "step": 1234 + }, + { + "epoch": 0.34, + "logps_train/chosen": -37.84785079956055, + "logps_train/ref_chosen": -32.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -54.35480880737305, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5383008718490601, + "rewards_train/margins": 0.6491333246231079, + "rewards_train/rejected": -1.187434196472168, + "step": 1234 + }, + { + "epoch": 0.35, + "logps_train/chosen": -64.3022232055664, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -71.05715942382812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.359714388847351, + "rewards_train/margins": 0.20303237438201904, + "rewards_train/rejected": -1.5627467632293701, + "step": 1235 + }, + { + "epoch": 0.35, + "learning_rate": 3.0131180576776694e-07, + "loss": 0.5652, + "step": 1236 + }, + { + "epoch": 0.35, + "logps_train/chosen": -36.36655807495117, + "logps_train/ref_chosen": -30.5, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -48.76054000854492, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5925151109695435, + "rewards_train/margins": 0.5085386037826538, + "rewards_train/rejected": -1.1010537147521973, + "step": 1236 + }, + { + "epoch": 0.35, + "logps_train/chosen": -99.78321075439453, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -120.41878509521484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8110358715057373, + "rewards_train/margins": 1.3636550903320312, + "rewards_train/rejected": -3.1746909618377686, + "step": 1237 + }, + { + "epoch": 0.35, + "learning_rate": 3.0070591458191354e-07, + "loss": 0.502, + "step": 1238 + }, + { + "epoch": 0.35, + "logps_train/chosen": -22.466035842895508, + "logps_train/ref_chosen": -19.5, + "logps_train/ref_rejected": -30.25, + "logps_train/rejected": -40.89089584350586, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3012911081314087, + "rewards_train/margins": 0.7671929597854614, + "rewards_train/rejected": -1.0684840679168701, + "step": 1238 + }, + { + "epoch": 0.35, + "logps_train/chosen": -60.26660919189453, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -68.46043395996094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3268800675868988, + "rewards_train/margins": 0.9654524624347687, + "rewards_train/rejected": -1.2923325300216675, + "step": 1239 + }, + { + "epoch": 0.35, + "learning_rate": 3.0009971262905686e-07, + "loss": 0.4518, + "step": 1240 + }, + { + "epoch": 0.35, + "logps_train/chosen": -83.158935546875, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -112.37612915039062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1037836074829102, + "rewards_train/margins": 1.5873448848724365, + "rewards_train/rejected": -2.6911284923553467, + "step": 1240 + }, + { + "epoch": 0.35, + "logps_train/chosen": -47.118743896484375, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -60.689964294433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.46206945180892944, + "rewards_train/margins": 0.8864193558692932, + "rewards_train/rejected": -1.3484888076782227, + "step": 1241 + }, + { + "epoch": 0.35, + "learning_rate": 2.9949320362449454e-07, + "loss": 0.3206, + "step": 1242 + }, + { + "epoch": 0.35, + "logps_train/chosen": -60.50547790527344, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -60.14662551879883, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5290637016296387, + "rewards_train/margins": 0.906301736831665, + "rewards_train/rejected": -1.4353654384613037, + "step": 1242 + }, + { + "epoch": 0.35, + "logps_train/chosen": -71.1565933227539, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -85.62747192382812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2133152484893799, + "rewards_train/margins": 1.2104673385620117, + "rewards_train/rejected": -2.4237825870513916, + "step": 1243 + }, + { + "epoch": 0.35, + "learning_rate": 2.988863912854061e-07, + "loss": 0.4171, + "step": 1244 + }, + { + "epoch": 0.35, + "logps_train/chosen": -59.45911407470703, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -68.69921112060547, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7736943364143372, + "rewards_train/margins": 1.1976673007011414, + "rewards_train/rejected": -1.9713616371154785, + "step": 1244 + }, + { + "epoch": 0.35, + "logps_train/chosen": -68.25922393798828, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -72.01715850830078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1981877088546753, + "rewards_train/margins": 0.9541140794754028, + "rewards_train/rejected": -2.152301788330078, + "step": 1245 + }, + { + "epoch": 0.35, + "learning_rate": 2.982792793308301e-07, + "loss": 0.4688, + "step": 1246 + }, + { + "epoch": 0.35, + "logps_train/chosen": -46.17108917236328, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -81.40778350830078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5603708028793335, + "rewards_train/margins": 2.0001343488693237, + "rewards_train/rejected": -2.5605051517486572, + "step": 1246 + }, + { + "epoch": 0.35, + "logps_train/chosen": -41.5633544921875, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -26.75, + "logps_train/rejected": -40.036712646484375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3092648684978485, + "rewards_train/margins": 1.0149140655994415, + "rewards_train/rejected": -1.32417893409729, + "step": 1247 + }, + { + "epoch": 0.35, + "learning_rate": 2.976718714816414e-07, + "loss": 0.4138, + "step": 1248 + }, + { + "epoch": 0.35, + "logps_train/chosen": -47.901123046875, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -65.83989715576172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.36174318194389343, + "rewards_train/margins": 1.0100879967212677, + "rewards_train/rejected": -1.3718311786651611, + "step": 1248 + }, + { + "epoch": 0.35, + "logps_train/chosen": -44.63545608520508, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -57.441184997558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7454789876937866, + "rewards_train/margins": 0.5191473960876465, + "rewards_train/rejected": -1.264626383781433, + "step": 1249 + }, + { + "epoch": 0.35, + "learning_rate": 2.9706417146052835e-07, + "loss": 0.4548, + "step": 1250 + }, + { + "epoch": 0.35, + "logps_train/chosen": -53.336307525634766, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -58.01958465576172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.036365270614624, + "rewards_train/margins": 0.21832752227783203, + "rewards_train/rejected": -1.254692792892456, + "step": 1250 + }, + { + "epoch": 0.35, + "logps_train/chosen": -85.30303192138672, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -90.73812866210938, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.7932426929473877, + "rewards_train/margins": 0.6220254898071289, + "rewards_train/rejected": -2.4152681827545166, + "step": 1251 + }, + { + "epoch": 0.35, + "learning_rate": 2.9645618299196994e-07, + "loss": 0.626, + "step": 1252 + }, + { + "epoch": 0.35, + "logps_train/chosen": -62.463233947753906, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -37.0, + "logps_train/rejected": -52.76800537109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1841163635253906, + "rewards_train/margins": 0.384188175201416, + "rewards_train/rejected": -1.5683045387268066, + "step": 1252 + }, + { + "epoch": 0.35, + "logps_train/chosen": -116.64669799804688, + "logps_train/ref_chosen": -107.5, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -141.56182861328125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.920920193195343, + "rewards_train/margins": 2.556355655193329, + "rewards_train/rejected": -3.477275848388672, + "step": 1253 + }, + { + "epoch": 0.35, + "learning_rate": 2.95847909802213e-07, + "loss": 0.5828, + "step": 1254 + }, + { + "epoch": 0.35, + "logps_train/chosen": -94.45301818847656, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -98.36775207519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9395405054092407, + "rewards_train/margins": 0.41823089122772217, + "rewards_train/rejected": -2.357771396636963, + "step": 1254 + }, + { + "epoch": 0.35, + "logps_train/chosen": -101.71021270751953, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -100.24636840820312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8341076374053955, + "rewards_train/margins": 0.9092800617218018, + "rewards_train/rejected": -2.7433876991271973, + "step": 1255 + }, + { + "epoch": 0.35, + "learning_rate": 2.952393556192495e-07, + "loss": 0.5338, + "step": 1256 + }, + { + "epoch": 0.35, + "logps_train/chosen": -49.79737091064453, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -65.52215576171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8700078129768372, + "rewards_train/margins": 0.5651177763938904, + "rewards_train/rejected": -1.4351255893707275, + "step": 1256 + }, + { + "epoch": 0.35, + "logps_train/chosen": -56.505470275878906, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -73.62117004394531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1196873188018799, + "rewards_train/margins": 0.22035980224609375, + "rewards_train/rejected": -1.3400471210479736, + "step": 1257 + }, + { + "epoch": 0.35, + "learning_rate": 2.946305241727933e-07, + "loss": 0.6657, + "step": 1258 + }, + { + "epoch": 0.35, + "logps_train/chosen": -52.27532958984375, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -54.501914978027344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8747987151145935, + "rewards_train/margins": 0.5121118426322937, + "rewards_train/rejected": -1.3869105577468872, + "step": 1258 + }, + { + "epoch": 0.35, + "logps_train/chosen": -76.8271255493164, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -123.54518127441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5729472637176514, + "rewards_train/margins": 2.4991490840911865, + "rewards_train/rejected": -4.072096347808838, + "step": 1259 + }, + { + "epoch": 0.35, + "learning_rate": 2.9402141919425784e-07, + "loss": 0.5543, + "step": 1260 + }, + { + "epoch": 0.35, + "logps_train/chosen": -50.75778579711914, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -72.40275573730469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6005833148956299, + "rewards_train/margins": 1.2896926403045654, + "rewards_train/rejected": -1.8902759552001953, + "step": 1260 + }, + { + "epoch": 0.35, + "logps_train/chosen": -49.02370834350586, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -55.26200866699219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8599880337715149, + "rewards_train/margins": 0.6197280287742615, + "rewards_train/rejected": -1.4797160625457764, + "step": 1261 + }, + { + "epoch": 0.35, + "learning_rate": 2.934120444167326e-07, + "loss": 0.4021, + "step": 1262 + }, + { + "epoch": 0.35, + "logps_train/chosen": -67.32768249511719, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -65.77238464355469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.48794418573379517, + "rewards_train/margins": 0.9049198031425476, + "rewards_train/rejected": -1.3928639888763428, + "step": 1262 + }, + { + "epoch": 0.35, + "logps_train/chosen": -41.7213134765625, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -69.01918029785156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6291625499725342, + "rewards_train/margins": 1.0112313032150269, + "rewards_train/rejected": -1.640393853187561, + "step": 1263 + }, + { + "epoch": 0.35, + "learning_rate": 2.928024035749611e-07, + "loss": 0.4033, + "step": 1264 + }, + { + "epoch": 0.35, + "logps_train/chosen": -63.10002136230469, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -77.05767059326172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8517013788223267, + "rewards_train/margins": 1.2935189008712769, + "rewards_train/rejected": -2.1452202796936035, + "step": 1264 + }, + { + "epoch": 0.35, + "logps_train/chosen": -64.24785614013672, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -87.11654663085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6533015370368958, + "rewards_train/margins": 1.6905805468559265, + "rewards_train/rejected": -2.3438820838928223, + "step": 1265 + }, + { + "epoch": 0.35, + "learning_rate": 2.9219250040531716e-07, + "loss": 0.3183, + "step": 1266 + }, + { + "epoch": 0.35, + "logps_train/chosen": -65.76615905761719, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -89.967041015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6911664605140686, + "rewards_train/margins": 1.2617883086204529, + "rewards_train/rejected": -1.9529547691345215, + "step": 1266 + }, + { + "epoch": 0.35, + "logps_train/chosen": -94.05767822265625, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -140.08242797851562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8002994060516357, + "rewards_train/margins": 2.2704427242279053, + "rewards_train/rejected": -4.070742130279541, + "step": 1267 + }, + { + "epoch": 0.35, + "learning_rate": 2.915823386457825e-07, + "loss": 0.4391, + "step": 1268 + }, + { + "epoch": 0.35, + "logps_train/chosen": -69.37425231933594, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -90.4566421508789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7557848691940308, + "rewards_train/margins": 1.8482776880264282, + "rewards_train/rejected": -2.604062557220459, + "step": 1268 + }, + { + "epoch": 0.35, + "logps_train/chosen": -73.65614318847656, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -88.40058135986328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0031148195266724, + "rewards_train/margins": 2.053739905357361, + "rewards_train/rejected": -3.056854724884033, + "step": 1269 + }, + { + "epoch": 0.35, + "learning_rate": 2.9097192203592373e-07, + "loss": 0.4572, + "step": 1270 + }, + { + "epoch": 0.35, + "logps_train/chosen": -111.06425476074219, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -131.41131591796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.059159517288208, + "rewards_train/margins": 1.825331211090088, + "rewards_train/rejected": -3.884490728378296, + "step": 1270 + }, + { + "epoch": 0.36, + "logps_train/chosen": -44.51241683959961, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -63.68146514892578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7299526929855347, + "rewards_train/margins": 1.1296004056930542, + "rewards_train/rejected": -1.8595530986785889, + "step": 1271 + }, + { + "epoch": 0.36, + "learning_rate": 2.9036125431686916e-07, + "loss": 0.3528, + "step": 1272 + }, + { + "epoch": 0.36, + "logps_train/chosen": -63.27284240722656, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -66.57807922363281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8819718360900879, + "rewards_train/margins": 0.9352107048034668, + "rewards_train/rejected": -1.8171825408935547, + "step": 1272 + }, + { + "epoch": 0.36, + "logps_train/chosen": -80.39818572998047, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -106.00479125976562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8234123587608337, + "rewards_train/margins": 2.020621597766876, + "rewards_train/rejected": -2.84403395652771, + "step": 1273 + }, + { + "epoch": 0.36, + "learning_rate": 2.897503392312864e-07, + "loss": 0.3524, + "step": 1274 + }, + { + "epoch": 0.36, + "logps_train/chosen": -53.39342498779297, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -54.14075469970703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4791374206542969, + "rewards_train/margins": 0.9202895164489746, + "rewards_train/rejected": -1.3994269371032715, + "step": 1274 + }, + { + "epoch": 0.36, + "logps_train/chosen": -111.9495849609375, + "logps_train/ref_chosen": -92.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -122.84001159667969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9674193859100342, + "rewards_train/margins": 1.512382984161377, + "rewards_train/rejected": -3.479802370071411, + "step": 1275 + }, + { + "epoch": 0.36, + "learning_rate": 2.8913918052335884e-07, + "loss": 0.4267, + "step": 1276 + }, + { + "epoch": 0.36, + "logps_train/chosen": -73.24745178222656, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -93.18992614746094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1024799346923828, + "rewards_train/margins": 1.9378023147583008, + "rewards_train/rejected": -3.0402822494506836, + "step": 1276 + }, + { + "epoch": 0.36, + "logps_train/chosen": -95.64163970947266, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -115.248046875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9829139709472656, + "rewards_train/margins": 0.716890811920166, + "rewards_train/rejected": -2.6998047828674316, + "step": 1277 + }, + { + "epoch": 0.36, + "learning_rate": 2.8852778193876333e-07, + "loss": 0.418, + "step": 1278 + }, + { + "epoch": 0.36, + "logps_train/chosen": -68.90501403808594, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -93.73394775390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9147204160690308, + "rewards_train/margins": 0.9772288799285889, + "rewards_train/rejected": -1.8919492959976196, + "step": 1278 + }, + { + "epoch": 0.36, + "logps_train/chosen": -51.51862716674805, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -77.333984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8381906747817993, + "rewards_train/margins": 1.2866140604019165, + "rewards_train/rejected": -2.124804735183716, + "step": 1279 + }, + { + "epoch": 0.36, + "learning_rate": 2.879161472246465e-07, + "loss": 0.3712, + "step": 1280 + }, + { + "epoch": 0.36, + "logps_train/chosen": -97.43931579589844, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -134.86773681640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7779157161712646, + "rewards_train/margins": 2.234835386276245, + "rewards_train/rejected": -4.01275110244751, + "step": 1280 + }, + { + "epoch": 0.36, + "logps_train/chosen": -68.90665435791016, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -86.76878356933594, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -1.2789463996887207, + "rewards_train/margins": 0.5182435512542725, + "rewards_train/rejected": -1.7971899509429932, + "step": 1281 + }, + { + "epoch": 0.36, + "learning_rate": 2.8730428012960245e-07, + "loss": 0.5033, + "step": 1282 + }, + { + "epoch": 0.36, + "logps_train/chosen": -115.7667236328125, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -121.2967300415039, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.5526981353759766, + "rewards_train/margins": 1.2461156845092773, + "rewards_train/rejected": -3.798813819885254, + "step": 1282 + }, + { + "epoch": 0.36, + "logps_train/chosen": -64.69181823730469, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -98.20594787597656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7613693475723267, + "rewards_train/margins": 2.1143044233322144, + "rewards_train/rejected": -2.875673770904541, + "step": 1283 + }, + { + "epoch": 0.36, + "learning_rate": 2.8669218440364933e-07, + "loss": 0.3413, + "step": 1284 + }, + { + "epoch": 0.36, + "logps_train/chosen": -73.53129577636719, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -93.92050170898438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6960985064506531, + "rewards_train/margins": 0.9115764498710632, + "rewards_train/rejected": -1.6076749563217163, + "step": 1284 + }, + { + "epoch": 0.36, + "logps_train/chosen": -112.06536865234375, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -128.71908569335938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2846617698669434, + "rewards_train/margins": 2.4806065559387207, + "rewards_train/rejected": -3.765268325805664, + "step": 1285 + }, + { + "epoch": 0.36, + "learning_rate": 2.8607986379820664e-07, + "loss": 0.446, + "step": 1286 + }, + { + "epoch": 0.36, + "logps_train/chosen": -88.01637268066406, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -104.08353424072266, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6192153692245483, + "rewards_train/margins": 0.16101324558258057, + "rewards_train/rejected": -1.780228614807129, + "step": 1286 + }, + { + "epoch": 0.36, + "logps_train/chosen": -39.651729583740234, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -50.218223571777344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5624388456344604, + "rewards_train/margins": 0.934578537940979, + "rewards_train/rejected": -1.4970173835754395, + "step": 1287 + }, + { + "epoch": 0.36, + "learning_rate": 2.854673220660721e-07, + "loss": 0.5327, + "step": 1288 + }, + { + "epoch": 0.36, + "logps_train/chosen": -114.07559204101562, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -132.51544189453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0622470378875732, + "rewards_train/margins": 2.03265643119812, + "rewards_train/rejected": -4.094903469085693, + "step": 1288 + }, + { + "epoch": 0.36, + "logps_train/chosen": -88.67436218261719, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -90.67655944824219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4979051351547241, + "rewards_train/margins": 0.8566652536392212, + "rewards_train/rejected": -2.3545703887939453, + "step": 1289 + }, + { + "epoch": 0.36, + "learning_rate": 2.848545629613986e-07, + "loss": 0.3232, + "step": 1290 + }, + { + "epoch": 0.36, + "logps_train/chosen": -49.19293975830078, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -60.60626220703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.687457799911499, + "rewards_train/margins": 0.9311761856079102, + "rewards_train/rejected": -1.6186339855194092, + "step": 1290 + }, + { + "epoch": 0.36, + "logps_train/chosen": -72.29499816894531, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -95.83135223388672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.78213632106781, + "rewards_train/margins": 1.9825423955917358, + "rewards_train/rejected": -3.764678716659546, + "step": 1291 + }, + { + "epoch": 0.36, + "learning_rate": 2.842415902396713e-07, + "loss": 0.5225, + "step": 1292 + }, + { + "epoch": 0.36, + "logps_train/chosen": -56.585472106933594, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -68.98127746582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.199172019958496, + "rewards_train/margins": 0.8024711608886719, + "rewards_train/rejected": -2.001643180847168, + "step": 1292 + }, + { + "epoch": 0.36, + "logps_train/chosen": -111.89640808105469, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -127.68263244628906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6958913803100586, + "rewards_train/margins": 1.5458097457885742, + "rewards_train/rejected": -3.241701126098633, + "step": 1293 + }, + { + "epoch": 0.36, + "learning_rate": 2.8362840765768476e-07, + "loss": 0.4456, + "step": 1294 + }, + { + "epoch": 0.36, + "logps_train/chosen": -25.271020889282227, + "logps_train/ref_chosen": -21.25, + "logps_train/ref_rejected": -22.5, + "logps_train/rejected": -29.85843276977539, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.3960718512535095, + "rewards_train/margins": 0.33901458978652954, + "rewards_train/rejected": -0.7350864410400391, + "step": 1294 + }, + { + "epoch": 0.36, + "logps_train/chosen": -102.75201416015625, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -158.03952026367188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7902405261993408, + "rewards_train/margins": 3.8672268390655518, + "rewards_train/rejected": -5.657467365264893, + "step": 1295 + }, + { + "epoch": 0.36, + "learning_rate": 2.830150189735193e-07, + "loss": 0.3755, + "step": 1296 + }, + { + "epoch": 0.36, + "logps_train/chosen": -39.12625503540039, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -53.60280227661133, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.30244502425193787, + "rewards_train/margins": 0.9511698782444, + "rewards_train/rejected": -1.253614902496338, + "step": 1296 + }, + { + "epoch": 0.36, + "logps_train/chosen": -106.22415161132812, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -112.7197494506836, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6665563583374023, + "rewards_train/margins": 1.257957935333252, + "rewards_train/rejected": -2.9245142936706543, + "step": 1297 + }, + { + "epoch": 0.36, + "learning_rate": 2.8240142794651895e-07, + "loss": 0.4655, + "step": 1298 + }, + { + "epoch": 0.36, + "logps_train/chosen": -105.55613708496094, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -112.70767974853516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9675285816192627, + "rewards_train/margins": 0.6024830341339111, + "rewards_train/rejected": -2.570011615753174, + "step": 1298 + }, + { + "epoch": 0.36, + "logps_train/chosen": -96.76789855957031, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -139.1729736328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8603839874267578, + "rewards_train/margins": 2.5506625175476074, + "rewards_train/rejected": -3.4110465049743652, + "step": 1299 + }, + { + "epoch": 0.36, + "learning_rate": 2.8178763833726734e-07, + "loss": 0.4321, + "step": 1300 + }, + { + "epoch": 0.36, + "logps_train/chosen": -69.91500854492188, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -105.3660659790039, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6147429943084717, + "rewards_train/margins": 1.8849496841430664, + "rewards_train/rejected": -2.499692678451538, + "step": 1300 + }, + { + "epoch": 0.36, + "logps_train/chosen": -148.17092895507812, + "logps_train/ref_chosen": -109.5, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -158.2682647705078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.8811564445495605, + "rewards_train/margins": 1.742936134338379, + "rewards_train/rejected": -5.6240925788879395, + "step": 1301 + }, + { + "epoch": 0.36, + "learning_rate": 2.811736539075656e-07, + "loss": 0.4972, + "step": 1302 + }, + { + "epoch": 0.36, + "logps_train/chosen": -59.03349685668945, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -89.21664428710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8964160084724426, + "rewards_train/margins": 1.5689980387687683, + "rewards_train/rejected": -2.465414047241211, + "step": 1302 + }, + { + "epoch": 0.36, + "logps_train/chosen": -84.39813232421875, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -86.77507019042969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2852232456207275, + "rewards_train/margins": 0.9193837642669678, + "rewards_train/rejected": -2.2046070098876953, + "step": 1303 + }, + { + "epoch": 0.36, + "learning_rate": 2.8055947842040863e-07, + "loss": 0.4232, + "step": 1304 + }, + { + "epoch": 0.36, + "logps_train/chosen": -72.73558807373047, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -104.67257690429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1188716888427734, + "rewards_train/margins": 2.3550262451171875, + "rewards_train/rejected": -3.473897933959961, + "step": 1304 + }, + { + "epoch": 0.36, + "logps_train/chosen": -46.97467803955078, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -93.93573760986328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6736396551132202, + "rewards_train/margins": 1.2531369924545288, + "rewards_train/rejected": -1.926776647567749, + "step": 1305 + }, + { + "epoch": 0.37, + "learning_rate": 2.799451156399623e-07, + "loss": 0.3026, + "step": 1306 + }, + { + "epoch": 0.37, + "logps_train/chosen": -85.8043441772461, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -94.05465698242188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0390287637710571, + "rewards_train/margins": 1.0871402025222778, + "rewards_train/rejected": -2.126168966293335, + "step": 1306 + }, + { + "epoch": 0.37, + "logps_train/chosen": -37.19535827636719, + "logps_train/ref_chosen": -32.0, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -61.08477020263672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5227588415145874, + "rewards_train/margins": 1.153588891029358, + "rewards_train/rejected": -1.6763477325439453, + "step": 1307 + }, + { + "epoch": 0.37, + "learning_rate": 2.7933056933154055e-07, + "loss": 0.4183, + "step": 1308 + }, + { + "epoch": 0.37, + "logps_train/chosen": -50.97650909423828, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -56.72451400756836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.683588445186615, + "rewards_train/margins": 1.0373005270957947, + "rewards_train/rejected": -1.7208889722824097, + "step": 1308 + }, + { + "epoch": 0.37, + "logps_train/chosen": -104.0382080078125, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -107.07575225830078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7108514308929443, + "rewards_train/margins": 1.9105427265167236, + "rewards_train/rejected": -3.621394157409668, + "step": 1309 + }, + { + "epoch": 0.37, + "learning_rate": 2.7871584326158183e-07, + "loss": 0.3898, + "step": 1310 + }, + { + "epoch": 0.37, + "logps_train/chosen": -26.640560150146484, + "logps_train/ref_chosen": -19.75, + "logps_train/ref_rejected": -14.1875, + "logps_train/rejected": -24.182024002075195, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6896420121192932, + "rewards_train/margins": 0.3077598214149475, + "rewards_train/rejected": -0.9974018335342407, + "step": 1310 + }, + { + "epoch": 0.37, + "logps_train/chosen": -95.34265899658203, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -100.6622314453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6484259366989136, + "rewards_train/margins": 1.381079077720642, + "rewards_train/rejected": -3.0295050144195557, + "step": 1311 + }, + { + "epoch": 0.37, + "learning_rate": 2.7810094119762656e-07, + "loss": 0.5341, + "step": 1312 + }, + { + "epoch": 0.37, + "logps_train/chosen": -61.50445556640625, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -81.31883239746094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9082581996917725, + "rewards_train/margins": 1.175969123840332, + "rewards_train/rejected": -2.0842273235321045, + "step": 1312 + }, + { + "epoch": 0.37, + "logps_train/chosen": -92.20587921142578, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -112.29820251464844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8151195049285889, + "rewards_train/margins": 2.1342315673828125, + "rewards_train/rejected": -3.9493510723114014, + "step": 1313 + }, + { + "epoch": 0.37, + "learning_rate": 2.774858669082937e-07, + "loss": 0.3724, + "step": 1314 + }, + { + "epoch": 0.37, + "logps_train/chosen": -96.3318099975586, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -94.733154296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.773806095123291, + "rewards_train/margins": 0.4018535614013672, + "rewards_train/rejected": -2.175659656524658, + "step": 1314 + }, + { + "epoch": 0.37, + "logps_train/chosen": -51.42466735839844, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -81.90716552734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7602400779724121, + "rewards_train/margins": 1.5957107543945312, + "rewards_train/rejected": -2.3559508323669434, + "step": 1315 + }, + { + "epoch": 0.37, + "learning_rate": 2.7687062416325777e-07, + "loss": 0.441, + "step": 1316 + }, + { + "epoch": 0.37, + "logps_train/chosen": -95.26358795166016, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -120.08601379394531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.937295913696289, + "rewards_train/margins": 1.0259923934936523, + "rewards_train/rejected": -2.9632883071899414, + "step": 1316 + }, + { + "epoch": 0.37, + "logps_train/chosen": -102.5262451171875, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -98.22103881835938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0816283226013184, + "rewards_train/margins": 0.20883560180664062, + "rewards_train/rejected": -2.290463924407959, + "step": 1317 + }, + { + "epoch": 0.37, + "learning_rate": 2.7625521673322584e-07, + "loss": 0.6372, + "step": 1318 + }, + { + "epoch": 0.37, + "logps_train/chosen": -85.05008697509766, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -89.3863296508789, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.3417274951934814, + "rewards_train/margins": 0.4308891296386719, + "rewards_train/rejected": -2.7726166248321533, + "step": 1318 + }, + { + "epoch": 0.37, + "logps_train/chosen": -50.07927322387695, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -53.705081939697266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0952810049057007, + "rewards_train/margins": 1.2998853921890259, + "rewards_train/rejected": -2.3951663970947266, + "step": 1319 + }, + { + "epoch": 0.37, + "learning_rate": 2.756396483899139e-07, + "loss": 0.5068, + "step": 1320 + }, + { + "epoch": 0.37, + "logps_train/chosen": -43.69392013549805, + "logps_train/ref_chosen": -36.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -69.0543212890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7180249094963074, + "rewards_train/margins": 0.6708055138587952, + "rewards_train/rejected": -1.3888304233551025, + "step": 1320 + }, + { + "epoch": 0.37, + "logps_train/chosen": -72.91769409179688, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -79.10346984863281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9517297744750977, + "rewards_train/margins": 0.6179918050765991, + "rewards_train/rejected": -1.5697215795516968, + "step": 1321 + }, + { + "epoch": 0.37, + "learning_rate": 2.7502392290602463e-07, + "loss": 0.5277, + "step": 1322 + }, + { + "epoch": 0.37, + "logps_train/chosen": -94.58790588378906, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -104.88409423828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8958998918533325, + "rewards_train/margins": 2.1021775007247925, + "rewards_train/rejected": -3.998077392578125, + "step": 1322 + }, + { + "epoch": 0.37, + "logps_train/chosen": -47.807395935058594, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -59.41307830810547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5037863850593567, + "rewards_train/margins": 1.51603764295578, + "rewards_train/rejected": -2.0198240280151367, + "step": 1323 + }, + { + "epoch": 0.37, + "learning_rate": 2.7440804405522346e-07, + "loss": 0.355, + "step": 1324 + }, + { + "epoch": 0.37, + "logps_train/chosen": -73.08329772949219, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -95.97520446777344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7374316453933716, + "rewards_train/margins": 1.0569642782211304, + "rewards_train/rejected": -2.794395923614502, + "step": 1324 + }, + { + "epoch": 0.37, + "logps_train/chosen": -85.72633361816406, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -82.99726867675781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8273212909698486, + "rewards_train/margins": 0.2659609317779541, + "rewards_train/rejected": -2.0932822227478027, + "step": 1325 + }, + { + "epoch": 0.37, + "learning_rate": 2.73792015612116e-07, + "loss": 0.5358, + "step": 1326 + }, + { + "epoch": 0.37, + "logps_train/chosen": -43.96147918701172, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -67.97431945800781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.42471253871917725, + "rewards_train/margins": 1.359047770500183, + "rewards_train/rejected": -1.7837603092193604, + "step": 1326 + }, + { + "epoch": 0.37, + "logps_train/chosen": -58.512733459472656, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -75.8866195678711, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.48896825313568115, + "rewards_train/margins": 1.309849739074707, + "rewards_train/rejected": -1.7988179922103882, + "step": 1327 + }, + { + "epoch": 0.37, + "learning_rate": 2.7317584135222453e-07, + "loss": 0.4733, + "step": 1328 + }, + { + "epoch": 0.37, + "logps_train/chosen": -81.32911682128906, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -76.09212493896484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8594746589660645, + "rewards_train/margins": 1.02180814743042, + "rewards_train/rejected": -1.8812828063964844, + "step": 1328 + }, + { + "epoch": 0.37, + "logps_train/chosen": -130.31271362304688, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -140.99732971191406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.4781460762023926, + "rewards_train/margins": 1.3262743949890137, + "rewards_train/rejected": -3.8044204711914062, + "step": 1329 + }, + { + "epoch": 0.37, + "learning_rate": 2.7255952505196523e-07, + "loss": 0.4357, + "step": 1330 + }, + { + "epoch": 0.37, + "logps_train/chosen": -87.80403137207031, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -105.68490600585938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3147778511047363, + "rewards_train/margins": 0.8310563564300537, + "rewards_train/rejected": -2.14583420753479, + "step": 1330 + }, + { + "epoch": 0.37, + "logps_train/chosen": -36.144046783447266, + "logps_train/ref_chosen": -27.25, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -54.56989288330078, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8839359283447266, + "rewards_train/margins": 0.697174072265625, + "rewards_train/rejected": -1.5811100006103516, + "step": 1331 + }, + { + "epoch": 0.37, + "learning_rate": 2.719430704886244e-07, + "loss": 0.5688, + "step": 1332 + }, + { + "epoch": 0.37, + "logps_train/chosen": -57.416873931884766, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -81.05432891845703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8594609498977661, + "rewards_train/margins": 1.9582279920578003, + "rewards_train/rejected": -2.8176889419555664, + "step": 1332 + }, + { + "epoch": 0.37, + "logps_train/chosen": -86.974365234375, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -114.34844207763672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9052491784095764, + "rewards_train/margins": 1.794048011302948, + "rewards_train/rejected": -2.6992971897125244, + "step": 1333 + }, + { + "epoch": 0.37, + "learning_rate": 2.7132648144033614e-07, + "loss": 0.3223, + "step": 1334 + }, + { + "epoch": 0.37, + "logps_train/chosen": -46.227272033691406, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -67.61605072021484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7809305787086487, + "rewards_train/margins": 1.0939556956291199, + "rewards_train/rejected": -1.8748862743377686, + "step": 1334 + }, + { + "epoch": 0.37, + "logps_train/chosen": -49.509666442871094, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -66.00403594970703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.34100592136383057, + "rewards_train/margins": 1.7270735502243042, + "rewards_train/rejected": -2.0680794715881348, + "step": 1335 + }, + { + "epoch": 0.37, + "learning_rate": 2.707097616860587e-07, + "loss": 0.3386, + "step": 1336 + }, + { + "epoch": 0.37, + "logps_train/chosen": -106.86275482177734, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -132.5000762939453, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.2087366580963135, + "rewards_train/margins": 1.1859004497528076, + "rewards_train/rejected": -3.394637107849121, + "step": 1336 + }, + { + "epoch": 0.37, + "logps_train/chosen": -74.65409088134766, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -82.0049819946289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5857215523719788, + "rewards_train/margins": 0.8608705401420593, + "rewards_train/rejected": -1.446592092514038, + "step": 1337 + }, + { + "epoch": 0.37, + "learning_rate": 2.7009291500555113e-07, + "loss": 0.5754, + "step": 1338 + }, + { + "epoch": 0.37, + "logps_train/chosen": -69.99712371826172, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -76.87053680419922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4236384630203247, + "rewards_train/margins": 0.9444705247879028, + "rewards_train/rejected": -2.3681089878082275, + "step": 1338 + }, + { + "epoch": 0.37, + "logps_train/chosen": -60.1782112121582, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -82.09654998779297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4322744607925415, + "rewards_train/margins": 1.5351929664611816, + "rewards_train/rejected": -1.9674674272537231, + "step": 1339 + }, + { + "epoch": 0.37, + "learning_rate": 2.694759451793508e-07, + "loss": 0.3491, + "step": 1340 + }, + { + "epoch": 0.37, + "logps_train/chosen": -67.44230651855469, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -82.63480377197266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5241507291793823, + "rewards_train/margins": 0.9656966924667358, + "rewards_train/rejected": -2.489847421646118, + "step": 1340 + }, + { + "epoch": 0.37, + "logps_train/chosen": -43.637535095214844, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -63.39043426513672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7274252772331238, + "rewards_train/margins": 0.7655245661735535, + "rewards_train/rejected": -1.4929498434066772, + "step": 1341 + }, + { + "epoch": 0.38, + "learning_rate": 2.6885885598874946e-07, + "loss": 0.3929, + "step": 1342 + }, + { + "epoch": 0.38, + "logps_train/chosen": -71.57133483886719, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -108.21395111083984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7282274961471558, + "rewards_train/margins": 2.0763708353042603, + "rewards_train/rejected": -2.804598331451416, + "step": 1342 + }, + { + "epoch": 0.38, + "logps_train/chosen": -57.34663391113281, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -85.48606872558594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6021436452865601, + "rewards_train/margins": 2.0251747369766235, + "rewards_train/rejected": -2.6273183822631836, + "step": 1343 + }, + { + "epoch": 0.38, + "learning_rate": 2.682416512157707e-07, + "loss": 0.3162, + "step": 1344 + }, + { + "epoch": 0.38, + "logps_train/chosen": -79.02452087402344, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -98.95014190673828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1665629148483276, + "rewards_train/margins": 1.4204438924789429, + "rewards_train/rejected": -2.5870068073272705, + "step": 1344 + }, + { + "epoch": 0.38, + "logps_train/chosen": -57.89361572265625, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -84.1726303100586, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.48418545722961426, + "rewards_train/margins": 2.1544644832611084, + "rewards_train/rejected": -2.6386499404907227, + "step": 1345 + }, + { + "epoch": 0.38, + "learning_rate": 2.6762433464314625e-07, + "loss": 0.3745, + "step": 1346 + }, + { + "epoch": 0.38, + "logps_train/chosen": -60.114990234375, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -91.95317077636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3827880322933197, + "rewards_train/margins": 1.4492479860782623, + "rewards_train/rejected": -1.832036018371582, + "step": 1346 + }, + { + "epoch": 0.38, + "logps_train/chosen": -88.88194274902344, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -94.5876235961914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4225689172744751, + "rewards_train/margins": 1.2158806324005127, + "rewards_train/rejected": -1.6384495496749878, + "step": 1347 + }, + { + "epoch": 0.38, + "learning_rate": 2.6700691005429314e-07, + "loss": 0.3202, + "step": 1348 + }, + { + "epoch": 0.38, + "logps_train/chosen": -87.83692932128906, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -99.7630844116211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.4221696853637695, + "rewards_train/margins": 1.8639039993286133, + "rewards_train/rejected": -4.286073684692383, + "step": 1348 + }, + { + "epoch": 0.38, + "logps_train/chosen": -48.34173583984375, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -76.79225158691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7172788977622986, + "rewards_train/margins": 2.2607746720314026, + "rewards_train/rejected": -2.978053569793701, + "step": 1349 + }, + { + "epoch": 0.38, + "learning_rate": 2.663893812332905e-07, + "loss": 0.3444, + "step": 1350 + }, + { + "epoch": 0.38, + "logps_train/chosen": -60.93762969970703, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -58.11137390136719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0298957824707031, + "rewards_train/margins": 0.9316320419311523, + "rewards_train/rejected": -1.9615278244018555, + "step": 1350 + }, + { + "epoch": 0.38, + "logps_train/chosen": -110.66854858398438, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -111.44385528564453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8504478931427002, + "rewards_train/margins": 0.7445235252380371, + "rewards_train/rejected": -2.5949714183807373, + "step": 1351 + }, + { + "epoch": 0.38, + "learning_rate": 2.6577175196485616e-07, + "loss": 0.5223, + "step": 1352 + }, + { + "epoch": 0.38, + "logps_train/chosen": -51.827369689941406, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -70.49395751953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7101783752441406, + "rewards_train/margins": 1.4089927673339844, + "rewards_train/rejected": -2.119171142578125, + "step": 1352 + }, + { + "epoch": 0.38, + "logps_train/chosen": -67.51397705078125, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -92.01215362548828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9197568893432617, + "rewards_train/margins": 1.8904430866241455, + "rewards_train/rejected": -2.8101999759674072, + "step": 1353 + }, + { + "epoch": 0.38, + "learning_rate": 2.651540260343237e-07, + "loss": 0.3153, + "step": 1354 + }, + { + "epoch": 0.38, + "logps_train/chosen": -82.34182739257812, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -91.17100524902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.287113070487976, + "rewards_train/margins": 1.5663150548934937, + "rewards_train/rejected": -2.8534281253814697, + "step": 1354 + }, + { + "epoch": 0.38, + "logps_train/chosen": -86.42059326171875, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -113.45274353027344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.267059326171875, + "rewards_train/margins": 2.017570972442627, + "rewards_train/rejected": -3.284630298614502, + "step": 1355 + }, + { + "epoch": 0.38, + "learning_rate": 2.6453620722761895e-07, + "loss": 0.3597, + "step": 1356 + }, + { + "epoch": 0.38, + "logps_train/chosen": -59.536041259765625, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -79.4385986328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6543853282928467, + "rewards_train/margins": 1.4230682849884033, + "rewards_train/rejected": -2.07745361328125, + "step": 1356 + }, + { + "epoch": 0.38, + "logps_train/chosen": -60.514652252197266, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -68.70114135742188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1543948650360107, + "rewards_train/margins": 0.48681366443634033, + "rewards_train/rejected": -1.641208529472351, + "step": 1357 + }, + { + "epoch": 0.38, + "learning_rate": 2.639182993312371e-07, + "loss": 0.4524, + "step": 1358 + }, + { + "epoch": 0.38, + "logps_train/chosen": -77.8732681274414, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -103.27001190185547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7883033752441406, + "rewards_train/margins": 1.3624773025512695, + "rewards_train/rejected": -2.15078067779541, + "step": 1358 + }, + { + "epoch": 0.38, + "logps_train/chosen": -80.97784423828125, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -83.80522155761719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9926083087921143, + "rewards_train/margins": 0.5324447154998779, + "rewards_train/rejected": -2.525053024291992, + "step": 1359 + }, + { + "epoch": 0.38, + "learning_rate": 2.6330030613221923e-07, + "loss": 0.4443, + "step": 1360 + }, + { + "epoch": 0.38, + "logps_train/chosen": -46.133155822753906, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -63.55250930786133, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5797218680381775, + "rewards_train/margins": 1.1144937872886658, + "rewards_train/rejected": -1.6942156553268433, + "step": 1360 + }, + { + "epoch": 0.38, + "logps_train/chosen": -121.89741516113281, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -140.67169189453125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.856929302215576, + "rewards_train/margins": 0.9571146965026855, + "rewards_train/rejected": -3.8140439987182617, + "step": 1361 + }, + { + "epoch": 0.38, + "learning_rate": 2.626822314181293e-07, + "loss": 0.5153, + "step": 1362 + }, + { + "epoch": 0.38, + "logps_train/chosen": -68.66605377197266, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -92.5390396118164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3077186346054077, + "rewards_train/margins": 1.0979429483413696, + "rewards_train/rejected": -2.4056615829467773, + "step": 1362 + }, + { + "epoch": 0.38, + "logps_train/chosen": -94.73461151123047, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -149.11090087890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.228539228439331, + "rewards_train/margins": 2.855987787246704, + "rewards_train/rejected": -5.084527015686035, + "step": 1363 + }, + { + "epoch": 0.38, + "learning_rate": 2.6206407897703093e-07, + "loss": 0.3182, + "step": 1364 + }, + { + "epoch": 0.38, + "logps_train/chosen": -82.84297943115234, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -119.45513153076172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9452354907989502, + "rewards_train/margins": 2.7491061687469482, + "rewards_train/rejected": -3.6943416595458984, + "step": 1364 + }, + { + "epoch": 0.38, + "logps_train/chosen": -87.6287841796875, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -76.39540100097656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0286750793457031, + "rewards_train/margins": 1.2200205326080322, + "rewards_train/rejected": -2.2486956119537354, + "step": 1365 + }, + { + "epoch": 0.38, + "learning_rate": 2.6144585259746394e-07, + "loss": 0.3367, + "step": 1366 + }, + { + "epoch": 0.38, + "logps_train/chosen": -76.80714416503906, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -121.25357055664062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0427266359329224, + "rewards_train/margins": 2.338098883628845, + "rewards_train/rejected": -3.3808255195617676, + "step": 1366 + }, + { + "epoch": 0.38, + "logps_train/chosen": -77.67304992675781, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -113.48094177246094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0262898206710815, + "rewards_train/margins": 1.9893819093704224, + "rewards_train/rejected": -3.015671730041504, + "step": 1367 + }, + { + "epoch": 0.38, + "learning_rate": 2.6082755606842154e-07, + "loss": 0.3611, + "step": 1368 + }, + { + "epoch": 0.38, + "logps_train/chosen": -77.24444580078125, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -89.52877807617188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.33577299118042, + "rewards_train/margins": 1.7199368476867676, + "rewards_train/rejected": -3.0557098388671875, + "step": 1368 + }, + { + "epoch": 0.38, + "logps_train/chosen": -60.85888671875, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -78.12150573730469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.511572241783142, + "rewards_train/margins": 1.453703761100769, + "rewards_train/rejected": -2.965276002883911, + "step": 1369 + }, + { + "epoch": 0.38, + "learning_rate": 2.602091931793267e-07, + "loss": 0.3925, + "step": 1370 + }, + { + "epoch": 0.38, + "logps_train/chosen": -49.612762451171875, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -48.86111068725586, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5028778314590454, + "rewards_train/margins": 0.6347957849502563, + "rewards_train/rejected": -1.1376736164093018, + "step": 1370 + }, + { + "epoch": 0.38, + "logps_train/chosen": -97.61985778808594, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -96.43189239501953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.229172945022583, + "rewards_train/margins": 1.477590799331665, + "rewards_train/rejected": -3.706763744354248, + "step": 1371 + }, + { + "epoch": 0.38, + "learning_rate": 2.595907677200091e-07, + "loss": 0.3864, + "step": 1372 + }, + { + "epoch": 0.38, + "logps_train/chosen": -94.76100158691406, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -114.26131439208984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8886008262634277, + "rewards_train/margins": 1.695343255996704, + "rewards_train/rejected": -3.583944082260132, + "step": 1372 + }, + { + "epoch": 0.38, + "logps_train/chosen": -75.87973022460938, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -90.60918426513672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3162927627563477, + "rewards_train/margins": 1.9688441753387451, + "rewards_train/rejected": -3.2851369380950928, + "step": 1373 + }, + { + "epoch": 0.38, + "learning_rate": 2.5897228348068195e-07, + "loss": 0.2367, + "step": 1374 + }, + { + "epoch": 0.38, + "logps_train/chosen": -73.16169738769531, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -104.44308471679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.189802646636963, + "rewards_train/margins": 2.0533342361450195, + "rewards_train/rejected": -3.2431368827819824, + "step": 1374 + }, + { + "epoch": 0.38, + "logps_train/chosen": -54.54771423339844, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -71.60335540771484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4755721688270569, + "rewards_train/margins": 1.3104471564292908, + "rewards_train/rejected": -1.7860193252563477, + "step": 1375 + }, + { + "epoch": 0.38, + "learning_rate": 2.583537442519186e-07, + "loss": 0.3255, + "step": 1376 + }, + { + "epoch": 0.38, + "logps_train/chosen": -128.61160278320312, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -120.68549346923828, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -3.0693626403808594, + "rewards_train/margins": -0.3984694480895996, + "rewards_train/rejected": -2.6708931922912598, + "step": 1376 + }, + { + "epoch": 0.38, + "logps_train/chosen": -58.03902816772461, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -67.445556640625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8437464237213135, + "rewards_train/margins": 0.6888947486877441, + "rewards_train/rejected": -1.5326411724090576, + "step": 1377 + }, + { + "epoch": 0.39, + "learning_rate": 2.577351538246298e-07, + "loss": 0.8633, + "step": 1378 + }, + { + "epoch": 0.39, + "logps_train/chosen": -56.045284271240234, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -87.34400177001953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4777708053588867, + "rewards_train/margins": 1.1011600494384766, + "rewards_train/rejected": -2.5789308547973633, + "step": 1378 + }, + { + "epoch": 0.39, + "logps_train/chosen": -89.68913269042969, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -122.016357421875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0821945667266846, + "rewards_train/margins": 1.4905357360839844, + "rewards_train/rejected": -3.572730302810669, + "step": 1379 + }, + { + "epoch": 0.39, + "learning_rate": 2.5711651599003945e-07, + "loss": 0.3479, + "step": 1380 + }, + { + "epoch": 0.39, + "logps_train/chosen": -63.23186492919922, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -75.99614715576172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.078655481338501, + "rewards_train/margins": 0.7104123830795288, + "rewards_train/rejected": -1.7890678644180298, + "step": 1380 + }, + { + "epoch": 0.39, + "logps_train/chosen": -65.56197357177734, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -63.32878494262695, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5444786548614502, + "rewards_train/margins": 0.27199387550354004, + "rewards_train/rejected": -1.8164725303649902, + "step": 1381 + }, + { + "epoch": 0.39, + "learning_rate": 2.564978345396627e-07, + "loss": 0.568, + "step": 1382 + }, + { + "epoch": 0.39, + "logps_train/chosen": -43.072750091552734, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -67.01962280273438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7892085313796997, + "rewards_train/margins": 1.2805274724960327, + "rewards_train/rejected": -2.0697360038757324, + "step": 1382 + }, + { + "epoch": 0.39, + "logps_train/chosen": -108.58927917480469, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -108.79452514648438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1761159896850586, + "rewards_train/margins": 1.159390926361084, + "rewards_train/rejected": -3.3355069160461426, + "step": 1383 + }, + { + "epoch": 0.39, + "learning_rate": 2.5587911326528145e-07, + "loss": 0.4475, + "step": 1384 + }, + { + "epoch": 0.39, + "logps_train/chosen": -96.11656188964844, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -133.13128662109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9585314989089966, + "rewards_train/margins": 3.8233460187911987, + "rewards_train/rejected": -4.781877517700195, + "step": 1384 + }, + { + "epoch": 0.39, + "logps_train/chosen": -138.98739624023438, + "logps_train/ref_chosen": -115.0, + "logps_train/ref_rejected": -190.0, + "logps_train/rejected": -235.3164520263672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.378427505493164, + "rewards_train/margins": 2.180171012878418, + "rewards_train/rejected": -4.558598518371582, + "step": 1385 + }, + { + "epoch": 0.39, + "learning_rate": 2.552603559589219e-07, + "loss": 0.2649, + "step": 1386 + }, + { + "epoch": 0.39, + "logps_train/chosen": -77.43486022949219, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -106.66267395019531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9927046298980713, + "rewards_train/margins": 1.9477818012237549, + "rewards_train/rejected": -2.940486431121826, + "step": 1386 + }, + { + "epoch": 0.39, + "logps_train/chosen": -103.97190856933594, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -118.98243713378906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7370350360870361, + "rewards_train/margins": 1.3971459865570068, + "rewards_train/rejected": -3.134181022644043, + "step": 1387 + }, + { + "epoch": 0.39, + "learning_rate": 2.5464156641283123e-07, + "loss": 0.2919, + "step": 1388 + }, + { + "epoch": 0.39, + "logps_train/chosen": -90.06669616699219, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -113.79029083251953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7799122333526611, + "rewards_train/margins": 1.795210599899292, + "rewards_train/rejected": -3.575122833251953, + "step": 1388 + }, + { + "epoch": 0.39, + "logps_train/chosen": -79.31021118164062, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -111.15463256835938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.244302749633789, + "rewards_train/margins": 1.87565279006958, + "rewards_train/rejected": -3.119955539703369, + "step": 1389 + }, + { + "epoch": 0.39, + "learning_rate": 2.5402274841945385e-07, + "loss": 0.3535, + "step": 1390 + }, + { + "epoch": 0.39, + "logps_train/chosen": -45.272926330566406, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -57.52449035644531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4745580554008484, + "rewards_train/margins": 0.5942969918251038, + "rewards_train/rejected": -1.0688550472259521, + "step": 1390 + }, + { + "epoch": 0.39, + "logps_train/chosen": -49.201202392578125, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -64.12103271484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7560582160949707, + "rewards_train/margins": 0.9017479419708252, + "rewards_train/rejected": -1.657806158065796, + "step": 1391 + }, + { + "epoch": 0.39, + "learning_rate": 2.534039057714089e-07, + "loss": 0.4316, + "step": 1392 + }, + { + "epoch": 0.39, + "logps_train/chosen": -110.06392669677734, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -144.35324096679688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.13139271736145, + "rewards_train/margins": 1.9023687839508057, + "rewards_train/rejected": -4.033761501312256, + "step": 1392 + }, + { + "epoch": 0.39, + "logps_train/chosen": -132.66116333007812, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -147.11500549316406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.239943742752075, + "rewards_train/margins": 1.8199937343597412, + "rewards_train/rejected": -5.059937477111816, + "step": 1393 + }, + { + "epoch": 0.39, + "learning_rate": 2.5278504226146636e-07, + "loss": 0.2451, + "step": 1394 + }, + { + "epoch": 0.39, + "logps_train/chosen": -56.96629333496094, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -79.7743148803711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.16718590259552, + "rewards_train/margins": 1.4344638586044312, + "rewards_train/rejected": -2.601649761199951, + "step": 1394 + }, + { + "epoch": 0.39, + "logps_train/chosen": -67.73016357421875, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -69.60502624511719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3655941486358643, + "rewards_train/margins": 1.069908857345581, + "rewards_train/rejected": -2.4355030059814453, + "step": 1395 + }, + { + "epoch": 0.39, + "learning_rate": 2.5216616168252423e-07, + "loss": 0.4427, + "step": 1396 + }, + { + "epoch": 0.39, + "logps_train/chosen": -62.675872802734375, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -84.93183898925781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7977144718170166, + "rewards_train/margins": 1.8155863285064697, + "rewards_train/rejected": -2.6133008003234863, + "step": 1396 + }, + { + "epoch": 0.39, + "logps_train/chosen": -68.78553009033203, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -85.96036529541016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5394909381866455, + "rewards_train/margins": 1.480764627456665, + "rewards_train/rejected": -3.0202555656433105, + "step": 1397 + }, + { + "epoch": 0.39, + "learning_rate": 2.51547267827585e-07, + "loss": 0.3628, + "step": 1398 + }, + { + "epoch": 0.39, + "logps_train/chosen": -90.35094451904297, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -108.65039825439453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9983751773834229, + "rewards_train/margins": 1.5662739276885986, + "rewards_train/rejected": -3.5646491050720215, + "step": 1398 + }, + { + "epoch": 0.39, + "logps_train/chosen": -83.69400024414062, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -91.17359161376953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0006496906280518, + "rewards_train/margins": 0.5372178554534912, + "rewards_train/rejected": -2.537867546081543, + "step": 1399 + }, + { + "epoch": 0.39, + "learning_rate": 2.509283644897325e-07, + "loss": 0.4923, + "step": 1400 + }, + { + "epoch": 0.39, + "logps_train/chosen": -110.51560974121094, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -152.65670776367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.276365280151367, + "rewards_train/margins": 2.3584470748901367, + "rewards_train/rejected": -4.634812355041504, + "step": 1400 + }, + { + "epoch": 0.39, + "logps_train/chosen": -53.87019729614258, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -60.21210479736328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.48213687539100647, + "rewards_train/margins": 0.6132923066616058, + "rewards_train/rejected": -1.0954291820526123, + "step": 1401 + }, + { + "epoch": 0.39, + "learning_rate": 2.5030945546210894e-07, + "loss": 0.361, + "step": 1402 + }, + { + "epoch": 0.39, + "logps_train/chosen": -69.6895980834961, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -127.36253356933594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9013574719429016, + "rewards_train/margins": 3.776277720928192, + "rewards_train/rejected": -4.677635192871094, + "step": 1402 + }, + { + "epoch": 0.39, + "logps_train/chosen": -113.97630310058594, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -109.5, + "logps_train/rejected": -152.47695922851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8050529956817627, + "rewards_train/margins": 2.5016276836395264, + "rewards_train/rejected": -4.306680679321289, + "step": 1403 + }, + { + "epoch": 0.39, + "learning_rate": 2.4969054453789114e-07, + "loss": 0.2161, + "step": 1404 + }, + { + "epoch": 0.39, + "logps_train/chosen": -43.923423767089844, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -32.5, + "logps_train/rejected": -51.70403289794922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9523031115531921, + "rewards_train/margins": 0.9770846962928772, + "rewards_train/rejected": -1.9293878078460693, + "step": 1404 + }, + { + "epoch": 0.39, + "logps_train/chosen": -73.5328140258789, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -83.11129760742188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8794531226158142, + "rewards_train/margins": 0.9228882193565369, + "rewards_train/rejected": -1.802341341972351, + "step": 1405 + }, + { + "epoch": 0.39, + "learning_rate": 2.490716355102675e-07, + "loss": 0.4436, + "step": 1406 + }, + { + "epoch": 0.39, + "logps_train/chosen": -88.65668487548828, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -98.30132293701172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8327586650848389, + "rewards_train/margins": 1.605576753616333, + "rewards_train/rejected": -3.438335418701172, + "step": 1406 + }, + { + "epoch": 0.39, + "logps_train/chosen": -84.43414306640625, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -117.94034576416016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7590389251708984, + "rewards_train/margins": 1.3990578651428223, + "rewards_train/rejected": -3.1580967903137207, + "step": 1407 + }, + { + "epoch": 0.39, + "learning_rate": 2.4845273217241503e-07, + "loss": 0.4724, + "step": 1408 + }, + { + "epoch": 0.39, + "logps_train/chosen": -93.84420776367188, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -112.18768310546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9602023363113403, + "rewards_train/margins": 0.7858120203018188, + "rewards_train/rejected": -2.746014356613159, + "step": 1408 + }, + { + "epoch": 0.39, + "logps_train/chosen": -34.074806213378906, + "logps_train/ref_chosen": -28.625, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -46.127540588378906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5441993474960327, + "rewards_train/margins": 0.6728512048721313, + "rewards_train/rejected": -1.217050552368164, + "step": 1409 + }, + { + "epoch": 0.39, + "learning_rate": 2.478338383174758e-07, + "loss": 0.4502, + "step": 1410 + }, + { + "epoch": 0.39, + "logps_train/chosen": -69.9249038696289, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -101.27232360839844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9871188998222351, + "rewards_train/margins": 2.603394567966461, + "rewards_train/rejected": -3.5905134677886963, + "step": 1410 + }, + { + "epoch": 0.39, + "logps_train/chosen": -64.23922729492188, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -70.24002838134766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7852510213851929, + "rewards_train/margins": 1.0569158792495728, + "rewards_train/rejected": -1.8421669006347656, + "step": 1411 + }, + { + "epoch": 0.39, + "learning_rate": 2.472149577385336e-07, + "loss": 0.3653, + "step": 1412 + }, + { + "epoch": 0.39, + "logps_train/chosen": -84.017333984375, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -91.52421569824219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.217358350753784, + "rewards_train/margins": 0.26475071907043457, + "rewards_train/rejected": -2.4821090698242188, + "step": 1412 + }, + { + "epoch": 0.39, + "logps_train/chosen": -57.075660705566406, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -67.00348663330078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0278780460357666, + "rewards_train/margins": 0.3607523441314697, + "rewards_train/rejected": -2.3886303901672363, + "step": 1413 + }, + { + "epoch": 0.4, + "learning_rate": 2.4659609422859113e-07, + "loss": 0.6514, + "step": 1414 + }, + { + "epoch": 0.4, + "logps_train/chosen": -60.59565734863281, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -75.19113159179688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0376907587051392, + "rewards_train/margins": 1.0380626916885376, + "rewards_train/rejected": -2.0757534503936768, + "step": 1414 + }, + { + "epoch": 0.4, + "logps_train/chosen": -93.30435180664062, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -110.95439147949219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9898099899291992, + "rewards_train/margins": 1.185317039489746, + "rewards_train/rejected": -3.1751270294189453, + "step": 1415 + }, + { + "epoch": 0.4, + "learning_rate": 2.459772515805462e-07, + "loss": 0.4499, + "step": 1416 + }, + { + "epoch": 0.4, + "logps_train/chosen": -90.56442260742188, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -103.7119140625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6349579095840454, + "rewards_train/margins": 0.743850588798523, + "rewards_train/rejected": -2.3788084983825684, + "step": 1416 + }, + { + "epoch": 0.4, + "logps_train/chosen": -69.59407043457031, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -98.663818359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1266918182373047, + "rewards_train/margins": 1.8064875602722168, + "rewards_train/rejected": -2.9331793785095215, + "step": 1417 + }, + { + "epoch": 0.4, + "learning_rate": 2.453584335871688e-07, + "loss": 0.3702, + "step": 1418 + }, + { + "epoch": 0.4, + "logps_train/chosen": -107.56165313720703, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -127.704345703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.343665361404419, + "rewards_train/margins": 1.3588006496429443, + "rewards_train/rejected": -2.7024660110473633, + "step": 1418 + }, + { + "epoch": 0.4, + "logps_train/chosen": -90.00579833984375, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -127.27154541015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1014583110809326, + "rewards_train/margins": 2.1901495456695557, + "rewards_train/rejected": -3.2916078567504883, + "step": 1419 + }, + { + "epoch": 0.4, + "learning_rate": 2.447396440410781e-07, + "loss": 0.3124, + "step": 1420 + }, + { + "epoch": 0.4, + "logps_train/chosen": -80.2513427734375, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -99.35195922851562, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.0134150981903076, + "rewards_train/margins": 2.0362343788146973, + "rewards_train/rejected": -3.049649477005005, + "step": 1420 + }, + { + "epoch": 0.4, + "logps_train/chosen": -56.434326171875, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -73.35064697265625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6969487071037292, + "rewards_train/margins": 0.965459406375885, + "rewards_train/rejected": -1.6624081134796143, + "step": 1421 + }, + { + "epoch": 0.4, + "learning_rate": 2.441208867347186e-07, + "loss": 0.4488, + "step": 1422 + }, + { + "epoch": 0.4, + "logps_train/chosen": -69.16523742675781, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -73.96730041503906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9069539308547974, + "rewards_train/margins": 1.4776662588119507, + "rewards_train/rejected": -2.384620189666748, + "step": 1422 + }, + { + "epoch": 0.4, + "logps_train/chosen": -19.333995819091797, + "logps_train/ref_chosen": -13.25, + "logps_train/ref_rejected": -19.875, + "logps_train/rejected": -30.237205505371094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6063489317893982, + "rewards_train/margins": 0.4295298457145691, + "rewards_train/rejected": -1.0358787775039673, + "step": 1423 + }, + { + "epoch": 0.4, + "learning_rate": 2.4350216546033736e-07, + "loss": 0.4241, + "step": 1424 + }, + { + "epoch": 0.4, + "logps_train/chosen": -85.90435028076172, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -110.66046905517578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.60361909866333, + "rewards_train/margins": 1.4823493957519531, + "rewards_train/rejected": -3.085968494415283, + "step": 1424 + }, + { + "epoch": 0.4, + "logps_train/chosen": -49.999610900878906, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -64.6334228515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9207618236541748, + "rewards_train/margins": 1.0402367115020752, + "rewards_train/rejected": -1.96099853515625, + "step": 1425 + }, + { + "epoch": 0.4, + "learning_rate": 2.428834840099605e-07, + "loss": 0.4679, + "step": 1426 + }, + { + "epoch": 0.4, + "logps_train/chosen": -84.17804718017578, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -94.75330352783203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9131168127059937, + "rewards_train/margins": 2.0071359872817993, + "rewards_train/rejected": -2.920252799987793, + "step": 1426 + }, + { + "epoch": 0.4, + "logps_train/chosen": -54.0204963684082, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -41.25, + "logps_train/rejected": -58.89433288574219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9514635801315308, + "rewards_train/margins": 0.8146296739578247, + "rewards_train/rejected": -1.7660932540893555, + "step": 1427 + }, + { + "epoch": 0.4, + "learning_rate": 2.422648461753703e-07, + "loss": 0.3872, + "step": 1428 + }, + { + "epoch": 0.4, + "logps_train/chosen": -62.32866668701172, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -74.67293548583984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0082577466964722, + "rewards_train/margins": 1.2414578199386597, + "rewards_train/rejected": -2.249715566635132, + "step": 1428 + }, + { + "epoch": 0.4, + "logps_train/chosen": -70.18084716796875, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -98.62975311279297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4383976459503174, + "rewards_train/margins": 1.7653977870941162, + "rewards_train/rejected": -3.2037954330444336, + "step": 1429 + }, + { + "epoch": 0.4, + "learning_rate": 2.416462557480814e-07, + "loss": 0.3173, + "step": 1430 + }, + { + "epoch": 0.4, + "logps_train/chosen": -85.343017578125, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -91.13835144042969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1393800973892212, + "rewards_train/margins": 0.9334388971328735, + "rewards_train/rejected": -2.0728189945220947, + "step": 1430 + }, + { + "epoch": 0.4, + "logps_train/chosen": -72.94872283935547, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -97.23341369628906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2464350461959839, + "rewards_train/margins": 1.0312029123306274, + "rewards_train/rejected": -2.2776379585266113, + "step": 1431 + }, + { + "epoch": 0.4, + "learning_rate": 2.4102771651931813e-07, + "loss": 0.4667, + "step": 1432 + }, + { + "epoch": 0.4, + "logps_train/chosen": -107.73786163330078, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -141.23602294921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.001227855682373, + "rewards_train/margins": 2.3296990394592285, + "rewards_train/rejected": -4.330926895141602, + "step": 1432 + }, + { + "epoch": 0.4, + "logps_train/chosen": -54.949005126953125, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -85.45057678222656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4242461919784546, + "rewards_train/margins": 1.7579203844070435, + "rewards_train/rejected": -3.182166576385498, + "step": 1433 + }, + { + "epoch": 0.4, + "learning_rate": 2.40409232279991e-07, + "loss": 0.3107, + "step": 1434 + }, + { + "epoch": 0.4, + "logps_train/chosen": -102.56112670898438, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -128.83555603027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.332674503326416, + "rewards_train/margins": 1.5313501358032227, + "rewards_train/rejected": -2.8640246391296387, + "step": 1434 + }, + { + "epoch": 0.4, + "logps_train/chosen": -60.32343292236328, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -85.97602081298828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.524237871170044, + "rewards_train/margins": 1.1214110851287842, + "rewards_train/rejected": -2.645648956298828, + "step": 1435 + }, + { + "epoch": 0.4, + "learning_rate": 2.397908068206733e-07, + "loss": 0.3252, + "step": 1436 + }, + { + "epoch": 0.4, + "logps_train/chosen": -88.18455505371094, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -92.77865600585938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.1090805530548096, + "rewards_train/margins": 0.35159754753112793, + "rewards_train/rejected": -2.4606781005859375, + "step": 1436 + }, + { + "epoch": 0.4, + "logps_train/chosen": -68.40955352783203, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -114.69082641601562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6092169284820557, + "rewards_train/margins": 2.2852563858032227, + "rewards_train/rejected": -3.8944733142852783, + "step": 1437 + }, + { + "epoch": 0.4, + "learning_rate": 2.391724439315785e-07, + "loss": 0.4977, + "step": 1438 + }, + { + "epoch": 0.4, + "logps_train/chosen": -67.13638305664062, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -102.11262512207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.28824782371521, + "rewards_train/margins": 1.2427418231964111, + "rewards_train/rejected": -2.530989646911621, + "step": 1438 + }, + { + "epoch": 0.4, + "logps_train/chosen": -61.952911376953125, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -52.560672760009766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.185037612915039, + "rewards_train/margins": 0.51741623878479, + "rewards_train/rejected": -1.702453851699829, + "step": 1439 + }, + { + "epoch": 0.4, + "learning_rate": 2.385541474025361e-07, + "loss": 0.5124, + "step": 1440 + }, + { + "epoch": 0.4, + "logps_train/chosen": -58.70077896118164, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -67.01563262939453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.221445083618164, + "rewards_train/margins": 0.7805085182189941, + "rewards_train/rejected": -2.001953601837158, + "step": 1440 + }, + { + "epoch": 0.4, + "logps_train/chosen": -56.96097946166992, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -85.03782653808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.528519868850708, + "rewards_train/margins": 1.402606725692749, + "rewards_train/rejected": -2.931126594543457, + "step": 1441 + }, + { + "epoch": 0.4, + "learning_rate": 2.3793592102296915e-07, + "loss": 0.4275, + "step": 1442 + }, + { + "epoch": 0.4, + "logps_train/chosen": -36.02882385253906, + "logps_train/ref_chosen": -30.75, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -49.92866897583008, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.526124894618988, + "rewards_train/margins": 0.6936948895454407, + "rewards_train/rejected": -1.2198197841644287, + "step": 1442 + }, + { + "epoch": 0.4, + "logps_train/chosen": -54.827972412109375, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -64.4866943359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8992036581039429, + "rewards_train/margins": 1.4025903940200806, + "rewards_train/rejected": -2.3017940521240234, + "step": 1443 + }, + { + "epoch": 0.4, + "learning_rate": 2.3731776858187078e-07, + "loss": 0.3823, + "step": 1444 + }, + { + "epoch": 0.4, + "logps_train/chosen": -133.40647888183594, + "logps_train/ref_chosen": -108.5, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -153.29942321777344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.5126941204071045, + "rewards_train/margins": 1.604919195175171, + "rewards_train/rejected": -4.117613315582275, + "step": 1444 + }, + { + "epoch": 0.4, + "logps_train/chosen": -59.57331466674805, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -72.4448471069336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9545971751213074, + "rewards_train/margins": 0.8010204434394836, + "rewards_train/rejected": -1.755617618560791, + "step": 1445 + }, + { + "epoch": 0.4, + "learning_rate": 2.3669969386778085e-07, + "loss": 0.4354, + "step": 1446 + }, + { + "epoch": 0.4, + "logps_train/chosen": -89.79341888427734, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -119.29327392578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.681685447692871, + "rewards_train/margins": 1.515317440032959, + "rewards_train/rejected": -3.19700288772583, + "step": 1446 + }, + { + "epoch": 0.4, + "logps_train/chosen": -28.20357894897461, + "logps_train/ref_chosen": -19.5, + "logps_train/ref_rejected": -21.25, + "logps_train/rejected": -37.50383758544922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8654752969741821, + "rewards_train/margins": 0.7658655643463135, + "rewards_train/rejected": -1.6313408613204956, + "step": 1447 + }, + { + "epoch": 0.4, + "learning_rate": 2.3608170066876298e-07, + "loss": 0.4243, + "step": 1448 + }, + { + "epoch": 0.4, + "logps_train/chosen": -54.646671295166016, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -58.266937255859375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8347843289375305, + "rewards_train/margins": 1.413784682750702, + "rewards_train/rejected": -2.2485690116882324, + "step": 1448 + }, + { + "epoch": 0.4, + "logps_train/chosen": -87.08670043945312, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -111.1590576171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8739047050476074, + "rewards_train/margins": 1.529500961303711, + "rewards_train/rejected": -3.4034056663513184, + "step": 1449 + }, + { + "epoch": 0.41, + "learning_rate": 2.3546379277238103e-07, + "loss": 0.376, + "step": 1450 + }, + { + "epoch": 0.41, + "logps_train/chosen": -53.08202362060547, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -78.91864776611328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7962394952774048, + "rewards_train/margins": 0.821308970451355, + "rewards_train/rejected": -1.6175484657287598, + "step": 1450 + }, + { + "epoch": 0.41, + "logps_train/chosen": -61.199378967285156, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -92.09947204589844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6254065036773682, + "rewards_train/margins": 1.6849310398101807, + "rewards_train/rejected": -2.310337543487549, + "step": 1451 + }, + { + "epoch": 0.41, + "learning_rate": 2.348459739656763e-07, + "loss": 0.4027, + "step": 1452 + }, + { + "epoch": 0.41, + "logps_train/chosen": -49.50818634033203, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -53.81367492675781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.24110230803489685, + "rewards_train/margins": 1.484796792268753, + "rewards_train/rejected": -1.72589910030365, + "step": 1452 + }, + { + "epoch": 0.41, + "logps_train/chosen": -32.541934967041016, + "logps_train/ref_chosen": -31.75, + "logps_train/ref_rejected": -25.0, + "logps_train/rejected": -34.23435974121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.08534583449363708, + "rewards_train/margins": 0.8364298045635223, + "rewards_train/rejected": -0.9217756390571594, + "step": 1453 + }, + { + "epoch": 0.41, + "learning_rate": 2.3422824803514382e-07, + "loss": 0.3802, + "step": 1454 + }, + { + "epoch": 0.41, + "logps_train/chosen": -55.98424530029297, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -72.5770263671875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4207881689071655, + "rewards_train/margins": 0.8268557786941528, + "rewards_train/rejected": -2.2476439476013184, + "step": 1454 + }, + { + "epoch": 0.41, + "logps_train/chosen": -131.9940948486328, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -118.5, + "logps_train/rejected": -163.99038696289062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -3.7119102478027344, + "rewards_train/margins": 0.823847770690918, + "rewards_train/rejected": -4.535758018493652, + "step": 1455 + }, + { + "epoch": 0.41, + "learning_rate": 2.3361061876670945e-07, + "loss": 0.6821, + "step": 1456 + }, + { + "epoch": 0.41, + "logps_train/chosen": -90.28365325927734, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -77.54593658447266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9314900636672974, + "rewards_train/margins": 0.515583872795105, + "rewards_train/rejected": -2.4470739364624023, + "step": 1456 + }, + { + "epoch": 0.41, + "logps_train/chosen": -52.99207305908203, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -67.475341796875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7374885082244873, + "rewards_train/margins": 1.3489134311676025, + "rewards_train/rejected": -2.08640193939209, + "step": 1457 + }, + { + "epoch": 0.41, + "learning_rate": 2.3299308994570687e-07, + "loss": 0.4125, + "step": 1458 + }, + { + "epoch": 0.41, + "logps_train/chosen": -50.42257308959961, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -58.40668869018555, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3668668866157532, + "rewards_train/margins": 1.0919663310050964, + "rewards_train/rejected": -1.4588332176208496, + "step": 1458 + }, + { + "epoch": 0.41, + "logps_train/chosen": -54.10042190551758, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -77.4073257446289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4252277910709381, + "rewards_train/margins": 1.105690211057663, + "rewards_train/rejected": -1.530918002128601, + "step": 1459 + }, + { + "epoch": 0.41, + "learning_rate": 2.3237566535685375e-07, + "loss": 0.4745, + "step": 1460 + }, + { + "epoch": 0.41, + "logps_train/chosen": -98.09123229980469, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -108.29418182373047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8040449619293213, + "rewards_train/margins": 1.8378732204437256, + "rewards_train/rejected": -3.641918182373047, + "step": 1460 + }, + { + "epoch": 0.41, + "logps_train/chosen": -79.92696380615234, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -93.69712829589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1676963567733765, + "rewards_train/margins": 1.066860318183899, + "rewards_train/rejected": -2.2345566749572754, + "step": 1461 + }, + { + "epoch": 0.41, + "learning_rate": 2.3175834878422931e-07, + "loss": 0.357, + "step": 1462 + }, + { + "epoch": 0.41, + "logps_train/chosen": -60.13316345214844, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -69.96977996826172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1663434505462646, + "rewards_train/margins": 1.3459665775299072, + "rewards_train/rejected": -2.512310028076172, + "step": 1462 + }, + { + "epoch": 0.41, + "logps_train/chosen": -63.537193298339844, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -77.98126220703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7668053507804871, + "rewards_train/margins": 0.44968026876449585, + "rewards_train/rejected": -1.216485619544983, + "step": 1463 + }, + { + "epoch": 0.41, + "learning_rate": 2.3114114401125054e-07, + "loss": 0.5119, + "step": 1464 + }, + { + "epoch": 0.41, + "logps_train/chosen": -51.874088287353516, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -54.085289001464844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2372134923934937, + "rewards_train/margins": 0.6703388690948486, + "rewards_train/rejected": -1.9075523614883423, + "step": 1464 + }, + { + "epoch": 0.41, + "logps_train/chosen": -69.12730407714844, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -73.90693664550781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1927599906921387, + "rewards_train/margins": 1.1556973457336426, + "rewards_train/rejected": -2.3484573364257812, + "step": 1465 + }, + { + "epoch": 0.41, + "learning_rate": 2.3052405482064919e-07, + "loss": 0.5114, + "step": 1466 + }, + { + "epoch": 0.41, + "logps_train/chosen": -44.410728454589844, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -38.25, + "logps_train/rejected": -51.31831359863281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.25765013694763184, + "rewards_train/margins": 1.0583362579345703, + "rewards_train/rejected": -1.3159863948822021, + "step": 1466 + }, + { + "epoch": 0.41, + "logps_train/chosen": -95.55567932128906, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -132.2721405029297, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.4610371589660645, + "rewards_train/margins": 1.3130521774291992, + "rewards_train/rejected": -2.7740893363952637, + "step": 1467 + }, + { + "epoch": 0.41, + "learning_rate": 2.2990708499444885e-07, + "loss": 0.5435, + "step": 1468 + }, + { + "epoch": 0.41, + "logps_train/chosen": -50.43743133544922, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -45.369049072265625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.919084906578064, + "rewards_train/margins": 0.6369608640670776, + "rewards_train/rejected": -1.5560457706451416, + "step": 1468 + }, + { + "epoch": 0.41, + "logps_train/chosen": -61.037750244140625, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -62.6798095703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.742446780204773, + "rewards_train/margins": 0.9290498495101929, + "rewards_train/rejected": -1.6714966297149658, + "step": 1469 + }, + { + "epoch": 0.41, + "learning_rate": 2.2929023831394133e-07, + "loss": 0.4351, + "step": 1470 + }, + { + "epoch": 0.41, + "logps_train/chosen": -148.80490112304688, + "logps_train/ref_chosen": -123.5, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -158.92843627929688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.543478012084961, + "rewards_train/margins": 1.7431154251098633, + "rewards_train/rejected": -4.286593437194824, + "step": 1470 + }, + { + "epoch": 0.41, + "logps_train/chosen": -83.69702911376953, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -116.85832214355469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0747811794281006, + "rewards_train/margins": 2.4360508918762207, + "rewards_train/rejected": -3.5108320713043213, + "step": 1471 + }, + { + "epoch": 0.41, + "learning_rate": 2.2867351855966384e-07, + "loss": 0.3182, + "step": 1472 + }, + { + "epoch": 0.41, + "logps_train/chosen": -38.83192443847656, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -55.64658737182617, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.32225489616394043, + "rewards_train/margins": 0.6670131087303162, + "rewards_train/rejected": -0.9892680048942566, + "step": 1472 + }, + { + "epoch": 0.41, + "logps_train/chosen": -58.36408615112305, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -78.29402923583984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8770335912704468, + "rewards_train/margins": 1.590455174446106, + "rewards_train/rejected": -2.4674887657165527, + "step": 1473 + }, + { + "epoch": 0.41, + "learning_rate": 2.2805692951137557e-07, + "loss": 0.4299, + "step": 1474 + }, + { + "epoch": 0.41, + "logps_train/chosen": -61.07868957519531, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -61.661033630371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8830645680427551, + "rewards_train/margins": 0.9844058156013489, + "rewards_train/rejected": -1.867470383644104, + "step": 1474 + }, + { + "epoch": 0.41, + "logps_train/chosen": -120.68931579589844, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -131.37936401367188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.6204943656921387, + "rewards_train/margins": 0.9018170833587646, + "rewards_train/rejected": -3.5223114490509033, + "step": 1475 + }, + { + "epoch": 0.41, + "learning_rate": 2.274404749480348e-07, + "loss": 0.3896, + "step": 1476 + }, + { + "epoch": 0.41, + "logps_train/chosen": -80.03251647949219, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -87.73963928222656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6452434062957764, + "rewards_train/margins": 0.6898534297943115, + "rewards_train/rejected": -2.335096836090088, + "step": 1476 + }, + { + "epoch": 0.41, + "logps_train/chosen": -86.63934326171875, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -94.40544128417969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8170599937438965, + "rewards_train/margins": 1.0135233402252197, + "rewards_train/rejected": -2.830583333969116, + "step": 1477 + }, + { + "epoch": 0.41, + "learning_rate": 2.2682415864777547e-07, + "loss": 0.4884, + "step": 1478 + }, + { + "epoch": 0.41, + "logps_train/chosen": -74.99855041503906, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -95.32831573486328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.817237913608551, + "rewards_train/margins": 1.817546784877777, + "rewards_train/rejected": -2.634784698486328, + "step": 1478 + }, + { + "epoch": 0.41, + "logps_train/chosen": -67.54692840576172, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -90.11866760253906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7957087755203247, + "rewards_train/margins": 1.9872523546218872, + "rewards_train/rejected": -2.782961130142212, + "step": 1479 + }, + { + "epoch": 0.41, + "learning_rate": 2.26207984387884e-07, + "loss": 0.2718, + "step": 1480 + }, + { + "epoch": 0.41, + "logps_train/chosen": -86.44613647460938, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -113.71174621582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1243016719818115, + "rewards_train/margins": 2.810936212539673, + "rewards_train/rejected": -3.9352378845214844, + "step": 1480 + }, + { + "epoch": 0.41, + "logps_train/chosen": -61.34617614746094, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -71.50247955322266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0271226167678833, + "rewards_train/margins": 1.1606501340866089, + "rewards_train/rejected": -2.187772750854492, + "step": 1481 + }, + { + "epoch": 0.41, + "learning_rate": 2.2559195594477657e-07, + "loss": 0.3558, + "step": 1482 + }, + { + "epoch": 0.41, + "logps_train/chosen": -62.66151428222656, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -109.41070556640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.189589023590088, + "rewards_train/margins": 2.595231533050537, + "rewards_train/rejected": -3.784820556640625, + "step": 1482 + }, + { + "epoch": 0.41, + "logps_train/chosen": -43.43029022216797, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -60.65336608886719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5588005185127258, + "rewards_train/margins": 1.5702082514762878, + "rewards_train/rejected": -2.1290087699890137, + "step": 1483 + }, + { + "epoch": 0.41, + "learning_rate": 2.249760770939754e-07, + "loss": 0.3329, + "step": 1484 + }, + { + "epoch": 0.41, + "logps_train/chosen": -63.7703742980957, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -79.15364074707031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5767444372177124, + "rewards_train/margins": 1.5843230485916138, + "rewards_train/rejected": -2.161067485809326, + "step": 1484 + }, + { + "epoch": 0.42, + "logps_train/chosen": -54.031463623046875, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -65.74440002441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.241231918334961, + "rewards_train/margins": 1.3985400199890137, + "rewards_train/rejected": -2.6397719383239746, + "step": 1485 + }, + { + "epoch": 0.42, + "learning_rate": 2.2436035161008616e-07, + "loss": 0.3665, + "step": 1486 + }, + { + "epoch": 0.42, + "logps_train/chosen": -78.16743469238281, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -93.56329345703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2112748622894287, + "rewards_train/margins": 1.2382426261901855, + "rewards_train/rejected": -3.4495174884796143, + "step": 1486 + }, + { + "epoch": 0.42, + "logps_train/chosen": -90.05267333984375, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -123.19520568847656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7634705305099487, + "rewards_train/margins": 1.7564400434494019, + "rewards_train/rejected": -3.5199105739593506, + "step": 1487 + }, + { + "epoch": 0.42, + "learning_rate": 2.237447832667742e-07, + "loss": 0.3788, + "step": 1488 + }, + { + "epoch": 0.42, + "logps_train/chosen": -48.116065979003906, + "logps_train/ref_chosen": -36.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -90.1641616821289, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.165952205657959, + "rewards_train/margins": 1.8254642486572266, + "rewards_train/rejected": -2.9914164543151855, + "step": 1488 + }, + { + "epoch": 0.42, + "logps_train/chosen": -39.31227111816406, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -60.464569091796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.38650065660476685, + "rewards_train/margins": 1.913423240184784, + "rewards_train/rejected": -2.299923896789551, + "step": 1489 + }, + { + "epoch": 0.42, + "learning_rate": 2.2312937583674218e-07, + "loss": 0.4177, + "step": 1490 + }, + { + "epoch": 0.42, + "logps_train/chosen": -43.495296478271484, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -54.47026062011719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7647639513015747, + "rewards_train/margins": 0.5806996822357178, + "rewards_train/rejected": -1.3454636335372925, + "step": 1490 + }, + { + "epoch": 0.42, + "logps_train/chosen": -34.33486557006836, + "logps_train/ref_chosen": -28.875, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -41.52058410644531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.543252170085907, + "rewards_train/margins": 0.39259546995162964, + "rewards_train/rejected": -0.9358476400375366, + "step": 1491 + }, + { + "epoch": 0.42, + "learning_rate": 2.225141330917063e-07, + "loss": 0.5265, + "step": 1492 + }, + { + "epoch": 0.42, + "logps_train/chosen": -60.08317565917969, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -62.24585723876953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1428883075714111, + "rewards_train/margins": 0.38697075843811035, + "rewards_train/rejected": -1.5298590660095215, + "step": 1492 + }, + { + "epoch": 0.42, + "logps_train/chosen": -33.77716064453125, + "logps_train/ref_chosen": -27.125, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -50.05274200439453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6660950779914856, + "rewards_train/margins": 0.8389837145805359, + "rewards_train/rejected": -1.5050787925720215, + "step": 1493 + }, + { + "epoch": 0.42, + "learning_rate": 2.2189905880237342e-07, + "loss": 0.5073, + "step": 1494 + }, + { + "epoch": 0.42, + "logps_train/chosen": -60.791893005371094, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -83.19351196289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.502333641052246, + "rewards_train/margins": 1.4160408973693848, + "rewards_train/rejected": -2.918374538421631, + "step": 1494 + }, + { + "epoch": 0.42, + "logps_train/chosen": -60.85224914550781, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -56.961856842041016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2508498430252075, + "rewards_train/margins": 0.4314686059951782, + "rewards_train/rejected": -1.6823184490203857, + "step": 1495 + }, + { + "epoch": 0.42, + "learning_rate": 2.2128415673841822e-07, + "loss": 0.5841, + "step": 1496 + }, + { + "epoch": 0.42, + "logps_train/chosen": -78.28062438964844, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -103.302490234375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1964218616485596, + "rewards_train/margins": 1.0543348789215088, + "rewards_train/rejected": -3.2507567405700684, + "step": 1496 + }, + { + "epoch": 0.42, + "logps_train/chosen": -48.63485336303711, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -80.08786010742188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6955165266990662, + "rewards_train/margins": 2.0489140152931213, + "rewards_train/rejected": -2.7444305419921875, + "step": 1497 + }, + { + "epoch": 0.42, + "learning_rate": 2.2066943066845948e-07, + "loss": 0.4681, + "step": 1498 + }, + { + "epoch": 0.42, + "logps_train/chosen": -89.67605590820312, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -102.9144515991211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.70627760887146, + "rewards_train/margins": 1.2285270690917969, + "rewards_train/rejected": -2.934804677963257, + "step": 1498 + }, + { + "epoch": 0.42, + "logps_train/chosen": -71.27426147460938, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -98.13237762451172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.699691891670227, + "rewards_train/margins": 0.8092492818832397, + "rewards_train/rejected": -2.508941173553467, + "step": 1499 + }, + { + "epoch": 0.42, + "learning_rate": 2.2005488436003768e-07, + "loss": 0.4944, + "step": 1500 + }, + { + "epoch": 0.42, + "logps_train/chosen": -44.52510070800781, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -50.1291389465332, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8290092349052429, + "rewards_train/margins": 0.6119661927223206, + "rewards_train/rejected": -1.4409754276275635, + "step": 1500 + }, + { + "epoch": 0.42, + "logps_train/chosen": -53.74895477294922, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -62.9795036315918, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3704276084899902, + "rewards_train/margins": 0.5708823204040527, + "rewards_train/rejected": -1.941309928894043, + "step": 1501 + }, + { + "epoch": 0.42, + "learning_rate": 2.1944052157959142e-07, + "loss": 0.552, + "step": 1502 + }, + { + "epoch": 0.42, + "logps_train/chosen": -74.89353942871094, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -93.01203918457031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.33759605884552, + "rewards_train/margins": 1.4645837545394897, + "rewards_train/rejected": -2.8021798133850098, + "step": 1502 + }, + { + "epoch": 0.42, + "logps_train/chosen": -37.61762237548828, + "logps_train/ref_chosen": -31.375, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -70.26980590820312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6247933506965637, + "rewards_train/margins": 1.9302577376365662, + "rewards_train/rejected": -2.55505108833313, + "step": 1503 + }, + { + "epoch": 0.42, + "learning_rate": 2.188263460924344e-07, + "loss": 0.3061, + "step": 1504 + }, + { + "epoch": 0.42, + "logps_train/chosen": -86.88961791992188, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -106.73487854003906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.139157295227051, + "rewards_train/margins": 1.1720263957977295, + "rewards_train/rejected": -3.3111836910247803, + "step": 1504 + }, + { + "epoch": 0.42, + "logps_train/chosen": -88.45167541503906, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -123.12904357910156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8604015707969666, + "rewards_train/margins": 1.408558189868927, + "rewards_train/rejected": -2.2689597606658936, + "step": 1505 + }, + { + "epoch": 0.42, + "learning_rate": 2.1821236166273267e-07, + "loss": 0.4179, + "step": 1506 + }, + { + "epoch": 0.42, + "logps_train/chosen": -42.475379943847656, + "logps_train/ref_chosen": -32.5, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -55.643890380859375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9967566132545471, + "rewards_train/margins": 0.8287650942802429, + "rewards_train/rejected": -1.82552170753479, + "step": 1506 + }, + { + "epoch": 0.42, + "logps_train/chosen": -69.06477355957031, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -84.2336654663086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7586259841918945, + "rewards_train/margins": 1.5608341693878174, + "rewards_train/rejected": -2.319460153579712, + "step": 1507 + }, + { + "epoch": 0.42, + "learning_rate": 2.1759857205348108e-07, + "loss": 0.3528, + "step": 1508 + }, + { + "epoch": 0.42, + "logps_train/chosen": -109.0347900390625, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -130.70358276367188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.129260540008545, + "rewards_train/margins": 1.237973690032959, + "rewards_train/rejected": -3.367234230041504, + "step": 1508 + }, + { + "epoch": 0.42, + "logps_train/chosen": -87.37171173095703, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -97.84712982177734, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.0922493934631348, + "rewards_train/margins": 1.633967399597168, + "rewards_train/rejected": -3.7262167930603027, + "step": 1509 + }, + { + "epoch": 0.42, + "learning_rate": 2.169849810264807e-07, + "loss": 0.4274, + "step": 1510 + }, + { + "epoch": 0.42, + "logps_train/chosen": -128.63365173339844, + "logps_train/ref_chosen": -110.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -131.63816833496094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8315294981002808, + "rewards_train/margins": 1.2322875261306763, + "rewards_train/rejected": -3.063817024230957, + "step": 1510 + }, + { + "epoch": 0.42, + "logps_train/chosen": -46.5494384765625, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -80.72059631347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6025997996330261, + "rewards_train/margins": 1.8684833645820618, + "rewards_train/rejected": -2.471083164215088, + "step": 1511 + }, + { + "epoch": 0.42, + "learning_rate": 2.163715923423153e-07, + "loss": 0.3519, + "step": 1512 + }, + { + "epoch": 0.42, + "logps_train/chosen": -39.29479217529297, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -41.62604904174805, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5459343791007996, + "rewards_train/margins": 0.31251996755599976, + "rewards_train/rejected": -0.8584543466567993, + "step": 1512 + }, + { + "epoch": 0.42, + "logps_train/chosen": -68.48253631591797, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -79.24789428710938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6947380304336548, + "rewards_train/margins": 0.9049538373947144, + "rewards_train/rejected": -2.599691867828369, + "step": 1513 + }, + { + "epoch": 0.42, + "learning_rate": 2.1575840976032866e-07, + "loss": 0.5562, + "step": 1514 + }, + { + "epoch": 0.42, + "logps_train/chosen": -129.51622009277344, + "logps_train/ref_chosen": -107.5, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -159.49038696289062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2031846046447754, + "rewards_train/margins": 3.1849169731140137, + "rewards_train/rejected": -5.388101577758789, + "step": 1514 + }, + { + "epoch": 0.42, + "logps_train/chosen": -58.58176040649414, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -80.47071838378906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8816139698028564, + "rewards_train/margins": 1.0724897384643555, + "rewards_train/rejected": -1.954103708267212, + "step": 1515 + }, + { + "epoch": 0.42, + "learning_rate": 2.1514543703860144e-07, + "loss": 0.4387, + "step": 1516 + }, + { + "epoch": 0.42, + "logps_train/chosen": -112.3211669921875, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -117.17977905273438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.417858362197876, + "rewards_train/margins": 1.5387914180755615, + "rewards_train/rejected": -2.9566497802734375, + "step": 1516 + }, + { + "epoch": 0.42, + "logps_train/chosen": -76.02619934082031, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -118.46748352050781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1244951486587524, + "rewards_train/margins": 1.9710818529129028, + "rewards_train/rejected": -3.0955770015716553, + "step": 1517 + }, + { + "epoch": 0.42, + "learning_rate": 2.145326779339279e-07, + "loss": 0.2708, + "step": 1518 + }, + { + "epoch": 0.42, + "logps_train/chosen": -67.40851593017578, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -108.60820007324219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9724920988082886, + "rewards_train/margins": 2.044577717781067, + "rewards_train/rejected": -3.0170698165893555, + "step": 1518 + }, + { + "epoch": 0.42, + "logps_train/chosen": -45.17582702636719, + "logps_train/ref_chosen": -31.0, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -52.9072265625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4156420230865479, + "rewards_train/margins": 0.3946244716644287, + "rewards_train/rejected": -1.8102664947509766, + "step": 1519 + }, + { + "epoch": 0.42, + "learning_rate": 2.1392013620179336e-07, + "loss": 0.4075, + "step": 1520 + }, + { + "epoch": 0.42, + "logps_train/chosen": -79.74624633789062, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -100.21214294433594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7996245622634888, + "rewards_train/margins": 1.8547929525375366, + "rewards_train/rejected": -2.6544175148010254, + "step": 1520 + }, + { + "epoch": 0.43, + "logps_train/chosen": -117.86734008789062, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -138.39466857910156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.948812961578369, + "rewards_train/margins": 0.9601542949676514, + "rewards_train/rejected": -3.9089672565460205, + "step": 1521 + }, + { + "epoch": 0.43, + "learning_rate": 2.1330781559635065e-07, + "loss": 0.3667, + "step": 1522 + }, + { + "epoch": 0.43, + "logps_train/chosen": -80.56924438476562, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -91.5592041015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6569247245788574, + "rewards_train/margins": 1.0097377300262451, + "rewards_train/rejected": -1.6666624546051025, + "step": 1522 + }, + { + "epoch": 0.43, + "logps_train/chosen": -64.69086456298828, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -94.71272277832031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8429144620895386, + "rewards_train/margins": 2.170544743537903, + "rewards_train/rejected": -3.0134592056274414, + "step": 1523 + }, + { + "epoch": 0.43, + "learning_rate": 2.1269571987039756e-07, + "loss": 0.3577, + "step": 1524 + }, + { + "epoch": 0.43, + "logps_train/chosen": -82.77047729492188, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -105.81295013427734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2954072952270508, + "rewards_train/margins": 1.6944327354431152, + "rewards_train/rejected": -2.989840030670166, + "step": 1524 + }, + { + "epoch": 0.43, + "logps_train/chosen": -52.450042724609375, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -66.30683898925781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0827972888946533, + "rewards_train/margins": 0.9615585803985596, + "rewards_train/rejected": -2.044355869293213, + "step": 1525 + }, + { + "epoch": 0.43, + "learning_rate": 2.120838527753535e-07, + "loss": 0.355, + "step": 1526 + }, + { + "epoch": 0.43, + "logps_train/chosen": -61.018638610839844, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -98.53857421875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8206138014793396, + "rewards_train/margins": 1.8656659722328186, + "rewards_train/rejected": -2.686279773712158, + "step": 1526 + }, + { + "epoch": 0.43, + "logps_train/chosen": -36.256561279296875, + "logps_train/ref_chosen": -29.875, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -60.390262603759766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.63527512550354, + "rewards_train/margins": 1.3281654119491577, + "rewards_train/rejected": -1.9634405374526978, + "step": 1527 + }, + { + "epoch": 0.43, + "learning_rate": 2.1147221806123667e-07, + "loss": 0.4442, + "step": 1528 + }, + { + "epoch": 0.43, + "logps_train/chosen": -68.06429290771484, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -96.5125961303711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.286897897720337, + "rewards_train/margins": 1.5995185375213623, + "rewards_train/rejected": -2.886416435241699, + "step": 1528 + }, + { + "epoch": 0.43, + "logps_train/chosen": -74.76358032226562, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -72.84532928466797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8099517226219177, + "rewards_train/margins": 1.0472373366355896, + "rewards_train/rejected": -1.8571890592575073, + "step": 1529 + }, + { + "epoch": 0.43, + "learning_rate": 2.1086081947664114e-07, + "loss": 0.3596, + "step": 1530 + }, + { + "epoch": 0.43, + "logps_train/chosen": -70.0419921875, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -97.6251449584961, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3801755905151367, + "rewards_train/margins": 1.469740867614746, + "rewards_train/rejected": -2.849916458129883, + "step": 1530 + }, + { + "epoch": 0.43, + "logps_train/chosen": -120.617919921875, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -125.33355712890625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.850855588912964, + "rewards_train/margins": 0.5739061832427979, + "rewards_train/rejected": -3.4247617721557617, + "step": 1531 + }, + { + "epoch": 0.43, + "learning_rate": 2.102496607687136e-07, + "loss": 0.4903, + "step": 1532 + }, + { + "epoch": 0.43, + "logps_train/chosen": -79.79019165039062, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -82.10564422607422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3633946180343628, + "rewards_train/margins": 1.6943382024765015, + "rewards_train/rejected": -3.0577328205108643, + "step": 1532 + }, + { + "epoch": 0.43, + "logps_train/chosen": -69.94976043701172, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -107.3936767578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9145069718360901, + "rewards_train/margins": 1.904352605342865, + "rewards_train/rejected": -2.818859577178955, + "step": 1533 + }, + { + "epoch": 0.43, + "learning_rate": 2.0963874568313087e-07, + "loss": 0.3106, + "step": 1534 + }, + { + "epoch": 0.43, + "logps_train/chosen": -82.80169677734375, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -109.240234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0413024425506592, + "rewards_train/margins": 2.1533265113830566, + "rewards_train/rejected": -3.194628953933716, + "step": 1534 + }, + { + "epoch": 0.43, + "logps_train/chosen": -83.65385437011719, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -89.20149230957031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.33724844455719, + "rewards_train/margins": 0.7130402326583862, + "rewards_train/rejected": -2.050288677215576, + "step": 1535 + }, + { + "epoch": 0.43, + "learning_rate": 2.0902807796407628e-07, + "loss": 0.4139, + "step": 1536 + }, + { + "epoch": 0.43, + "logps_train/chosen": -93.53363037109375, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -126.63748168945312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.31449556350708, + "rewards_train/margins": 2.2656588554382324, + "rewards_train/rejected": -4.5801544189453125, + "step": 1536 + }, + { + "epoch": 0.43, + "logps_train/chosen": -62.9566764831543, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -78.16910552978516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6032848358154297, + "rewards_train/margins": 1.0370631217956543, + "rewards_train/rejected": -1.640347957611084, + "step": 1537 + }, + { + "epoch": 0.43, + "learning_rate": 2.0841766135421747e-07, + "loss": 0.3557, + "step": 1538 + }, + { + "epoch": 0.43, + "logps_train/chosen": -62.165611267089844, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -71.07669830322266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.111288070678711, + "rewards_train/margins": 1.066694736480713, + "rewards_train/rejected": -2.177982807159424, + "step": 1538 + }, + { + "epoch": 0.43, + "logps_train/chosen": -175.44473266601562, + "logps_train/ref_chosen": -130.0, + "logps_train/ref_rejected": -132.0, + "logps_train/rejected": -195.6644287109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -4.542909145355225, + "rewards_train/margins": 1.7797837257385254, + "rewards_train/rejected": -6.32269287109375, + "step": 1539 + }, + { + "epoch": 0.43, + "learning_rate": 2.0780749959468287e-07, + "loss": 0.5776, + "step": 1540 + }, + { + "epoch": 0.43, + "logps_train/chosen": -62.14244079589844, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -86.73136901855469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7204940915107727, + "rewards_train/margins": 1.8340887427330017, + "rewards_train/rejected": -2.5545828342437744, + "step": 1540 + }, + { + "epoch": 0.43, + "logps_train/chosen": -38.095069885253906, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -31.0, + "logps_train/rejected": -48.69858169555664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5007179379463196, + "rewards_train/margins": 1.2690913081169128, + "rewards_train/rejected": -1.7698092460632324, + "step": 1541 + }, + { + "epoch": 0.43, + "learning_rate": 2.071975964250389e-07, + "loss": 0.3767, + "step": 1542 + }, + { + "epoch": 0.43, + "logps_train/chosen": -123.3168716430664, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -157.4512481689453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.049264907836914, + "rewards_train/margins": 2.0337510108947754, + "rewards_train/rejected": -5.0830159187316895, + "step": 1542 + }, + { + "epoch": 0.43, + "logps_train/chosen": -51.47052764892578, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -62.78307342529297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0050606727600098, + "rewards_train/margins": 1.019536018371582, + "rewards_train/rejected": -2.024596691131592, + "step": 1543 + }, + { + "epoch": 0.43, + "learning_rate": 2.065879555832674e-07, + "loss": 0.3898, + "step": 1544 + }, + { + "epoch": 0.43, + "logps_train/chosen": -117.35748291015625, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -132.3830108642578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.7892637252807617, + "rewards_train/margins": 1.3103656768798828, + "rewards_train/rejected": -4.0996294021606445, + "step": 1544 + }, + { + "epoch": 0.43, + "logps_train/chosen": -71.69520568847656, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -65.30738067626953, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.546473503112793, + "rewards_train/margins": 0.48553431034088135, + "rewards_train/rejected": -1.0320078134536743, + "step": 1545 + }, + { + "epoch": 0.43, + "learning_rate": 2.0597858080574221e-07, + "loss": 0.6783, + "step": 1546 + }, + { + "epoch": 0.43, + "logps_train/chosen": -94.24891662597656, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -98.80707550048828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.5139542818069458, + "rewards_train/margins": 0.9077686071395874, + "rewards_train/rejected": -2.421722888946533, + "step": 1546 + }, + { + "epoch": 0.43, + "logps_train/chosen": -78.47566986083984, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -100.68916320800781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3827235698699951, + "rewards_train/margins": 2.2131459712982178, + "rewards_train/rejected": -3.595869541168213, + "step": 1547 + }, + { + "epoch": 0.43, + "learning_rate": 2.0536947582720668e-07, + "loss": 0.4052, + "step": 1548 + }, + { + "epoch": 0.43, + "logps_train/chosen": -88.38264465332031, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -120.88613891601562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9195142388343811, + "rewards_train/margins": 2.117536962032318, + "rewards_train/rejected": -3.037051200866699, + "step": 1548 + }, + { + "epoch": 0.43, + "logps_train/chosen": -74.20623779296875, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -79.50244903564453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9571474194526672, + "rewards_train/margins": 1.3395819067955017, + "rewards_train/rejected": -2.296729326248169, + "step": 1549 + }, + { + "epoch": 0.43, + "learning_rate": 2.047606443807505e-07, + "loss": 0.3008, + "step": 1550 + }, + { + "epoch": 0.43, + "logps_train/chosen": -65.69995880126953, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -88.05136108398438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.623120903968811, + "rewards_train/margins": 1.9284995794296265, + "rewards_train/rejected": -2.5516204833984375, + "step": 1550 + }, + { + "epoch": 0.43, + "logps_train/chosen": -67.62134552001953, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -92.56690979003906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0144782066345215, + "rewards_train/margins": 1.018775463104248, + "rewards_train/rejected": -2.0332536697387695, + "step": 1551 + }, + { + "epoch": 0.43, + "learning_rate": 2.0415209019778693e-07, + "loss": 0.3619, + "step": 1552 + }, + { + "epoch": 0.43, + "logps_train/chosen": -43.14768981933594, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -58.05607604980469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3968001902103424, + "rewards_train/margins": 1.0730652511119843, + "rewards_train/rejected": -1.4698654413223267, + "step": 1552 + }, + { + "epoch": 0.43, + "logps_train/chosen": -82.86640930175781, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -89.47486877441406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.404609203338623, + "rewards_train/margins": 1.6585025787353516, + "rewards_train/rejected": -3.0631117820739746, + "step": 1553 + }, + { + "epoch": 0.43, + "learning_rate": 2.0354381700803002e-07, + "loss": 0.5328, + "step": 1554 + }, + { + "epoch": 0.43, + "logps_train/chosen": -75.02198791503906, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -94.145263671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2661638259887695, + "rewards_train/margins": 1.4509997367858887, + "rewards_train/rejected": -2.717163562774658, + "step": 1554 + }, + { + "epoch": 0.43, + "logps_train/chosen": -95.974609375, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -129.45941162109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.234570264816284, + "rewards_train/margins": 1.7551214694976807, + "rewards_train/rejected": -3.989691734313965, + "step": 1555 + }, + { + "epoch": 0.43, + "learning_rate": 2.029358285394716e-07, + "loss": 0.4075, + "step": 1556 + }, + { + "epoch": 0.43, + "logps_train/chosen": -92.38813018798828, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -132.50436401367188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9878367185592651, + "rewards_train/margins": 1.9760764837265015, + "rewards_train/rejected": -3.9639132022857666, + "step": 1556 + }, + { + "epoch": 0.44, + "logps_train/chosen": -91.25912475585938, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -106.64566040039062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.3690762519836426, + "rewards_train/margins": 0.9525213241577148, + "rewards_train/rejected": -3.3215975761413574, + "step": 1557 + }, + { + "epoch": 0.44, + "learning_rate": 2.0232812851835857e-07, + "loss": 0.3707, + "step": 1558 + }, + { + "epoch": 0.44, + "logps_train/chosen": -81.57861328125, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -88.1158676147461, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0727052688598633, + "rewards_train/margins": 1.0502095222473145, + "rewards_train/rejected": -2.1229147911071777, + "step": 1558 + }, + { + "epoch": 0.44, + "logps_train/chosen": -85.18222045898438, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -98.25654602050781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.636190414428711, + "rewards_train/margins": 0.7580184936523438, + "rewards_train/rejected": -2.3942089080810547, + "step": 1559 + }, + { + "epoch": 0.44, + "learning_rate": 2.0172072066916985e-07, + "loss": 0.4155, + "step": 1560 + }, + { + "epoch": 0.44, + "logps_train/chosen": -91.49235534667969, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -131.5889129638672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9978681802749634, + "rewards_train/margins": 2.384656310081482, + "rewards_train/rejected": -4.382524490356445, + "step": 1560 + }, + { + "epoch": 0.44, + "logps_train/chosen": -37.26866149902344, + "logps_train/ref_chosen": -32.0, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -40.53761672973633, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.5304304361343384, + "rewards_train/margins": 0.2887610197067261, + "rewards_train/rejected": -0.8191914558410645, + "step": 1561 + }, + { + "epoch": 0.44, + "learning_rate": 2.0111360871459388e-07, + "loss": 0.39, + "step": 1562 + }, + { + "epoch": 0.44, + "logps_train/chosen": -131.1982879638672, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -137.15306091308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.0684123039245605, + "rewards_train/margins": 0.8005070686340332, + "rewards_train/rejected": -3.8689193725585938, + "step": 1562 + }, + { + "epoch": 0.44, + "logps_train/chosen": -93.68301391601562, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -107.22701263427734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8948639631271362, + "rewards_train/margins": 1.4883843660354614, + "rewards_train/rejected": -3.3832483291625977, + "step": 1563 + }, + { + "epoch": 0.44, + "learning_rate": 2.0050679637550544e-07, + "loss": 0.4393, + "step": 1564 + }, + { + "epoch": 0.44, + "logps_train/chosen": -92.06768798828125, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -135.05026245117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2126283645629883, + "rewards_train/margins": 2.6457176208496094, + "rewards_train/rejected": -4.858345985412598, + "step": 1564 + }, + { + "epoch": 0.44, + "logps_train/chosen": -85.63681030273438, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -105.09564208984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8238368034362793, + "rewards_train/margins": 1.8904154300689697, + "rewards_train/rejected": -3.714252233505249, + "step": 1565 + }, + { + "epoch": 0.44, + "learning_rate": 1.9990028737094312e-07, + "loss": 0.313, + "step": 1566 + }, + { + "epoch": 0.44, + "logps_train/chosen": -33.341217041015625, + "logps_train/ref_chosen": -25.125, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -63.069393157958984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8253324627876282, + "rewards_train/margins": 1.378872811794281, + "rewards_train/rejected": -2.204205274581909, + "step": 1566 + }, + { + "epoch": 0.44, + "logps_train/chosen": -29.861522674560547, + "logps_train/ref_chosen": -26.0, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -41.419776916503906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.38209956884384155, + "rewards_train/margins": 0.4716942310333252, + "rewards_train/rejected": -0.8537937998771667, + "step": 1567 + }, + { + "epoch": 0.44, + "learning_rate": 1.9929408541808646e-07, + "loss": 0.4046, + "step": 1568 + }, + { + "epoch": 0.44, + "logps_train/chosen": -57.043373107910156, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -59.03089904785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0348060131072998, + "rewards_train/margins": 0.48078417778015137, + "rewards_train/rejected": -1.5155901908874512, + "step": 1568 + }, + { + "epoch": 0.44, + "logps_train/chosen": -72.8663330078125, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -118.95845794677734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1471807956695557, + "rewards_train/margins": 1.9658522605895996, + "rewards_train/rejected": -3.1130330562591553, + "step": 1569 + }, + { + "epoch": 0.44, + "learning_rate": 1.9868819423223298e-07, + "loss": 0.4329, + "step": 1570 + }, + { + "epoch": 0.44, + "logps_train/chosen": -77.43839263916016, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -96.0757827758789, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.661221981048584, + "rewards_train/margins": 1.6092469692230225, + "rewards_train/rejected": -2.2704689502716064, + "step": 1570 + }, + { + "epoch": 0.44, + "logps_train/chosen": -51.52234649658203, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -52.100154876708984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6694222092628479, + "rewards_train/margins": 0.43668705224990845, + "rewards_train/rejected": -1.1061092615127563, + "step": 1571 + }, + { + "epoch": 0.44, + "learning_rate": 1.9808261752677583e-07, + "loss": 0.4694, + "step": 1572 + }, + { + "epoch": 0.44, + "logps_train/chosen": -70.57655334472656, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -79.20134735107422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5283582210540771, + "rewards_train/margins": 0.6954877376556396, + "rewards_train/rejected": -2.223845958709717, + "step": 1572 + }, + { + "epoch": 0.44, + "logps_train/chosen": -71.13848876953125, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -87.99737548828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.511504888534546, + "rewards_train/margins": 1.6058104038238525, + "rewards_train/rejected": -3.1173152923583984, + "step": 1573 + }, + { + "epoch": 0.44, + "learning_rate": 1.9747735901318046e-07, + "loss": 0.4525, + "step": 1574 + }, + { + "epoch": 0.44, + "logps_train/chosen": -94.76878356933594, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -122.83468627929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.165159225463867, + "rewards_train/margins": 1.7991690635681152, + "rewards_train/rejected": -3.9643282890319824, + "step": 1574 + }, + { + "epoch": 0.44, + "logps_train/chosen": -46.97719192504883, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -49.47740936279297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8016255497932434, + "rewards_train/margins": 0.6672093272209167, + "rewards_train/rejected": -1.4688348770141602, + "step": 1575 + }, + { + "epoch": 0.44, + "learning_rate": 1.9687242240096246e-07, + "loss": 0.3539, + "step": 1576 + }, + { + "epoch": 0.44, + "logps_train/chosen": -113.40764617919922, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -149.0386962890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2259206771850586, + "rewards_train/margins": 2.448261260986328, + "rewards_train/rejected": -4.674181938171387, + "step": 1576 + }, + { + "epoch": 0.44, + "logps_train/chosen": -76.69839477539062, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -70.870361328125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8221832513809204, + "rewards_train/margins": 0.1711028814315796, + "rewards_train/rejected": -1.9932861328125, + "step": 1577 + }, + { + "epoch": 0.44, + "learning_rate": 1.9626781139766442e-07, + "loss": 0.5414, + "step": 1578 + }, + { + "epoch": 0.44, + "logps_train/chosen": -97.37671661376953, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -128.31997680664062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5454843044281006, + "rewards_train/margins": 2.8747947216033936, + "rewards_train/rejected": -4.420279026031494, + "step": 1578 + }, + { + "epoch": 0.44, + "logps_train/chosen": -86.0423583984375, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -97.94073486328125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.609997510910034, + "rewards_train/margins": 0.583294153213501, + "rewards_train/rejected": -3.193291664123535, + "step": 1579 + }, + { + "epoch": 0.44, + "learning_rate": 1.956635297088332e-07, + "loss": 0.4043, + "step": 1580 + }, + { + "epoch": 0.44, + "logps_train/chosen": -91.88868713378906, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -114.54054260253906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.056056499481201, + "rewards_train/margins": 2.033935546875, + "rewards_train/rejected": -4.089992046356201, + "step": 1580 + }, + { + "epoch": 0.44, + "logps_train/chosen": -75.75950622558594, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -82.37225341796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3267312049865723, + "rewards_train/margins": 1.5382280349731445, + "rewards_train/rejected": -2.864959239959717, + "step": 1581 + }, + { + "epoch": 0.44, + "learning_rate": 1.9505958103799768e-07, + "loss": 0.3429, + "step": 1582 + }, + { + "epoch": 0.44, + "logps_train/chosen": -97.96509552001953, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -105.13853454589844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3652596473693848, + "rewards_train/margins": 2.265194892883301, + "rewards_train/rejected": -3.6304545402526855, + "step": 1582 + }, + { + "epoch": 0.44, + "logps_train/chosen": -82.00286102294922, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -88.98050689697266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7584893107414246, + "rewards_train/margins": 0.9598739743232727, + "rewards_train/rejected": -1.7183632850646973, + "step": 1583 + }, + { + "epoch": 0.44, + "learning_rate": 1.9445596908664538e-07, + "loss": 0.4702, + "step": 1584 + }, + { + "epoch": 0.44, + "logps_train/chosen": -55.20734405517578, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -56.561405181884766, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5101875066757202, + "rewards_train/margins": 0.38892197608947754, + "rewards_train/rejected": -1.8991094827651978, + "step": 1584 + }, + { + "epoch": 0.44, + "logps_train/chosen": -61.61973571777344, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -71.2435302734375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8783797025680542, + "rewards_train/margins": 0.592848539352417, + "rewards_train/rejected": -1.4712282419204712, + "step": 1585 + }, + { + "epoch": 0.44, + "learning_rate": 1.9385269755420044e-07, + "loss": 0.5605, + "step": 1586 + }, + { + "epoch": 0.44, + "logps_train/chosen": -58.24223709106445, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -76.9065933227539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9369189739227295, + "rewards_train/margins": 1.8613574504852295, + "rewards_train/rejected": -2.798276424407959, + "step": 1586 + }, + { + "epoch": 0.44, + "logps_train/chosen": -74.80534362792969, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -60.732025146484375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8530930280685425, + "rewards_train/margins": 1.2278248071670532, + "rewards_train/rejected": -2.0809178352355957, + "step": 1587 + }, + { + "epoch": 0.44, + "learning_rate": 1.9324977013800043e-07, + "loss": 0.2951, + "step": 1588 + }, + { + "epoch": 0.44, + "logps_train/chosen": -71.7240219116211, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -63.767738342285156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.393227219581604, + "rewards_train/margins": 0.153273344039917, + "rewards_train/rejected": -1.546500563621521, + "step": 1588 + }, + { + "epoch": 0.44, + "logps_train/chosen": -78.40270233154297, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -94.98567199707031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1555049419403076, + "rewards_train/margins": 1.9446251392364502, + "rewards_train/rejected": -3.100130081176758, + "step": 1589 + }, + { + "epoch": 0.44, + "learning_rate": 1.926471905332739e-07, + "loss": 0.5442, + "step": 1590 + }, + { + "epoch": 0.44, + "logps_train/chosen": -81.09420776367188, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -97.06202697753906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8138642311096191, + "rewards_train/margins": 1.286088466644287, + "rewards_train/rejected": -2.0999526977539062, + "step": 1590 + }, + { + "epoch": 0.44, + "logps_train/chosen": -63.191864013671875, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -93.6617431640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2117648124694824, + "rewards_train/margins": 2.119253635406494, + "rewards_train/rejected": -3.3310184478759766, + "step": 1591 + }, + { + "epoch": 0.44, + "learning_rate": 1.920449624331179e-07, + "loss": 0.3014, + "step": 1592 + }, + { + "epoch": 0.44, + "logps_train/chosen": -63.863407135009766, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -111.71652221679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9037234783172607, + "rewards_train/margins": 1.6468348503112793, + "rewards_train/rejected": -2.55055832862854, + "step": 1592 + }, + { + "epoch": 0.45, + "logps_train/chosen": -47.715370178222656, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -72.30594635009766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5560826063156128, + "rewards_train/margins": 1.3600589036941528, + "rewards_train/rejected": -1.9161415100097656, + "step": 1593 + }, + { + "epoch": 0.45, + "learning_rate": 1.9144308952847498e-07, + "loss": 0.3609, + "step": 1594 + }, + { + "epoch": 0.45, + "logps_train/chosen": -72.75309753417969, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -96.64938354492188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3952317237854004, + "rewards_train/margins": 1.6150193214416504, + "rewards_train/rejected": -3.010251045227051, + "step": 1594 + }, + { + "epoch": 0.45, + "logps_train/chosen": -26.114727020263672, + "logps_train/ref_chosen": -21.125, + "logps_train/ref_rejected": -32.75, + "logps_train/rejected": -44.55803680419922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4950665235519409, + "rewards_train/margins": 0.6759716272354126, + "rewards_train/rejected": -1.1710381507873535, + "step": 1595 + }, + { + "epoch": 0.45, + "learning_rate": 1.9084157550811095e-07, + "loss": 0.4269, + "step": 1596 + }, + { + "epoch": 0.45, + "logps_train/chosen": -57.08452606201172, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -60.630889892578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0283747911453247, + "rewards_train/margins": 1.4489721059799194, + "rewards_train/rejected": -2.477346897125244, + "step": 1596 + }, + { + "epoch": 0.45, + "logps_train/chosen": -35.66205978393555, + "logps_train/ref_chosen": -29.875, + "logps_train/ref_rejected": -29.875, + "logps_train/rejected": -44.3210563659668, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.581831157207489, + "rewards_train/margins": 0.8682431578636169, + "rewards_train/rejected": -1.450074315071106, + "step": 1597 + }, + { + "epoch": 0.45, + "learning_rate": 1.9024042405859185e-07, + "loss": 0.4342, + "step": 1598 + }, + { + "epoch": 0.45, + "logps_train/chosen": -88.10818481445312, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -120.07489013671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7823021411895752, + "rewards_train/margins": 2.396085023880005, + "rewards_train/rejected": -4.17838716506958, + "step": 1598 + }, + { + "epoch": 0.45, + "logps_train/chosen": -70.81685638427734, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -126.18684387207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6357871294021606, + "rewards_train/margins": 3.0518428087234497, + "rewards_train/rejected": -3.6876299381256104, + "step": 1599 + }, + { + "epoch": 0.45, + "learning_rate": 1.8963963886426195e-07, + "loss": 0.185, + "step": 1600 + }, + { + "epoch": 0.45, + "logps_train/chosen": -81.81231689453125, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -108.00234985351562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1275206804275513, + "rewards_train/margins": 2.3703707456588745, + "rewards_train/rejected": -3.497891426086426, + "step": 1600 + }, + { + "epoch": 0.45, + "logps_train/chosen": -109.75550842285156, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -117.73065185546875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.7650036811828613, + "rewards_train/margins": 0.33384275436401367, + "rewards_train/rejected": -2.098846435546875, + "step": 1601 + }, + { + "epoch": 0.45, + "learning_rate": 1.890392236072203e-07, + "loss": 0.5113, + "step": 1602 + }, + { + "epoch": 0.45, + "logps_train/chosen": -92.44400024414062, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -103.2388916015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.005727767944336, + "rewards_train/margins": 0.7662084102630615, + "rewards_train/rejected": -3.7719361782073975, + "step": 1602 + }, + { + "epoch": 0.45, + "logps_train/chosen": -95.73787689208984, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -107.28487396240234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.738631248474121, + "rewards_train/margins": 0.9564576148986816, + "rewards_train/rejected": -3.6950888633728027, + "step": 1603 + }, + { + "epoch": 0.45, + "learning_rate": 1.884391819672991e-07, + "loss": 0.7016, + "step": 1604 + }, + { + "epoch": 0.45, + "logps_train/chosen": -36.527244567871094, + "logps_train/ref_chosen": -26.625, + "logps_train/ref_rejected": -23.375, + "logps_train/rejected": -34.958683013916016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9954978227615356, + "rewards_train/margins": 0.15857350826263428, + "rewards_train/rejected": -1.15407133102417, + "step": 1604 + }, + { + "epoch": 0.45, + "logps_train/chosen": -55.56608200073242, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -62.14733123779297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5710611939430237, + "rewards_train/margins": 1.0249220728874207, + "rewards_train/rejected": -1.5959832668304443, + "step": 1605 + }, + { + "epoch": 0.45, + "learning_rate": 1.8783951762204052e-07, + "loss": 0.5639, + "step": 1606 + }, + { + "epoch": 0.45, + "logps_train/chosen": -129.42323303222656, + "logps_train/ref_chosen": -103.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -142.60104370117188, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.6395885944366455, + "rewards_train/margins": 0.8181729316711426, + "rewards_train/rejected": -3.457761526107788, + "step": 1606 + }, + { + "epoch": 0.45, + "logps_train/chosen": -71.31681060791016, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -83.39795684814453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2392005920410156, + "rewards_train/margins": 1.2903413772583008, + "rewards_train/rejected": -2.5295419692993164, + "step": 1607 + }, + { + "epoch": 0.45, + "learning_rate": 1.8724023424667458e-07, + "loss": 0.5695, + "step": 1608 + }, + { + "epoch": 0.45, + "logps_train/chosen": -68.46073913574219, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -81.42782592773438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.986014723777771, + "rewards_train/margins": 1.2684859037399292, + "rewards_train/rejected": -2.2545006275177, + "step": 1608 + }, + { + "epoch": 0.45, + "logps_train/chosen": -96.95525360107422, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -110.5, + "logps_train/rejected": -159.29974365234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.351286768913269, + "rewards_train/margins": 3.5308371782302856, + "rewards_train/rejected": -4.882123947143555, + "step": 1609 + }, + { + "epoch": 0.45, + "learning_rate": 1.8664133551409612e-07, + "loss": 0.2235, + "step": 1610 + }, + { + "epoch": 0.45, + "logps_train/chosen": -69.56402587890625, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -72.66041564941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5195867419242859, + "rewards_train/margins": 1.407002031803131, + "rewards_train/rejected": -1.926588773727417, + "step": 1610 + }, + { + "epoch": 0.45, + "logps_train/chosen": -56.912479400634766, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -71.60416412353516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.887536883354187, + "rewards_train/margins": 1.1965123414993286, + "rewards_train/rejected": -2.0840492248535156, + "step": 1611 + }, + { + "epoch": 0.45, + "learning_rate": 1.860428250948427e-07, + "loss": 0.4269, + "step": 1612 + }, + { + "epoch": 0.45, + "logps_train/chosen": -70.08004760742188, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -85.83873748779297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8978487253189087, + "rewards_train/margins": 2.6473532915115356, + "rewards_train/rejected": -3.5452020168304443, + "step": 1612 + }, + { + "epoch": 0.45, + "logps_train/chosen": -71.24488067626953, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -112.40217590332031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.221753716468811, + "rewards_train/margins": 1.962995171546936, + "rewards_train/rejected": -3.184748888015747, + "step": 1613 + }, + { + "epoch": 0.45, + "learning_rate": 1.8544470665707207e-07, + "loss": 0.2423, + "step": 1614 + }, + { + "epoch": 0.45, + "logps_train/chosen": -68.96395111083984, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -85.09397888183594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5140705108642578, + "rewards_train/margins": 1.222280502319336, + "rewards_train/rejected": -2.7363510131835938, + "step": 1614 + }, + { + "epoch": 0.45, + "logps_train/chosen": -89.24520874023438, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -105.38399505615234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6350679397583008, + "rewards_train/margins": 1.7178335189819336, + "rewards_train/rejected": -3.3529014587402344, + "step": 1615 + }, + { + "epoch": 0.45, + "learning_rate": 1.848469838665394e-07, + "loss": 0.3986, + "step": 1616 + }, + { + "epoch": 0.45, + "logps_train/chosen": -95.86478424072266, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -128.3060302734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.703665852546692, + "rewards_train/margins": 2.2550617456436157, + "rewards_train/rejected": -3.9587275981903076, + "step": 1616 + }, + { + "epoch": 0.45, + "logps_train/chosen": -51.01324462890625, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -67.15058898925781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9450746178627014, + "rewards_train/margins": 1.134828507900238, + "rewards_train/rejected": -2.0799031257629395, + "step": 1617 + }, + { + "epoch": 0.45, + "learning_rate": 1.8424966038657523e-07, + "loss": 0.2926, + "step": 1618 + }, + { + "epoch": 0.45, + "logps_train/chosen": -65.51244354248047, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -74.79045104980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1387443542480469, + "rewards_train/margins": 1.0923757553100586, + "rewards_train/rejected": -2.2311201095581055, + "step": 1618 + }, + { + "epoch": 0.45, + "logps_train/chosen": -77.77388000488281, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -112.19791412353516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4059033393859863, + "rewards_train/margins": 1.275606632232666, + "rewards_train/rejected": -3.6815099716186523, + "step": 1619 + }, + { + "epoch": 0.45, + "learning_rate": 1.836527398780627e-07, + "loss": 0.3525, + "step": 1620 + }, + { + "epoch": 0.45, + "logps_train/chosen": -77.53721618652344, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -116.349853515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.230870008468628, + "rewards_train/margins": 2.0150530338287354, + "rewards_train/rejected": -3.2459230422973633, + "step": 1620 + }, + { + "epoch": 0.45, + "logps_train/chosen": -68.93390655517578, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -93.30572509765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.704816460609436, + "rewards_train/margins": 2.114622950553894, + "rewards_train/rejected": -2.81943941116333, + "step": 1621 + }, + { + "epoch": 0.45, + "learning_rate": 1.8305622599941517e-07, + "loss": 0.3649, + "step": 1622 + }, + { + "epoch": 0.45, + "logps_train/chosen": -33.22088623046875, + "logps_train/ref_chosen": -25.625, + "logps_train/ref_rejected": -29.0, + "logps_train/rejected": -42.651756286621094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7562317848205566, + "rewards_train/margins": 0.6143757104873657, + "rewards_train/rejected": -1.3706074953079224, + "step": 1622 + }, + { + "epoch": 0.45, + "logps_train/chosen": -35.414363861083984, + "logps_train/ref_chosen": -30.125, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -51.605384826660156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5338191986083984, + "rewards_train/margins": 0.48638737201690674, + "rewards_train/rejected": -1.0202065706253052, + "step": 1623 + }, + { + "epoch": 0.45, + "learning_rate": 1.8246012240655395e-07, + "loss": 0.5101, + "step": 1624 + }, + { + "epoch": 0.45, + "logps_train/chosen": -64.7273941040039, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -80.7210693359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.942855954170227, + "rewards_train/margins": 1.150150179862976, + "rewards_train/rejected": -2.093006134033203, + "step": 1624 + }, + { + "epoch": 0.45, + "logps_train/chosen": -93.31907653808594, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -99.22573852539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.049095630645752, + "rewards_train/margins": 0.7547280788421631, + "rewards_train/rejected": -2.803823709487915, + "step": 1625 + }, + { + "epoch": 0.45, + "learning_rate": 1.818644327528856e-07, + "loss": 0.4991, + "step": 1626 + }, + { + "epoch": 0.45, + "logps_train/chosen": -129.418212890625, + "logps_train/ref_chosen": -107.0, + "logps_train/ref_rejected": -134.0, + "logps_train/rejected": -190.81375122070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2543210983276367, + "rewards_train/margins": 3.434868335723877, + "rewards_train/rejected": -5.689189434051514, + "step": 1626 + }, + { + "epoch": 0.45, + "logps_train/chosen": -67.3095703125, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -83.19931030273438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3274415731430054, + "rewards_train/margins": 1.3395591974258423, + "rewards_train/rejected": -2.6670007705688477, + "step": 1627 + }, + { + "epoch": 0.46, + "learning_rate": 1.8126916068927997e-07, + "loss": 0.2331, + "step": 1628 + }, + { + "epoch": 0.46, + "logps_train/chosen": -68.84822082519531, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -94.31878662109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7074776887893677, + "rewards_train/margins": 1.3550652265548706, + "rewards_train/rejected": -3.0625429153442383, + "step": 1628 + }, + { + "epoch": 0.46, + "logps_train/chosen": -47.78813171386719, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -79.74202728271484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8522503972053528, + "rewards_train/margins": 2.2678512930870056, + "rewards_train/rejected": -3.1201016902923584, + "step": 1629 + }, + { + "epoch": 0.46, + "learning_rate": 1.8067430986404735e-07, + "loss": 0.33, + "step": 1630 + }, + { + "epoch": 0.46, + "logps_train/chosen": -73.79712677001953, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -93.31788635253906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.421118974685669, + "rewards_train/margins": 1.9993417263031006, + "rewards_train/rejected": -3.4204607009887695, + "step": 1630 + }, + { + "epoch": 0.46, + "logps_train/chosen": -78.96018981933594, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -112.003173828125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5717514753341675, + "rewards_train/margins": 0.7545429468154907, + "rewards_train/rejected": -2.326294422149658, + "step": 1631 + }, + { + "epoch": 0.46, + "learning_rate": 1.8007988392291662e-07, + "loss": 0.6382, + "step": 1632 + }, + { + "epoch": 0.46, + "logps_train/chosen": -65.02816009521484, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -87.3437271118164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7250816226005554, + "rewards_train/margins": 2.588588058948517, + "rewards_train/rejected": -3.3136696815490723, + "step": 1632 + }, + { + "epoch": 0.46, + "logps_train/chosen": -63.71031951904297, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -70.89663696289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6694695949554443, + "rewards_train/margins": 0.4854285717010498, + "rewards_train/rejected": -2.154898166656494, + "step": 1633 + }, + { + "epoch": 0.46, + "learning_rate": 1.7948588650901225e-07, + "loss": 0.3873, + "step": 1634 + }, + { + "epoch": 0.46, + "logps_train/chosen": -113.53593444824219, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -134.50201416015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.909928321838379, + "rewards_train/margins": 1.2794208526611328, + "rewards_train/rejected": -4.189349174499512, + "step": 1634 + }, + { + "epoch": 0.46, + "logps_train/chosen": -76.97315979003906, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -100.32398986816406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1965343952178955, + "rewards_train/margins": 1.6050055027008057, + "rewards_train/rejected": -2.801539897918701, + "step": 1635 + }, + { + "epoch": 0.46, + "learning_rate": 1.7889232126283267e-07, + "loss": 0.3359, + "step": 1636 + }, + { + "epoch": 0.46, + "logps_train/chosen": -79.50006103515625, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -84.783447265625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5851625204086304, + "rewards_train/margins": 1.392400860786438, + "rewards_train/rejected": -2.9775633811950684, + "step": 1636 + }, + { + "epoch": 0.46, + "logps_train/chosen": -69.0367431640625, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -70.38298797607422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6337518692016602, + "rewards_train/margins": 0.7023983001708984, + "rewards_train/rejected": -2.3361501693725586, + "step": 1637 + }, + { + "epoch": 0.46, + "learning_rate": 1.782991918222275e-07, + "loss": 0.508, + "step": 1638 + }, + { + "epoch": 0.46, + "logps_train/chosen": -55.38524627685547, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -74.1991195678711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0447261333465576, + "rewards_train/margins": 1.190420150756836, + "rewards_train/rejected": -2.2351462841033936, + "step": 1638 + }, + { + "epoch": 0.46, + "logps_train/chosen": -47.96641159057617, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -77.42793273925781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8376566171646118, + "rewards_train/margins": 1.2662690877914429, + "rewards_train/rejected": -2.1039257049560547, + "step": 1639 + }, + { + "epoch": 0.46, + "learning_rate": 1.7770650182237532e-07, + "loss": 0.3921, + "step": 1640 + }, + { + "epoch": 0.46, + "logps_train/chosen": -86.46134948730469, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -96.79161834716797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1559011936187744, + "rewards_train/margins": 1.272479772567749, + "rewards_train/rejected": -2.4283809661865234, + "step": 1640 + }, + { + "epoch": 0.46, + "logps_train/chosen": -73.64527893066406, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -96.40797424316406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.004371166229248, + "rewards_train/margins": 2.070801019668579, + "rewards_train/rejected": -3.075172185897827, + "step": 1641 + }, + { + "epoch": 0.46, + "learning_rate": 1.7711425489576164e-07, + "loss": 0.3354, + "step": 1642 + }, + { + "epoch": 0.46, + "logps_train/chosen": -33.769466400146484, + "logps_train/ref_chosen": -27.625, + "logps_train/ref_rejected": -31.125, + "logps_train/rejected": -46.60411071777344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6140562295913696, + "rewards_train/margins": 0.9340499639511108, + "rewards_train/rejected": -1.5481061935424805, + "step": 1642 + }, + { + "epoch": 0.46, + "logps_train/chosen": -62.69888687133789, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -99.6690673828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0552401542663574, + "rewards_train/margins": 1.9081511497497559, + "rewards_train/rejected": -2.9633913040161133, + "step": 1643 + }, + { + "epoch": 0.46, + "learning_rate": 1.7652245467215633e-07, + "loss": 0.3381, + "step": 1644 + }, + { + "epoch": 0.46, + "logps_train/chosen": -77.92562866210938, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -88.2549819946289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3132660388946533, + "rewards_train/margins": 1.3321540355682373, + "rewards_train/rejected": -2.6454200744628906, + "step": 1644 + }, + { + "epoch": 0.46, + "logps_train/chosen": -54.02538299560547, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -83.53839111328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2481436729431152, + "rewards_train/margins": 1.958625078201294, + "rewards_train/rejected": -3.206768751144409, + "step": 1645 + }, + { + "epoch": 0.46, + "learning_rate": 1.7593110477859152e-07, + "loss": 0.4441, + "step": 1646 + }, + { + "epoch": 0.46, + "logps_train/chosen": -92.95575714111328, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -100.82368469238281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.848998546600342, + "rewards_train/margins": 0.4821047782897949, + "rewards_train/rejected": -3.3311033248901367, + "step": 1646 + }, + { + "epoch": 0.46, + "logps_train/chosen": -66.4138412475586, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -34.25, + "logps_train/rejected": -60.00886535644531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8527121543884277, + "rewards_train/margins": 0.7304012775421143, + "rewards_train/rejected": -2.583113431930542, + "step": 1647 + }, + { + "epoch": 0.46, + "learning_rate": 1.7534020883933942e-07, + "loss": 0.5099, + "step": 1648 + }, + { + "epoch": 0.46, + "logps_train/chosen": -117.94528198242188, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -151.95156860351562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3450403213500977, + "rewards_train/margins": 2.9756298065185547, + "rewards_train/rejected": -5.320670127868652, + "step": 1648 + }, + { + "epoch": 0.46, + "logps_train/chosen": -80.22037506103516, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -120.06130981445312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2251629829406738, + "rewards_train/margins": 2.590343952178955, + "rewards_train/rejected": -3.815506935119629, + "step": 1649 + }, + { + "epoch": 0.46, + "learning_rate": 1.747497704758899e-07, + "loss": 0.2625, + "step": 1650 + }, + { + "epoch": 0.46, + "logps_train/chosen": -61.823944091796875, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -73.3372573852539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8113001585006714, + "rewards_train/margins": 1.7501598596572876, + "rewards_train/rejected": -2.561460018157959, + "step": 1650 + }, + { + "epoch": 0.46, + "logps_train/chosen": -55.210697174072266, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -86.23442077636719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5924079418182373, + "rewards_train/margins": 2.030057430267334, + "rewards_train/rejected": -3.6224653720855713, + "step": 1651 + }, + { + "epoch": 0.46, + "learning_rate": 1.7415979330692857e-07, + "loss": 0.3315, + "step": 1652 + }, + { + "epoch": 0.46, + "logps_train/chosen": -77.4622802734375, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -105.72571563720703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.144274353981018, + "rewards_train/margins": 2.211891531944275, + "rewards_train/rejected": -3.356165885925293, + "step": 1652 + }, + { + "epoch": 0.46, + "logps_train/chosen": -137.295654296875, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -180.1324005126953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -4.613160133361816, + "rewards_train/margins": 1.7364087104797363, + "rewards_train/rejected": -6.349568843841553, + "step": 1653 + }, + { + "epoch": 0.46, + "learning_rate": 1.7357028094831437e-07, + "loss": 0.4237, + "step": 1654 + }, + { + "epoch": 0.46, + "logps_train/chosen": -48.06719970703125, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -68.30729675292969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.49802833795547485, + "rewards_train/margins": 1.8776230216026306, + "rewards_train/rejected": -2.3756513595581055, + "step": 1654 + }, + { + "epoch": 0.46, + "logps_train/chosen": -51.079383850097656, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -81.62925720214844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6306922435760498, + "rewards_train/margins": 0.698249340057373, + "rewards_train/rejected": -2.328941583633423, + "step": 1655 + }, + { + "epoch": 0.46, + "learning_rate": 1.7298123701305761e-07, + "loss": 0.432, + "step": 1656 + }, + { + "epoch": 0.46, + "logps_train/chosen": -89.21125030517578, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -110.03629302978516, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3027660846710205, + "rewards_train/margins": 1.8664886951446533, + "rewards_train/rejected": -3.169254779815674, + "step": 1656 + }, + { + "epoch": 0.46, + "logps_train/chosen": -27.07877540588379, + "logps_train/ref_chosen": -21.875, + "logps_train/ref_rejected": -23.625, + "logps_train/rejected": -31.229503631591797, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.5249674320220947, + "rewards_train/margins": 0.23509228229522705, + "rewards_train/rejected": -0.7600597143173218, + "step": 1657 + }, + { + "epoch": 0.46, + "learning_rate": 1.723926651112976e-07, + "loss": 0.4783, + "step": 1658 + }, + { + "epoch": 0.46, + "logps_train/chosen": -90.55311584472656, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -117.06472778320312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.685780644416809, + "rewards_train/margins": 2.1785048246383667, + "rewards_train/rejected": -3.864285469055176, + "step": 1658 + }, + { + "epoch": 0.46, + "logps_train/chosen": -131.74801635742188, + "logps_train/ref_chosen": -106.0, + "logps_train/ref_rejected": -114.5, + "logps_train/rejected": -150.03887939453125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.5576157569885254, + "rewards_train/margins": 1.001741886138916, + "rewards_train/rejected": -3.5593576431274414, + "step": 1659 + }, + { + "epoch": 0.46, + "learning_rate": 1.718045688502808e-07, + "loss": 0.3645, + "step": 1660 + }, + { + "epoch": 0.46, + "logps_train/chosen": -123.75086975097656, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -115.00794982910156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -4.132118225097656, + "rewards_train/margins": 0.23215341567993164, + "rewards_train/rejected": -4.364271640777588, + "step": 1660 + }, + { + "epoch": 0.46, + "logps_train/chosen": -84.78007507324219, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -117.05474090576172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7177529335021973, + "rewards_train/margins": 1.9377210140228271, + "rewards_train/rejected": -3.6554739475250244, + "step": 1661 + }, + { + "epoch": 0.46, + "learning_rate": 1.7121695183433838e-07, + "loss": 0.5338, + "step": 1662 + }, + { + "epoch": 0.46, + "logps_train/chosen": -68.51127624511719, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -90.26943969726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9937058091163635, + "rewards_train/margins": 2.0795273184776306, + "rewards_train/rejected": -3.073233127593994, + "step": 1662 + }, + { + "epoch": 0.46, + "logps_train/chosen": -32.415794372558594, + "logps_train/ref_chosen": -25.625, + "logps_train/ref_rejected": -21.5, + "logps_train/rejected": -30.881206512451172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6844506859779358, + "rewards_train/margins": 0.25552552938461304, + "rewards_train/rejected": -0.9399762153625488, + "step": 1663 + }, + { + "epoch": 0.47, + "learning_rate": 1.7062981766486436e-07, + "loss": 0.3761, + "step": 1664 + }, + { + "epoch": 0.47, + "logps_train/chosen": -70.52616119384766, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -70.73539733886719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3366740942001343, + "rewards_train/margins": 1.369947075843811, + "rewards_train/rejected": -2.7066211700439453, + "step": 1664 + }, + { + "epoch": 0.47, + "logps_train/chosen": -94.04827880859375, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -90.67620849609375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.702630877494812, + "rewards_train/margins": 1.6329580545425415, + "rewards_train/rejected": -3.3355889320373535, + "step": 1665 + }, + { + "epoch": 0.47, + "learning_rate": 1.7004316994029364e-07, + "loss": 0.6045, + "step": 1666 + }, + { + "epoch": 0.47, + "logps_train/chosen": -84.98683166503906, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -80.54086303710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.287354975938797, + "rewards_train/margins": 1.6485677659511566, + "rewards_train/rejected": -1.9359227418899536, + "step": 1666 + }, + { + "epoch": 0.47, + "logps_train/chosen": -48.174278259277344, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -36.25, + "logps_train/rejected": -51.08442687988281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8951619863510132, + "rewards_train/margins": 0.5955073833465576, + "rewards_train/rejected": -1.4906693696975708, + "step": 1667 + }, + { + "epoch": 0.47, + "learning_rate": 1.6945701225607952e-07, + "loss": 0.3778, + "step": 1668 + }, + { + "epoch": 0.47, + "logps_train/chosen": -92.9300308227539, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -125.04141235351562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6633155345916748, + "rewards_train/margins": 2.776664972305298, + "rewards_train/rejected": -4.439980506896973, + "step": 1668 + }, + { + "epoch": 0.47, + "logps_train/chosen": -52.9864616394043, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -81.70050048828125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6720833778381348, + "rewards_train/margins": 2.5932788848876953, + "rewards_train/rejected": -3.26536226272583, + "step": 1669 + }, + { + "epoch": 0.47, + "learning_rate": 1.6887134820467222e-07, + "loss": 0.2394, + "step": 1670 + }, + { + "epoch": 0.47, + "logps_train/chosen": -92.07159423828125, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -93.31512451171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.925518751144409, + "rewards_train/margins": 0.351306676864624, + "rewards_train/rejected": -3.276825428009033, + "step": 1670 + }, + { + "epoch": 0.47, + "logps_train/chosen": -84.47746276855469, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -92.91738891601562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.983292818069458, + "rewards_train/margins": 0.7264158725738525, + "rewards_train/rejected": -2.7097086906433105, + "step": 1671 + }, + { + "epoch": 0.47, + "learning_rate": 1.6828618137549635e-07, + "loss": 0.5321, + "step": 1672 + }, + { + "epoch": 0.47, + "logps_train/chosen": -55.901512145996094, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -84.44178771972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5995261669158936, + "rewards_train/margins": 1.7020747661590576, + "rewards_train/rejected": -3.301600933074951, + "step": 1672 + }, + { + "epoch": 0.47, + "logps_train/chosen": -70.01547241210938, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -98.51997375488281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2319185733795166, + "rewards_train/margins": 1.4917585849761963, + "rewards_train/rejected": -2.723677158355713, + "step": 1673 + }, + { + "epoch": 0.47, + "learning_rate": 1.6770151535492921e-07, + "loss": 0.3731, + "step": 1674 + }, + { + "epoch": 0.47, + "logps_train/chosen": -42.512962341308594, + "logps_train/ref_chosen": -36.5, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -47.57307434082031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6077415943145752, + "rewards_train/margins": 0.5992730855941772, + "rewards_train/rejected": -1.2070146799087524, + "step": 1674 + }, + { + "epoch": 0.47, + "logps_train/chosen": -79.2337646484375, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -91.4706802368164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.477673888206482, + "rewards_train/margins": 1.107772946357727, + "rewards_train/rejected": -2.585446834564209, + "step": 1675 + }, + { + "epoch": 0.47, + "learning_rate": 1.6711735372627888e-07, + "loss": 0.4862, + "step": 1676 + }, + { + "epoch": 0.47, + "logps_train/chosen": -67.922607421875, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -84.69071197509766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.776440143585205, + "rewards_train/margins": 0.7301309108734131, + "rewards_train/rejected": -2.506571054458618, + "step": 1676 + }, + { + "epoch": 0.47, + "logps_train/chosen": -99.23464965820312, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -99.31503295898438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6422152519226074, + "rewards_train/margins": 0.9697575569152832, + "rewards_train/rejected": -2.6119728088378906, + "step": 1677 + }, + { + "epoch": 0.47, + "learning_rate": 1.6653370006976182e-07, + "loss": 0.4877, + "step": 1678 + }, + { + "epoch": 0.47, + "logps_train/chosen": -59.814300537109375, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -97.75444030761719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9835784435272217, + "rewards_train/margins": 2.1797568798065186, + "rewards_train/rejected": -3.1633353233337402, + "step": 1678 + }, + { + "epoch": 0.47, + "logps_train/chosen": -107.05012512207031, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -100.28025817871094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.135676860809326, + "rewards_train/margins": 0.923403263092041, + "rewards_train/rejected": -4.059080123901367, + "step": 1679 + }, + { + "epoch": 0.47, + "learning_rate": 1.6595055796248154e-07, + "loss": 0.4452, + "step": 1680 + }, + { + "epoch": 0.47, + "logps_train/chosen": -58.643882751464844, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -71.38655090332031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7219080328941345, + "rewards_train/margins": 0.7577629685401917, + "rewards_train/rejected": -1.4796710014343262, + "step": 1680 + }, + { + "epoch": 0.47, + "logps_train/chosen": -70.05195617675781, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -88.61813354492188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5166213512420654, + "rewards_train/margins": 1.2826917171478271, + "rewards_train/rejected": -2.7993130683898926, + "step": 1681 + }, + { + "epoch": 0.47, + "learning_rate": 1.6536793097840613e-07, + "loss": 0.4482, + "step": 1682 + }, + { + "epoch": 0.47, + "logps_train/chosen": -71.62968444824219, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -64.81156921386719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5358200073242188, + "rewards_train/margins": 0.4359617233276367, + "rewards_train/rejected": -1.9717817306518555, + "step": 1682 + }, + { + "epoch": 0.47, + "logps_train/chosen": -64.4502182006836, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -85.279541015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.465919852256775, + "rewards_train/margins": 0.6679915189743042, + "rewards_train/rejected": -2.133911371231079, + "step": 1683 + }, + { + "epoch": 0.47, + "learning_rate": 1.6478582268834674e-07, + "loss": 0.5673, + "step": 1684 + }, + { + "epoch": 0.47, + "logps_train/chosen": -77.01126861572266, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -83.2972412109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1683144569396973, + "rewards_train/margins": 1.007601022720337, + "rewards_train/rejected": -2.175915479660034, + "step": 1684 + }, + { + "epoch": 0.47, + "logps_train/chosen": -91.12889099121094, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -125.72279357910156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7468738555908203, + "rewards_train/margins": 2.4564595222473145, + "rewards_train/rejected": -4.203333377838135, + "step": 1685 + }, + { + "epoch": 0.47, + "learning_rate": 1.6420423665993543e-07, + "loss": 0.3966, + "step": 1686 + }, + { + "epoch": 0.47, + "logps_train/chosen": -69.03770446777344, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -108.81758880615234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1006451845169067, + "rewards_train/margins": 1.8715428113937378, + "rewards_train/rejected": -2.9721879959106445, + "step": 1686 + }, + { + "epoch": 0.47, + "logps_train/chosen": -87.07865142822266, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -99.63607788085938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3953654766082764, + "rewards_train/margins": 1.3096485137939453, + "rewards_train/rejected": -2.7050139904022217, + "step": 1687 + }, + { + "epoch": 0.47, + "learning_rate": 1.6362317645760345e-07, + "loss": 0.5543, + "step": 1688 + }, + { + "epoch": 0.47, + "logps_train/chosen": -74.01127624511719, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -69.46797180175781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9556194543838501, + "rewards_train/margins": 0.04820847511291504, + "rewards_train/rejected": -1.0038279294967651, + "step": 1688 + }, + { + "epoch": 0.47, + "logps_train/chosen": -28.50318717956543, + "logps_train/ref_chosen": -23.5, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -45.10893249511719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4987562298774719, + "rewards_train/margins": 0.5574496388435364, + "rewards_train/rejected": -1.0562058687210083, + "step": 1689 + }, + { + "epoch": 0.47, + "learning_rate": 1.6304264564255945e-07, + "loss": 0.5874, + "step": 1690 + }, + { + "epoch": 0.47, + "logps_train/chosen": -80.19734191894531, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -99.33476257324219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7146562337875366, + "rewards_train/margins": 2.0879613161087036, + "rewards_train/rejected": -2.8026175498962402, + "step": 1690 + }, + { + "epoch": 0.47, + "logps_train/chosen": -41.62714385986328, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -53.11134719848633, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.45900335907936096, + "rewards_train/margins": 1.2771314084529877, + "rewards_train/rejected": -1.7361347675323486, + "step": 1691 + }, + { + "epoch": 0.47, + "learning_rate": 1.624626477727674e-07, + "loss": 0.3138, + "step": 1692 + }, + { + "epoch": 0.47, + "logps_train/chosen": -45.346580505371094, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -66.0799789428711, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.3510642945766449, + "rewards_train/margins": 1.1681641638278961, + "rewards_train/rejected": -1.519228458404541, + "step": 1692 + }, + { + "epoch": 0.47, + "logps_train/chosen": -68.06881713867188, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -83.01776123046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1756322383880615, + "rewards_train/margins": 2.289034843444824, + "rewards_train/rejected": -3.4646670818328857, + "step": 1693 + }, + { + "epoch": 0.47, + "learning_rate": 1.618831864029251e-07, + "loss": 0.3492, + "step": 1694 + }, + { + "epoch": 0.47, + "logps_train/chosen": -96.76583862304688, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -105.95556640625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3433805704116821, + "rewards_train/margins": 0.977958083152771, + "rewards_train/rejected": -2.321338653564453, + "step": 1694 + }, + { + "epoch": 0.47, + "logps_train/chosen": -43.882144927978516, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -52.5103759765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5408511161804199, + "rewards_train/margins": 0.7512019872665405, + "rewards_train/rejected": -1.2920531034469604, + "step": 1695 + }, + { + "epoch": 0.47, + "learning_rate": 1.613042650844422e-07, + "loss": 0.4764, + "step": 1696 + }, + { + "epoch": 0.47, + "logps_train/chosen": -86.63471221923828, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -115.38475036621094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3977975845336914, + "rewards_train/margins": 1.965238332748413, + "rewards_train/rejected": -3.3630359172821045, + "step": 1696 + }, + { + "epoch": 0.47, + "logps_train/chosen": -84.23179626464844, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -113.83992004394531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0578969717025757, + "rewards_train/margins": 2.1768757104873657, + "rewards_train/rejected": -3.2347726821899414, + "step": 1697 + }, + { + "epoch": 0.47, + "learning_rate": 1.6072588736541837e-07, + "loss": 0.2471, + "step": 1698 + }, + { + "epoch": 0.47, + "logps_train/chosen": -62.672122955322266, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -122.68158721923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.400708794593811, + "rewards_train/margins": 2.427606225013733, + "rewards_train/rejected": -3.828315019607544, + "step": 1698 + }, + { + "epoch": 0.47, + "logps_train/chosen": -54.730247497558594, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -107.04042053222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0784939527511597, + "rewards_train/margins": 2.445860505104065, + "rewards_train/rejected": -3.5243544578552246, + "step": 1699 + }, + { + "epoch": 0.48, + "learning_rate": 1.6014805679062183e-07, + "loss": 0.2489, + "step": 1700 + }, + { + "epoch": 0.48, + "logps_train/chosen": -80.585693359375, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -104.49479675292969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.811694622039795, + "rewards_train/margins": 2.1026291847229004, + "rewards_train/rejected": -3.9143238067626953, + "step": 1700 + }, + { + "epoch": 0.48, + "logps_train/chosen": -93.85848999023438, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -108.67376708984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.591318368911743, + "rewards_train/margins": 0.9119961261749268, + "rewards_train/rejected": -3.50331449508667, + "step": 1701 + }, + { + "epoch": 0.48, + "learning_rate": 1.5957077690146728e-07, + "loss": 0.5183, + "step": 1702 + }, + { + "epoch": 0.48, + "logps_train/chosen": -62.26563262939453, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -73.73411560058594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8724616169929504, + "rewards_train/margins": 1.190403163433075, + "rewards_train/rejected": -2.0628647804260254, + "step": 1702 + }, + { + "epoch": 0.48, + "logps_train/chosen": -59.47084426879883, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -61.20113754272461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8697408437728882, + "rewards_train/margins": 1.4027162790298462, + "rewards_train/rejected": -2.2724571228027344, + "step": 1703 + }, + { + "epoch": 0.48, + "learning_rate": 1.589940512359946e-07, + "loss": 0.3901, + "step": 1704 + }, + { + "epoch": 0.48, + "logps_train/chosen": -85.05970001220703, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -119.24519348144531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.244251012802124, + "rewards_train/margins": 2.076362371444702, + "rewards_train/rejected": -3.320613384246826, + "step": 1704 + }, + { + "epoch": 0.48, + "logps_train/chosen": -54.460426330566406, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -74.13459777832031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.25561290979385376, + "rewards_train/margins": 1.2282573580741882, + "rewards_train/rejected": -1.483870267868042, + "step": 1705 + }, + { + "epoch": 0.48, + "learning_rate": 1.584178833288467e-07, + "loss": 0.4027, + "step": 1706 + }, + { + "epoch": 0.48, + "logps_train/chosen": -58.76069641113281, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -82.43553924560547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3251905143260956, + "rewards_train/margins": 1.0753945410251617, + "rewards_train/rejected": -1.4005850553512573, + "step": 1706 + }, + { + "epoch": 0.48, + "logps_train/chosen": -104.62451171875, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -137.52508544921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.912841558456421, + "rewards_train/margins": 2.640448808670044, + "rewards_train/rejected": -4.553290367126465, + "step": 1707 + }, + { + "epoch": 0.48, + "learning_rate": 1.5784227671124826e-07, + "loss": 0.2927, + "step": 1708 + }, + { + "epoch": 0.48, + "logps_train/chosen": -55.8347053527832, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -85.9326171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.945970356464386, + "rewards_train/margins": 1.4238542914390564, + "rewards_train/rejected": -2.3698246479034424, + "step": 1708 + }, + { + "epoch": 0.48, + "logps_train/chosen": -54.66715621948242, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -86.22138977050781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9868572354316711, + "rewards_train/margins": 1.8746129870414734, + "rewards_train/rejected": -2.8614702224731445, + "step": 1709 + }, + { + "epoch": 0.48, + "learning_rate": 1.5726723491098383e-07, + "loss": 0.304, + "step": 1710 + }, + { + "epoch": 0.48, + "logps_train/chosen": -90.70524597167969, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -92.46198272705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8190110325813293, + "rewards_train/margins": 2.3880273699760437, + "rewards_train/rejected": -3.207038402557373, + "step": 1710 + }, + { + "epoch": 0.48, + "logps_train/chosen": -86.20796203613281, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -116.68403625488281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.6706008911132812, + "rewards_train/margins": 2.073974609375, + "rewards_train/rejected": -3.7445755004882812, + "step": 1711 + }, + { + "epoch": 0.48, + "learning_rate": 1.566927614523763e-07, + "loss": 0.4273, + "step": 1712 + }, + { + "epoch": 0.48, + "logps_train/chosen": -61.82952880859375, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -77.03433990478516, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -2.1032655239105225, + "rewards_train/margins": 0.28376221656799316, + "rewards_train/rejected": -2.3870277404785156, + "step": 1712 + }, + { + "epoch": 0.48, + "logps_train/chosen": -78.32061767578125, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -102.40614318847656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6319637298583984, + "rewards_train/margins": 1.013533592224121, + "rewards_train/rejected": -2.6454973220825195, + "step": 1713 + }, + { + "epoch": 0.48, + "learning_rate": 1.5611885985626544e-07, + "loss": 0.6185, + "step": 1714 + }, + { + "epoch": 0.48, + "logps_train/chosen": -138.0715789794922, + "logps_train/ref_chosen": -116.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -113.78358459472656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.2212202548980713, + "rewards_train/margins": 1.4633877277374268, + "rewards_train/rejected": -3.684607982635498, + "step": 1714 + }, + { + "epoch": 0.48, + "logps_train/chosen": -105.45602416992188, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -155.59356689453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.344820976257324, + "rewards_train/margins": 2.350083351135254, + "rewards_train/rejected": -4.694904327392578, + "step": 1715 + }, + { + "epoch": 0.48, + "learning_rate": 1.5554553363998592e-07, + "loss": 0.4026, + "step": 1716 + }, + { + "epoch": 0.48, + "logps_train/chosen": -39.249446868896484, + "logps_train/ref_chosen": -33.5, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -60.99778747558594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5843197107315063, + "rewards_train/margins": 1.1607716083526611, + "rewards_train/rejected": -1.7450913190841675, + "step": 1716 + }, + { + "epoch": 0.48, + "logps_train/chosen": -65.62832641601562, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -77.27700805664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0398834943771362, + "rewards_train/margins": 1.0017582178115845, + "rewards_train/rejected": -2.0416417121887207, + "step": 1717 + }, + { + "epoch": 0.48, + "learning_rate": 1.549727863173463e-07, + "loss": 0.3592, + "step": 1718 + }, + { + "epoch": 0.48, + "logps_train/chosen": -122.73854064941406, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -166.14834594726562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7176034450531006, + "rewards_train/margins": 3.262075662612915, + "rewards_train/rejected": -4.979679107666016, + "step": 1718 + }, + { + "epoch": 0.48, + "logps_train/chosen": -55.00486755371094, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -60.80463409423828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6984851360321045, + "rewards_train/margins": 0.5397908687591553, + "rewards_train/rejected": -2.2382760047912598, + "step": 1719 + }, + { + "epoch": 0.48, + "learning_rate": 1.5440062139860702e-07, + "loss": 0.4756, + "step": 1720 + }, + { + "epoch": 0.48, + "logps_train/chosen": -62.37964630126953, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -77.89312744140625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6694098711013794, + "rewards_train/margins": 2.1906055212020874, + "rewards_train/rejected": -2.860015392303467, + "step": 1720 + }, + { + "epoch": 0.48, + "logps_train/chosen": -49.41643524169922, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -75.60308837890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3776785135269165, + "rewards_train/margins": 1.2463024854660034, + "rewards_train/rejected": -2.62398099899292, + "step": 1721 + }, + { + "epoch": 0.48, + "learning_rate": 1.5382904239045917e-07, + "loss": 0.3565, + "step": 1722 + }, + { + "epoch": 0.48, + "logps_train/chosen": -86.35062408447266, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -94.67170715332031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7866249084472656, + "rewards_train/margins": 0.8211703300476074, + "rewards_train/rejected": -2.607795238494873, + "step": 1722 + }, + { + "epoch": 0.48, + "logps_train/chosen": -84.44341278076172, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -99.17850494384766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9177792072296143, + "rewards_train/margins": 2.126633405685425, + "rewards_train/rejected": -4.044412612915039, + "step": 1723 + }, + { + "epoch": 0.48, + "learning_rate": 1.5325805279600285e-07, + "loss": 0.3435, + "step": 1724 + }, + { + "epoch": 0.48, + "logps_train/chosen": -60.262733459472656, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -38.0, + "logps_train/rejected": -59.72871017456055, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3323768377304077, + "rewards_train/margins": 0.8286288976669312, + "rewards_train/rejected": -2.161005735397339, + "step": 1724 + }, + { + "epoch": 0.48, + "logps_train/chosen": -43.0749626159668, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -70.24840545654297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7276864051818848, + "rewards_train/margins": 0.7864855527877808, + "rewards_train/rejected": -1.5141719579696655, + "step": 1725 + }, + { + "epoch": 0.48, + "learning_rate": 1.5268765611472574e-07, + "loss": 0.4338, + "step": 1726 + }, + { + "epoch": 0.48, + "logps_train/chosen": -63.47959518432617, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -64.1396255493164, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9475688934326172, + "rewards_train/margins": 0.8548210859298706, + "rewards_train/rejected": -1.8023899793624878, + "step": 1726 + }, + { + "epoch": 0.48, + "logps_train/chosen": -57.35747528076172, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -76.74878692626953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5443410873413086, + "rewards_train/margins": 1.814131259918213, + "rewards_train/rejected": -3.3584723472595215, + "step": 1727 + }, + { + "epoch": 0.48, + "learning_rate": 1.5211785584248178e-07, + "loss": 0.3767, + "step": 1728 + }, + { + "epoch": 0.48, + "logps_train/chosen": -96.9595947265625, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -142.14691162109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3178340196609497, + "rewards_train/margins": 2.7702945470809937, + "rewards_train/rejected": -4.088128566741943, + "step": 1728 + }, + { + "epoch": 0.48, + "logps_train/chosen": -59.227317810058594, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -76.9884033203125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7125757932662964, + "rewards_train/margins": 0.8030610084533691, + "rewards_train/rejected": -1.5156368017196655, + "step": 1729 + }, + { + "epoch": 0.48, + "learning_rate": 1.5154865547146947e-07, + "loss": 0.2882, + "step": 1730 + }, + { + "epoch": 0.48, + "logps_train/chosen": -65.23429870605469, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -77.76101684570312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2140541970729828, + "rewards_train/margins": 1.3245470821857452, + "rewards_train/rejected": -1.538601279258728, + "step": 1730 + }, + { + "epoch": 0.48, + "logps_train/chosen": -75.65512084960938, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -88.51319885253906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.544906735420227, + "rewards_train/margins": 1.561881422996521, + "rewards_train/rejected": -2.106788158416748, + "step": 1731 + }, + { + "epoch": 0.48, + "learning_rate": 1.5098005849021078e-07, + "loss": 0.3378, + "step": 1732 + }, + { + "epoch": 0.48, + "logps_train/chosen": -70.87867736816406, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -87.4938735961914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0714613199234009, + "rewards_train/margins": 0.35556256771087646, + "rewards_train/rejected": -1.4270238876342773, + "step": 1732 + }, + { + "epoch": 0.48, + "logps_train/chosen": -65.34355926513672, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -71.9080810546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.462090015411377, + "rewards_train/margins": 0.5146552324295044, + "rewards_train/rejected": -1.9767452478408813, + "step": 1733 + }, + { + "epoch": 0.48, + "learning_rate": 1.5041206838352955e-07, + "loss": 0.5774, + "step": 1734 + }, + { + "epoch": 0.48, + "logps_train/chosen": -73.20880126953125, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -85.79881286621094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5255677700042725, + "rewards_train/margins": 1.7681806087493896, + "rewards_train/rejected": -3.293748378753662, + "step": 1734 + }, + { + "epoch": 0.48, + "logps_train/chosen": -61.277870178222656, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -93.12185668945312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2486857175827026, + "rewards_train/margins": 2.0986567735671997, + "rewards_train/rejected": -3.3473424911499023, + "step": 1735 + }, + { + "epoch": 0.49, + "learning_rate": 1.4984468863253007e-07, + "loss": 0.2993, + "step": 1736 + }, + { + "epoch": 0.49, + "logps_train/chosen": -100.27379608154297, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -94.43202209472656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9312859773635864, + "rewards_train/margins": 0.6416035890579224, + "rewards_train/rejected": -2.572889566421509, + "step": 1736 + }, + { + "epoch": 0.49, + "logps_train/chosen": -73.89569091796875, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -78.6570816040039, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1137696504592896, + "rewards_train/margins": 1.4405921697616577, + "rewards_train/rejected": -2.5543618202209473, + "step": 1737 + }, + { + "epoch": 0.49, + "learning_rate": 1.4927792271457624e-07, + "loss": 0.4012, + "step": 1738 + }, + { + "epoch": 0.49, + "logps_train/chosen": -48.92772674560547, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -59.254119873046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9099600315093994, + "rewards_train/margins": 0.5400614738464355, + "rewards_train/rejected": -1.450021505355835, + "step": 1738 + }, + { + "epoch": 0.49, + "logps_train/chosen": -46.54792022705078, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -65.42706298828125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1446361541748047, + "rewards_train/margins": 1.0652580261230469, + "rewards_train/rejected": -2.2098941802978516, + "step": 1739 + }, + { + "epoch": 0.49, + "learning_rate": 1.487117741032694e-07, + "loss": 0.507, + "step": 1740 + }, + { + "epoch": 0.49, + "logps_train/chosen": -52.60486602783203, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -72.66886138916016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4809945225715637, + "rewards_train/margins": 1.3015164732933044, + "rewards_train/rejected": -1.7825109958648682, + "step": 1740 + }, + { + "epoch": 0.49, + "logps_train/chosen": -60.44313049316406, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -100.72547912597656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3650164604187012, + "rewards_train/margins": 1.9962029457092285, + "rewards_train/rejected": -3.3612194061279297, + "step": 1741 + }, + { + "epoch": 0.49, + "learning_rate": 1.4814624626842797e-07, + "loss": 0.3217, + "step": 1742 + }, + { + "epoch": 0.49, + "logps_train/chosen": -117.7610092163086, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -148.991943359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.43860125541687, + "rewards_train/margins": 2.3387176990509033, + "rewards_train/rejected": -4.777318954467773, + "step": 1742 + }, + { + "epoch": 0.49, + "logps_train/chosen": -68.24562072753906, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -101.6647720336914, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5278820991516113, + "rewards_train/margins": 1.294844627380371, + "rewards_train/rejected": -2.8227267265319824, + "step": 1743 + }, + { + "epoch": 0.49, + "learning_rate": 1.475813426760655e-07, + "loss": 0.4032, + "step": 1744 + }, + { + "epoch": 0.49, + "logps_train/chosen": -37.63656234741211, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -58.51951599121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5460291504859924, + "rewards_train/margins": 1.319887101650238, + "rewards_train/rejected": -1.8659162521362305, + "step": 1744 + }, + { + "epoch": 0.49, + "logps_train/chosen": -102.67803192138672, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -125.20155334472656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2474911212921143, + "rewards_train/margins": 1.614070177078247, + "rewards_train/rejected": -2.8615612983703613, + "step": 1745 + }, + { + "epoch": 0.49, + "learning_rate": 1.4701706678836977e-07, + "loss": 0.3559, + "step": 1746 + }, + { + "epoch": 0.49, + "logps_train/chosen": -48.163970947265625, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -79.82466888427734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2507719397544861, + "rewards_train/margins": 1.78032785654068, + "rewards_train/rejected": -2.031099796295166, + "step": 1746 + }, + { + "epoch": 0.49, + "logps_train/chosen": -69.75738525390625, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -104.90107727050781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2784242630004883, + "rewards_train/margins": 2.199769973754883, + "rewards_train/rejected": -3.478194236755371, + "step": 1747 + }, + { + "epoch": 0.49, + "learning_rate": 1.4645342206368144e-07, + "loss": 0.3322, + "step": 1748 + }, + { + "epoch": 0.49, + "logps_train/chosen": -106.47711944580078, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -127.27220153808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2395095825195312, + "rewards_train/margins": 2.168569564819336, + "rewards_train/rejected": -4.408079147338867, + "step": 1748 + }, + { + "epoch": 0.49, + "logps_train/chosen": -91.26492309570312, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -126.6604995727539, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0190703868865967, + "rewards_train/margins": 1.5723700523376465, + "rewards_train/rejected": -3.591440439224243, + "step": 1749 + }, + { + "epoch": 0.49, + "learning_rate": 1.4589041195647283e-07, + "loss": 0.3291, + "step": 1750 + }, + { + "epoch": 0.49, + "logps_train/chosen": -101.08950805664062, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -71.98365783691406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9179353713989258, + "rewards_train/margins": 0.2476177215576172, + "rewards_train/rejected": -2.165553092956543, + "step": 1750 + }, + { + "epoch": 0.49, + "logps_train/chosen": -31.443187713623047, + "logps_train/ref_chosen": -19.875, + "logps_train/ref_rejected": -20.125, + "logps_train/rejected": -36.207881927490234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1613110303878784, + "rewards_train/margins": 0.4471726417541504, + "rewards_train/rejected": -1.6084836721420288, + "step": 1751 + }, + { + "epoch": 0.49, + "learning_rate": 1.4532803991732697e-07, + "loss": 0.6374, + "step": 1752 + }, + { + "epoch": 0.49, + "logps_train/chosen": -93.20042419433594, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -134.81411743164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0637922286987305, + "rewards_train/margins": 2.7484796047210693, + "rewards_train/rejected": -3.8122718334198, + "step": 1752 + }, + { + "epoch": 0.49, + "logps_train/chosen": -57.33367156982422, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -63.25, + "logps_train/rejected": -86.43705749511719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3732109069824219, + "rewards_train/margins": 0.9560410976409912, + "rewards_train/rejected": -2.329252004623413, + "step": 1753 + }, + { + "epoch": 0.49, + "learning_rate": 1.447663093929163e-07, + "loss": 0.316, + "step": 1754 + }, + { + "epoch": 0.49, + "logps_train/chosen": -42.070743560791016, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -58.13180160522461, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.42699646949768066, + "rewards_train/margins": 1.5381369590759277, + "rewards_train/rejected": -1.9651334285736084, + "step": 1754 + }, + { + "epoch": 0.49, + "logps_train/chosen": -93.35173797607422, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -105.97796630859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6605641841888428, + "rewards_train/margins": 2.121607780456543, + "rewards_train/rejected": -2.7821719646453857, + "step": 1755 + }, + { + "epoch": 0.49, + "learning_rate": 1.4420522382598116e-07, + "loss": 0.3327, + "step": 1756 + }, + { + "epoch": 0.49, + "logps_train/chosen": -72.68170166015625, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -79.17109680175781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8264704942703247, + "rewards_train/margins": 1.0676897764205933, + "rewards_train/rejected": -2.894160270690918, + "step": 1756 + }, + { + "epoch": 0.49, + "logps_train/chosen": -108.734375, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -122.84239196777344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.72695255279541, + "rewards_train/margins": 1.0113883018493652, + "rewards_train/rejected": -3.7383408546447754, + "step": 1757 + }, + { + "epoch": 0.49, + "learning_rate": 1.4364478665530956e-07, + "loss": 0.5524, + "step": 1758 + }, + { + "epoch": 0.49, + "logps_train/chosen": -59.95671081542969, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -97.79368591308594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3226239681243896, + "rewards_train/margins": 2.2794010639190674, + "rewards_train/rejected": -3.602025032043457, + "step": 1758 + }, + { + "epoch": 0.49, + "logps_train/chosen": -111.15128326416016, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -107.0, + "logps_train/rejected": -154.73086547851562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -3.0514564514160156, + "rewards_train/margins": 1.722020149230957, + "rewards_train/rejected": -4.773476600646973, + "step": 1759 + }, + { + "epoch": 0.49, + "learning_rate": 1.4308500131571538e-07, + "loss": 0.4535, + "step": 1760 + }, + { + "epoch": 0.49, + "logps_train/chosen": -65.82362365722656, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -86.1658935546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8222059011459351, + "rewards_train/margins": 1.7561031579971313, + "rewards_train/rejected": -2.5783090591430664, + "step": 1760 + }, + { + "epoch": 0.49, + "logps_train/chosen": -55.25353240966797, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -59.853878021240234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3292596340179443, + "rewards_train/margins": 0.7237551212310791, + "rewards_train/rejected": -2.0530147552490234, + "step": 1761 + }, + { + "epoch": 0.49, + "learning_rate": 1.4252587123801745e-07, + "loss": 0.4007, + "step": 1762 + }, + { + "epoch": 0.49, + "logps_train/chosen": -94.54315185546875, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -107.18714904785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1226749420166016, + "rewards_train/margins": 1.3268015384674072, + "rewards_train/rejected": -3.449476480484009, + "step": 1762 + }, + { + "epoch": 0.49, + "logps_train/chosen": -45.40351104736328, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -87.63957214355469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.579804003238678, + "rewards_train/margins": 1.9727277159690857, + "rewards_train/rejected": -2.5525317192077637, + "step": 1763 + }, + { + "epoch": 0.49, + "learning_rate": 1.419673998490188e-07, + "loss": 0.4272, + "step": 1764 + }, + { + "epoch": 0.49, + "logps_train/chosen": -70.68592834472656, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -68.03724670410156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.372938871383667, + "rewards_train/margins": 0.5843008756637573, + "rewards_train/rejected": -1.9572397470474243, + "step": 1764 + }, + { + "epoch": 0.49, + "logps_train/chosen": -122.95270538330078, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -137.17544555664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.398590564727783, + "rewards_train/margins": 1.3312108516693115, + "rewards_train/rejected": -3.7298014163970947, + "step": 1765 + }, + { + "epoch": 0.49, + "learning_rate": 1.4140959057148544e-07, + "loss": 0.5153, + "step": 1766 + }, + { + "epoch": 0.49, + "logps_train/chosen": -61.5564079284668, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -60.4739990234375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.424195408821106, + "rewards_train/margins": 0.5708611011505127, + "rewards_train/rejected": -1.9950565099716187, + "step": 1766 + }, + { + "epoch": 0.49, + "logps_train/chosen": -89.47825622558594, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -84.82290649414062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6915267705917358, + "rewards_train/margins": -0.07530021667480469, + "rewards_train/rejected": -1.6162265539169312, + "step": 1767 + }, + { + "epoch": 0.49, + "learning_rate": 1.408524468241255e-07, + "loss": 0.7928, + "step": 1768 + }, + { + "epoch": 0.49, + "logps_train/chosen": -74.18833923339844, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -94.327880859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5604358911514282, + "rewards_train/margins": 1.4539932012557983, + "rewards_train/rejected": -3.0144290924072266, + "step": 1768 + }, + { + "epoch": 0.49, + "logps_train/chosen": -82.75665283203125, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -97.08348083496094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9408992528915405, + "rewards_train/margins": 0.7682303190231323, + "rewards_train/rejected": -2.709129571914673, + "step": 1769 + }, + { + "epoch": 0.49, + "learning_rate": 1.4029597202156793e-07, + "loss": 0.5645, + "step": 1770 + }, + { + "epoch": 0.49, + "logps_train/chosen": -39.584197998046875, + "logps_train/ref_chosen": -29.0, + "logps_train/ref_rejected": -28.75, + "logps_train/rejected": -47.16441345214844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0589078664779663, + "rewards_train/margins": 0.7766741514205933, + "rewards_train/rejected": -1.8355820178985596, + "step": 1770 + }, + { + "epoch": 0.49, + "logps_train/chosen": -69.99488067626953, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -73.59834289550781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.058838963508606, + "rewards_train/margins": 1.936932921409607, + "rewards_train/rejected": -2.995771884918213, + "step": 1771 + }, + { + "epoch": 0.5, + "learning_rate": 1.3974016957434206e-07, + "loss": 0.3904, + "step": 1772 + }, + { + "epoch": 0.5, + "logps_train/chosen": -69.9470443725586, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -80.33355712890625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2562280893325806, + "rewards_train/margins": 1.5761502981185913, + "rewards_train/rejected": -2.832378387451172, + "step": 1772 + }, + { + "epoch": 0.5, + "logps_train/chosen": -91.52737426757812, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -134.87603759765625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.305472493171692, + "rewards_train/margins": 2.5784205198287964, + "rewards_train/rejected": -3.8838930130004883, + "step": 1773 + }, + { + "epoch": 0.5, + "learning_rate": 1.3918504288885658e-07, + "loss": 0.4017, + "step": 1774 + }, + { + "epoch": 0.5, + "logps_train/chosen": -103.61128234863281, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -120.56626892089844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3197214603424072, + "rewards_train/margins": 1.8423736095428467, + "rewards_train/rejected": -4.162095069885254, + "step": 1774 + }, + { + "epoch": 0.5, + "logps_train/chosen": -82.91297912597656, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -77.99188995361328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8811415433883667, + "rewards_train/margins": 0.7254692316055298, + "rewards_train/rejected": -2.6066107749938965, + "step": 1775 + }, + { + "epoch": 0.5, + "learning_rate": 1.386305953673782e-07, + "loss": 0.5205, + "step": 1776 + }, + { + "epoch": 0.5, + "logps_train/chosen": -39.34204864501953, + "logps_train/ref_chosen": -23.625, + "logps_train/ref_rejected": -21.5, + "logps_train/rejected": -43.0682373046875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5776619911193848, + "rewards_train/margins": 0.5769162178039551, + "rewards_train/rejected": -2.15457820892334, + "step": 1776 + }, + { + "epoch": 0.5, + "logps_train/chosen": -106.58378601074219, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -115.2689208984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7001752853393555, + "rewards_train/margins": 1.4103107452392578, + "rewards_train/rejected": -3.1104860305786133, + "step": 1777 + }, + { + "epoch": 0.5, + "learning_rate": 1.3807683040801153e-07, + "loss": 0.5109, + "step": 1778 + }, + { + "epoch": 0.5, + "logps_train/chosen": -70.5435791015625, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -86.02911376953125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7059206366539001, + "rewards_train/margins": 1.1188653111457825, + "rewards_train/rejected": -1.8247859477996826, + "step": 1778 + }, + { + "epoch": 0.5, + "logps_train/chosen": -57.89008712768555, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -75.26858520507812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7229932546615601, + "rewards_train/margins": 0.9643144607543945, + "rewards_train/rejected": -1.6873077154159546, + "step": 1779 + }, + { + "epoch": 0.5, + "learning_rate": 1.375237514046777e-07, + "loss": 0.3594, + "step": 1780 + }, + { + "epoch": 0.5, + "logps_train/chosen": -46.415016174316406, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -79.39259338378906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5808570384979248, + "rewards_train/margins": 2.629349946975708, + "rewards_train/rejected": -3.210206985473633, + "step": 1780 + }, + { + "epoch": 0.5, + "logps_train/chosen": -37.79524230957031, + "logps_train/ref_chosen": -23.875, + "logps_train/ref_rejected": -28.875, + "logps_train/rejected": -44.77808380126953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3961255550384521, + "rewards_train/margins": 0.20023739337921143, + "rewards_train/rejected": -1.5963629484176636, + "step": 1781 + }, + { + "epoch": 0.5, + "learning_rate": 1.3697136174709386e-07, + "loss": 0.4521, + "step": 1782 + }, + { + "epoch": 0.5, + "logps_train/chosen": -70.57560729980469, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -91.58863830566406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.71346914768219, + "rewards_train/margins": 1.2479337453842163, + "rewards_train/rejected": -2.9614028930664062, + "step": 1782 + }, + { + "epoch": 0.5, + "logps_train/chosen": -57.05122375488281, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -63.5496711730957, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9015088081359863, + "rewards_train/margins": 0.7023842334747314, + "rewards_train/rejected": -1.6038930416107178, + "step": 1783 + }, + { + "epoch": 0.5, + "learning_rate": 1.3641966482075205e-07, + "loss": 0.4846, + "step": 1784 + }, + { + "epoch": 0.5, + "logps_train/chosen": -90.04568481445312, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -114.89344787597656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9240999221801758, + "rewards_train/margins": 0.944542407989502, + "rewards_train/rejected": -2.8686423301696777, + "step": 1784 + }, + { + "epoch": 0.5, + "logps_train/chosen": -62.505821228027344, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -55.99300003051758, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7806605100631714, + "rewards_train/margins": 0.21375662088394165, + "rewards_train/rejected": -0.994417130947113, + "step": 1785 + }, + { + "epoch": 0.5, + "learning_rate": 1.35868664006899e-07, + "loss": 0.6153, + "step": 1786 + }, + { + "epoch": 0.5, + "logps_train/chosen": -69.90780639648438, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -54.83738708496094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6353120803833008, + "rewards_train/margins": 0.17498886585235596, + "rewards_train/rejected": -1.8103009462356567, + "step": 1786 + }, + { + "epoch": 0.5, + "logps_train/chosen": -84.24098205566406, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -103.84087371826172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.306910753250122, + "rewards_train/margins": 1.5486609935760498, + "rewards_train/rejected": -2.855571746826172, + "step": 1787 + }, + { + "epoch": 0.5, + "learning_rate": 1.3531836268251494e-07, + "loss": 0.7109, + "step": 1788 + }, + { + "epoch": 0.5, + "logps_train/chosen": -83.69889831542969, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -105.857177734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.070427179336548, + "rewards_train/margins": 1.9996659755706787, + "rewards_train/rejected": -4.070093154907227, + "step": 1788 + }, + { + "epoch": 0.5, + "logps_train/chosen": -59.895565032958984, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -80.91615295410156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2710018157958984, + "rewards_train/margins": 1.7010817527770996, + "rewards_train/rejected": -2.972083568572998, + "step": 1789 + }, + { + "epoch": 0.5, + "learning_rate": 1.34768764220293e-07, + "loss": 0.3358, + "step": 1790 + }, + { + "epoch": 0.5, + "logps_train/chosen": -119.17463684082031, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -95.608642578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.7974438667297363, + "rewards_train/margins": -0.2254467010498047, + "rewards_train/rejected": -2.5719971656799316, + "step": 1790 + }, + { + "epoch": 0.5, + "logps_train/chosen": -65.99052429199219, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -95.28819274902344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9435832500457764, + "rewards_train/margins": 1.1352365016937256, + "rewards_train/rejected": -3.078819751739502, + "step": 1791 + }, + { + "epoch": 0.5, + "learning_rate": 1.3421987198861866e-07, + "loss": 0.8264, + "step": 1792 + }, + { + "epoch": 0.5, + "logps_train/chosen": -31.896549224853516, + "logps_train/ref_chosen": -26.25, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -56.73073959350586, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5677797794342041, + "rewards_train/margins": 1.3667197227478027, + "rewards_train/rejected": -1.9344995021820068, + "step": 1792 + }, + { + "epoch": 0.5, + "logps_train/chosen": -77.688720703125, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -86.49745178222656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2786376476287842, + "rewards_train/margins": 1.1798484325408936, + "rewards_train/rejected": -2.4584860801696777, + "step": 1793 + }, + { + "epoch": 0.5, + "learning_rate": 1.336716893515492e-07, + "loss": 0.4138, + "step": 1794 + }, + { + "epoch": 0.5, + "logps_train/chosen": -82.18788146972656, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -99.10455322265625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.650233507156372, + "rewards_train/margins": 1.616276741027832, + "rewards_train/rejected": -3.266510248184204, + "step": 1794 + }, + { + "epoch": 0.5, + "logps_train/chosen": -83.833251953125, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -111.43186950683594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8843994140625, + "rewards_train/margins": 1.9326152801513672, + "rewards_train/rejected": -3.817014694213867, + "step": 1795 + }, + { + "epoch": 0.5, + "learning_rate": 1.3312421966879273e-07, + "loss": 0.5196, + "step": 1796 + }, + { + "epoch": 0.5, + "logps_train/chosen": -96.25450134277344, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -114.88604736328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6770128011703491, + "rewards_train/margins": 2.2950881719589233, + "rewards_train/rejected": -3.9721009731292725, + "step": 1796 + }, + { + "epoch": 0.5, + "logps_train/chosen": -86.13140106201172, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -105.61611938476562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2568906545639038, + "rewards_train/margins": 1.9767910242080688, + "rewards_train/rejected": -3.2336816787719727, + "step": 1797 + }, + { + "epoch": 0.5, + "learning_rate": 1.3257746629568772e-07, + "loss": 0.3998, + "step": 1798 + }, + { + "epoch": 0.5, + "logps_train/chosen": -53.450172424316406, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -85.01348114013672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9946267604827881, + "rewards_train/margins": 2.2108232975006104, + "rewards_train/rejected": -3.2054500579833984, + "step": 1798 + }, + { + "epoch": 0.5, + "logps_train/chosen": -28.21187973022461, + "logps_train/ref_chosen": -25.125, + "logps_train/ref_rejected": -22.5, + "logps_train/rejected": -30.372562408447266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.30788224935531616, + "rewards_train/margins": 0.4855995178222656, + "rewards_train/rejected": -0.7934817671775818, + "step": 1799 + }, + { + "epoch": 0.5, + "learning_rate": 1.3203143258318272e-07, + "loss": 0.4383, + "step": 1800 + }, + { + "epoch": 0.5, + "logps_train/chosen": -41.677310943603516, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -19.125, + "logps_train/rejected": -35.0609130859375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9311587810516357, + "rewards_train/margins": 0.6671199798583984, + "rewards_train/rejected": -1.5982787609100342, + "step": 1800 + }, + { + "epoch": 0.5, + "logps_train/chosen": -53.61812973022461, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -89.99407196044922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9002898335456848, + "rewards_train/margins": 1.2363728880882263, + "rewards_train/rejected": -2.136662721633911, + "step": 1801 + }, + { + "epoch": 0.5, + "learning_rate": 1.314861218778156e-07, + "loss": 0.4323, + "step": 1802 + }, + { + "epoch": 0.5, + "logps_train/chosen": -71.07485961914062, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -87.70455932617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.233511209487915, + "rewards_train/margins": 1.4119446277618408, + "rewards_train/rejected": -2.645455837249756, + "step": 1802 + }, + { + "epoch": 0.5, + "logps_train/chosen": -46.52236557006836, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -55.869659423828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0539944171905518, + "rewards_train/margins": 0.9950804710388184, + "rewards_train/rejected": -2.04907488822937, + "step": 1803 + }, + { + "epoch": 0.5, + "learning_rate": 1.3094153752169307e-07, + "loss": 0.3371, + "step": 1804 + }, + { + "epoch": 0.5, + "logps_train/chosen": -70.13768768310547, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -73.90690612792969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3694329261779785, + "rewards_train/margins": 1.1970391273498535, + "rewards_train/rejected": -2.566472053527832, + "step": 1804 + }, + { + "epoch": 0.5, + "logps_train/chosen": -53.03296661376953, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -105.7511215209961, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3784918785095215, + "rewards_train/margins": 3.680800437927246, + "rewards_train/rejected": -5.059292316436768, + "step": 1805 + }, + { + "epoch": 0.5, + "learning_rate": 1.3039768285246988e-07, + "loss": 0.4646, + "step": 1806 + }, + { + "epoch": 0.5, + "logps_train/chosen": -101.19830322265625, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -137.64564514160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.6630680561065674, + "rewards_train/margins": 1.894465684890747, + "rewards_train/rejected": -4.5575337409973145, + "step": 1806 + }, + { + "epoch": 0.51, + "logps_train/chosen": -35.44614791870117, + "logps_train/ref_chosen": -29.5, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -49.23463439941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5943704843521118, + "rewards_train/margins": 0.7475991249084473, + "rewards_train/rejected": -1.341969609260559, + "step": 1807 + }, + { + "epoch": 0.51, + "learning_rate": 1.2985456120332906e-07, + "loss": 0.3964, + "step": 1808 + }, + { + "epoch": 0.51, + "logps_train/chosen": -89.47793579101562, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -92.598876953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2364659309387207, + "rewards_train/margins": 0.88826584815979, + "rewards_train/rejected": -2.1247317790985107, + "step": 1808 + }, + { + "epoch": 0.51, + "logps_train/chosen": -56.61750030517578, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -77.45428466796875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.775275707244873, + "rewards_train/margins": 0.792320728302002, + "rewards_train/rejected": -1.567596435546875, + "step": 1809 + }, + { + "epoch": 0.51, + "learning_rate": 1.29312175902961e-07, + "loss": 0.4759, + "step": 1810 + }, + { + "epoch": 0.51, + "logps_train/chosen": -69.15110778808594, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -87.00193786621094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2891346216201782, + "rewards_train/margins": 1.273363709449768, + "rewards_train/rejected": -2.5624983310699463, + "step": 1810 + }, + { + "epoch": 0.51, + "logps_train/chosen": -117.8868408203125, + "logps_train/ref_chosen": -99.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -121.09489440917969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8168091773986816, + "rewards_train/margins": 1.848149061203003, + "rewards_train/rejected": -3.6649582386016846, + "step": 1811 + }, + { + "epoch": 0.51, + "learning_rate": 1.2877053027554289e-07, + "loss": 0.408, + "step": 1812 + }, + { + "epoch": 0.51, + "logps_train/chosen": -43.83164978027344, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -74.97264099121094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9997665286064148, + "rewards_train/margins": 1.9084346890449524, + "rewards_train/rejected": -2.908201217651367, + "step": 1812 + }, + { + "epoch": 0.51, + "logps_train/chosen": -93.790283203125, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -114.37774658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8887935876846313, + "rewards_train/margins": 1.6864818334579468, + "rewards_train/rejected": -3.575275421142578, + "step": 1813 + }, + { + "epoch": 0.51, + "learning_rate": 1.2822962764071888e-07, + "loss": 0.3362, + "step": 1814 + }, + { + "epoch": 0.51, + "logps_train/chosen": -86.02261352539062, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -119.37617492675781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1738920211791992, + "rewards_train/margins": 3.034770965576172, + "rewards_train/rejected": -4.208662986755371, + "step": 1814 + }, + { + "epoch": 0.51, + "logps_train/chosen": -51.77228546142578, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -67.97145080566406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8561345338821411, + "rewards_train/margins": 1.2830032110214233, + "rewards_train/rejected": -2.1391377449035645, + "step": 1815 + }, + { + "epoch": 0.51, + "learning_rate": 1.2768947131357937e-07, + "loss": 0.33, + "step": 1816 + }, + { + "epoch": 0.51, + "logps_train/chosen": -54.13633728027344, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -62.22792053222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9581648111343384, + "rewards_train/margins": 0.7034945487976074, + "rewards_train/rejected": -1.6616593599319458, + "step": 1816 + }, + { + "epoch": 0.51, + "logps_train/chosen": -89.08345031738281, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -105.21546173095703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5163532495498657, + "rewards_train/margins": 2.492693781852722, + "rewards_train/rejected": -4.009047031402588, + "step": 1817 + }, + { + "epoch": 0.51, + "learning_rate": 1.2715006460464093e-07, + "loss": 0.3507, + "step": 1818 + }, + { + "epoch": 0.51, + "logps_train/chosen": -34.79269790649414, + "logps_train/ref_chosen": -26.25, + "logps_train/ref_rejected": -26.75, + "logps_train/rejected": -40.77827453613281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8591037392616272, + "rewards_train/margins": 0.5477034449577332, + "rewards_train/rejected": -1.4068071842193604, + "step": 1818 + }, + { + "epoch": 0.51, + "logps_train/chosen": -131.22537231445312, + "logps_train/ref_chosen": -103.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -149.39248657226562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.822537422180176, + "rewards_train/margins": 2.0487422943115234, + "rewards_train/rejected": -4.871279716491699, + "step": 1819 + }, + { + "epoch": 0.51, + "learning_rate": 1.2661141081982545e-07, + "loss": 0.4243, + "step": 1820 + }, + { + "epoch": 0.51, + "logps_train/chosen": -88.078857421875, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -113.62788391113281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7383538484573364, + "rewards_train/margins": 1.958809733390808, + "rewards_train/rejected": -2.6971635818481445, + "step": 1820 + }, + { + "epoch": 0.51, + "logps_train/chosen": -84.630859375, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -91.62677764892578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7440423965454102, + "rewards_train/margins": 1.5096511840820312, + "rewards_train/rejected": -3.2536935806274414, + "step": 1821 + }, + { + "epoch": 0.51, + "learning_rate": 1.2607351326044074e-07, + "loss": 0.2819, + "step": 1822 + }, + { + "epoch": 0.51, + "logps_train/chosen": -63.17647933959961, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -95.75277709960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2062222957611084, + "rewards_train/margins": 2.6799933910369873, + "rewards_train/rejected": -3.8862156867980957, + "step": 1822 + }, + { + "epoch": 0.51, + "logps_train/chosen": -95.8468017578125, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -117.04344177246094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.633227825164795, + "rewards_train/margins": 1.5704450607299805, + "rewards_train/rejected": -3.2036728858947754, + "step": 1823 + }, + { + "epoch": 0.51, + "learning_rate": 1.2553637522315967e-07, + "loss": 0.2748, + "step": 1824 + }, + { + "epoch": 0.51, + "logps_train/chosen": -21.561628341674805, + "logps_train/ref_chosen": -15.875, + "logps_train/ref_rejected": -11.0625, + "logps_train/rejected": -25.239707946777344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5680767297744751, + "rewards_train/margins": 0.8473002910614014, + "rewards_train/rejected": -1.4153770208358765, + "step": 1824 + }, + { + "epoch": 0.51, + "logps_train/chosen": -93.94432830810547, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -119.57408905029297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.6206047534942627, + "rewards_train/margins": 1.6477415561676025, + "rewards_train/rejected": -4.268346309661865, + "step": 1825 + }, + { + "epoch": 0.51, + "learning_rate": 1.2500000000000005e-07, + "loss": 0.4641, + "step": 1826 + }, + { + "epoch": 0.51, + "logps_train/chosen": -96.84722137451172, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -125.43040466308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3909720182418823, + "rewards_train/margins": 2.451971173286438, + "rewards_train/rejected": -3.8429431915283203, + "step": 1826 + }, + { + "epoch": 0.51, + "logps_train/chosen": -83.45915222167969, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -95.38729858398438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.786729097366333, + "rewards_train/margins": 0.764153242111206, + "rewards_train/rejected": -2.550882339477539, + "step": 1827 + }, + { + "epoch": 0.51, + "learning_rate": 1.2446439087830462e-07, + "loss": 0.3792, + "step": 1828 + }, + { + "epoch": 0.51, + "logps_train/chosen": -60.217140197753906, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -69.36685180664062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0772801637649536, + "rewards_train/margins": 0.9910463094711304, + "rewards_train/rejected": -2.068326473236084, + "step": 1828 + }, + { + "epoch": 0.51, + "logps_train/chosen": -96.72798156738281, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -131.86734008789062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1935014724731445, + "rewards_train/margins": 2.4166698455810547, + "rewards_train/rejected": -3.610171318054199, + "step": 1829 + }, + { + "epoch": 0.51, + "learning_rate": 1.2392955114072098e-07, + "loss": 0.371, + "step": 1830 + }, + { + "epoch": 0.51, + "logps_train/chosen": -84.11962127685547, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -118.79898071289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4166494607925415, + "rewards_train/margins": 2.806998133659363, + "rewards_train/rejected": -4.223647594451904, + "step": 1830 + }, + { + "epoch": 0.51, + "logps_train/chosen": -55.4085807800293, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -75.881591796875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2488659620285034, + "rewards_train/margins": 1.0535508394241333, + "rewards_train/rejected": -2.3024168014526367, + "step": 1831 + }, + { + "epoch": 0.51, + "learning_rate": 1.23395484065181e-07, + "loss": 0.3201, + "step": 1832 + }, + { + "epoch": 0.51, + "logps_train/chosen": -60.6721076965332, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -78.96751403808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2451891899108887, + "rewards_train/margins": 1.912353754043579, + "rewards_train/rejected": -3.1575429439544678, + "step": 1832 + }, + { + "epoch": 0.51, + "logps_train/chosen": -67.5907974243164, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -79.24272918701172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8152318000793457, + "rewards_train/margins": 0.4012284278869629, + "rewards_train/rejected": -2.2164602279663086, + "step": 1833 + }, + { + "epoch": 0.51, + "learning_rate": 1.228621929248813e-07, + "loss": 0.418, + "step": 1834 + }, + { + "epoch": 0.51, + "logps_train/chosen": -50.921104431152344, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -65.22193908691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.523750901222229, + "rewards_train/margins": 1.4542043209075928, + "rewards_train/rejected": -1.9779552221298218, + "step": 1834 + }, + { + "epoch": 0.51, + "logps_train/chosen": -101.90826416015625, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -126.49099731445312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.65059232711792, + "rewards_train/margins": 3.072726249694824, + "rewards_train/rejected": -4.723318576812744, + "step": 1835 + }, + { + "epoch": 0.51, + "learning_rate": 1.2232968098826284e-07, + "loss": 0.3201, + "step": 1836 + }, + { + "epoch": 0.51, + "logps_train/chosen": -51.65633773803711, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -38.25, + "logps_train/rejected": -49.79901123046875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0818445682525635, + "rewards_train/margins": 0.0666114091873169, + "rewards_train/rejected": -1.1484559774398804, + "step": 1836 + }, + { + "epoch": 0.51, + "logps_train/chosen": -114.28877258300781, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -164.07217407226562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.4196486473083496, + "rewards_train/margins": 2.074678897857666, + "rewards_train/rejected": -5.494327545166016, + "step": 1837 + }, + { + "epoch": 0.51, + "learning_rate": 1.217979515189912e-07, + "loss": 0.5245, + "step": 1838 + }, + { + "epoch": 0.51, + "logps_train/chosen": -106.10956573486328, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -118.07239532470703, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.0093939304351807, + "rewards_train/margins": 1.4376893043518066, + "rewards_train/rejected": -3.4470832347869873, + "step": 1838 + }, + { + "epoch": 0.51, + "logps_train/chosen": -70.26261901855469, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -84.50745391845703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7466719150543213, + "rewards_train/margins": 0.9300498962402344, + "rewards_train/rejected": -2.6767218112945557, + "step": 1839 + }, + { + "epoch": 0.51, + "learning_rate": 1.212670077759359e-07, + "loss": 0.4998, + "step": 1840 + }, + { + "epoch": 0.51, + "logps_train/chosen": -88.94002532958984, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -107.8089599609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5893150568008423, + "rewards_train/margins": 1.4259554147720337, + "rewards_train/rejected": -3.015270471572876, + "step": 1840 + }, + { + "epoch": 0.51, + "logps_train/chosen": -74.72373962402344, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -113.5050277709961, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5709335803985596, + "rewards_train/margins": 2.22956919670105, + "rewards_train/rejected": -3.8005027770996094, + "step": 1841 + }, + { + "epoch": 0.51, + "learning_rate": 1.2073685301315133e-07, + "loss": 0.2643, + "step": 1842 + }, + { + "epoch": 0.51, + "logps_train/chosen": -88.02828979492188, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -132.9336395263672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.359664797782898, + "rewards_train/margins": 2.0989338159561157, + "rewards_train/rejected": -3.4585986137390137, + "step": 1842 + }, + { + "epoch": 0.52, + "logps_train/chosen": -79.47792053222656, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -85.26771545410156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3286516666412354, + "rewards_train/margins": 0.9178466796875, + "rewards_train/rejected": -2.2464983463287354, + "step": 1843 + }, + { + "epoch": 0.52, + "learning_rate": 1.2020749047985625e-07, + "loss": 0.4137, + "step": 1844 + }, + { + "epoch": 0.52, + "logps_train/chosen": -43.18343734741211, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -56.77774429321289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.855648398399353, + "rewards_train/margins": 0.6510323286056519, + "rewards_train/rejected": -1.5066807270050049, + "step": 1844 + }, + { + "epoch": 0.52, + "logps_train/chosen": -77.59200286865234, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -100.8201675415039, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.642793893814087, + "rewards_train/margins": 1.9681293964385986, + "rewards_train/rejected": -3.6109232902526855, + "step": 1845 + }, + { + "epoch": 0.52, + "learning_rate": 1.196789234204138e-07, + "loss": 0.4572, + "step": 1846 + }, + { + "epoch": 0.52, + "logps_train/chosen": -68.59378814697266, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -101.05061340332031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7761759757995605, + "rewards_train/margins": 1.465604305267334, + "rewards_train/rejected": -2.2417802810668945, + "step": 1846 + }, + { + "epoch": 0.52, + "logps_train/chosen": -112.95681762695312, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -179.93466186523438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8581812381744385, + "rewards_train/margins": 3.404034376144409, + "rewards_train/rejected": -5.262215614318848, + "step": 1847 + }, + { + "epoch": 0.52, + "learning_rate": 1.1915115507431207e-07, + "loss": 0.2125, + "step": 1848 + }, + { + "epoch": 0.52, + "logps_train/chosen": -78.44413757324219, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -90.45282745361328, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6540809869766235, + "rewards_train/margins": 1.2925690412521362, + "rewards_train/rejected": -2.9466500282287598, + "step": 1848 + }, + { + "epoch": 0.52, + "logps_train/chosen": -87.24072265625, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -134.02291870117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4100093841552734, + "rewards_train/margins": 2.7375946044921875, + "rewards_train/rejected": -4.147603988647461, + "step": 1849 + }, + { + "epoch": 0.52, + "learning_rate": 1.1862418867614385e-07, + "loss": 0.4177, + "step": 1850 + }, + { + "epoch": 0.52, + "logps_train/chosen": -32.44071960449219, + "logps_train/ref_chosen": -25.25, + "logps_train/ref_rejected": -28.375, + "logps_train/rejected": -43.17241287231445, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.715556263923645, + "rewards_train/margins": 0.7645754814147949, + "rewards_train/rejected": -1.48013174533844, + "step": 1850 + }, + { + "epoch": 0.52, + "logps_train/chosen": -138.3934326171875, + "logps_train/ref_chosen": -107.0, + "logps_train/ref_rejected": -127.5, + "logps_train/rejected": -199.50967407226562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.13934326171875, + "rewards_train/margins": 4.055373191833496, + "rewards_train/rejected": -7.194716453552246, + "step": 1851 + }, + { + "epoch": 0.52, + "learning_rate": 1.1809802745558708e-07, + "loss": 0.2895, + "step": 1852 + }, + { + "epoch": 0.52, + "logps_train/chosen": -67.9940414428711, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -73.91470336914062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.490175724029541, + "rewards_train/margins": 1.0944101810455322, + "rewards_train/rejected": -2.5845859050750732, + "step": 1852 + }, + { + "epoch": 0.52, + "logps_train/chosen": -88.08970642089844, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -126.63687133789062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.615220546722412, + "rewards_train/margins": 1.051591396331787, + "rewards_train/rejected": -3.666811943054199, + "step": 1853 + }, + { + "epoch": 0.52, + "learning_rate": 1.1757267463738465e-07, + "loss": 0.8369, + "step": 1854 + }, + { + "epoch": 0.52, + "logps_train/chosen": -85.12269592285156, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -108.0712890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9829728603363037, + "rewards_train/margins": 1.3514995574951172, + "rewards_train/rejected": -3.334472417831421, + "step": 1854 + }, + { + "epoch": 0.52, + "logps_train/chosen": -65.36869049072266, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -88.8803482055664, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6564977169036865, + "rewards_train/margins": 1.757904052734375, + "rewards_train/rejected": -2.4144017696380615, + "step": 1855 + }, + { + "epoch": 0.52, + "learning_rate": 1.1704813344132514e-07, + "loss": 0.4072, + "step": 1856 + }, + { + "epoch": 0.52, + "logps_train/chosen": -92.02995300292969, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -105.53173065185547, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.514713764190674, + "rewards_train/margins": 0.7595534324645996, + "rewards_train/rejected": -3.2742671966552734, + "step": 1856 + }, + { + "epoch": 0.52, + "logps_train/chosen": -79.29387664794922, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -108.56607055664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4967706203460693, + "rewards_train/margins": 1.8145246505737305, + "rewards_train/rejected": -3.3112952709198, + "step": 1857 + }, + { + "epoch": 0.52, + "learning_rate": 1.1652440708222284e-07, + "loss": 0.3761, + "step": 1858 + }, + { + "epoch": 0.52, + "logps_train/chosen": -108.27483367919922, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -126.79065704345703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.0460376739501953, + "rewards_train/margins": 1.5861525535583496, + "rewards_train/rejected": -4.632190227508545, + "step": 1858 + }, + { + "epoch": 0.52, + "logps_train/chosen": -107.4876708984375, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -123.75631713867188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.5261106491088867, + "rewards_train/margins": 1.0970797538757324, + "rewards_train/rejected": -3.623190402984619, + "step": 1859 + }, + { + "epoch": 0.52, + "learning_rate": 1.1600149876989784e-07, + "loss": 0.3844, + "step": 1860 + }, + { + "epoch": 0.52, + "logps_train/chosen": -57.655517578125, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -79.86770629882812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9548091888427734, + "rewards_train/margins": 1.0356730222702026, + "rewards_train/rejected": -1.990482211112976, + "step": 1860 + }, + { + "epoch": 0.52, + "logps_train/chosen": -66.32403564453125, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -92.36734008789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4661928415298462, + "rewards_train/margins": 1.7510100603103638, + "rewards_train/rejected": -3.21720290184021, + "step": 1861 + }, + { + "epoch": 0.52, + "learning_rate": 1.1547941170915685e-07, + "loss": 0.3558, + "step": 1862 + }, + { + "epoch": 0.52, + "logps_train/chosen": -42.74238586425781, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -50.778236389160156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.33380889892578125, + "rewards_train/margins": 0.9889365434646606, + "rewards_train/rejected": -1.322745442390442, + "step": 1862 + }, + { + "epoch": 0.52, + "logps_train/chosen": -107.78318786621094, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -102.5, + "logps_train/rejected": -152.5138702392578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6712876558303833, + "rewards_train/margins": 3.313693404197693, + "rewards_train/rejected": -4.984981060028076, + "step": 1863 + }, + { + "epoch": 0.52, + "learning_rate": 1.1495814909977311e-07, + "loss": 0.3325, + "step": 1864 + }, + { + "epoch": 0.52, + "logps_train/chosen": -66.98416137695312, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -87.318115234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2640411853790283, + "rewards_train/margins": 1.536618947982788, + "rewards_train/rejected": -2.8006601333618164, + "step": 1864 + }, + { + "epoch": 0.52, + "logps_train/chosen": -78.55347442626953, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -84.07122802734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9069098830223083, + "rewards_train/margins": 1.95372873544693, + "rewards_train/rejected": -2.8606386184692383, + "step": 1865 + }, + { + "epoch": 0.52, + "learning_rate": 1.1443771413646711e-07, + "loss": 0.3175, + "step": 1866 + }, + { + "epoch": 0.52, + "logps_train/chosen": -71.20903015136719, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -97.3011474609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8240283727645874, + "rewards_train/margins": 2.2349923849105835, + "rewards_train/rejected": -3.059020757675171, + "step": 1866 + }, + { + "epoch": 0.52, + "logps_train/chosen": -49.83819580078125, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -61.071800231933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1485167741775513, + "rewards_train/margins": 0.8909875154495239, + "rewards_train/rejected": -2.039504289627075, + "step": 1867 + }, + { + "epoch": 0.52, + "learning_rate": 1.139181100088866e-07, + "loss": 0.4216, + "step": 1868 + }, + { + "epoch": 0.52, + "logps_train/chosen": -102.03263854980469, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -134.36888122558594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.270841360092163, + "rewards_train/margins": 1.7730779647827148, + "rewards_train/rejected": -3.043919324874878, + "step": 1868 + }, + { + "epoch": 0.52, + "logps_train/chosen": -87.11683654785156, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -103.55712890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1390278339385986, + "rewards_train/margins": 2.2823097705841064, + "rewards_train/rejected": -3.421337604522705, + "step": 1869 + }, + { + "epoch": 0.52, + "learning_rate": 1.1339933990158749e-07, + "loss": 0.2687, + "step": 1870 + }, + { + "epoch": 0.52, + "logps_train/chosen": -62.227073669433594, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -83.20600891113281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6632344722747803, + "rewards_train/margins": 1.267132043838501, + "rewards_train/rejected": -2.9303665161132812, + "step": 1870 + }, + { + "epoch": 0.52, + "logps_train/chosen": -22.738014221191406, + "logps_train/ref_chosen": -16.0, + "logps_train/ref_rejected": -22.375, + "logps_train/rejected": -36.29288101196289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6761450171470642, + "rewards_train/margins": 0.7185729146003723, + "rewards_train/rejected": -1.3947179317474365, + "step": 1871 + }, + { + "epoch": 0.52, + "learning_rate": 1.1288140699401421e-07, + "loss": 0.4092, + "step": 1872 + }, + { + "epoch": 0.52, + "logps_train/chosen": -57.997798919677734, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -73.33271789550781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6044674515724182, + "rewards_train/margins": 1.2905232310295105, + "rewards_train/rejected": -1.8949906826019287, + "step": 1872 + }, + { + "epoch": 0.52, + "logps_train/chosen": -54.176692962646484, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -86.22315979003906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8534112572669983, + "rewards_train/margins": 2.4538163542747498, + "rewards_train/rejected": -3.307227611541748, + "step": 1873 + }, + { + "epoch": 0.52, + "learning_rate": 1.1236431446047984e-07, + "loss": 0.3006, + "step": 1874 + }, + { + "epoch": 0.52, + "logps_train/chosen": -81.40237426757812, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -106.32112121582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4635772705078125, + "rewards_train/margins": 2.0947067737579346, + "rewards_train/rejected": -3.558284044265747, + "step": 1874 + }, + { + "epoch": 0.52, + "logps_train/chosen": -94.24456787109375, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -120.14347839355469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.484222173690796, + "rewards_train/margins": 1.7141101360321045, + "rewards_train/rejected": -3.1983323097229004, + "step": 1875 + }, + { + "epoch": 0.52, + "learning_rate": 1.1184806547014725e-07, + "loss": 0.4348, + "step": 1876 + }, + { + "epoch": 0.52, + "logps_train/chosen": -77.82167053222656, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -101.10649108886719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.754042148590088, + "rewards_train/margins": 1.5484039783477783, + "rewards_train/rejected": -3.302446126937866, + "step": 1876 + }, + { + "epoch": 0.52, + "logps_train/chosen": -110.63151550292969, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -128.6043243408203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.447136163711548, + "rewards_train/margins": 2.15899920463562, + "rewards_train/rejected": -4.606135368347168, + "step": 1877 + }, + { + "epoch": 0.52, + "learning_rate": 1.1133266318700931e-07, + "loss": 0.3177, + "step": 1878 + }, + { + "epoch": 0.52, + "logps_train/chosen": -72.16413116455078, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -106.85801696777344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9018137454986572, + "rewards_train/margins": 2.35762095451355, + "rewards_train/rejected": -4.259434700012207, + "step": 1878 + }, + { + "epoch": 0.53, + "logps_train/chosen": -96.20481872558594, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -99.30570983886719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.954075813293457, + "rewards_train/margins": 1.7632145881652832, + "rewards_train/rejected": -3.7172904014587402, + "step": 1879 + }, + { + "epoch": 0.53, + "learning_rate": 1.1081811076986963e-07, + "loss": 0.3477, + "step": 1880 + }, + { + "epoch": 0.53, + "logps_train/chosen": -103.07200622558594, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -133.30667114257812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.6540753841400146, + "rewards_train/margins": 2.924247980117798, + "rewards_train/rejected": -5.5783233642578125, + "step": 1880 + }, + { + "epoch": 0.53, + "logps_train/chosen": -75.77108764648438, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -73.75123596191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3454680442810059, + "rewards_train/margins": 1.103874683380127, + "rewards_train/rejected": -2.449342727661133, + "step": 1881 + }, + { + "epoch": 0.53, + "learning_rate": 1.1030441137232294e-07, + "loss": 0.2516, + "step": 1882 + }, + { + "epoch": 0.53, + "logps_train/chosen": -61.58794403076172, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -61.39958953857422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5322317481040955, + "rewards_train/margins": 0.9374150633811951, + "rewards_train/rejected": -1.4696468114852905, + "step": 1882 + }, + { + "epoch": 0.53, + "logps_train/chosen": -72.37861633300781, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -91.42536163330078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.844501793384552, + "rewards_train/margins": 1.1734248995780945, + "rewards_train/rejected": -2.0179266929626465, + "step": 1883 + }, + { + "epoch": 0.53, + "learning_rate": 1.0979156814273621e-07, + "loss": 0.3806, + "step": 1884 + }, + { + "epoch": 0.53, + "logps_train/chosen": -65.94540405273438, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -95.26437377929688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1222748756408691, + "rewards_train/margins": 1.2508411407470703, + "rewards_train/rejected": -2.3731160163879395, + "step": 1884 + }, + { + "epoch": 0.53, + "logps_train/chosen": -69.76664733886719, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -105.36920166015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9269583225250244, + "rewards_train/margins": 1.6904304027557373, + "rewards_train/rejected": -3.6173887252807617, + "step": 1885 + }, + { + "epoch": 0.53, + "learning_rate": 1.0927958422422911e-07, + "loss": 0.3645, + "step": 1886 + }, + { + "epoch": 0.53, + "logps_train/chosen": -31.979169845581055, + "logps_train/ref_chosen": -21.375, + "logps_train/ref_rejected": -26.5, + "logps_train/rejected": -47.55209732055664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0623213052749634, + "rewards_train/margins": 1.0368338823318481, + "rewards_train/rejected": -2.0991551876068115, + "step": 1886 + }, + { + "epoch": 0.53, + "logps_train/chosen": -80.36085510253906, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -94.09173583984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7400413751602173, + "rewards_train/margins": 1.7382735013961792, + "rewards_train/rejected": -3.4783148765563965, + "step": 1887 + }, + { + "epoch": 0.53, + "learning_rate": 1.0876846275465453e-07, + "loss": 0.4227, + "step": 1888 + }, + { + "epoch": 0.53, + "logps_train/chosen": -104.34622955322266, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -139.8173828125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.422123432159424, + "rewards_train/margins": 0.8174281120300293, + "rewards_train/rejected": -4.239551544189453, + "step": 1888 + }, + { + "epoch": 0.53, + "logps_train/chosen": -118.78239440917969, + "logps_train/ref_chosen": -107.0, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -154.7715606689453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1876142024993896, + "rewards_train/margins": 3.4661037921905518, + "rewards_train/rejected": -4.653717994689941, + "step": 1889 + }, + { + "epoch": 0.53, + "learning_rate": 1.0825820686657978e-07, + "loss": 0.2776, + "step": 1890 + }, + { + "epoch": 0.53, + "logps_train/chosen": -92.30839538574219, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -105.17121887207031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8191206455230713, + "rewards_train/margins": 1.3874547481536865, + "rewards_train/rejected": -3.206575393676758, + "step": 1890 + }, + { + "epoch": 0.53, + "logps_train/chosen": -89.18570709228516, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -102.78724670410156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.908805251121521, + "rewards_train/margins": 1.424216628074646, + "rewards_train/rejected": -3.333021879196167, + "step": 1891 + }, + { + "epoch": 0.53, + "learning_rate": 1.0774881968726721e-07, + "loss": 0.4718, + "step": 1892 + }, + { + "epoch": 0.53, + "logps_train/chosen": -70.77232360839844, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -100.51925659179688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.485436201095581, + "rewards_train/margins": 1.9508650302886963, + "rewards_train/rejected": -3.4363012313842773, + "step": 1892 + }, + { + "epoch": 0.53, + "logps_train/chosen": -89.73664855957031, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -104.7386474609375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4354815483093262, + "rewards_train/margins": 1.6090869903564453, + "rewards_train/rejected": -3.0445685386657715, + "step": 1893 + }, + { + "epoch": 0.53, + "learning_rate": 1.0724030433865475e-07, + "loss": 0.3676, + "step": 1894 + }, + { + "epoch": 0.53, + "logps_train/chosen": -107.09449768066406, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -117.95587158203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.3559341430664062, + "rewards_train/margins": 0.7964887619018555, + "rewards_train/rejected": -3.1524229049682617, + "step": 1894 + }, + { + "epoch": 0.53, + "logps_train/chosen": -67.088623046875, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -95.45256805419922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3133540153503418, + "rewards_train/margins": 1.683074712753296, + "rewards_train/rejected": -2.9964287281036377, + "step": 1895 + }, + { + "epoch": 0.53, + "learning_rate": 1.0673266393733732e-07, + "loss": 0.48, + "step": 1896 + }, + { + "epoch": 0.53, + "logps_train/chosen": -57.43756103515625, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -59.90660095214844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7033750414848328, + "rewards_train/margins": 1.03792005777359, + "rewards_train/rejected": -1.7412950992584229, + "step": 1896 + }, + { + "epoch": 0.53, + "logps_train/chosen": -39.66823196411133, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -69.99291229248047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5634051561355591, + "rewards_train/margins": 1.352292537689209, + "rewards_train/rejected": -1.915697693824768, + "step": 1897 + }, + { + "epoch": 0.53, + "learning_rate": 1.062259015945474e-07, + "loss": 0.4262, + "step": 1898 + }, + { + "epoch": 0.53, + "logps_train/chosen": -73.24278259277344, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -113.96565246582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.681017279624939, + "rewards_train/margins": 2.301095128059387, + "rewards_train/rejected": -3.982112407684326, + "step": 1898 + }, + { + "epoch": 0.53, + "logps_train/chosen": -30.016342163085938, + "logps_train/ref_chosen": -23.125, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -41.85903549194336, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6856186389923096, + "rewards_train/margins": 0.5196206569671631, + "rewards_train/rejected": -1.2052392959594727, + "step": 1899 + }, + { + "epoch": 0.53, + "learning_rate": 1.0572002041613597e-07, + "loss": 0.3987, + "step": 1900 + }, + { + "epoch": 0.53, + "logps_train/chosen": -82.3949203491211, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -112.35581970214844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3531635999679565, + "rewards_train/margins": 1.0581990480422974, + "rewards_train/rejected": -2.411362648010254, + "step": 1900 + }, + { + "epoch": 0.53, + "logps_train/chosen": -45.51484680175781, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -69.83395385742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8014847040176392, + "rewards_train/margins": 1.8037861585617065, + "rewards_train/rejected": -2.6052708625793457, + "step": 1901 + }, + { + "epoch": 0.53, + "learning_rate": 1.0521502350255346e-07, + "loss": 0.3813, + "step": 1902 + }, + { + "epoch": 0.53, + "logps_train/chosen": -90.20823669433594, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -119.5953140258789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5494375228881836, + "rewards_train/margins": 2.3565785884857178, + "rewards_train/rejected": -3.9060161113739014, + "step": 1902 + }, + { + "epoch": 0.53, + "logps_train/chosen": -79.43113708496094, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -80.36499786376953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2706531286239624, + "rewards_train/margins": 1.3228775262832642, + "rewards_train/rejected": -2.5935306549072266, + "step": 1903 + }, + { + "epoch": 0.53, + "learning_rate": 1.0471091394883085e-07, + "loss": 0.3232, + "step": 1904 + }, + { + "epoch": 0.53, + "logps_train/chosen": -100.63670349121094, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -122.93672180175781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2228502035140991, + "rewards_train/margins": 2.218478798866272, + "rewards_train/rejected": -3.441329002380371, + "step": 1904 + }, + { + "epoch": 0.53, + "logps_train/chosen": -56.86190414428711, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -73.1705551147461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.824276328086853, + "rewards_train/margins": 1.226080060005188, + "rewards_train/rejected": -2.050356388092041, + "step": 1905 + }, + { + "epoch": 0.53, + "learning_rate": 1.0420769484456085e-07, + "loss": 0.3196, + "step": 1906 + }, + { + "epoch": 0.53, + "logps_train/chosen": -61.427406311035156, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -91.34333801269531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.938248336315155, + "rewards_train/margins": 2.617960751056671, + "rewards_train/rejected": -3.556209087371826, + "step": 1906 + }, + { + "epoch": 0.53, + "logps_train/chosen": -67.15316772460938, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -80.15322875976562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.393442153930664, + "rewards_train/margins": 0.6664113998413086, + "rewards_train/rejected": -2.0598535537719727, + "step": 1907 + }, + { + "epoch": 0.53, + "learning_rate": 1.0370536927387838e-07, + "loss": 0.4639, + "step": 1908 + }, + { + "epoch": 0.53, + "logps_train/chosen": -106.0721206665039, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -131.2165069580078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.347836971282959, + "rewards_train/margins": 1.3841662406921387, + "rewards_train/rejected": -3.7320032119750977, + "step": 1908 + }, + { + "epoch": 0.53, + "logps_train/chosen": -132.96841430664062, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -168.9968719482422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.2632477283477783, + "rewards_train/margins": 2.3536274433135986, + "rewards_train/rejected": -5.616875171661377, + "step": 1909 + }, + { + "epoch": 0.53, + "learning_rate": 1.0320394031544238e-07, + "loss": 0.4148, + "step": 1910 + }, + { + "epoch": 0.53, + "logps_train/chosen": -55.784812927246094, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -78.22610473632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4109032452106476, + "rewards_train/margins": 2.4781133830547333, + "rewards_train/rejected": -2.889016628265381, + "step": 1910 + }, + { + "epoch": 0.53, + "logps_train/chosen": -85.2554702758789, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -114.61570739746094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.309140682220459, + "rewards_train/margins": 1.967566967010498, + "rewards_train/rejected": -4.276707649230957, + "step": 1911 + }, + { + "epoch": 0.53, + "learning_rate": 1.0270341104241654e-07, + "loss": 0.2673, + "step": 1912 + }, + { + "epoch": 0.53, + "logps_train/chosen": -52.6467399597168, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -51.80069351196289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.644459068775177, + "rewards_train/margins": 1.1443992257118225, + "rewards_train/rejected": -1.7888582944869995, + "step": 1912 + }, + { + "epoch": 0.53, + "logps_train/chosen": -48.017059326171875, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -62.773712158203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8917452096939087, + "rewards_train/margins": 0.7830871343612671, + "rewards_train/rejected": -1.6748323440551758, + "step": 1913 + }, + { + "epoch": 0.53, + "learning_rate": 1.0220378452245057e-07, + "loss": 0.493, + "step": 1914 + }, + { + "epoch": 0.53, + "logps_train/chosen": -72.70944213867188, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -98.0931625366211, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.150045394897461, + "rewards_train/margins": 1.0202088356018066, + "rewards_train/rejected": -3.1702542304992676, + "step": 1914 + }, + { + "epoch": 0.54, + "logps_train/chosen": -56.9299201965332, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -75.45361328125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8105697631835938, + "rewards_train/margins": 1.608229160308838, + "rewards_train/rejected": -2.4187989234924316, + "step": 1915 + }, + { + "epoch": 0.54, + "learning_rate": 1.0170506381766119e-07, + "loss": 0.4271, + "step": 1916 + }, + { + "epoch": 0.54, + "logps_train/chosen": -72.64109802246094, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -104.75239562988281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1496570110321045, + "rewards_train/margins": 2.017770290374756, + "rewards_train/rejected": -3.1674273014068604, + "step": 1916 + }, + { + "epoch": 0.54, + "logps_train/chosen": -64.56470489501953, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -72.0902099609375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6527595520019531, + "rewards_train/margins": 0.7912225723266602, + "rewards_train/rejected": -2.4439821243286133, + "step": 1917 + }, + { + "epoch": 0.54, + "learning_rate": 1.0120725198461383e-07, + "loss": 0.5625, + "step": 1918 + }, + { + "epoch": 0.54, + "logps_train/chosen": -83.3591537475586, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -106.28707885742188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6050564050674438, + "rewards_train/margins": 1.9398621320724487, + "rewards_train/rejected": -3.5449185371398926, + "step": 1918 + }, + { + "epoch": 0.54, + "logps_train/chosen": -32.98208999633789, + "logps_train/ref_chosen": -26.125, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -54.504356384277344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6852939128875732, + "rewards_train/margins": 1.3149466514587402, + "rewards_train/rejected": -2.0002405643463135, + "step": 1919 + }, + { + "epoch": 0.54, + "learning_rate": 1.007103520743035e-07, + "loss": 0.4866, + "step": 1920 + }, + { + "epoch": 0.54, + "logps_train/chosen": -107.27156066894531, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -139.72808837890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.063093662261963, + "rewards_train/margins": 1.846433162689209, + "rewards_train/rejected": -3.909526824951172, + "step": 1920 + }, + { + "epoch": 0.54, + "logps_train/chosen": -72.44178009033203, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -92.11134338378906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9953500032424927, + "rewards_train/margins": 1.080236792564392, + "rewards_train/rejected": -3.0755867958068848, + "step": 1921 + }, + { + "epoch": 0.54, + "learning_rate": 1.0021436713213605e-07, + "loss": 0.3617, + "step": 1922 + }, + { + "epoch": 0.54, + "logps_train/chosen": -84.2673110961914, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -99.67121887207031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.077024221420288, + "rewards_train/margins": 0.6986920833587646, + "rewards_train/rejected": -2.7757163047790527, + "step": 1922 + }, + { + "epoch": 0.54, + "logps_train/chosen": -48.90370178222656, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -66.9290771484375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6816787123680115, + "rewards_train/margins": 1.1111312508583069, + "rewards_train/rejected": -1.7928099632263184, + "step": 1923 + }, + { + "epoch": 0.54, + "learning_rate": 9.97193001979099e-08, + "loss": 0.4458, + "step": 1924 + }, + { + "epoch": 0.54, + "logps_train/chosen": -90.2362060546875, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -97.33097076416016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.97967529296875, + "rewards_train/margins": 0.9403355121612549, + "rewards_train/rejected": -2.920010805130005, + "step": 1924 + }, + { + "epoch": 0.54, + "logps_train/chosen": -89.4881362915039, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -107.73068237304688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.316781997680664, + "rewards_train/margins": 1.0887079238891602, + "rewards_train/rejected": -3.405489921569824, + "step": 1925 + }, + { + "epoch": 0.54, + "learning_rate": 9.922515430579706e-08, + "loss": 0.487, + "step": 1926 + }, + { + "epoch": 0.54, + "logps_train/chosen": -54.097442626953125, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -69.08293151855469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3391391038894653, + "rewards_train/margins": 0.815247654914856, + "rewards_train/rejected": -2.1543867588043213, + "step": 1926 + }, + { + "epoch": 0.54, + "logps_train/chosen": -33.45086669921875, + "logps_train/ref_chosen": -28.125, + "logps_train/ref_rejected": -21.25, + "logps_train/rejected": -34.63164520263672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5357113480567932, + "rewards_train/margins": 0.8057733178138733, + "rewards_train/rejected": -1.3414846658706665, + "step": 1927 + }, + { + "epoch": 0.54, + "learning_rate": 9.873193248432474e-08, + "loss": 0.5634, + "step": 1928 + }, + { + "epoch": 0.54, + "logps_train/chosen": -70.0160903930664, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -88.89080810546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.854928970336914, + "rewards_train/margins": 1.8595430850982666, + "rewards_train/rejected": -3.7144720554351807, + "step": 1928 + }, + { + "epoch": 0.54, + "logps_train/chosen": -36.294586181640625, + "logps_train/ref_chosen": -31.5, + "logps_train/ref_rejected": -27.375, + "logps_train/rejected": -38.279640197753906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.47565028071403503, + "rewards_train/margins": 0.6132514774799347, + "rewards_train/rejected": -1.0889017581939697, + "step": 1929 + }, + { + "epoch": 0.54, + "learning_rate": 9.823963775635649e-08, + "loss": 0.61, + "step": 1930 + }, + { + "epoch": 0.54, + "logps_train/chosen": -110.10010528564453, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -130.25558471679688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.5490732192993164, + "rewards_train/margins": 1.4658403396606445, + "rewards_train/rejected": -4.014913558959961, + "step": 1930 + }, + { + "epoch": 0.54, + "logps_train/chosen": -73.57592010498047, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -71.04751586914062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.130516529083252, + "rewards_train/margins": 0.7465004920959473, + "rewards_train/rejected": -2.877017021179199, + "step": 1931 + }, + { + "epoch": 0.54, + "learning_rate": 9.774827313907402e-08, + "loss": 0.6667, + "step": 1932 + }, + { + "epoch": 0.54, + "logps_train/chosen": -42.27134704589844, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -54.67235565185547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6021837592124939, + "rewards_train/margins": 0.7863407731056213, + "rewards_train/rejected": -1.3885245323181152, + "step": 1932 + }, + { + "epoch": 0.54, + "logps_train/chosen": -119.25166320800781, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -141.34066772460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -3.3017287254333496, + "rewards_train/margins": 1.8729634284973145, + "rewards_train/rejected": -5.174692153930664, + "step": 1933 + }, + { + "epoch": 0.54, + "learning_rate": 9.725784164395869e-08, + "loss": 0.3698, + "step": 1934 + }, + { + "epoch": 0.54, + "logps_train/chosen": -76.71092224121094, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -98.15559387207031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.210374355316162, + "rewards_train/margins": 1.2499113082885742, + "rewards_train/rejected": -2.4602856636047363, + "step": 1934 + }, + { + "epoch": 0.54, + "logps_train/chosen": -120.2344970703125, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -105.5, + "logps_train/rejected": -142.18515014648438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.640246868133545, + "rewards_train/margins": 1.0337355136871338, + "rewards_train/rejected": -3.6739823818206787, + "step": 1935 + }, + { + "epoch": 0.54, + "learning_rate": 9.67683462767726e-08, + "loss": 0.5539, + "step": 1936 + }, + { + "epoch": 0.54, + "logps_train/chosen": -38.442298889160156, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -54.857452392578125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3488198220729828, + "rewards_train/margins": 1.1718863546848297, + "rewards_train/rejected": -1.5207061767578125, + "step": 1936 + }, + { + "epoch": 0.54, + "logps_train/chosen": -72.53105163574219, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -80.7066421508789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6605265140533447, + "rewards_train/margins": 2.13670015335083, + "rewards_train/rejected": -2.797226667404175, + "step": 1937 + }, + { + "epoch": 0.54, + "learning_rate": 9.627979003754081e-08, + "loss": 0.3234, + "step": 1938 + }, + { + "epoch": 0.54, + "logps_train/chosen": -45.484642028808594, + "logps_train/ref_chosen": -31.375, + "logps_train/ref_rejected": -25.0, + "logps_train/rejected": -40.9714469909668, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.413405418395996, + "rewards_train/margins": 0.1870594024658203, + "rewards_train/rejected": -1.6004648208618164, + "step": 1938 + }, + { + "epoch": 0.54, + "logps_train/chosen": -68.18218231201172, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -75.88032531738281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.1447806358337402, + "rewards_train/margins": 0.7670800685882568, + "rewards_train/rejected": -2.911860704421997, + "step": 1939 + }, + { + "epoch": 0.54, + "learning_rate": 9.579217592053249e-08, + "loss": 0.5756, + "step": 1940 + }, + { + "epoch": 0.54, + "logps_train/chosen": -32.406002044677734, + "logps_train/ref_chosen": -23.5, + "logps_train/ref_rejected": -33.75, + "logps_train/rejected": -49.05712127685547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8916256427764893, + "rewards_train/margins": 0.6273677349090576, + "rewards_train/rejected": -1.5189933776855469, + "step": 1940 + }, + { + "epoch": 0.54, + "logps_train/chosen": -64.72395324707031, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -93.2551040649414, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1455401182174683, + "rewards_train/margins": 1.7887598276138306, + "rewards_train/rejected": -2.934299945831299, + "step": 1941 + }, + { + "epoch": 0.54, + "learning_rate": 9.530550691424283e-08, + "loss": 0.4961, + "step": 1942 + }, + { + "epoch": 0.54, + "logps_train/chosen": -86.69308471679688, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -131.5696563720703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.000558853149414, + "rewards_train/margins": 2.1673450469970703, + "rewards_train/rejected": -4.167903900146484, + "step": 1942 + }, + { + "epoch": 0.54, + "logps_train/chosen": -69.41508483886719, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -71.49929809570312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.0921919345855713, + "rewards_train/margins": 0.15363574028015137, + "rewards_train/rejected": -2.2458276748657227, + "step": 1943 + }, + { + "epoch": 0.54, + "learning_rate": 9.481978600137435e-08, + "loss": 0.4903, + "step": 1944 + }, + { + "epoch": 0.54, + "logps_train/chosen": -87.29005432128906, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -101.59136962890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3891611099243164, + "rewards_train/margins": 1.0793511867523193, + "rewards_train/rejected": -2.4685122966766357, + "step": 1944 + }, + { + "epoch": 0.54, + "logps_train/chosen": -65.32451629638672, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -79.38421630859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5969047546386719, + "rewards_train/margins": 1.0954232215881348, + "rewards_train/rejected": -2.6923279762268066, + "step": 1945 + }, + { + "epoch": 0.54, + "learning_rate": 9.433501615881923e-08, + "loss": 0.516, + "step": 1946 + }, + { + "epoch": 0.54, + "logps_train/chosen": -71.54859924316406, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -86.97831726074219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.762526035308838, + "rewards_train/margins": 0.750784158706665, + "rewards_train/rejected": -2.513310194015503, + "step": 1946 + }, + { + "epoch": 0.54, + "logps_train/chosen": -94.74353790283203, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -117.25440979003906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7579474449157715, + "rewards_train/margins": 1.9831185340881348, + "rewards_train/rejected": -3.7410659790039062, + "step": 1947 + }, + { + "epoch": 0.54, + "learning_rate": 9.385120035764057e-08, + "loss": 0.4191, + "step": 1948 + }, + { + "epoch": 0.54, + "logps_train/chosen": -58.763999938964844, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -75.54961395263672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9271815419197083, + "rewards_train/margins": 1.110592544078827, + "rewards_train/rejected": -2.037774085998535, + "step": 1948 + }, + { + "epoch": 0.54, + "logps_train/chosen": -105.40151977539062, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -141.94189453125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3798977136611938, + "rewards_train/margins": 2.4640473127365112, + "rewards_train/rejected": -3.843945026397705, + "step": 1949 + }, + { + "epoch": 0.54, + "learning_rate": 9.33683415630542e-08, + "loss": 0.3266, + "step": 1950 + }, + { + "epoch": 0.54, + "logps_train/chosen": -44.61640167236328, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -69.68132019042969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.22238191962242126, + "rewards_train/margins": 0.8638648092746735, + "rewards_train/rejected": -1.0862467288970947, + "step": 1950 + }, + { + "epoch": 0.55, + "logps_train/chosen": -78.37663269042969, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -103.28611755371094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0622730255126953, + "rewards_train/margins": 2.4971981048583984, + "rewards_train/rejected": -4.559471130371094, + "step": 1951 + }, + { + "epoch": 0.55, + "learning_rate": 9.288644273441082e-08, + "loss": 0.3534, + "step": 1952 + }, + { + "epoch": 0.55, + "logps_train/chosen": -43.369773864746094, + "logps_train/ref_chosen": -36.75, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -81.58674621582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.652895450592041, + "rewards_train/margins": 2.721795082092285, + "rewards_train/rejected": -3.374690532684326, + "step": 1952 + }, + { + "epoch": 0.55, + "logps_train/chosen": -43.837947845458984, + "logps_train/ref_chosen": -34.25, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -72.29368591308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.962164044380188, + "rewards_train/margins": 2.0916680097579956, + "rewards_train/rejected": -3.0538320541381836, + "step": 1953 + }, + { + "epoch": 0.55, + "learning_rate": 9.240550682517767e-08, + "loss": 0.2719, + "step": 1954 + }, + { + "epoch": 0.55, + "logps_train/chosen": -100.79878997802734, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -133.30992126464844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3505821228027344, + "rewards_train/margins": 2.0655670166015625, + "rewards_train/rejected": -4.416149139404297, + "step": 1954 + }, + { + "epoch": 0.55, + "logps_train/chosen": -72.240478515625, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -87.75106811523438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3447513580322266, + "rewards_train/margins": 1.1574063301086426, + "rewards_train/rejected": -2.502157688140869, + "step": 1955 + }, + { + "epoch": 0.55, + "learning_rate": 9.192553678292028e-08, + "loss": 0.4237, + "step": 1956 + }, + { + "epoch": 0.55, + "logps_train/chosen": -58.43614196777344, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -66.32608032226562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6959576606750488, + "rewards_train/margins": 0.8934860229492188, + "rewards_train/rejected": -1.5894436836242676, + "step": 1956 + }, + { + "epoch": 0.55, + "logps_train/chosen": -108.7609634399414, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -138.67153930664062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5696511268615723, + "rewards_train/margins": 1.7826590538024902, + "rewards_train/rejected": -3.3523101806640625, + "step": 1957 + }, + { + "epoch": 0.55, + "learning_rate": 9.14465355492847e-08, + "loss": 0.3997, + "step": 1958 + }, + { + "epoch": 0.55, + "logps_train/chosen": -57.13661193847656, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -78.82752990722656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7325087189674377, + "rewards_train/margins": 1.3393065333366394, + "rewards_train/rejected": -2.071815252304077, + "step": 1958 + }, + { + "epoch": 0.55, + "logps_train/chosen": -67.84454345703125, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -57.69300842285156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3555479049682617, + "rewards_train/margins": 0.2727372646331787, + "rewards_train/rejected": -1.6282851696014404, + "step": 1959 + }, + { + "epoch": 0.55, + "learning_rate": 9.09685060599793e-08, + "loss": 0.4911, + "step": 1960 + }, + { + "epoch": 0.55, + "logps_train/chosen": -36.913307189941406, + "logps_train/ref_chosen": -25.5, + "logps_train/ref_rejected": -21.25, + "logps_train/rejected": -37.476165771484375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1362526416778564, + "rewards_train/margins": 0.4801138639450073, + "rewards_train/rejected": -1.6163665056228638, + "step": 1960 + }, + { + "epoch": 0.55, + "logps_train/chosen": -109.95804595947266, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -107.5, + "logps_train/rejected": -153.1173858642578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9790079593658447, + "rewards_train/margins": 2.569059133529663, + "rewards_train/rejected": -4.548067092895508, + "step": 1961 + }, + { + "epoch": 0.55, + "learning_rate": 9.049145124475697e-08, + "loss": 0.4072, + "step": 1962 + }, + { + "epoch": 0.55, + "logps_train/chosen": -97.06443786621094, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -104.98481750488281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.699608325958252, + "rewards_train/margins": 1.2527806758880615, + "rewards_train/rejected": -3.9523890018463135, + "step": 1962 + }, + { + "epoch": 0.55, + "logps_train/chosen": -102.10678100585938, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -146.4178466796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4352869987487793, + "rewards_train/margins": 3.264212131500244, + "rewards_train/rejected": -4.699499130249023, + "step": 1963 + }, + { + "epoch": 0.55, + "learning_rate": 9.001537402739656e-08, + "loss": 0.5195, + "step": 1964 + }, + { + "epoch": 0.55, + "logps_train/chosen": -77.20133972167969, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -111.71385192871094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7654464244842529, + "rewards_train/margins": 2.846759080886841, + "rewards_train/rejected": -3.6122055053710938, + "step": 1964 + }, + { + "epoch": 0.55, + "logps_train/chosen": -32.98747634887695, + "logps_train/ref_chosen": -23.25, + "logps_train/ref_rejected": -20.375, + "logps_train/rejected": -35.0021858215332, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9791187644004822, + "rewards_train/margins": 0.48169535398483276, + "rewards_train/rejected": -1.460814118385315, + "step": 1965 + }, + { + "epoch": 0.55, + "learning_rate": 8.95402773256859e-08, + "loss": 0.3197, + "step": 1966 + }, + { + "epoch": 0.55, + "logps_train/chosen": -70.00694274902344, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -86.17904663085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9338979721069336, + "rewards_train/margins": 1.4931864738464355, + "rewards_train/rejected": -2.427084445953369, + "step": 1966 + }, + { + "epoch": 0.55, + "logps_train/chosen": -102.46730041503906, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -125.98727416992188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1924326419830322, + "rewards_train/margins": 1.3684041500091553, + "rewards_train/rejected": -3.5608367919921875, + "step": 1967 + }, + { + "epoch": 0.55, + "learning_rate": 8.906616405140324e-08, + "loss": 0.3615, + "step": 1968 + }, + { + "epoch": 0.55, + "logps_train/chosen": -84.91909790039062, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -139.67703247070312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.200113296508789, + "rewards_train/margins": 3.58477783203125, + "rewards_train/rejected": -4.784891128540039, + "step": 1968 + }, + { + "epoch": 0.55, + "logps_train/chosen": -64.94412994384766, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -93.78373718261719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7944126129150391, + "rewards_train/margins": 1.239234209060669, + "rewards_train/rejected": -2.033646821975708, + "step": 1969 + }, + { + "epoch": 0.55, + "learning_rate": 8.859303711029939e-08, + "loss": 0.2596, + "step": 1970 + }, + { + "epoch": 0.55, + "logps_train/chosen": -90.74423217773438, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -97.94482421875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.6834075450897217, + "rewards_train/margins": 0.7979888916015625, + "rewards_train/rejected": -2.481396436691284, + "step": 1970 + }, + { + "epoch": 0.55, + "logps_train/chosen": -76.51930236816406, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -133.88336181640625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8037859797477722, + "rewards_train/margins": 2.807988226413727, + "rewards_train/rejected": -3.611774206161499, + "step": 1971 + }, + { + "epoch": 0.55, + "learning_rate": 8.812089940208043e-08, + "loss": 0.5362, + "step": 1972 + }, + { + "epoch": 0.55, + "logps_train/chosen": -74.79251098632812, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -79.21062469482422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2431179285049438, + "rewards_train/margins": 0.5658348798751831, + "rewards_train/rejected": -1.808952808380127, + "step": 1972 + }, + { + "epoch": 0.55, + "logps_train/chosen": -64.97998046875, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -82.74063873291016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4780757427215576, + "rewards_train/margins": 1.6629807949066162, + "rewards_train/rejected": -3.141056537628174, + "step": 1973 + }, + { + "epoch": 0.55, + "learning_rate": 8.764975382038942e-08, + "loss": 0.4841, + "step": 1974 + }, + { + "epoch": 0.55, + "logps_train/chosen": -48.83981704711914, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -74.11100006103516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.733005166053772, + "rewards_train/margins": 1.6060246229171753, + "rewards_train/rejected": -2.3390297889709473, + "step": 1974 + }, + { + "epoch": 0.55, + "logps_train/chosen": -90.83193969726562, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -116.63704681396484, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6828033924102783, + "rewards_train/margins": 1.145744800567627, + "rewards_train/rejected": -2.8285481929779053, + "step": 1975 + }, + { + "epoch": 0.55, + "learning_rate": 8.717960325278904e-08, + "loss": 0.3899, + "step": 1976 + }, + { + "epoch": 0.55, + "logps_train/chosen": -102.88260650634766, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -108.89571380615234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.6577913761138916, + "rewards_train/margins": 1.1907646656036377, + "rewards_train/rejected": -3.8485560417175293, + "step": 1976 + }, + { + "epoch": 0.55, + "logps_train/chosen": -31.117368698120117, + "logps_train/ref_chosen": -21.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -62.46642303466797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.959783673286438, + "rewards_train/margins": 0.8899835348129272, + "rewards_train/rejected": -1.8497672080993652, + "step": 1977 + }, + { + "epoch": 0.55, + "learning_rate": 8.671045058074342e-08, + "loss": 0.4141, + "step": 1978 + }, + { + "epoch": 0.55, + "logps_train/chosen": -84.71221923828125, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -104.71487426757812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1792783737182617, + "rewards_train/margins": 1.6450409889221191, + "rewards_train/rejected": -3.824319362640381, + "step": 1978 + }, + { + "epoch": 0.55, + "logps_train/chosen": -83.40449523925781, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -105.50485229492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8709185123443604, + "rewards_train/margins": 2.112574815750122, + "rewards_train/rejected": -3.9834933280944824, + "step": 1979 + }, + { + "epoch": 0.55, + "learning_rate": 8.624229867960101e-08, + "loss": 0.322, + "step": 1980 + }, + { + "epoch": 0.55, + "logps_train/chosen": -60.2187385559082, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -80.80245971679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8312669992446899, + "rewards_train/margins": 1.6380594968795776, + "rewards_train/rejected": -2.4693264961242676, + "step": 1980 + }, + { + "epoch": 0.55, + "logps_train/chosen": -92.03398132324219, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -106.09092712402344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.175859212875366, + "rewards_train/margins": 1.680499792098999, + "rewards_train/rejected": -3.8563590049743652, + "step": 1981 + }, + { + "epoch": 0.55, + "learning_rate": 8.57751504185768e-08, + "loss": 0.3531, + "step": 1982 + }, + { + "epoch": 0.55, + "logps_train/chosen": -49.1508674621582, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -26.375, + "logps_train/rejected": -41.218502044677734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3704088926315308, + "rewards_train/margins": 0.11394131183624268, + "rewards_train/rejected": -1.4843502044677734, + "step": 1982 + }, + { + "epoch": 0.55, + "logps_train/chosen": -77.5458755493164, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -109.19812774658203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5270484685897827, + "rewards_train/margins": 2.009170174598694, + "rewards_train/rejected": -3.5362186431884766, + "step": 1983 + }, + { + "epoch": 0.55, + "learning_rate": 8.530900866073431e-08, + "loss": 0.6593, + "step": 1984 + }, + { + "epoch": 0.55, + "logps_train/chosen": -71.50971984863281, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -73.53953552246094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1530224084854126, + "rewards_train/margins": 1.1723178625106812, + "rewards_train/rejected": -2.3253402709960938, + "step": 1984 + }, + { + "epoch": 0.55, + "logps_train/chosen": -105.9326171875, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -108.26788330078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2108397483825684, + "rewards_train/margins": 1.617511510848999, + "rewards_train/rejected": -3.8283512592315674, + "step": 1985 + }, + { + "epoch": 0.56, + "learning_rate": 8.484387626296871e-08, + "loss": 0.3764, + "step": 1986 + }, + { + "epoch": 0.56, + "logps_train/chosen": -108.44713592529297, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -135.17124938964844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.673912763595581, + "rewards_train/margins": 1.3355953693389893, + "rewards_train/rejected": -4.00950813293457, + "step": 1986 + }, + { + "epoch": 0.56, + "logps_train/chosen": -62.282108306884766, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -77.98124694824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0186405181884766, + "rewards_train/margins": 1.3792893886566162, + "rewards_train/rejected": -2.3979299068450928, + "step": 1987 + }, + { + "epoch": 0.56, + "learning_rate": 8.437975607598888e-08, + "loss": 0.4001, + "step": 1988 + }, + { + "epoch": 0.56, + "logps_train/chosen": -38.71179962158203, + "logps_train/ref_chosen": -28.875, + "logps_train/ref_rejected": -31.0, + "logps_train/rejected": -47.60798645019531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9852428436279297, + "rewards_train/margins": 0.6798524856567383, + "rewards_train/rejected": -1.665095329284668, + "step": 1988 + }, + { + "epoch": 0.56, + "logps_train/chosen": -69.31269073486328, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -106.16201782226562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5469427108764648, + "rewards_train/margins": 2.226533889770508, + "rewards_train/rejected": -3.7734766006469727, + "step": 1989 + }, + { + "epoch": 0.56, + "learning_rate": 8.391665094430023e-08, + "loss": 0.425, + "step": 1990 + }, + { + "epoch": 0.56, + "logps_train/chosen": -75.35893249511719, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -88.6258544921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6272995471954346, + "rewards_train/margins": 1.7893869876861572, + "rewards_train/rejected": -3.416686534881592, + "step": 1990 + }, + { + "epoch": 0.56, + "logps_train/chosen": -65.22088623046875, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -98.70135498046875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8096866607666016, + "rewards_train/margins": 2.1319329738616943, + "rewards_train/rejected": -2.941619634628296, + "step": 1991 + }, + { + "epoch": 0.56, + "learning_rate": 8.345456370618659e-08, + "loss": 0.3566, + "step": 1992 + }, + { + "epoch": 0.56, + "logps_train/chosen": -90.94308471679688, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -98.82499694824219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6549532413482666, + "rewards_train/margins": 1.2091882228851318, + "rewards_train/rejected": -2.8641414642333984, + "step": 1992 + }, + { + "epoch": 0.56, + "logps_train/chosen": -84.97198486328125, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -90.90570068359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.515363097190857, + "rewards_train/margins": 1.443956971168518, + "rewards_train/rejected": -2.959320068359375, + "step": 1993 + }, + { + "epoch": 0.56, + "learning_rate": 8.29934971936938e-08, + "loss": 0.435, + "step": 1994 + }, + { + "epoch": 0.56, + "logps_train/chosen": -84.12213897705078, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -118.45738220214844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3056586980819702, + "rewards_train/margins": 2.090555787086487, + "rewards_train/rejected": -3.396214485168457, + "step": 1994 + }, + { + "epoch": 0.56, + "logps_train/chosen": -82.85786437988281, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -109.7349624633789, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1667675971984863, + "rewards_train/margins": 1.7896640300750732, + "rewards_train/rejected": -2.9564316272735596, + "step": 1995 + }, + { + "epoch": 0.56, + "learning_rate": 8.253345423261168e-08, + "loss": 0.3296, + "step": 1996 + }, + { + "epoch": 0.56, + "logps_train/chosen": -127.67427825927734, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -132.77032470703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -3.4143025875091553, + "rewards_train/margins": -0.17008256912231445, + "rewards_train/rejected": -3.244220018386841, + "step": 1996 + }, + { + "epoch": 0.56, + "logps_train/chosen": -105.43716430664062, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -123.96332550048828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0671541690826416, + "rewards_train/margins": 1.8073036670684814, + "rewards_train/rejected": -3.874457836151123, + "step": 1997 + }, + { + "epoch": 0.56, + "learning_rate": 8.207443764245705e-08, + "loss": 0.9856, + "step": 1998 + }, + { + "epoch": 0.56, + "logps_train/chosen": -75.80126953125, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -98.28489685058594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8289549350738525, + "rewards_train/margins": 1.5110580921173096, + "rewards_train/rejected": -2.340013027191162, + "step": 1998 + }, + { + "epoch": 0.56, + "logps_train/chosen": -68.47697448730469, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -91.54481506347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.4064865112304688, + "rewards_train/margins": 0.2593224048614502, + "rewards_train/rejected": -2.665808916091919, + "step": 1999 + }, + { + "epoch": 0.56, + "learning_rate": 8.161645023645597e-08, + "loss": 0.581, + "step": 2000 + }, + { + "epoch": 0.56, + "logps_train/chosen": -53.53972625732422, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -68.63607788085938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0336599349975586, + "rewards_train/margins": 0.8455724716186523, + "rewards_train/rejected": -1.879232406616211, + "step": 2000 + }, + { + "epoch": 0.56, + "logps_train/chosen": -83.170166015625, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -83.50080871582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0488529205322266, + "rewards_train/margins": 1.5910711288452148, + "rewards_train/rejected": -2.6399240493774414, + "step": 2001 + }, + { + "epoch": 0.56, + "learning_rate": 8.115949482152709e-08, + "loss": 0.4007, + "step": 2002 + }, + { + "epoch": 0.56, + "logps_train/chosen": -74.61415100097656, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -85.65557861328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.399109959602356, + "rewards_train/margins": 1.2592214345932007, + "rewards_train/rejected": -2.6583313941955566, + "step": 2002 + }, + { + "epoch": 0.56, + "logps_train/chosen": -69.95663452148438, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -97.09745788574219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8202722072601318, + "rewards_train/margins": 1.9869341850280762, + "rewards_train/rejected": -3.807206392288208, + "step": 2003 + }, + { + "epoch": 0.56, + "learning_rate": 8.070357419826418e-08, + "loss": 0.2954, + "step": 2004 + }, + { + "epoch": 0.56, + "logps_train/chosen": -92.07615661621094, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -118.43268585205078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3320298194885254, + "rewards_train/margins": 2.094050884246826, + "rewards_train/rejected": -3.4260807037353516, + "step": 2004 + }, + { + "epoch": 0.56, + "logps_train/chosen": -86.79302978515625, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -115.96133422851562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4421939849853516, + "rewards_train/margins": 1.427767276763916, + "rewards_train/rejected": -2.8699612617492676, + "step": 2005 + }, + { + "epoch": 0.56, + "learning_rate": 8.024869116091879e-08, + "loss": 0.347, + "step": 2006 + }, + { + "epoch": 0.56, + "logps_train/chosen": -126.97574615478516, + "logps_train/ref_chosen": -102.5, + "logps_train/ref_rejected": -126.5, + "logps_train/rejected": -168.7589111328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.456949234008789, + "rewards_train/margins": 1.759566307067871, + "rewards_train/rejected": -4.21651554107666, + "step": 2006 + }, + { + "epoch": 0.56, + "logps_train/chosen": -110.49139404296875, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -124.17587280273438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.5725769996643066, + "rewards_train/margins": 2.064542293548584, + "rewards_train/rejected": -4.637119293212891, + "step": 2007 + }, + { + "epoch": 0.56, + "learning_rate": 7.979484849738344e-08, + "loss": 0.367, + "step": 2008 + }, + { + "epoch": 0.56, + "logps_train/chosen": -95.91465759277344, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -118.02151489257812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2083609104156494, + "rewards_train/margins": 1.537247896194458, + "rewards_train/rejected": -3.7456088066101074, + "step": 2008 + }, + { + "epoch": 0.56, + "logps_train/chosen": -78.37964630126953, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -85.4207763671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.478003740310669, + "rewards_train/margins": 0.887902021408081, + "rewards_train/rejected": -2.36590576171875, + "step": 2009 + }, + { + "epoch": 0.56, + "learning_rate": 7.93420489891744e-08, + "loss": 0.5685, + "step": 2010 + }, + { + "epoch": 0.56, + "logps_train/chosen": -60.42747497558594, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -89.84246826171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7189193964004517, + "rewards_train/margins": 1.29794442653656, + "rewards_train/rejected": -3.0168638229370117, + "step": 2010 + }, + { + "epoch": 0.56, + "logps_train/chosen": -105.34133911132812, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -133.2752685546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -3.130227565765381, + "rewards_train/margins": 1.6566743850708008, + "rewards_train/rejected": -4.786901950836182, + "step": 2011 + }, + { + "epoch": 0.56, + "learning_rate": 7.889029541141465e-08, + "loss": 0.3696, + "step": 2012 + }, + { + "epoch": 0.56, + "logps_train/chosen": -102.57357788085938, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -125.34063720703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4393885135650635, + "rewards_train/margins": 2.0649876594543457, + "rewards_train/rejected": -3.504376173019409, + "step": 2012 + }, + { + "epoch": 0.56, + "logps_train/chosen": -34.7305908203125, + "logps_train/ref_chosen": -22.5, + "logps_train/ref_rejected": -36.25, + "logps_train/rejected": -61.56572723388672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2244750261306763, + "rewards_train/margins": 1.316668152809143, + "rewards_train/rejected": -2.5411431789398193, + "step": 2013 + }, + { + "epoch": 0.56, + "learning_rate": 7.843959053281663e-08, + "loss": 0.269, + "step": 2014 + }, + { + "epoch": 0.56, + "logps_train/chosen": -64.35531616210938, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -95.31759643554688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1378759145736694, + "rewards_train/margins": 1.2759398221969604, + "rewards_train/rejected": -2.41381573677063, + "step": 2014 + }, + { + "epoch": 0.56, + "logps_train/chosen": -48.608985900878906, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -62.054054260253906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.266660213470459, + "rewards_train/margins": 1.068042278289795, + "rewards_train/rejected": -2.334702491760254, + "step": 2015 + }, + { + "epoch": 0.56, + "learning_rate": 7.798993711566581e-08, + "loss": 0.3833, + "step": 2016 + }, + { + "epoch": 0.56, + "logps_train/chosen": -90.3540267944336, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -109.29360961914062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.173293113708496, + "rewards_train/margins": 1.4865365028381348, + "rewards_train/rejected": -3.659829616546631, + "step": 2016 + }, + { + "epoch": 0.56, + "logps_train/chosen": -65.87181091308594, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -66.69879150390625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7055405378341675, + "rewards_train/margins": 1.0889474153518677, + "rewards_train/rejected": -2.794487953186035, + "step": 2017 + }, + { + "epoch": 0.56, + "learning_rate": 7.754133791580339e-08, + "loss": 0.3551, + "step": 2018 + }, + { + "epoch": 0.56, + "logps_train/chosen": -84.87860870361328, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -92.8297348022461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9077825546264648, + "rewards_train/margins": 1.2103471755981445, + "rewards_train/rejected": -3.1181297302246094, + "step": 2018 + }, + { + "epoch": 0.56, + "logps_train/chosen": -89.56547546386719, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -124.26695251464844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3358440399169922, + "rewards_train/margins": 2.7084288597106934, + "rewards_train/rejected": -4.0442728996276855, + "step": 2019 + }, + { + "epoch": 0.56, + "learning_rate": 7.709379568260921e-08, + "loss": 0.3807, + "step": 2020 + }, + { + "epoch": 0.56, + "logps_train/chosen": -48.11452865600586, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -59.88193893432617, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9053493738174438, + "rewards_train/margins": 1.2148758172988892, + "rewards_train/rejected": -2.120225191116333, + "step": 2020 + }, + { + "epoch": 0.56, + "logps_train/chosen": -58.42257308959961, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -33.25, + "logps_train/rejected": -59.07329177856445, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6925015449523926, + "rewards_train/margins": 0.9008626937866211, + "rewards_train/rejected": -2.5933642387390137, + "step": 2021 + }, + { + "epoch": 0.57, + "learning_rate": 7.664731315898546e-08, + "loss": 0.3955, + "step": 2022 + }, + { + "epoch": 0.57, + "logps_train/chosen": -61.435279846191406, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -98.72279357910156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7993875741958618, + "rewards_train/margins": 1.5148848295211792, + "rewards_train/rejected": -3.314272403717041, + "step": 2022 + }, + { + "epoch": 0.57, + "logps_train/chosen": -56.93460464477539, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -68.29438781738281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0959997177124023, + "rewards_train/margins": 0.5771892070770264, + "rewards_train/rejected": -1.6731889247894287, + "step": 2023 + }, + { + "epoch": 0.57, + "learning_rate": 7.620189308133943e-08, + "loss": 0.4807, + "step": 2024 + }, + { + "epoch": 0.57, + "logps_train/chosen": -103.25361633300781, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -132.89398193359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.267939567565918, + "rewards_train/margins": 2.0386457443237305, + "rewards_train/rejected": -4.306585311889648, + "step": 2024 + }, + { + "epoch": 0.57, + "logps_train/chosen": -77.77604675292969, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -101.76101684570312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7494794130325317, + "rewards_train/margins": 1.9812120199203491, + "rewards_train/rejected": -3.730691432952881, + "step": 2025 + }, + { + "epoch": 0.57, + "learning_rate": 7.575753817956702e-08, + "loss": 0.3636, + "step": 2026 + }, + { + "epoch": 0.57, + "logps_train/chosen": -71.26786804199219, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -79.71773529052734, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.054130792617798, + "rewards_train/margins": 0.13385415077209473, + "rewards_train/rejected": -2.1879849433898926, + "step": 2026 + }, + { + "epoch": 0.57, + "logps_train/chosen": -53.09149932861328, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -72.68791198730469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.880146324634552, + "rewards_train/margins": 1.07927006483078, + "rewards_train/rejected": -1.959416389465332, + "step": 2027 + }, + { + "epoch": 0.57, + "learning_rate": 7.531425117703557e-08, + "loss": 0.5769, + "step": 2028 + }, + { + "epoch": 0.57, + "logps_train/chosen": -94.37026977539062, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -99.60851287841797, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.2429842948913574, + "rewards_train/margins": 0.6723837852478027, + "rewards_train/rejected": -2.91536808013916, + "step": 2028 + }, + { + "epoch": 0.57, + "logps_train/chosen": -85.78739929199219, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -123.35035705566406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8318644762039185, + "rewards_train/margins": 1.481296181678772, + "rewards_train/rejected": -3.3131606578826904, + "step": 2029 + }, + { + "epoch": 0.57, + "learning_rate": 7.487203479056777e-08, + "loss": 0.5022, + "step": 2030 + }, + { + "epoch": 0.57, + "logps_train/chosen": -61.43556213378906, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -43.75, + "logps_train/rejected": -72.5194091796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9685559272766113, + "rewards_train/margins": 1.9017441272735596, + "rewards_train/rejected": -2.870300054550171, + "step": 2030 + }, + { + "epoch": 0.57, + "logps_train/chosen": -99.33772277832031, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -89.69277954101562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3941230773925781, + "rewards_train/margins": 0.8712484836578369, + "rewards_train/rejected": -2.265371561050415, + "step": 2031 + }, + { + "epoch": 0.57, + "learning_rate": 7.443089173042466e-08, + "loss": 0.449, + "step": 2032 + }, + { + "epoch": 0.57, + "logps_train/chosen": -70.18465423583984, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -86.06988525390625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.4484455585479736, + "rewards_train/margins": 1.011277675628662, + "rewards_train/rejected": -3.4597232341766357, + "step": 2032 + }, + { + "epoch": 0.57, + "logps_train/chosen": -80.47709655761719, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -117.68765258789062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.264530658721924, + "rewards_train/margins": 2.062462329864502, + "rewards_train/rejected": -4.326992988586426, + "step": 2033 + }, + { + "epoch": 0.57, + "learning_rate": 7.399082470028883e-08, + "loss": 0.5092, + "step": 2034 + }, + { + "epoch": 0.57, + "logps_train/chosen": -130.05368041992188, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -122.53167724609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.086618661880493, + "rewards_train/margins": 1.1272914409637451, + "rewards_train/rejected": -4.213910102844238, + "step": 2034 + }, + { + "epoch": 0.57, + "logps_train/chosen": -40.75750732421875, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -87.02031707763672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0569519996643066, + "rewards_train/margins": 1.7413201332092285, + "rewards_train/rejected": -2.798272132873535, + "step": 2035 + }, + { + "epoch": 0.57, + "learning_rate": 7.35518363972483e-08, + "loss": 0.4356, + "step": 2036 + }, + { + "epoch": 0.57, + "logps_train/chosen": -95.84678649902344, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -125.46881103515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9279396533966064, + "rewards_train/margins": 2.9374959468841553, + "rewards_train/rejected": -4.865435600280762, + "step": 2036 + }, + { + "epoch": 0.57, + "logps_train/chosen": -117.28984832763672, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -111.59902954101562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.520782232284546, + "rewards_train/margins": 1.0416600704193115, + "rewards_train/rejected": -2.5624423027038574, + "step": 2037 + }, + { + "epoch": 0.57, + "learning_rate": 7.311392951177983e-08, + "loss": 0.3732, + "step": 2038 + }, + { + "epoch": 0.57, + "logps_train/chosen": -58.8238525390625, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -69.92394256591797, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9842894673347473, + "rewards_train/margins": 1.0846182703971863, + "rewards_train/rejected": -2.0689077377319336, + "step": 2038 + }, + { + "epoch": 0.57, + "logps_train/chosen": -70.83395385742188, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -105.24347686767578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7454560995101929, + "rewards_train/margins": 2.4327975511550903, + "rewards_train/rejected": -3.178253650665283, + "step": 2039 + }, + { + "epoch": 0.57, + "learning_rate": 7.267710672773211e-08, + "loss": 0.3544, + "step": 2040 + }, + { + "epoch": 0.57, + "logps_train/chosen": -42.925819396972656, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -65.96470642089844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9110389351844788, + "rewards_train/margins": 1.6556467413902283, + "rewards_train/rejected": -2.566685676574707, + "step": 2040 + }, + { + "epoch": 0.57, + "logps_train/chosen": -84.8345947265625, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -113.7282943725586, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2553342580795288, + "rewards_train/margins": 2.5889803171157837, + "rewards_train/rejected": -3.8443145751953125, + "step": 2041 + }, + { + "epoch": 0.57, + "learning_rate": 7.224137072230982e-08, + "loss": 0.3334, + "step": 2042 + }, + { + "epoch": 0.57, + "logps_train/chosen": -88.57653045654297, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -150.1007843017578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4748404026031494, + "rewards_train/margins": 2.3453941345214844, + "rewards_train/rejected": -3.820234537124634, + "step": 2042 + }, + { + "epoch": 0.57, + "logps_train/chosen": -46.55854034423828, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -67.77569580078125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4599558115005493, + "rewards_train/margins": 1.1422239542007446, + "rewards_train/rejected": -1.602179765701294, + "step": 2043 + }, + { + "epoch": 0.57, + "learning_rate": 7.180672416605687e-08, + "loss": 0.4253, + "step": 2044 + }, + { + "epoch": 0.57, + "logps_train/chosen": -58.505218505859375, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -55.59811019897461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8695771098136902, + "rewards_train/margins": 1.038085401058197, + "rewards_train/rejected": -1.9076625108718872, + "step": 2044 + }, + { + "epoch": 0.57, + "logps_train/chosen": -122.59248352050781, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -146.1300506591797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7967479228973389, + "rewards_train/margins": 2.4131319522857666, + "rewards_train/rejected": -4.2098798751831055, + "step": 2045 + }, + { + "epoch": 0.57, + "learning_rate": 7.137316972284027e-08, + "loss": 0.3584, + "step": 2046 + }, + { + "epoch": 0.57, + "logps_train/chosen": -75.13282775878906, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -98.92269897460938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.666212797164917, + "rewards_train/margins": 2.1494953632354736, + "rewards_train/rejected": -3.8157081604003906, + "step": 2046 + }, + { + "epoch": 0.57, + "logps_train/chosen": -85.78764343261719, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -113.82390594482422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5402874946594238, + "rewards_train/margins": 2.2005014419555664, + "rewards_train/rejected": -3.7407889366149902, + "step": 2047 + }, + { + "epoch": 0.57, + "learning_rate": 7.094071004983343e-08, + "loss": 0.2601, + "step": 2048 + }, + { + "epoch": 0.57, + "logps_train/chosen": -79.29083251953125, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -108.56194305419922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5841622352600098, + "rewards_train/margins": 1.883164882659912, + "rewards_train/rejected": -3.467327117919922, + "step": 2048 + }, + { + "epoch": 0.57, + "logps_train/chosen": -68.17729187011719, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -59.22962188720703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9677292704582214, + "rewards_train/margins": 0.7999593615531921, + "rewards_train/rejected": -1.7676886320114136, + "step": 2049 + }, + { + "epoch": 0.57, + "learning_rate": 7.050934779750029e-08, + "loss": 0.3991, + "step": 2050 + }, + { + "epoch": 0.57, + "logps_train/chosen": -82.71372985839844, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -100.57864379882812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.6639509201049805, + "rewards_train/margins": 1.369694709777832, + "rewards_train/rejected": -4.0336456298828125, + "step": 2050 + }, + { + "epoch": 0.57, + "logps_train/chosen": -67.38702392578125, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -98.01837158203125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6201475262641907, + "rewards_train/margins": 2.3334473967552185, + "rewards_train/rejected": -2.953594923019409, + "step": 2051 + }, + { + "epoch": 0.57, + "learning_rate": 7.007908560957895e-08, + "loss": 0.5333, + "step": 2052 + }, + { + "epoch": 0.57, + "logps_train/chosen": -70.43220520019531, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -91.27249145507812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2988355159759521, + "rewards_train/margins": 1.0909130573272705, + "rewards_train/rejected": -2.3897485733032227, + "step": 2052 + }, + { + "epoch": 0.57, + "logps_train/chosen": -63.30310821533203, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -67.20059967041016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1099984645843506, + "rewards_train/margins": 0.640920877456665, + "rewards_train/rejected": -1.7509193420410156, + "step": 2053 + }, + { + "epoch": 0.57, + "learning_rate": 6.964992612306525e-08, + "loss": 0.4779, + "step": 2054 + }, + { + "epoch": 0.57, + "logps_train/chosen": -40.83663558959961, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -60.61116409301758, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.04176926612854, + "rewards_train/margins": 1.2412221431732178, + "rewards_train/rejected": -2.282991409301758, + "step": 2054 + }, + { + "epoch": 0.57, + "logps_train/chosen": -48.40538024902344, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -75.69383239746094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2327252626419067, + "rewards_train/margins": 1.5532597303390503, + "rewards_train/rejected": -2.785984992980957, + "step": 2055 + }, + { + "epoch": 0.57, + "learning_rate": 6.92218719681969e-08, + "loss": 0.4046, + "step": 2056 + }, + { + "epoch": 0.57, + "logps_train/chosen": -76.61868286132812, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -89.76553344726562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8899929523468018, + "rewards_train/margins": 1.0631229877471924, + "rewards_train/rejected": -2.953115940093994, + "step": 2056 + }, + { + "epoch": 0.57, + "logps_train/chosen": -79.19552612304688, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -82.55109405517578, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8472871780395508, + "rewards_train/margins": 0.9828221797943115, + "rewards_train/rejected": -2.8301093578338623, + "step": 2057 + }, + { + "epoch": 0.58, + "learning_rate": 6.879492576843726e-08, + "loss": 0.6968, + "step": 2058 + }, + { + "epoch": 0.58, + "logps_train/chosen": -68.574462890625, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -79.17143249511719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4750241041183472, + "rewards_train/margins": 1.1171191930770874, + "rewards_train/rejected": -2.5921432971954346, + "step": 2058 + }, + { + "epoch": 0.58, + "logps_train/chosen": -54.907867431640625, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -31.5, + "logps_train/rejected": -54.651058197021484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4823884963989258, + "rewards_train/margins": 0.831545352935791, + "rewards_train/rejected": -2.313933849334717, + "step": 2059 + }, + { + "epoch": 0.58, + "learning_rate": 6.836909014045924e-08, + "loss": 0.3963, + "step": 2060 + }, + { + "epoch": 0.58, + "logps_train/chosen": -53.24628448486328, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -76.54415893554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2809760570526123, + "rewards_train/margins": 1.5523455142974854, + "rewards_train/rejected": -2.8333215713500977, + "step": 2060 + }, + { + "epoch": 0.58, + "logps_train/chosen": -113.57747650146484, + "logps_train/ref_chosen": -92.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -97.16392517089844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1120452880859375, + "rewards_train/margins": 0.6478042602539062, + "rewards_train/rejected": -2.7598495483398438, + "step": 2061 + }, + { + "epoch": 0.58, + "learning_rate": 6.794436769412911e-08, + "loss": 0.454, + "step": 2062 + }, + { + "epoch": 0.58, + "logps_train/chosen": -40.35031509399414, + "logps_train/ref_chosen": -31.75, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -70.04084777832031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8655002117156982, + "rewards_train/margins": 1.3850693702697754, + "rewards_train/rejected": -2.2505695819854736, + "step": 2062 + }, + { + "epoch": 0.58, + "logps_train/chosen": -42.43242263793945, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -51.40599060058594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9502733945846558, + "rewards_train/margins": 0.7848566770553589, + "rewards_train/rejected": -1.7351300716400146, + "step": 2063 + }, + { + "epoch": 0.58, + "learning_rate": 6.752076103249083e-08, + "loss": 0.3283, + "step": 2064 + }, + { + "epoch": 0.58, + "logps_train/chosen": -54.35895538330078, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -91.61186218261719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.34136441349983215, + "rewards_train/margins": 2.185056656599045, + "rewards_train/rejected": -2.526421070098877, + "step": 2064 + }, + { + "epoch": 0.58, + "logps_train/chosen": -105.46289825439453, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -127.77777099609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.8486335277557373, + "rewards_train/margins": 1.6217710971832275, + "rewards_train/rejected": -4.470404624938965, + "step": 2065 + }, + { + "epoch": 0.58, + "learning_rate": 6.709827275174992e-08, + "loss": 0.3239, + "step": 2066 + }, + { + "epoch": 0.58, + "logps_train/chosen": -89.56805419921875, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -99.33772277832031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3353216648101807, + "rewards_train/margins": 1.3476698398590088, + "rewards_train/rejected": -2.6829915046691895, + "step": 2066 + }, + { + "epoch": 0.58, + "logps_train/chosen": -84.03972625732422, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -95.41542053222656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6199886798858643, + "rewards_train/margins": 1.028975486755371, + "rewards_train/rejected": -2.6489641666412354, + "step": 2067 + }, + { + "epoch": 0.58, + "learning_rate": 6.66769054412574e-08, + "loss": 0.4161, + "step": 2068 + }, + { + "epoch": 0.58, + "logps_train/chosen": -97.68883514404297, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -105.70333862304688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1757194995880127, + "rewards_train/margins": 1.2481303215026855, + "rewards_train/rejected": -2.4238498210906982, + "step": 2068 + }, + { + "epoch": 0.58, + "logps_train/chosen": -69.78389739990234, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -90.17318725585938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.841866135597229, + "rewards_train/margins": 1.7820924520492554, + "rewards_train/rejected": -2.6239585876464844, + "step": 2069 + }, + { + "epoch": 0.58, + "learning_rate": 6.625666168349423e-08, + "loss": 0.411, + "step": 2070 + }, + { + "epoch": 0.58, + "logps_train/chosen": -59.67823791503906, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -77.22866821289062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8352067470550537, + "rewards_train/margins": 1.596644639968872, + "rewards_train/rejected": -2.431851387023926, + "step": 2070 + }, + { + "epoch": 0.58, + "logps_train/chosen": -91.96846008300781, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -117.0979995727539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.133931040763855, + "rewards_train/margins": 2.2127586603164673, + "rewards_train/rejected": -3.3466897010803223, + "step": 2071 + }, + { + "epoch": 0.58, + "learning_rate": 6.583754405405528e-08, + "loss": 0.3199, + "step": 2072 + }, + { + "epoch": 0.58, + "logps_train/chosen": -47.81785202026367, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -65.80642700195312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.482371062040329, + "rewards_train/margins": 1.701983243227005, + "rewards_train/rejected": -2.184354305267334, + "step": 2072 + }, + { + "epoch": 0.58, + "logps_train/chosen": -54.53237533569336, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -70.27101135253906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.7394677400588989, + "rewards_train/margins": 0.4726923704147339, + "rewards_train/rejected": -1.2121601104736328, + "step": 2073 + }, + { + "epoch": 0.58, + "learning_rate": 6.541955512163367e-08, + "loss": 0.4126, + "step": 2074 + }, + { + "epoch": 0.58, + "logps_train/chosen": -104.01774597167969, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -124.22376251220703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2252120971679688, + "rewards_train/margins": 1.036226749420166, + "rewards_train/rejected": -3.2614388465881348, + "step": 2074 + }, + { + "epoch": 0.58, + "logps_train/chosen": -87.72959899902344, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -114.3899154663086, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.45655357837677, + "rewards_train/margins": 1.1152499914169312, + "rewards_train/rejected": -2.571803569793701, + "step": 2075 + }, + { + "epoch": 0.58, + "learning_rate": 6.500269744800469e-08, + "loss": 0.4249, + "step": 2076 + }, + { + "epoch": 0.58, + "logps_train/chosen": -39.61121368408203, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -25.125, + "logps_train/rejected": -44.971614837646484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.40379706025123596, + "rewards_train/margins": 1.5770559012889862, + "rewards_train/rejected": -1.9808529615402222, + "step": 2076 + }, + { + "epoch": 0.58, + "logps_train/chosen": -93.03768920898438, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -117.90193176269531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4691985845565796, + "rewards_train/margins": 1.1084946393966675, + "rewards_train/rejected": -2.577693223953247, + "step": 2077 + }, + { + "epoch": 0.58, + "learning_rate": 6.45869735880106e-08, + "loss": 0.3391, + "step": 2078 + }, + { + "epoch": 0.58, + "logps_train/chosen": -65.64482879638672, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -104.19233703613281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1625299453735352, + "rewards_train/margins": 1.9092426300048828, + "rewards_train/rejected": -3.071772575378418, + "step": 2078 + }, + { + "epoch": 0.58, + "logps_train/chosen": -108.32366943359375, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -134.6239013671875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6147891283035278, + "rewards_train/margins": 3.0503371953964233, + "rewards_train/rejected": -4.665126323699951, + "step": 2079 + }, + { + "epoch": 0.58, + "learning_rate": 6.417238608954479e-08, + "loss": 0.3345, + "step": 2080 + }, + { + "epoch": 0.58, + "logps_train/chosen": -41.94489288330078, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -50.916587829589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.32964563369750977, + "rewards_train/margins": 1.2667005062103271, + "rewards_train/rejected": -1.596346139907837, + "step": 2080 + }, + { + "epoch": 0.58, + "logps_train/chosen": -113.23661804199219, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -136.02545166015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8537397384643555, + "rewards_train/margins": 1.9238042831420898, + "rewards_train/rejected": -3.7775440216064453, + "step": 2081 + }, + { + "epoch": 0.58, + "learning_rate": 6.375893749353578e-08, + "loss": 0.3716, + "step": 2082 + }, + { + "epoch": 0.58, + "logps_train/chosen": -87.25921630859375, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -111.98489379882812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5462337732315063, + "rewards_train/margins": 1.2210057973861694, + "rewards_train/rejected": -2.767239570617676, + "step": 2082 + }, + { + "epoch": 0.58, + "logps_train/chosen": -85.58940887451172, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -100.89420318603516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5526912212371826, + "rewards_train/margins": 1.297666311264038, + "rewards_train/rejected": -2.8503575325012207, + "step": 2083 + }, + { + "epoch": 0.58, + "learning_rate": 6.334663033393228e-08, + "loss": 0.3492, + "step": 2084 + }, + { + "epoch": 0.58, + "logps_train/chosen": -49.07570266723633, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -41.25, + "logps_train/rejected": -75.57756042480469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9356950521469116, + "rewards_train/margins": 2.489639401435852, + "rewards_train/rejected": -3.4253344535827637, + "step": 2084 + }, + { + "epoch": 0.58, + "logps_train/chosen": -43.674495697021484, + "logps_train/ref_chosen": -33.75, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -69.68412017822266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9971615672111511, + "rewards_train/margins": 1.0604656338691711, + "rewards_train/rejected": -2.0576272010803223, + "step": 2085 + }, + { + "epoch": 0.58, + "learning_rate": 6.293546713768722e-08, + "loss": 0.3387, + "step": 2086 + }, + { + "epoch": 0.58, + "logps_train/chosen": -76.24918365478516, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -70.58163452148438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4436683654785156, + "rewards_train/margins": 0.9539487361907959, + "rewards_train/rejected": -2.3976171016693115, + "step": 2086 + }, + { + "epoch": 0.58, + "logps_train/chosen": -92.84099578857422, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -137.44235229492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9978206157684326, + "rewards_train/margins": 3.044461488723755, + "rewards_train/rejected": -4.0422821044921875, + "step": 2087 + }, + { + "epoch": 0.58, + "learning_rate": 6.252545042474246e-08, + "loss": 0.2887, + "step": 2088 + }, + { + "epoch": 0.58, + "logps_train/chosen": -162.59339904785156, + "logps_train/ref_chosen": -121.5, + "logps_train/ref_rejected": -106.5, + "logps_train/rejected": -161.24937438964844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -4.100355625152588, + "rewards_train/margins": 1.3937220573425293, + "rewards_train/rejected": -5.494077682495117, + "step": 2088 + }, + { + "epoch": 0.58, + "logps_train/chosen": -71.49720764160156, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -86.50640869140625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.618861436843872, + "rewards_train/margins": 0.9321701526641846, + "rewards_train/rejected": -2.5510315895080566, + "step": 2089 + }, + { + "epoch": 0.58, + "learning_rate": 6.211658270801315e-08, + "loss": 0.4803, + "step": 2090 + }, + { + "epoch": 0.58, + "logps_train/chosen": -112.76863861083984, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -109.97859191894531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.7280359268188477, + "rewards_train/margins": 1.2534170150756836, + "rewards_train/rejected": -3.9814529418945312, + "step": 2090 + }, + { + "epoch": 0.58, + "logps_train/chosen": -67.25273895263672, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -79.99827575683594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8928521275520325, + "rewards_train/margins": 1.448088824748993, + "rewards_train/rejected": -2.3409409523010254, + "step": 2091 + }, + { + "epoch": 0.58, + "learning_rate": 6.170886649337257e-08, + "loss": 0.3987, + "step": 2092 + }, + { + "epoch": 0.58, + "logps_train/chosen": -53.44761657714844, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -84.1935043334961, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5974960327148438, + "rewards_train/margins": 2.5835728645324707, + "rewards_train/rejected": -3.1810688972473145, + "step": 2092 + }, + { + "epoch": 0.58, + "logps_train/chosen": -93.46104431152344, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -123.36744689941406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6160262823104858, + "rewards_train/margins": 1.082436442375183, + "rewards_train/rejected": -2.698462724685669, + "step": 2093 + }, + { + "epoch": 0.59, + "learning_rate": 6.13023042796367e-08, + "loss": 0.313, + "step": 2094 + }, + { + "epoch": 0.59, + "logps_train/chosen": -66.57969665527344, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -89.05522155761719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0910753011703491, + "rewards_train/margins": 1.451165795326233, + "rewards_train/rejected": -2.542241096496582, + "step": 2094 + }, + { + "epoch": 0.59, + "logps_train/chosen": -85.72696685791016, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -97.59848022460938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.833828926086426, + "rewards_train/margins": 1.2093195915222168, + "rewards_train/rejected": -4.043148517608643, + "step": 2095 + }, + { + "epoch": 0.59, + "learning_rate": 6.089689855854869e-08, + "loss": 0.3826, + "step": 2096 + }, + { + "epoch": 0.59, + "logps_train/chosen": -69.51953125, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -65.06510925292969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6810541152954102, + "rewards_train/margins": 0.38824963569641113, + "rewards_train/rejected": -2.0693037509918213, + "step": 2096 + }, + { + "epoch": 0.59, + "logps_train/chosen": -117.8916015625, + "logps_train/ref_chosen": -106.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -126.8585433959961, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.171191930770874, + "rewards_train/margins": 1.2963025569915771, + "rewards_train/rejected": -2.467494487762451, + "step": 2097 + }, + { + "epoch": 0.59, + "learning_rate": 6.04926518147639e-08, + "loss": 0.5213, + "step": 2098 + }, + { + "epoch": 0.59, + "logps_train/chosen": -100.62855529785156, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -114.99569702148438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.965492606163025, + "rewards_train/margins": 1.1067334413528442, + "rewards_train/rejected": -3.072226047515869, + "step": 2098 + }, + { + "epoch": 0.59, + "logps_train/chosen": -98.0699462890625, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -110.64087677001953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9958622455596924, + "rewards_train/margins": 0.7252566814422607, + "rewards_train/rejected": -2.721118927001953, + "step": 2099 + }, + { + "epoch": 0.59, + "learning_rate": 6.00895665258346e-08, + "loss": 0.5094, + "step": 2100 + }, + { + "epoch": 0.59, + "logps_train/chosen": -70.66618347167969, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -88.56083679199219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.539287567138672, + "rewards_train/margins": 0.6989374160766602, + "rewards_train/rejected": -3.238224983215332, + "step": 2100 + }, + { + "epoch": 0.59, + "logps_train/chosen": -65.05520629882812, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -76.51008605957031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4090361595153809, + "rewards_train/margins": 1.5167770385742188, + "rewards_train/rejected": -2.9258131980895996, + "step": 2101 + }, + { + "epoch": 0.59, + "learning_rate": 5.968764516219452e-08, + "loss": 0.409, + "step": 2102 + }, + { + "epoch": 0.59, + "logps_train/chosen": -67.32633972167969, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -71.25092315673828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.139665126800537, + "rewards_train/margins": 0.6016381978988647, + "rewards_train/rejected": -1.7413033246994019, + "step": 2102 + }, + { + "epoch": 0.59, + "logps_train/chosen": -50.675941467285156, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -64.9217529296875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3148601055145264, + "rewards_train/margins": 0.9710648059844971, + "rewards_train/rejected": -2.2859249114990234, + "step": 2103 + }, + { + "epoch": 0.59, + "learning_rate": 5.9286890187144027e-08, + "loss": 0.4437, + "step": 2104 + }, + { + "epoch": 0.59, + "logps_train/chosen": -93.62046813964844, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -118.02957916259766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.357945442199707, + "rewards_train/margins": 1.7239186763763428, + "rewards_train/rejected": -3.08186411857605, + "step": 2104 + }, + { + "epoch": 0.59, + "logps_train/chosen": -89.52161407470703, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -136.29417419433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0435675382614136, + "rewards_train/margins": 3.3913198709487915, + "rewards_train/rejected": -4.434887409210205, + "step": 2105 + }, + { + "epoch": 0.59, + "learning_rate": 5.888730405683495e-08, + "loss": 0.1536, + "step": 2106 + }, + { + "epoch": 0.59, + "logps_train/chosen": -82.62612915039062, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -86.76184844970703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.272378921508789, + "rewards_train/margins": 1.7903294563293457, + "rewards_train/rejected": -3.0627083778381348, + "step": 2106 + }, + { + "epoch": 0.59, + "logps_train/chosen": -95.77322387695312, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -119.54524230957031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.112185001373291, + "rewards_train/margins": 2.22202730178833, + "rewards_train/rejected": -4.334212303161621, + "step": 2107 + }, + { + "epoch": 0.59, + "learning_rate": 5.848888922025552e-08, + "loss": 0.251, + "step": 2108 + }, + { + "epoch": 0.59, + "logps_train/chosen": -50.7303352355957, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -72.72836303710938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9046742916107178, + "rewards_train/margins": 1.6324200630187988, + "rewards_train/rejected": -2.5370943546295166, + "step": 2108 + }, + { + "epoch": 0.59, + "logps_train/chosen": -31.666881561279297, + "logps_train/ref_chosen": -27.125, + "logps_train/ref_rejected": -30.875, + "logps_train/rejected": -39.66892623901367, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.44847527146339417, + "rewards_train/margins": 0.4336518943309784, + "rewards_train/rejected": -0.8821271657943726, + "step": 2109 + }, + { + "epoch": 0.59, + "learning_rate": 5.8091648119215105e-08, + "loss": 0.4292, + "step": 2110 + }, + { + "epoch": 0.59, + "logps_train/chosen": -68.48396301269531, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -92.3371810913086, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9716391563415527, + "rewards_train/margins": 1.34840726852417, + "rewards_train/rejected": -3.3200464248657227, + "step": 2110 + }, + { + "epoch": 0.59, + "logps_train/chosen": -35.47300338745117, + "logps_train/ref_chosen": -27.875, + "logps_train/ref_rejected": -26.625, + "logps_train/rejected": -39.582855224609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7613626718521118, + "rewards_train/margins": 0.528563380241394, + "rewards_train/rejected": -1.2899260520935059, + "step": 2111 + }, + { + "epoch": 0.59, + "learning_rate": 5.769558318832965e-08, + "loss": 0.4616, + "step": 2112 + }, + { + "epoch": 0.59, + "logps_train/chosen": -85.51171875, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -124.2002944946289, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2382328510284424, + "rewards_train/margins": 2.1925389766693115, + "rewards_train/rejected": -4.430771827697754, + "step": 2112 + }, + { + "epoch": 0.59, + "logps_train/chosen": -38.618709564208984, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -28.875, + "logps_train/rejected": -45.32367706298828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2907773554325104, + "rewards_train/margins": 1.3490119874477386, + "rewards_train/rejected": -1.639789342880249, + "step": 2113 + }, + { + "epoch": 0.59, + "learning_rate": 5.7300696855006684e-08, + "loss": 0.4409, + "step": 2114 + }, + { + "epoch": 0.59, + "logps_train/chosen": -69.73672485351562, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -117.43955993652344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9086329936981201, + "rewards_train/margins": 3.11686635017395, + "rewards_train/rejected": -4.02549934387207, + "step": 2114 + }, + { + "epoch": 0.59, + "logps_train/chosen": -49.12819290161133, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -76.30802154541016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6368428468704224, + "rewards_train/margins": 1.4916154146194458, + "rewards_train/rejected": -3.128458261489868, + "step": 2115 + }, + { + "epoch": 0.59, + "learning_rate": 5.690699153942999e-08, + "loss": 0.3039, + "step": 2116 + }, + { + "epoch": 0.59, + "logps_train/chosen": -102.94654846191406, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -111.5, + "logps_train/rejected": -146.1102294921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5367438793182373, + "rewards_train/margins": 1.9109971523284912, + "rewards_train/rejected": -3.4477410316467285, + "step": 2116 + }, + { + "epoch": 0.59, + "logps_train/chosen": -79.35420227050781, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -117.54788208007812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.7651073932647705, + "rewards_train/margins": 2.2947590351104736, + "rewards_train/rejected": -5.059866428375244, + "step": 2117 + }, + { + "epoch": 0.59, + "learning_rate": 5.6514469654545424e-08, + "loss": 0.3365, + "step": 2118 + }, + { + "epoch": 0.59, + "logps_train/chosen": -75.69719696044922, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -97.35427856445312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6011650562286377, + "rewards_train/margins": 1.3848488330841064, + "rewards_train/rejected": -2.986013889312744, + "step": 2118 + }, + { + "epoch": 0.59, + "logps_train/chosen": -108.6153793334961, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -102.6696548461914, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7549954652786255, + "rewards_train/margins": 1.5508369207382202, + "rewards_train/rejected": -3.3058323860168457, + "step": 2119 + }, + { + "epoch": 0.59, + "learning_rate": 5.61231336060457e-08, + "loss": 0.3585, + "step": 2120 + }, + { + "epoch": 0.59, + "logps_train/chosen": -74.99488830566406, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -89.0638427734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.1393327713012695, + "rewards_train/margins": 0.5445904731750488, + "rewards_train/rejected": -2.6839232444763184, + "step": 2120 + }, + { + "epoch": 0.59, + "logps_train/chosen": -92.38201904296875, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -104.81034088134766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.167889356613159, + "rewards_train/margins": 1.4100196361541748, + "rewards_train/rejected": -3.577908992767334, + "step": 2121 + }, + { + "epoch": 0.59, + "learning_rate": 5.573298579235586e-08, + "loss": 0.442, + "step": 2122 + }, + { + "epoch": 0.59, + "logps_train/chosen": -34.443363189697266, + "logps_train/ref_chosen": -27.25, + "logps_train/ref_rejected": -33.25, + "logps_train/rejected": -55.34367370605469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.7206120491027832, + "rewards_train/margins": 1.4889566898345947, + "rewards_train/rejected": -2.209568738937378, + "step": 2122 + }, + { + "epoch": 0.59, + "logps_train/chosen": -71.74280548095703, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -99.73445129394531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0570931434631348, + "rewards_train/margins": 2.6399850845336914, + "rewards_train/rejected": -3.697078227996826, + "step": 2123 + }, + { + "epoch": 0.59, + "learning_rate": 5.534402860461823e-08, + "loss": 0.3885, + "step": 2124 + }, + { + "epoch": 0.59, + "logps_train/chosen": -47.78821563720703, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -54.90350341796875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.218665599822998, + "rewards_train/margins": 0.7177789211273193, + "rewards_train/rejected": -1.9364445209503174, + "step": 2124 + }, + { + "epoch": 0.59, + "logps_train/chosen": -74.49174499511719, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -77.91828918457031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7401896715164185, + "rewards_train/margins": 0.9453891515731812, + "rewards_train/rejected": -2.6855788230895996, + "step": 2125 + }, + { + "epoch": 0.59, + "learning_rate": 5.495626442667825e-08, + "loss": 0.4826, + "step": 2126 + }, + { + "epoch": 0.59, + "logps_train/chosen": -58.563819885253906, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -58.89781188964844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5790258646011353, + "rewards_train/margins": 0.5388680696487427, + "rewards_train/rejected": -2.117893934249878, + "step": 2126 + }, + { + "epoch": 0.59, + "logps_train/chosen": -52.63378143310547, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -97.5623550415039, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1653310060501099, + "rewards_train/margins": 2.3487173318862915, + "rewards_train/rejected": -3.5140483379364014, + "step": 2127 + }, + { + "epoch": 0.59, + "learning_rate": 5.456969563506966e-08, + "loss": 0.3885, + "step": 2128 + }, + { + "epoch": 0.59, + "logps_train/chosen": -73.58062744140625, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -98.3315658569336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.586431860923767, + "rewards_train/margins": 1.281685471534729, + "rewards_train/rejected": -2.868117332458496, + "step": 2128 + }, + { + "epoch": 0.6, + "logps_train/chosen": -53.447994232177734, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -81.51532745361328, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.6170652508735657, + "rewards_train/margins": 1.7473577857017517, + "rewards_train/rejected": -2.3644230365753174, + "step": 2129 + }, + { + "epoch": 0.6, + "learning_rate": 5.418432459899963e-08, + "loss": 0.424, + "step": 2130 + }, + { + "epoch": 0.6, + "logps_train/chosen": -64.28777313232422, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -88.68441772460938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3242729902267456, + "rewards_train/margins": 2.565055727958679, + "rewards_train/rejected": -3.889328718185425, + "step": 2130 + }, + { + "epoch": 0.6, + "logps_train/chosen": -74.45040893554688, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -80.51853942871094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1805874109268188, + "rewards_train/margins": 1.4519306421279907, + "rewards_train/rejected": -2.6325180530548096, + "step": 2131 + }, + { + "epoch": 0.6, + "learning_rate": 5.380015368033475e-08, + "loss": 0.3344, + "step": 2132 + }, + { + "epoch": 0.6, + "logps_train/chosen": -52.31156539916992, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -64.6455078125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0885294675827026, + "rewards_train/margins": 1.076412320137024, + "rewards_train/rejected": -2.1649417877197266, + "step": 2132 + }, + { + "epoch": 0.6, + "logps_train/chosen": -105.92382049560547, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -130.3473663330078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1806633472442627, + "rewards_train/margins": 2.1187217235565186, + "rewards_train/rejected": -3.2993850708007812, + "step": 2133 + }, + { + "epoch": 0.6, + "learning_rate": 5.341718523358624e-08, + "loss": 0.4098, + "step": 2134 + }, + { + "epoch": 0.6, + "logps_train/chosen": -62.77705383300781, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -65.87120819091797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.801142692565918, + "rewards_train/margins": 0.44847822189331055, + "rewards_train/rejected": -2.2496209144592285, + "step": 2134 + }, + { + "epoch": 0.6, + "logps_train/chosen": -75.49212646484375, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -116.18441772460938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.114837646484375, + "rewards_train/margins": 2.6766507625579834, + "rewards_train/rejected": -3.7914884090423584, + "step": 2135 + }, + { + "epoch": 0.6, + "learning_rate": 5.3035421605895694e-08, + "loss": 0.3943, + "step": 2136 + }, + { + "epoch": 0.6, + "logps_train/chosen": -79.04789733886719, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -89.93489837646484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0745160579681396, + "rewards_train/margins": 1.3844029903411865, + "rewards_train/rejected": -3.458919048309326, + "step": 2136 + }, + { + "epoch": 0.6, + "logps_train/chosen": -71.12640380859375, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -83.72441864013672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9622499942779541, + "rewards_train/margins": 1.8742544651031494, + "rewards_train/rejected": -2.8365044593811035, + "step": 2137 + }, + { + "epoch": 0.6, + "learning_rate": 5.265486513702036e-08, + "loss": 0.2997, + "step": 2138 + }, + { + "epoch": 0.6, + "logps_train/chosen": -74.46654510498047, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -93.89525604248047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.1700921058654785, + "rewards_train/margins": 0.9196293354034424, + "rewards_train/rejected": -3.089721441268921, + "step": 2138 + }, + { + "epoch": 0.6, + "logps_train/chosen": -67.54258728027344, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -98.41825103759766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2925889492034912, + "rewards_train/margins": 2.2851741313934326, + "rewards_train/rejected": -3.577763080596924, + "step": 2139 + }, + { + "epoch": 0.6, + "learning_rate": 5.2275518159319244e-08, + "loss": 0.4414, + "step": 2140 + }, + { + "epoch": 0.6, + "logps_train/chosen": -100.00669860839844, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -120.00212860107422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2702014446258545, + "rewards_train/margins": 2.686846971511841, + "rewards_train/rejected": -3.9570484161376953, + "step": 2140 + }, + { + "epoch": 0.6, + "logps_train/chosen": -61.78115463256836, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -75.81358337402344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4771389961242676, + "rewards_train/margins": 0.6557817459106445, + "rewards_train/rejected": -2.132920742034912, + "step": 2141 + }, + { + "epoch": 0.6, + "learning_rate": 5.189738299773863e-08, + "loss": 0.3366, + "step": 2142 + }, + { + "epoch": 0.6, + "logps_train/chosen": -57.79534912109375, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -80.45194244384766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1432068347930908, + "rewards_train/margins": 1.410581350326538, + "rewards_train/rejected": -2.553788185119629, + "step": 2142 + }, + { + "epoch": 0.6, + "logps_train/chosen": -52.1115608215332, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -68.91043090820312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6092031002044678, + "rewards_train/margins": 0.7982456684112549, + "rewards_train/rejected": -2.4074487686157227, + "step": 2143 + }, + { + "epoch": 0.6, + "learning_rate": 5.152046196979756e-08, + "loss": 0.381, + "step": 2144 + }, + { + "epoch": 0.6, + "logps_train/chosen": -67.82266235351562, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -78.76952362060547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1142973899841309, + "rewards_train/margins": 0.46704959869384766, + "rewards_train/rejected": -1.5813469886779785, + "step": 2144 + }, + { + "epoch": 0.6, + "logps_train/chosen": -90.55293273925781, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -112.4375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9808785915374756, + "rewards_train/margins": 1.3691213130950928, + "rewards_train/rejected": -3.3499999046325684, + "step": 2145 + }, + { + "epoch": 0.6, + "learning_rate": 5.114475738557414e-08, + "loss": 0.4408, + "step": 2146 + }, + { + "epoch": 0.6, + "logps_train/chosen": -80.70329284667969, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -97.93040466308594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1353681087493896, + "rewards_train/margins": 1.5688543319702148, + "rewards_train/rejected": -3.7042224407196045, + "step": 2146 + }, + { + "epoch": 0.6, + "logps_train/chosen": -94.27971649169922, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -118.33087158203125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9658623933792114, + "rewards_train/margins": 2.504724383354187, + "rewards_train/rejected": -4.470586776733398, + "step": 2147 + }, + { + "epoch": 0.6, + "learning_rate": 5.077027154769106e-08, + "loss": 0.2933, + "step": 2148 + }, + { + "epoch": 0.6, + "logps_train/chosen": -71.42758178710938, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -91.78971099853516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.768930196762085, + "rewards_train/margins": 1.0479319095611572, + "rewards_train/rejected": -2.816862106323242, + "step": 2148 + }, + { + "epoch": 0.6, + "logps_train/chosen": -116.45526123046875, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -124.108642578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.3264341354370117, + "rewards_train/margins": 1.762359619140625, + "rewards_train/rejected": -4.088793754577637, + "step": 2149 + }, + { + "epoch": 0.6, + "learning_rate": 5.039700675130143e-08, + "loss": 0.4622, + "step": 2150 + }, + { + "epoch": 0.6, + "logps_train/chosen": -47.57038116455078, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -63.25647735595703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9398505687713623, + "rewards_train/margins": 1.3225162029266357, + "rewards_train/rejected": -2.262366771697998, + "step": 2150 + }, + { + "epoch": 0.6, + "logps_train/chosen": -75.62815856933594, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -103.50210571289062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8766829967498779, + "rewards_train/margins": 2.130559206008911, + "rewards_train/rejected": -3.007242202758789, + "step": 2151 + }, + { + "epoch": 0.6, + "learning_rate": 5.002496528407493e-08, + "loss": 0.3644, + "step": 2152 + }, + { + "epoch": 0.6, + "logps_train/chosen": -70.53668212890625, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -96.66780090332031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0055909156799316, + "rewards_train/margins": 1.2736575603485107, + "rewards_train/rejected": -2.2792484760284424, + "step": 2152 + }, + { + "epoch": 0.6, + "logps_train/chosen": -86.23835754394531, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -97.13589477539062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3457109928131104, + "rewards_train/margins": 2.0218327045440674, + "rewards_train/rejected": -3.3675436973571777, + "step": 2153 + }, + { + "epoch": 0.6, + "learning_rate": 4.9654149426183675e-08, + "loss": 0.3179, + "step": 2154 + }, + { + "epoch": 0.6, + "logps_train/chosen": -94.10749816894531, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -113.29045104980469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.027156114578247, + "rewards_train/margins": 1.1034526824951172, + "rewards_train/rejected": -3.1306087970733643, + "step": 2154 + }, + { + "epoch": 0.6, + "logps_train/chosen": -41.602203369140625, + "logps_train/ref_chosen": -32.25, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -72.07321166992188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9360989928245544, + "rewards_train/margins": 0.8774717450141907, + "rewards_train/rejected": -1.8135707378387451, + "step": 2155 + }, + { + "epoch": 0.6, + "learning_rate": 4.9284561450288336e-08, + "loss": 0.4909, + "step": 2156 + }, + { + "epoch": 0.6, + "logps_train/chosen": -50.188743591308594, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -67.2891616821289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8723899126052856, + "rewards_train/margins": 1.5221515893936157, + "rewards_train/rejected": -2.3945415019989014, + "step": 2156 + }, + { + "epoch": 0.6, + "logps_train/chosen": -117.48445129394531, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -145.0334930419922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.6066479682922363, + "rewards_train/margins": 1.5853729248046875, + "rewards_train/rejected": -4.192020893096924, + "step": 2157 + }, + { + "epoch": 0.6, + "learning_rate": 4.8916203621523846e-08, + "loss": 0.6095, + "step": 2158 + }, + { + "epoch": 0.6, + "logps_train/chosen": -112.71588134765625, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -141.0, + "logps_train/rejected": -177.91049194335938, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.37471342086792, + "rewards_train/margins": 1.3186795711517334, + "rewards_train/rejected": -3.6933929920196533, + "step": 2158 + }, + { + "epoch": 0.6, + "logps_train/chosen": -77.97830200195312, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -108.66264343261719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3766385316848755, + "rewards_train/margins": 1.66618812084198, + "rewards_train/rejected": -3.0428266525268555, + "step": 2159 + }, + { + "epoch": 0.6, + "learning_rate": 4.854907819748605e-08, + "loss": 0.4645, + "step": 2160 + }, + { + "epoch": 0.6, + "logps_train/chosen": -76.81294250488281, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -89.4246826171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4867630004882812, + "rewards_train/margins": 0.7822675704956055, + "rewards_train/rejected": -2.2690305709838867, + "step": 2160 + }, + { + "epoch": 0.6, + "logps_train/chosen": -51.17985153198242, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -55.96266174316406, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.0998210906982422, + "rewards_train/margins": 0.31167924404144287, + "rewards_train/rejected": -1.411500334739685, + "step": 2161 + }, + { + "epoch": 0.6, + "learning_rate": 4.8183187428217644e-08, + "loss": 0.5627, + "step": 2162 + }, + { + "epoch": 0.6, + "logps_train/chosen": -89.8934326171875, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -136.94467163085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2463749647140503, + "rewards_train/margins": 2.6863728761672974, + "rewards_train/rejected": -3.9327478408813477, + "step": 2162 + }, + { + "epoch": 0.6, + "logps_train/chosen": -66.49400329589844, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -77.09996032714844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8236675262451172, + "rewards_train/margins": 0.727588415145874, + "rewards_train/rejected": -2.551255941390991, + "step": 2163 + }, + { + "epoch": 0.6, + "learning_rate": 4.781853355619414e-08, + "loss": 0.3154, + "step": 2164 + }, + { + "epoch": 0.6, + "logps_train/chosen": -95.30115509033203, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -138.87399291992188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.670496940612793, + "rewards_train/margins": 2.0684638023376465, + "rewards_train/rejected": -4.7389607429504395, + "step": 2164 + }, + { + "epoch": 0.61, + "logps_train/chosen": -66.40122985839844, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -92.26812744140625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9758161306381226, + "rewards_train/margins": 1.6861530542373657, + "rewards_train/rejected": -2.6619691848754883, + "step": 2165 + }, + { + "epoch": 0.61, + "learning_rate": 4.745511881631048e-08, + "loss": 0.3928, + "step": 2166 + }, + { + "epoch": 0.61, + "logps_train/chosen": -63.793094635009766, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -89.9686279296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8122196197509766, + "rewards_train/margins": 2.1561279296875, + "rewards_train/rejected": -2.9683475494384766, + "step": 2166 + }, + { + "epoch": 0.61, + "logps_train/chosen": -73.35771179199219, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -101.32657623291016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8987107276916504, + "rewards_train/margins": 1.4831657409667969, + "rewards_train/rejected": -3.3818764686584473, + "step": 2167 + }, + { + "epoch": 0.61, + "learning_rate": 4.7092945435867196e-08, + "loss": 0.3073, + "step": 2168 + }, + { + "epoch": 0.61, + "logps_train/chosen": -75.34603881835938, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -84.72315216064453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8536467552185059, + "rewards_train/margins": 0.474332332611084, + "rewards_train/rejected": -2.32797908782959, + "step": 2168 + }, + { + "epoch": 0.61, + "logps_train/chosen": -62.41304016113281, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -89.46566009521484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.763472080230713, + "rewards_train/margins": 1.571765661239624, + "rewards_train/rejected": -3.335237741470337, + "step": 2169 + }, + { + "epoch": 0.61, + "learning_rate": 4.673201563455681e-08, + "loss": 0.4618, + "step": 2170 + }, + { + "epoch": 0.61, + "logps_train/chosen": -71.67170715332031, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -90.49404907226562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.721272349357605, + "rewards_train/margins": 1.2793046236038208, + "rewards_train/rejected": -3.000576972961426, + "step": 2170 + }, + { + "epoch": 0.61, + "logps_train/chosen": -80.6717529296875, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -106.77967834472656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6156131029129028, + "rewards_train/margins": 2.0881351232528687, + "rewards_train/rejected": -3.7037482261657715, + "step": 2171 + }, + { + "epoch": 0.61, + "learning_rate": 4.637233162445001e-08, + "loss": 0.3808, + "step": 2172 + }, + { + "epoch": 0.61, + "logps_train/chosen": -74.60847473144531, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -90.58171844482422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9211989641189575, + "rewards_train/margins": 1.737754225730896, + "rewards_train/rejected": -2.6589531898498535, + "step": 2172 + }, + { + "epoch": 0.61, + "logps_train/chosen": -67.87394714355469, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -81.95368957519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.28134024143219, + "rewards_train/margins": 0.9876612424850464, + "rewards_train/rejected": -2.2690014839172363, + "step": 2173 + }, + { + "epoch": 0.61, + "learning_rate": 4.601389560998239e-08, + "loss": 0.419, + "step": 2174 + }, + { + "epoch": 0.61, + "logps_train/chosen": -53.42560577392578, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -76.20135498046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1635569334030151, + "rewards_train/margins": 1.0972038507461548, + "rewards_train/rejected": -2.26076078414917, + "step": 2174 + }, + { + "epoch": 0.61, + "logps_train/chosen": -94.52764892578125, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -99.2550048828125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5938782691955566, + "rewards_train/margins": 2.3807923793792725, + "rewards_train/rejected": -3.974670648574829, + "step": 2175 + }, + { + "epoch": 0.61, + "learning_rate": 4.5656709787940924e-08, + "loss": 0.4072, + "step": 2176 + }, + { + "epoch": 0.61, + "logps_train/chosen": -57.778236389160156, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -60.36033248901367, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8884190320968628, + "rewards_train/margins": 0.6456612348556519, + "rewards_train/rejected": -2.5340802669525146, + "step": 2176 + }, + { + "epoch": 0.61, + "logps_train/chosen": -66.49990844726562, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -69.87161254882812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4437408447265625, + "rewards_train/margins": 1.4879522323608398, + "rewards_train/rejected": -1.9316930770874023, + "step": 2177 + }, + { + "epoch": 0.61, + "learning_rate": 4.5300776347450054e-08, + "loss": 0.5362, + "step": 2178 + }, + { + "epoch": 0.61, + "logps_train/chosen": -88.06047058105469, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -82.39453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.2029218673706055, + "rewards_train/margins": 0.9341874122619629, + "rewards_train/rejected": -3.1371092796325684, + "step": 2178 + }, + { + "epoch": 0.61, + "logps_train/chosen": -89.73477935791016, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -110.08747100830078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0811927318573, + "rewards_train/margins": 1.9843904972076416, + "rewards_train/rejected": -4.065583229064941, + "step": 2179 + }, + { + "epoch": 0.61, + "learning_rate": 4.494609746995895e-08, + "loss": 0.4428, + "step": 2180 + }, + { + "epoch": 0.61, + "logps_train/chosen": -59.022613525390625, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -80.35166931152344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3339993953704834, + "rewards_train/margins": 1.9964802265167236, + "rewards_train/rejected": -3.330479621887207, + "step": 2180 + }, + { + "epoch": 0.61, + "logps_train/chosen": -78.413818359375, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -73.18446350097656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.560522437095642, + "rewards_train/margins": 0.9602681398391724, + "rewards_train/rejected": -2.5207905769348145, + "step": 2181 + }, + { + "epoch": 0.61, + "learning_rate": 4.4592675329227644e-08, + "loss": 0.4098, + "step": 2182 + }, + { + "epoch": 0.61, + "logps_train/chosen": -97.47107696533203, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -146.03619384765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.363318681716919, + "rewards_train/margins": 3.4576833248138428, + "rewards_train/rejected": -4.821002006530762, + "step": 2182 + }, + { + "epoch": 0.61, + "logps_train/chosen": -84.58187866210938, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -41.0, + "logps_train/rejected": -75.60816955566406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.045687675476074, + "rewards_train/margins": 1.422257900238037, + "rewards_train/rejected": -3.4679455757141113, + "step": 2183 + }, + { + "epoch": 0.61, + "learning_rate": 4.424051209131399e-08, + "loss": 0.2827, + "step": 2184 + }, + { + "epoch": 0.61, + "logps_train/chosen": -67.81340026855469, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -86.39239501953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1239666938781738, + "rewards_train/margins": 1.7319717407226562, + "rewards_train/rejected": -2.85593843460083, + "step": 2184 + }, + { + "epoch": 0.61, + "logps_train/chosen": -102.90721130371094, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -134.3486328125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3950178623199463, + "rewards_train/margins": 1.9812510013580322, + "rewards_train/rejected": -3.3762688636779785, + "step": 2185 + }, + { + "epoch": 0.61, + "learning_rate": 4.388960991455998e-08, + "loss": 0.2786, + "step": 2186 + }, + { + "epoch": 0.61, + "logps_train/chosen": -94.52255249023438, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -104.25389099121094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6331144571304321, + "rewards_train/margins": 1.6272355318069458, + "rewards_train/rejected": -3.260349988937378, + "step": 2186 + }, + { + "epoch": 0.61, + "logps_train/chosen": -89.76007080078125, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -129.9043731689453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8854801654815674, + "rewards_train/margins": 2.8222415447235107, + "rewards_train/rejected": -4.707721710205078, + "step": 2187 + }, + { + "epoch": 0.61, + "learning_rate": 4.353997094957903e-08, + "loss": 0.267, + "step": 2188 + }, + { + "epoch": 0.61, + "logps_train/chosen": -72.3919677734375, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -82.99869537353516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7066773176193237, + "rewards_train/margins": 1.5084270238876343, + "rewards_train/rejected": -2.215104341506958, + "step": 2188 + }, + { + "epoch": 0.61, + "logps_train/chosen": -92.11982727050781, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -119.62309265136719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.272139072418213, + "rewards_train/margins": 1.5995445251464844, + "rewards_train/rejected": -2.8716835975646973, + "step": 2189 + }, + { + "epoch": 0.61, + "learning_rate": 4.31915973392426e-08, + "loss": 0.344, + "step": 2190 + }, + { + "epoch": 0.61, + "logps_train/chosen": -61.91041564941406, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -102.25560760498047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.4523700177669525, + "rewards_train/margins": 2.0759247839450836, + "rewards_train/rejected": -2.528294801712036, + "step": 2190 + }, + { + "epoch": 0.61, + "logps_train/chosen": -70.85047912597656, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -85.49187469482422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0537985563278198, + "rewards_train/margins": 2.0175567865371704, + "rewards_train/rejected": -3.0713553428649902, + "step": 2191 + }, + { + "epoch": 0.61, + "learning_rate": 4.2844491218666986e-08, + "loss": 0.2707, + "step": 2192 + }, + { + "epoch": 0.61, + "logps_train/chosen": -86.39334869384766, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -103.11177062988281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6780071258544922, + "rewards_train/margins": 1.2112946510314941, + "rewards_train/rejected": -2.8893017768859863, + "step": 2192 + }, + { + "epoch": 0.61, + "logps_train/chosen": -73.32200622558594, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -93.06844329833984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7441143989562988, + "rewards_train/margins": 0.5579445362091064, + "rewards_train/rejected": -2.3020589351654053, + "step": 2193 + }, + { + "epoch": 0.61, + "learning_rate": 4.2498654715200115e-08, + "loss": 0.4912, + "step": 2194 + }, + { + "epoch": 0.61, + "logps_train/chosen": -93.79214477539062, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -104.29537963867188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5503088235855103, + "rewards_train/margins": 2.2485655546188354, + "rewards_train/rejected": -3.7988743782043457, + "step": 2194 + }, + { + "epoch": 0.61, + "logps_train/chosen": -78.31034088134766, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -82.18409729003906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7984168529510498, + "rewards_train/margins": 0.5113992691040039, + "rewards_train/rejected": -2.3098161220550537, + "step": 2195 + }, + { + "epoch": 0.61, + "learning_rate": 4.2154089948408855e-08, + "loss": 0.45, + "step": 2196 + }, + { + "epoch": 0.61, + "logps_train/chosen": -49.43216323852539, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -57.50679397583008, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.30405592918395996, + "rewards_train/margins": 0.6392014026641846, + "rewards_train/rejected": -0.9432573318481445, + "step": 2196 + }, + { + "epoch": 0.61, + "logps_train/chosen": -84.24931335449219, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -113.93408966064453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6593858003616333, + "rewards_train/margins": 2.4887903928756714, + "rewards_train/rejected": -4.148176193237305, + "step": 2197 + }, + { + "epoch": 0.61, + "learning_rate": 4.181079903006587e-08, + "loss": 0.3739, + "step": 2198 + }, + { + "epoch": 0.61, + "logps_train/chosen": -104.45442199707031, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -123.62088775634766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6325514316558838, + "rewards_train/margins": 1.2787561416625977, + "rewards_train/rejected": -2.9113075733184814, + "step": 2198 + }, + { + "epoch": 0.61, + "logps_train/chosen": -85.09977722167969, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -101.44914245605469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3189617395401, + "rewards_train/margins": 1.197046160697937, + "rewards_train/rejected": -2.516007900238037, + "step": 2199 + }, + { + "epoch": 0.61, + "learning_rate": 4.1468784064136424e-08, + "loss": 0.3595, + "step": 2200 + }, + { + "epoch": 0.61, + "logps_train/chosen": -57.185577392578125, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -86.12376403808594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.68418288230896, + "rewards_train/margins": 0.493818998336792, + "rewards_train/rejected": -2.178001880645752, + "step": 2200 + }, + { + "epoch": 0.62, + "logps_train/chosen": -63.62837600708008, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -68.22294616699219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9917436838150024, + "rewards_train/margins": 1.3664888143539429, + "rewards_train/rejected": -2.3582324981689453, + "step": 2201 + }, + { + "epoch": 0.62, + "learning_rate": 4.112804714676593e-08, + "loss": 0.5028, + "step": 2202 + }, + { + "epoch": 0.62, + "logps_train/chosen": -74.61570739746094, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -86.88809204101562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5113760232925415, + "rewards_train/margins": 1.9196211099624634, + "rewards_train/rejected": -3.430997133255005, + "step": 2202 + }, + { + "epoch": 0.62, + "logps_train/chosen": -73.47591400146484, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -123.33708190917969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3405604362487793, + "rewards_train/margins": 2.7712721824645996, + "rewards_train/rejected": -4.111832618713379, + "step": 2203 + }, + { + "epoch": 0.62, + "learning_rate": 4.078859036626675e-08, + "loss": 0.3244, + "step": 2204 + }, + { + "epoch": 0.62, + "logps_train/chosen": -78.50559997558594, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -102.9476318359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0773179531097412, + "rewards_train/margins": 1.9715468883514404, + "rewards_train/rejected": -3.0488648414611816, + "step": 2204 + }, + { + "epoch": 0.62, + "logps_train/chosen": -99.85668182373047, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -122.10873413085938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9958239793777466, + "rewards_train/margins": 1.4857524633407593, + "rewards_train/rejected": -3.481576442718506, + "step": 2205 + }, + { + "epoch": 0.62, + "learning_rate": 4.045041580310568e-08, + "loss": 0.3996, + "step": 2206 + }, + { + "epoch": 0.62, + "logps_train/chosen": -91.04232788085938, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -128.8021240234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.47298282384872437, + "rewards_train/margins": 2.8220723271369934, + "rewards_train/rejected": -3.2950551509857178, + "step": 2206 + }, + { + "epoch": 0.62, + "logps_train/chosen": -60.48843765258789, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -85.99115753173828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.032559871673584, + "rewards_train/margins": 2.402102470397949, + "rewards_train/rejected": -3.434662342071533, + "step": 2207 + }, + { + "epoch": 0.62, + "learning_rate": 4.011352552989081e-08, + "loss": 0.2627, + "step": 2208 + }, + { + "epoch": 0.62, + "logps_train/chosen": -78.32228088378906, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -98.48217010498047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.222853422164917, + "rewards_train/margins": 2.1152076721191406, + "rewards_train/rejected": -3.3380610942840576, + "step": 2208 + }, + { + "epoch": 0.62, + "logps_train/chosen": -121.11969757080078, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -110.13211059570312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.695563793182373, + "rewards_train/margins": 1.7629599571228027, + "rewards_train/rejected": -4.458523750305176, + "step": 2209 + }, + { + "epoch": 0.62, + "learning_rate": 3.977792161135926e-08, + "loss": 0.2788, + "step": 2210 + }, + { + "epoch": 0.62, + "logps_train/chosen": -84.21195983886719, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -85.9394302368164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7102582454681396, + "rewards_train/margins": 0.7754814624786377, + "rewards_train/rejected": -2.4857397079467773, + "step": 2210 + }, + { + "epoch": 0.62, + "logps_train/chosen": -52.243473052978516, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -56.98064422607422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7221986651420593, + "rewards_train/margins": 1.2147327065467834, + "rewards_train/rejected": -1.9369313716888428, + "step": 2211 + }, + { + "epoch": 0.62, + "learning_rate": 3.9443606104364285e-08, + "loss": 0.4414, + "step": 2212 + }, + { + "epoch": 0.62, + "logps_train/chosen": -98.57444763183594, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -121.17247009277344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.9412338733673096, + "rewards_train/margins": 0.9576056003570557, + "rewards_train/rejected": -3.8988394737243652, + "step": 2212 + }, + { + "epoch": 0.62, + "logps_train/chosen": -79.18292999267578, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -112.39274597167969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3831367492675781, + "rewards_train/margins": 2.326646327972412, + "rewards_train/rejected": -3.7097830772399902, + "step": 2213 + }, + { + "epoch": 0.62, + "learning_rate": 3.911058105786261e-08, + "loss": 0.4105, + "step": 2214 + }, + { + "epoch": 0.62, + "logps_train/chosen": -78.99240112304688, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -100.45001220703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.66408371925354, + "rewards_train/margins": 1.2137305736541748, + "rewards_train/rejected": -2.877814292907715, + "step": 2214 + }, + { + "epoch": 0.62, + "logps_train/chosen": -72.00189971923828, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -80.90254974365234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5917916297912598, + "rewards_train/margins": 1.5211198329925537, + "rewards_train/rejected": -3.1129114627838135, + "step": 2215 + }, + { + "epoch": 0.62, + "learning_rate": 3.877884851290206e-08, + "loss": 0.3255, + "step": 2216 + }, + { + "epoch": 0.62, + "logps_train/chosen": -65.60671997070312, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -86.0708999633789, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1294224262237549, + "rewards_train/margins": 1.5378239154815674, + "rewards_train/rejected": -2.6672463417053223, + "step": 2216 + }, + { + "epoch": 0.62, + "logps_train/chosen": -64.95185089111328, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -88.27638244628906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1026068925857544, + "rewards_train/margins": 1.2574526071548462, + "rewards_train/rejected": -2.3600594997406006, + "step": 2217 + }, + { + "epoch": 0.62, + "learning_rate": 3.844841050260897e-08, + "loss": 0.3354, + "step": 2218 + }, + { + "epoch": 0.62, + "logps_train/chosen": -85.91168212890625, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -114.42323303222656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7695856094360352, + "rewards_train/margins": 1.7224445343017578, + "rewards_train/rejected": -3.492030143737793, + "step": 2218 + }, + { + "epoch": 0.62, + "logps_train/chosen": -51.87825012207031, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -67.33705139160156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8430984616279602, + "rewards_train/margins": 1.746954619884491, + "rewards_train/rejected": -2.590053081512451, + "step": 2219 + }, + { + "epoch": 0.62, + "learning_rate": 3.811926905217574e-08, + "loss": 0.4107, + "step": 2220 + }, + { + "epoch": 0.62, + "logps_train/chosen": -91.75385284423828, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -119.37815856933594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.2437448501586914, + "rewards_train/margins": 2.267899513244629, + "rewards_train/rejected": -4.51164436340332, + "step": 2220 + }, + { + "epoch": 0.62, + "logps_train/chosen": -127.87559509277344, + "logps_train/ref_chosen": -96.5, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -147.115478515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.1203718185424805, + "rewards_train/margins": 2.3958635330200195, + "rewards_train/rejected": -5.5162353515625, + "step": 2221 + }, + { + "epoch": 0.62, + "learning_rate": 3.779142617884823e-08, + "loss": 0.3116, + "step": 2222 + }, + { + "epoch": 0.62, + "logps_train/chosen": -75.1016845703125, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -91.29086303710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.95001220703125, + "rewards_train/margins": 1.3796601295471191, + "rewards_train/rejected": -3.329672336578369, + "step": 2222 + }, + { + "epoch": 0.62, + "logps_train/chosen": -82.7794189453125, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -101.03459167480469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0894654989242554, + "rewards_train/margins": 1.869852900505066, + "rewards_train/rejected": -2.9593183994293213, + "step": 2223 + }, + { + "epoch": 0.62, + "learning_rate": 3.746488389191371e-08, + "loss": 0.2689, + "step": 2224 + }, + { + "epoch": 0.62, + "logps_train/chosen": -83.6978759765625, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -119.25350189208984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.9529907703399658, + "rewards_train/margins": 2.4247024059295654, + "rewards_train/rejected": -4.377693176269531, + "step": 2224 + }, + { + "epoch": 0.62, + "logps_train/chosen": -36.573944091796875, + "logps_train/ref_chosen": -28.0, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -59.56032943725586, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8517303466796875, + "rewards_train/margins": 1.0855525732040405, + "rewards_train/rejected": -1.937282919883728, + "step": 2225 + }, + { + "epoch": 0.62, + "learning_rate": 3.7139644192688474e-08, + "loss": 0.315, + "step": 2226 + }, + { + "epoch": 0.62, + "logps_train/chosen": -75.35189819335938, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -86.87653350830078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.269174337387085, + "rewards_train/margins": 1.128244161605835, + "rewards_train/rejected": -2.39741849899292, + "step": 2226 + }, + { + "epoch": 0.62, + "logps_train/chosen": -53.44129943847656, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -81.5665283203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9230359792709351, + "rewards_train/margins": 1.8857661485671997, + "rewards_train/rejected": -2.8088021278381348, + "step": 2227 + }, + { + "epoch": 0.62, + "learning_rate": 3.681570907450526e-08, + "loss": 0.3966, + "step": 2228 + }, + { + "epoch": 0.62, + "logps_train/chosen": -97.71565246582031, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -121.66749572753906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7164868116378784, + "rewards_train/margins": 2.1275824308395386, + "rewards_train/rejected": -3.844069242477417, + "step": 2228 + }, + { + "epoch": 0.62, + "logps_train/chosen": -59.47755432128906, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -65.89055633544922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8981459140777588, + "rewards_train/margins": 0.9686441421508789, + "rewards_train/rejected": -2.8667900562286377, + "step": 2229 + }, + { + "epoch": 0.62, + "learning_rate": 3.6493080522701504e-08, + "loss": 0.3777, + "step": 2230 + }, + { + "epoch": 0.62, + "logps_train/chosen": -97.23069763183594, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -103.3641128540039, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.641624927520752, + "rewards_train/margins": 1.6627554893493652, + "rewards_train/rejected": -4.304380416870117, + "step": 2230 + }, + { + "epoch": 0.62, + "logps_train/chosen": -90.7872314453125, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -83.63240051269531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.999622106552124, + "rewards_train/margins": 0.7530707120895386, + "rewards_train/rejected": -1.7526928186416626, + "step": 2231 + }, + { + "epoch": 0.62, + "learning_rate": 3.61717605146068e-08, + "loss": 0.4237, + "step": 2232 + }, + { + "epoch": 0.62, + "logps_train/chosen": -89.6351318359375, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -113.39808654785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1514034271240234, + "rewards_train/margins": 1.3364527225494385, + "rewards_train/rejected": -3.487856149673462, + "step": 2232 + }, + { + "epoch": 0.62, + "logps_train/chosen": -63.18183898925781, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -69.90095520019531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.025996208190918, + "rewards_train/margins": 1.44378662109375, + "rewards_train/rejected": -2.469782829284668, + "step": 2233 + }, + { + "epoch": 0.62, + "learning_rate": 3.585175101953108e-08, + "loss": 0.3815, + "step": 2234 + }, + { + "epoch": 0.62, + "logps_train/chosen": -62.70296859741211, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -79.36196899414062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5584807395935059, + "rewards_train/margins": 1.133868932723999, + "rewards_train/rejected": -2.692349672317505, + "step": 2234 + }, + { + "epoch": 0.62, + "logps_train/chosen": -47.91161346435547, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -30.75, + "logps_train/rejected": -48.11452102661133, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.0786614418029785, + "rewards_train/margins": 0.6613061428070068, + "rewards_train/rejected": -1.7399675846099854, + "step": 2235 + }, + { + "epoch": 0.62, + "learning_rate": 3.553305399875217e-08, + "loss": 0.588, + "step": 2236 + }, + { + "epoch": 0.62, + "logps_train/chosen": -60.3453483581543, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -83.7688217163086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.336194634437561, + "rewards_train/margins": 1.9405409097671509, + "rewards_train/rejected": -3.276735544204712, + "step": 2236 + }, + { + "epoch": 0.63, + "logps_train/chosen": -81.75450134277344, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -99.93232727050781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3727158308029175, + "rewards_train/margins": 1.4931734800338745, + "rewards_train/rejected": -2.865889310836792, + "step": 2237 + }, + { + "epoch": 0.63, + "learning_rate": 3.521567140550413e-08, + "loss": 0.3125, + "step": 2238 + }, + { + "epoch": 0.63, + "logps_train/chosen": -98.9923095703125, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -110.40878295898438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.20220947265625, + "rewards_train/margins": 1.3863375186920166, + "rewards_train/rejected": -3.5885469913482666, + "step": 2238 + }, + { + "epoch": 0.63, + "logps_train/chosen": -91.9637451171875, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -111.10643768310547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3811404705047607, + "rewards_train/margins": 2.0095810890197754, + "rewards_train/rejected": -3.390721559524536, + "step": 2239 + }, + { + "epoch": 0.63, + "learning_rate": 3.4899605184965206e-08, + "loss": 0.476, + "step": 2240 + }, + { + "epoch": 0.63, + "logps_train/chosen": -73.35107421875, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -86.94080352783203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6032711267471313, + "rewards_train/margins": 1.0263563394546509, + "rewards_train/rejected": -2.6296274662017822, + "step": 2240 + }, + { + "epoch": 0.63, + "logps_train/chosen": -56.36665344238281, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -69.5025634765625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.2542437314987183, + "rewards_train/margins": 0.8428870439529419, + "rewards_train/rejected": -2.09713077545166, + "step": 2241 + }, + { + "epoch": 0.63, + "learning_rate": 3.45848572742456e-08, + "loss": 0.5062, + "step": 2242 + }, + { + "epoch": 0.63, + "logps_train/chosen": -73.39962768554688, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -100.27967834472656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.264767646789551, + "rewards_train/margins": 0.7805824279785156, + "rewards_train/rejected": -3.0453500747680664, + "step": 2242 + }, + { + "epoch": 0.63, + "logps_train/chosen": -28.2735595703125, + "logps_train/ref_chosen": -22.75, + "logps_train/ref_rejected": -27.375, + "logps_train/rejected": -42.428741455078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5486938953399658, + "rewards_train/margins": 0.9595122337341309, + "rewards_train/rejected": -1.5082061290740967, + "step": 2243 + }, + { + "epoch": 0.63, + "learning_rate": 3.427142960237609e-08, + "loss": 0.4894, + "step": 2244 + }, + { + "epoch": 0.63, + "logps_train/chosen": -107.04579162597656, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -122.21106719970703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.6733052730560303, + "rewards_train/margins": 0.965355634689331, + "rewards_train/rejected": -3.6386609077453613, + "step": 2244 + }, + { + "epoch": 0.63, + "logps_train/chosen": -63.8718376159668, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -91.53450012207031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.367554783821106, + "rewards_train/margins": 2.2419499158859253, + "rewards_train/rejected": -3.6095046997070312, + "step": 2245 + }, + { + "epoch": 0.63, + "learning_rate": 3.395932409029589e-08, + "loss": 0.3596, + "step": 2246 + }, + { + "epoch": 0.63, + "logps_train/chosen": -58.545536041259766, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -67.61285400390625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3766239881515503, + "rewards_train/margins": 0.8612238168716431, + "rewards_train/rejected": -2.2378478050231934, + "step": 2246 + }, + { + "epoch": 0.63, + "logps_train/chosen": -68.09675598144531, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -82.7144546508789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5113364458084106, + "rewards_train/margins": 1.4550312757492065, + "rewards_train/rejected": -2.966367721557617, + "step": 2247 + }, + { + "epoch": 0.63, + "learning_rate": 3.364854265084086e-08, + "loss": 0.6393, + "step": 2248 + }, + { + "epoch": 0.63, + "logps_train/chosen": -85.12940979003906, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -113.14777374267578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0652844905853271, + "rewards_train/margins": 1.9237120151519775, + "rewards_train/rejected": -2.9889965057373047, + "step": 2248 + }, + { + "epoch": 0.63, + "logps_train/chosen": -102.07742309570312, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -126.52885437011719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.695193290710449, + "rewards_train/margins": 2.2426528930664062, + "rewards_train/rejected": -4.9378461837768555, + "step": 2249 + }, + { + "epoch": 0.63, + "learning_rate": 3.333908718873191e-08, + "loss": 0.3309, + "step": 2250 + }, + { + "epoch": 0.63, + "logps_train/chosen": -93.69108581542969, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -98.12279510498047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4114912748336792, + "rewards_train/margins": 2.0351635217666626, + "rewards_train/rejected": -3.446654796600342, + "step": 2250 + }, + { + "epoch": 0.63, + "logps_train/chosen": -108.47863006591797, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -128.9418182373047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9377063512802124, + "rewards_train/margins": 2.1350878477096558, + "rewards_train/rejected": -3.072794198989868, + "step": 2251 + }, + { + "epoch": 0.63, + "learning_rate": 3.303095960056332e-08, + "loss": 0.2968, + "step": 2252 + }, + { + "epoch": 0.63, + "logps_train/chosen": -120.46194458007812, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -152.6195831298828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.637600898742676, + "rewards_train/margins": 3.0729904174804688, + "rewards_train/rejected": -5.7105913162231445, + "step": 2252 + }, + { + "epoch": 0.63, + "logps_train/chosen": -95.0181655883789, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -98.46448516845703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1252541542053223, + "rewards_train/margins": 0.46494388580322266, + "rewards_train/rejected": -2.590198040008545, + "step": 2253 + }, + { + "epoch": 0.63, + "learning_rate": 3.2724161774791146e-08, + "loss": 0.4819, + "step": 2254 + }, + { + "epoch": 0.63, + "logps_train/chosen": -41.65080642700195, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -29.875, + "logps_train/rejected": -45.3927001953125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1373463869094849, + "rewards_train/margins": 0.4144233465194702, + "rewards_train/rejected": -1.551769733428955, + "step": 2254 + }, + { + "epoch": 0.63, + "logps_train/chosen": -95.94683074951172, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -106.769287109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0665581226348877, + "rewards_train/margins": 2.7142765522003174, + "rewards_train/rejected": -3.780834674835205, + "step": 2255 + }, + { + "epoch": 0.63, + "learning_rate": 3.241869559172136e-08, + "loss": 0.3635, + "step": 2256 + }, + { + "epoch": 0.63, + "logps_train/chosen": -70.94738006591797, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -120.03895568847656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1283314228057861, + "rewards_train/margins": 2.3808376789093018, + "rewards_train/rejected": -3.509169101715088, + "step": 2256 + }, + { + "epoch": 0.63, + "logps_train/chosen": -66.00940704345703, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -86.06417083740234, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.513245701789856, + "rewards_train/margins": 1.0896552801132202, + "rewards_train/rejected": -2.602900981903076, + "step": 2257 + }, + { + "epoch": 0.63, + "learning_rate": 3.211456292349876e-08, + "loss": 0.4352, + "step": 2258 + }, + { + "epoch": 0.63, + "logps_train/chosen": -61.45050811767578, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -93.33035278320312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3827158212661743, + "rewards_train/margins": 1.8770774602890015, + "rewards_train/rejected": -3.259793281555176, + "step": 2258 + }, + { + "epoch": 0.63, + "logps_train/chosen": -63.052833557128906, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -91.94204711914062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4310648441314697, + "rewards_train/margins": 1.5191943645477295, + "rewards_train/rejected": -2.950259208679199, + "step": 2259 + }, + { + "epoch": 0.63, + "learning_rate": 3.1811765634095265e-08, + "loss": 0.3445, + "step": 2260 + }, + { + "epoch": 0.63, + "logps_train/chosen": -48.07798385620117, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -86.21844482421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.585923433303833, + "rewards_train/margins": 2.124983310699463, + "rewards_train/rejected": -2.710906744003296, + "step": 2260 + }, + { + "epoch": 0.63, + "logps_train/chosen": -31.856260299682617, + "logps_train/ref_chosen": -28.25, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -49.76077651977539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.362432599067688, + "rewards_train/margins": 1.201340675354004, + "rewards_train/rejected": -1.563773274421692, + "step": 2261 + }, + { + "epoch": 0.63, + "learning_rate": 3.151030557929829e-08, + "loss": 0.2969, + "step": 2262 + }, + { + "epoch": 0.63, + "logps_train/chosen": -117.81269073486328, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -151.4298095703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3849802017211914, + "rewards_train/margins": 2.5712828636169434, + "rewards_train/rejected": -4.956263065338135, + "step": 2262 + }, + { + "epoch": 0.63, + "logps_train/chosen": -94.64985656738281, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -114.19483947753906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.7395955324172974, + "rewards_train/margins": 2.332231879234314, + "rewards_train/rejected": -4.071827411651611, + "step": 2263 + }, + { + "epoch": 0.63, + "learning_rate": 3.121018460669986e-08, + "loss": 0.3789, + "step": 2264 + }, + { + "epoch": 0.63, + "logps_train/chosen": -64.57235717773438, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -70.3760986328125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2156345844268799, + "rewards_train/margins": 0.9360380172729492, + "rewards_train/rejected": -2.151672601699829, + "step": 2264 + }, + { + "epoch": 0.63, + "logps_train/chosen": -98.49313354492188, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -132.10076904296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9899380207061768, + "rewards_train/margins": 2.3795154094696045, + "rewards_train/rejected": -4.369453430175781, + "step": 2265 + }, + { + "epoch": 0.63, + "learning_rate": 3.091140455568489e-08, + "loss": 0.3682, + "step": 2266 + }, + { + "epoch": 0.63, + "logps_train/chosen": -131.7357635498047, + "logps_train/ref_chosen": -117.0, + "logps_train/ref_rejected": -120.0, + "logps_train/rejected": -152.8302459716797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4923266172409058, + "rewards_train/margins": 1.803197979927063, + "rewards_train/rejected": -3.2955245971679688, + "step": 2266 + }, + { + "epoch": 0.63, + "logps_train/chosen": -53.98360061645508, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -67.46401977539062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.2785238027572632, + "rewards_train/margins": 0.8058539628982544, + "rewards_train/rejected": -2.0843777656555176, + "step": 2267 + }, + { + "epoch": 0.63, + "learning_rate": 3.0613967257420074e-08, + "loss": 0.3973, + "step": 2268 + }, + { + "epoch": 0.63, + "logps_train/chosen": -65.583251953125, + "logps_train/ref_chosen": -52.0, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -86.12519073486328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3612544536590576, + "rewards_train/margins": 0.8661086559295654, + "rewards_train/rejected": -2.227363109588623, + "step": 2268 + }, + { + "epoch": 0.63, + "logps_train/chosen": -42.63085174560547, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -76.93778228759766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9796866774559021, + "rewards_train/margins": 2.458622992038727, + "rewards_train/rejected": -3.438309669494629, + "step": 2269 + }, + { + "epoch": 0.63, + "learning_rate": 3.031787453484255e-08, + "loss": 0.382, + "step": 2270 + }, + { + "epoch": 0.63, + "logps_train/chosen": -84.20780944824219, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -108.0483169555664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3858203887939453, + "rewards_train/margins": 2.2135426998138428, + "rewards_train/rejected": -3.599363088607788, + "step": 2270 + }, + { + "epoch": 0.63, + "logps_train/chosen": -41.86841583251953, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -80.5899658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6313731670379639, + "rewards_train/margins": 2.1152215003967285, + "rewards_train/rejected": -2.7465946674346924, + "step": 2271 + }, + { + "epoch": 0.63, + "learning_rate": 3.002312820264893e-08, + "loss": 0.2475, + "step": 2272 + }, + { + "epoch": 0.63, + "logps_train/chosen": -97.06907653808594, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -110.30889892578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.563938617706299, + "rewards_train/margins": 0.9935135841369629, + "rewards_train/rejected": -3.5574522018432617, + "step": 2272 + }, + { + "epoch": 0.64, + "logps_train/chosen": -83.07379150390625, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -110.21211242675781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3450748920440674, + "rewards_train/margins": 2.406604528427124, + "rewards_train/rejected": -3.7516794204711914, + "step": 2273 + }, + { + "epoch": 0.64, + "learning_rate": 2.972973006728399e-08, + "loss": 0.4133, + "step": 2274 + }, + { + "epoch": 0.64, + "logps_train/chosen": -98.0738754272461, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -142.01951599121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5839498043060303, + "rewards_train/margins": 2.246126651763916, + "rewards_train/rejected": -3.8300764560699463, + "step": 2274 + }, + { + "epoch": 0.64, + "logps_train/chosen": -52.19287872314453, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -72.36558532714844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8724128603935242, + "rewards_train/margins": 1.1766451001167297, + "rewards_train/rejected": -2.049057960510254, + "step": 2275 + }, + { + "epoch": 0.64, + "learning_rate": 2.943768192692958e-08, + "loss": 0.2993, + "step": 2276 + }, + { + "epoch": 0.64, + "logps_train/chosen": -38.568092346191406, + "logps_train/ref_chosen": -26.875, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -53.184696197509766, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1704810857772827, + "rewards_train/margins": 0.6430573463439941, + "rewards_train/rejected": -1.8135384321212769, + "step": 2276 + }, + { + "epoch": 0.64, + "logps_train/chosen": -100.71672058105469, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -111.54247283935547, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.1747965812683105, + "rewards_train/margins": 0.7583575248718262, + "rewards_train/rejected": -2.9331541061401367, + "step": 2277 + }, + { + "epoch": 0.64, + "learning_rate": 2.914698557149381e-08, + "loss": 0.5478, + "step": 2278 + }, + { + "epoch": 0.64, + "logps_train/chosen": -59.154361724853516, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -64.31646728515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8205142021179199, + "rewards_train/margins": 0.7283201217651367, + "rewards_train/rejected": -1.5488343238830566, + "step": 2278 + }, + { + "epoch": 0.64, + "logps_train/chosen": -69.17066955566406, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -92.26918029785156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7604263424873352, + "rewards_train/margins": 1.5942253470420837, + "rewards_train/rejected": -2.354651689529419, + "step": 2279 + }, + { + "epoch": 0.64, + "learning_rate": 2.885764278259989e-08, + "loss": 0.4812, + "step": 2280 + }, + { + "epoch": 0.64, + "logps_train/chosen": -80.02699279785156, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -114.78129577636719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8563127517700195, + "rewards_train/margins": 2.429922580718994, + "rewards_train/rejected": -4.286235332489014, + "step": 2280 + }, + { + "epoch": 0.64, + "logps_train/chosen": -82.1644058227539, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -143.25636291503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4097998142242432, + "rewards_train/margins": 3.8181798458099365, + "rewards_train/rejected": -5.22797966003418, + "step": 2281 + }, + { + "epoch": 0.64, + "learning_rate": 2.8569655333575388e-08, + "loss": 0.2186, + "step": 2282 + }, + { + "epoch": 0.64, + "logps_train/chosen": -85.30421447753906, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -104.56698608398438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.151515007019043, + "rewards_train/margins": 1.4945387840270996, + "rewards_train/rejected": -2.6460537910461426, + "step": 2282 + }, + { + "epoch": 0.64, + "logps_train/chosen": -70.06663513183594, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -89.6580810546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3867413997650146, + "rewards_train/margins": 1.272425889968872, + "rewards_train/rejected": -2.6591672897338867, + "step": 2283 + }, + { + "epoch": 0.64, + "learning_rate": 2.8283024989441017e-08, + "loss": 0.4312, + "step": 2284 + }, + { + "epoch": 0.64, + "logps_train/chosen": -38.46830749511719, + "logps_train/ref_chosen": -28.75, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -53.23004150390625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9751753807067871, + "rewards_train/margins": 0.7005630731582642, + "rewards_train/rejected": -1.6757384538650513, + "step": 2284 + }, + { + "epoch": 0.64, + "logps_train/chosen": -56.60980987548828, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -37.0, + "logps_train/rejected": -60.609840393066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1805121898651123, + "rewards_train/margins": 1.1748082637786865, + "rewards_train/rejected": -2.355320453643799, + "step": 2285 + }, + { + "epoch": 0.64, + "learning_rate": 2.7997753506900284e-08, + "loss": 0.4174, + "step": 2286 + }, + { + "epoch": 0.64, + "logps_train/chosen": -61.47783279418945, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -83.35697174072266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1370413303375244, + "rewards_train/margins": 1.5540273189544678, + "rewards_train/rejected": -2.691068649291992, + "step": 2286 + }, + { + "epoch": 0.64, + "logps_train/chosen": -87.39089965820312, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -80.72539520263672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5031523108482361, + "rewards_train/margins": 0.8396994471549988, + "rewards_train/rejected": -1.3428517580032349, + "step": 2287 + }, + { + "epoch": 0.64, + "learning_rate": 2.771384263432838e-08, + "loss": 0.4489, + "step": 2288 + }, + { + "epoch": 0.64, + "logps_train/chosen": -72.82838439941406, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -92.01563262939453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4658713340759277, + "rewards_train/margins": 1.5831775665283203, + "rewards_train/rejected": -3.049048900604248, + "step": 2288 + }, + { + "epoch": 0.64, + "logps_train/chosen": -95.0842514038086, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -117.84736633300781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.3092551231384277, + "rewards_train/margins": 1.1809983253479004, + "rewards_train/rejected": -3.490253448486328, + "step": 2289 + }, + { + "epoch": 0.64, + "learning_rate": 2.7431294111761538e-08, + "loss": 0.4206, + "step": 2290 + }, + { + "epoch": 0.64, + "logps_train/chosen": -35.47691345214844, + "logps_train/ref_chosen": -29.375, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -67.27271270751953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6105333566665649, + "rewards_train/margins": 2.0660542249679565, + "rewards_train/rejected": -2.6765875816345215, + "step": 2290 + }, + { + "epoch": 0.64, + "logps_train/chosen": -85.71358489990234, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -104.96987915039062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.313155174255371, + "rewards_train/margins": 0.964301586151123, + "rewards_train/rejected": -3.277456760406494, + "step": 2291 + }, + { + "epoch": 0.64, + "learning_rate": 2.715010967088646e-08, + "loss": 0.3828, + "step": 2292 + }, + { + "epoch": 0.64, + "logps_train/chosen": -105.21588134765625, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -143.63766479492188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5610418319702148, + "rewards_train/margins": 2.458975315093994, + "rewards_train/rejected": -4.020017147064209, + "step": 2292 + }, + { + "epoch": 0.64, + "logps_train/chosen": -55.39488983154297, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -65.77626037597656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5595819354057312, + "rewards_train/margins": 1.0641135573387146, + "rewards_train/rejected": -1.6236954927444458, + "step": 2293 + }, + { + "epoch": 0.64, + "learning_rate": 2.6870291035029718e-08, + "loss": 0.3276, + "step": 2294 + }, + { + "epoch": 0.64, + "logps_train/chosen": -81.55626678466797, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -84.76385498046875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8581660985946655, + "rewards_train/margins": 0.8850165605545044, + "rewards_train/rejected": -2.74318265914917, + "step": 2294 + }, + { + "epoch": 0.64, + "logps_train/chosen": -64.18229675292969, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -75.52268981933594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6174484491348267, + "rewards_train/margins": 1.1398991346359253, + "rewards_train/rejected": -2.757347583770752, + "step": 2295 + }, + { + "epoch": 0.64, + "learning_rate": 2.659183991914696e-08, + "loss": 0.4717, + "step": 2296 + }, + { + "epoch": 0.64, + "logps_train/chosen": -68.45669555664062, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -81.3840103149414, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.764907956123352, + "rewards_train/margins": 0.494195818901062, + "rewards_train/rejected": -2.259103775024414, + "step": 2296 + }, + { + "epoch": 0.64, + "logps_train/chosen": -85.42061614990234, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -102.02793884277344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0028035640716553, + "rewards_train/margins": 2.16424822807312, + "rewards_train/rejected": -3.1670517921447754, + "step": 2297 + }, + { + "epoch": 0.64, + "learning_rate": 2.631475802981267e-08, + "loss": 0.3543, + "step": 2298 + }, + { + "epoch": 0.64, + "logps_train/chosen": -123.10936737060547, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -124.00788116455078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.76747989654541, + "rewards_train/margins": 0.8004956245422363, + "rewards_train/rejected": -3.5679755210876465, + "step": 2298 + }, + { + "epoch": 0.64, + "logps_train/chosen": -112.47196197509766, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -92.0, + "logps_train/rejected": -130.05885314941406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.2675087451934814, + "rewards_train/margins": 1.5563452243804932, + "rewards_train/rejected": -3.8238539695739746, + "step": 2299 + }, + { + "epoch": 0.64, + "learning_rate": 2.6039047065209567e-08, + "loss": 0.5269, + "step": 2300 + }, + { + "epoch": 0.64, + "logps_train/chosen": -97.63093566894531, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -108.96878814697266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0855538845062256, + "rewards_train/margins": 1.4871065616607666, + "rewards_train/rejected": -3.572660446166992, + "step": 2300 + }, + { + "epoch": 0.64, + "logps_train/chosen": -69.18180084228516, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -91.53018188476562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.5590007305145264, + "rewards_train/margins": 1.916283130645752, + "rewards_train/rejected": -3.4752838611602783, + "step": 2301 + }, + { + "epoch": 0.64, + "learning_rate": 2.576470871511832e-08, + "loss": 0.4149, + "step": 2302 + }, + { + "epoch": 0.64, + "logps_train/chosen": -68.69903564453125, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -76.16432189941406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1012508869171143, + "rewards_train/margins": 0.9362752437591553, + "rewards_train/rejected": -2.0375261306762695, + "step": 2302 + }, + { + "epoch": 0.64, + "logps_train/chosen": -48.255455017089844, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -77.14627075195312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.226326823234558, + "rewards_train/margins": 1.9146674871444702, + "rewards_train/rejected": -3.1409943103790283, + "step": 2303 + }, + { + "epoch": 0.64, + "learning_rate": 2.5491744660906922e-08, + "loss": 0.4538, + "step": 2304 + }, + { + "epoch": 0.64, + "logps_train/chosen": -41.622154235839844, + "logps_train/ref_chosen": -30.0, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -65.2881851196289, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1606160402297974, + "rewards_train/margins": 1.097267985343933, + "rewards_train/rejected": -2.2578840255737305, + "step": 2304 + }, + { + "epoch": 0.64, + "logps_train/chosen": -86.71805572509766, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -108.43025207519531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6573522090911865, + "rewards_train/margins": 1.0544235706329346, + "rewards_train/rejected": -2.711775779724121, + "step": 2305 + }, + { + "epoch": 0.64, + "learning_rate": 2.522015657552068e-08, + "loss": 0.5079, + "step": 2306 + }, + { + "epoch": 0.64, + "logps_train/chosen": -79.84477233886719, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -133.5365447998047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.478226900100708, + "rewards_train/margins": 2.4129269123077393, + "rewards_train/rejected": -3.8911538124084473, + "step": 2306 + }, + { + "epoch": 0.64, + "logps_train/chosen": -52.64592742919922, + "logps_train/ref_chosen": -43.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -72.86075592041016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8976004719734192, + "rewards_train/margins": 0.7845690846443176, + "rewards_train/rejected": -1.6821695566177368, + "step": 2307 + }, + { + "epoch": 0.65, + "learning_rate": 2.4949946123471928e-08, + "loss": 0.4362, + "step": 2308 + }, + { + "epoch": 0.65, + "logps_train/chosen": -112.13893127441406, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -119.48912811279297, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8115500211715698, + "rewards_train/margins": 1.4623628854751587, + "rewards_train/rejected": -3.2739129066467285, + "step": 2308 + }, + { + "epoch": 0.65, + "logps_train/chosen": -77.41275024414062, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -108.7606201171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5029933452606201, + "rewards_train/margins": 1.9535377025604248, + "rewards_train/rejected": -3.456531047821045, + "step": 2309 + }, + { + "epoch": 0.65, + "learning_rate": 2.468111496082953e-08, + "loss": 0.393, + "step": 2310 + }, + { + "epoch": 0.65, + "logps_train/chosen": -76.18523406982422, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -86.7576904296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3306331634521484, + "rewards_train/margins": 1.3666200637817383, + "rewards_train/rejected": -2.6972532272338867, + "step": 2310 + }, + { + "epoch": 0.65, + "logps_train/chosen": -74.35615539550781, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -85.40081787109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.3793771266937256, + "rewards_train/margins": 0.906029462814331, + "rewards_train/rejected": -3.2854065895080566, + "step": 2311 + }, + { + "epoch": 0.65, + "learning_rate": 2.441366473520909e-08, + "loss": 0.438, + "step": 2312 + }, + { + "epoch": 0.65, + "logps_train/chosen": -84.054931640625, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -111.26505279541016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.768969178199768, + "rewards_train/margins": 2.6528486013412476, + "rewards_train/rejected": -4.421817779541016, + "step": 2312 + }, + { + "epoch": 0.65, + "logps_train/chosen": -97.89682006835938, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -88.00159454345703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3671231269836426, + "rewards_train/margins": 1.2402629852294922, + "rewards_train/rejected": -2.6073861122131348, + "step": 2313 + }, + { + "epoch": 0.65, + "learning_rate": 2.414759708576272e-08, + "loss": 0.4338, + "step": 2314 + }, + { + "epoch": 0.65, + "logps_train/chosen": -85.98214721679688, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -108.347900390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1966524124145508, + "rewards_train/margins": 2.472512722015381, + "rewards_train/rejected": -3.6691651344299316, + "step": 2314 + }, + { + "epoch": 0.65, + "logps_train/chosen": -101.44063568115234, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -105.71339416503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9948445558547974, + "rewards_train/margins": 1.640947937965393, + "rewards_train/rejected": -2.6357924938201904, + "step": 2315 + }, + { + "epoch": 0.65, + "learning_rate": 2.3882913643168996e-08, + "loss": 0.2656, + "step": 2316 + }, + { + "epoch": 0.65, + "logps_train/chosen": -54.69548034667969, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -73.16571044921875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.041813850402832, + "rewards_train/margins": 1.148585557937622, + "rewards_train/rejected": -2.190399408340454, + "step": 2316 + }, + { + "epoch": 0.65, + "logps_train/chosen": -82.53874969482422, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -101.69741821289062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4109060764312744, + "rewards_train/margins": 1.3307111263275146, + "rewards_train/rejected": -2.741617202758789, + "step": 2317 + }, + { + "epoch": 0.65, + "learning_rate": 2.3619616029622786e-08, + "loss": 0.378, + "step": 2318 + }, + { + "epoch": 0.65, + "logps_train/chosen": -59.0460205078125, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -77.77056121826172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1014772653579712, + "rewards_train/margins": 1.3862236738204956, + "rewards_train/rejected": -2.487700939178467, + "step": 2318 + }, + { + "epoch": 0.65, + "logps_train/chosen": -106.45872497558594, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -128.98675537109375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2654037475585938, + "rewards_train/margins": 1.5731151103973389, + "rewards_train/rejected": -3.8385188579559326, + "step": 2319 + }, + { + "epoch": 0.65, + "learning_rate": 2.3357705858825626e-08, + "loss": 0.3599, + "step": 2320 + }, + { + "epoch": 0.65, + "logps_train/chosen": -54.24959182739258, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -51.97197341918945, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6323809027671814, + "rewards_train/margins": 0.8976289629936218, + "rewards_train/rejected": -1.5300098657608032, + "step": 2320 + }, + { + "epoch": 0.65, + "logps_train/chosen": -85.1417236328125, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -86.06590270996094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1801878213882446, + "rewards_train/margins": 1.1674178838729858, + "rewards_train/rejected": -2.3476057052612305, + "step": 2321 + }, + { + "epoch": 0.65, + "learning_rate": 2.3097184735975715e-08, + "loss": 0.4628, + "step": 2322 + }, + { + "epoch": 0.65, + "logps_train/chosen": -50.962223052978516, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -63.62197494506836, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9616520404815674, + "rewards_train/margins": 0.4386312961578369, + "rewards_train/rejected": -1.4002833366394043, + "step": 2322 + }, + { + "epoch": 0.65, + "logps_train/chosen": -45.97882843017578, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -46.01702880859375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.8758121132850647, + "rewards_train/margins": 0.31807857751846313, + "rewards_train/rejected": -1.1938906908035278, + "step": 2323 + }, + { + "epoch": 0.65, + "learning_rate": 2.283805425775784e-08, + "loss": 0.6033, + "step": 2324 + }, + { + "epoch": 0.65, + "logps_train/chosen": -80.25047302246094, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -84.49087524414062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0461416244506836, + "rewards_train/margins": 1.0320472717285156, + "rewards_train/rejected": -3.078188896179199, + "step": 2324 + }, + { + "epoch": 0.65, + "logps_train/chosen": -33.4067497253418, + "logps_train/ref_chosen": -26.125, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -52.737266540527344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7299327254295349, + "rewards_train/margins": 1.1340284943580627, + "rewards_train/rejected": -1.8639612197875977, + "step": 2325 + }, + { + "epoch": 0.65, + "learning_rate": 2.2580316012333983e-08, + "loss": 0.3952, + "step": 2326 + }, + { + "epoch": 0.65, + "logps_train/chosen": -63.234134674072266, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -86.70317077636719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8007569909095764, + "rewards_train/margins": 2.040653169155121, + "rewards_train/rejected": -2.8414101600646973, + "step": 2326 + }, + { + "epoch": 0.65, + "logps_train/chosen": -111.7594985961914, + "logps_train/ref_chosen": -98.5, + "logps_train/ref_rejected": -113.0, + "logps_train/rejected": -152.4572296142578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3231786489486694, + "rewards_train/margins": 2.63852322101593, + "rewards_train/rejected": -3.9617018699645996, + "step": 2327 + }, + { + "epoch": 0.65, + "learning_rate": 2.232397157933333e-08, + "loss": 0.2995, + "step": 2328 + }, + { + "epoch": 0.65, + "logps_train/chosen": -112.3330307006836, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -154.49700927734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -3.4161157608032227, + "rewards_train/margins": 1.9484291076660156, + "rewards_train/rejected": -5.364544868469238, + "step": 2328 + }, + { + "epoch": 0.65, + "logps_train/chosen": -59.38545227050781, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -82.02438354492188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3947954177856445, + "rewards_train/margins": 1.3133068084716797, + "rewards_train/rejected": -2.708102226257324, + "step": 2329 + }, + { + "epoch": 0.65, + "learning_rate": 2.206902252984266e-08, + "loss": 0.3211, + "step": 2330 + }, + { + "epoch": 0.65, + "logps_train/chosen": -99.88095092773438, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -132.04046630859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6930516958236694, + "rewards_train/margins": 2.106575608253479, + "rewards_train/rejected": -3.7996273040771484, + "step": 2330 + }, + { + "epoch": 0.65, + "logps_train/chosen": -91.5267333984375, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -124.65767669677734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4155640602111816, + "rewards_train/margins": 2.719344139099121, + "rewards_train/rejected": -4.134908199310303, + "step": 2331 + }, + { + "epoch": 0.65, + "learning_rate": 2.1815470426396615e-08, + "loss": 0.3932, + "step": 2332 + }, + { + "epoch": 0.65, + "logps_train/chosen": -46.769317626953125, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -72.10181427001953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5208772420883179, + "rewards_train/margins": 1.517428994178772, + "rewards_train/rejected": -2.03830623626709, + "step": 2332 + }, + { + "epoch": 0.65, + "logps_train/chosen": -58.29115295410156, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -62.80207061767578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9801895618438721, + "rewards_train/margins": 1.1518728733062744, + "rewards_train/rejected": -2.1320624351501465, + "step": 2333 + }, + { + "epoch": 0.65, + "learning_rate": 2.1563316822968332e-08, + "loss": 0.3587, + "step": 2334 + }, + { + "epoch": 0.65, + "logps_train/chosen": -116.30047607421875, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -161.61383056640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.290984869003296, + "rewards_train/margins": 2.540710210800171, + "rewards_train/rejected": -4.831695079803467, + "step": 2334 + }, + { + "epoch": 0.65, + "logps_train/chosen": -89.87202453613281, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -99.28591918945312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.212398052215576, + "rewards_train/margins": 1.0849440097808838, + "rewards_train/rejected": -3.29734206199646, + "step": 2335 + }, + { + "epoch": 0.65, + "learning_rate": 2.1312563264959837e-08, + "loss": 0.6399, + "step": 2336 + }, + { + "epoch": 0.65, + "logps_train/chosen": -69.43641662597656, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -90.65304565429688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.901381015777588, + "rewards_train/margins": 1.4276933670043945, + "rewards_train/rejected": -3.3290743827819824, + "step": 2336 + }, + { + "epoch": 0.65, + "logps_train/chosen": -66.98248291015625, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -120.43529510498047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0648493766784668, + "rewards_train/margins": 2.090008020401001, + "rewards_train/rejected": -3.1548573970794678, + "step": 2337 + }, + { + "epoch": 0.65, + "learning_rate": 2.1063211289192363e-08, + "loss": 0.3742, + "step": 2338 + }, + { + "epoch": 0.65, + "logps_train/chosen": -37.199623107910156, + "logps_train/ref_chosen": -32.75, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -48.03995132446289, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.44222787022590637, + "rewards_train/margins": 0.9666502177715302, + "rewards_train/rejected": -1.4088780879974365, + "step": 2338 + }, + { + "epoch": 0.65, + "logps_train/chosen": -120.96432495117188, + "logps_train/ref_chosen": -106.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -147.673828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.485103964805603, + "rewards_train/margins": 2.4358195066452026, + "rewards_train/rejected": -3.9209234714508057, + "step": 2339 + }, + { + "epoch": 0.65, + "learning_rate": 2.081526242389728e-08, + "loss": 0.3163, + "step": 2340 + }, + { + "epoch": 0.65, + "logps_train/chosen": -61.40909957885742, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -67.73588562011719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2549724578857422, + "rewards_train/margins": 0.6457642316818237, + "rewards_train/rejected": -1.900736689567566, + "step": 2340 + }, + { + "epoch": 0.65, + "logps_train/chosen": -65.85340881347656, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -85.075439453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.009070873260498, + "rewards_train/margins": 1.5371451377868652, + "rewards_train/rejected": -2.5462160110473633, + "step": 2341 + }, + { + "epoch": 0.65, + "learning_rate": 2.056871818870648e-08, + "loss": 0.4376, + "step": 2342 + }, + { + "epoch": 0.65, + "logps_train/chosen": -70.17825317382812, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -70.79692077636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4225132465362549, + "rewards_train/margins": 1.450049877166748, + "rewards_train/rejected": -1.872563123703003, + "step": 2342 + }, + { + "epoch": 0.65, + "logps_train/chosen": -89.3247299194336, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -121.79609680175781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.335378646850586, + "rewards_train/margins": 1.4989194869995117, + "rewards_train/rejected": -3.8342981338500977, + "step": 2343 + }, + { + "epoch": 0.66, + "learning_rate": 2.0323580094643215e-08, + "loss": 0.2979, + "step": 2344 + }, + { + "epoch": 0.66, + "logps_train/chosen": -104.042236328125, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -115.4459228515625, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -2.83164119720459, + "rewards_train/margins": 1.2770862579345703, + "rewards_train/rejected": -4.10872745513916, + "step": 2344 + }, + { + "epoch": 0.66, + "logps_train/chosen": -87.92204284667969, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -105.17977142333984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.6062674522399902, + "rewards_train/margins": 1.1886749267578125, + "rewards_train/rejected": -3.7949423789978027, + "step": 2345 + }, + { + "epoch": 0.66, + "learning_rate": 2.0079849644112636e-08, + "loss": 0.4891, + "step": 2346 + }, + { + "epoch": 0.66, + "logps_train/chosen": -62.77349090576172, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -90.38088989257812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8043019771575928, + "rewards_train/margins": 1.4115214347839355, + "rewards_train/rejected": -3.2158234119415283, + "step": 2346 + }, + { + "epoch": 0.66, + "logps_train/chosen": -81.14441680908203, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -125.37225341796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.182410717010498, + "rewards_train/margins": 1.6298151016235352, + "rewards_train/rejected": -3.812225818634033, + "step": 2347 + }, + { + "epoch": 0.66, + "learning_rate": 1.9837528330892776e-08, + "loss": 0.4502, + "step": 2348 + }, + { + "epoch": 0.66, + "logps_train/chosen": -62.4871711730957, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -54.25, + "logps_train/rejected": -81.77409362792969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9696152806282043, + "rewards_train/margins": 1.782012164592743, + "rewards_train/rejected": -2.7516274452209473, + "step": 2348 + }, + { + "epoch": 0.66, + "logps_train/chosen": -102.28630065917969, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -116.01897430419922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.920329213142395, + "rewards_train/margins": 1.644849181175232, + "rewards_train/rejected": -3.565178394317627, + "step": 2349 + }, + { + "epoch": 0.66, + "learning_rate": 1.9596617640125463e-08, + "loss": 0.4125, + "step": 2350 + }, + { + "epoch": 0.66, + "logps_train/chosen": -144.18722534179688, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -146.3778076171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.8687233924865723, + "rewards_train/margins": 1.2151508331298828, + "rewards_train/rejected": -5.083874225616455, + "step": 2350 + }, + { + "epoch": 0.66, + "logps_train/chosen": -72.83572387695312, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -71.47396850585938, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.483181953430176, + "rewards_train/margins": 0.19907808303833008, + "rewards_train/rejected": -2.682260036468506, + "step": 2351 + }, + { + "epoch": 0.66, + "learning_rate": 1.935711904830681e-08, + "loss": 0.6555, + "step": 2352 + }, + { + "epoch": 0.66, + "logps_train/chosen": -60.31793212890625, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -80.93270874023438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.746246337890625, + "rewards_train/margins": 1.3808135986328125, + "rewards_train/rejected": -3.1270599365234375, + "step": 2352 + }, + { + "epoch": 0.66, + "logps_train/chosen": -79.99787902832031, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -116.02961730957031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.081282377243042, + "rewards_train/margins": 2.1707513332366943, + "rewards_train/rejected": -3.2520337104797363, + "step": 2353 + }, + { + "epoch": 0.66, + "learning_rate": 1.9119034023278634e-08, + "loss": 0.3205, + "step": 2354 + }, + { + "epoch": 0.66, + "logps_train/chosen": -70.44905853271484, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -94.92647552490234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1918785572052002, + "rewards_train/margins": 1.392761468887329, + "rewards_train/rejected": -2.5846400260925293, + "step": 2354 + }, + { + "epoch": 0.66, + "logps_train/chosen": -52.95862579345703, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -82.2431640625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6912485361099243, + "rewards_train/margins": 2.4679068326950073, + "rewards_train/rejected": -3.1591553688049316, + "step": 2355 + }, + { + "epoch": 0.66, + "learning_rate": 1.8882364024219306e-08, + "loss": 0.367, + "step": 2356 + }, + { + "epoch": 0.66, + "logps_train/chosen": -85.4339599609375, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -104.77973175048828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.910583257675171, + "rewards_train/margins": 1.3658277988433838, + "rewards_train/rejected": -3.2764110565185547, + "step": 2356 + }, + { + "epoch": 0.66, + "logps_train/chosen": -79.79117584228516, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -101.0180435180664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8884929418563843, + "rewards_train/margins": 0.6673704385757446, + "rewards_train/rejected": -2.555863380432129, + "step": 2357 + }, + { + "epoch": 0.66, + "learning_rate": 1.864711050163456e-08, + "loss": 0.5869, + "step": 2358 + }, + { + "epoch": 0.66, + "logps_train/chosen": -47.60871124267578, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -82.54953002929688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.899396538734436, + "rewards_train/margins": 1.6905664205551147, + "rewards_train/rejected": -2.589962959289551, + "step": 2358 + }, + { + "epoch": 0.66, + "logps_train/chosen": -54.427406311035156, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -90.47377014160156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6716465950012207, + "rewards_train/margins": 2.7245097160339355, + "rewards_train/rejected": -3.3961563110351562, + "step": 2359 + }, + { + "epoch": 0.66, + "learning_rate": 1.8413274897349036e-08, + "loss": 0.3164, + "step": 2360 + }, + { + "epoch": 0.66, + "logps_train/chosen": -74.3843994140625, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -60.552337646484375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1689088344573975, + "rewards_train/margins": 0.6492154598236084, + "rewards_train/rejected": -1.8181242942810059, + "step": 2360 + }, + { + "epoch": 0.66, + "logps_train/chosen": -75.90766906738281, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -88.56928253173828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0731887817382812, + "rewards_train/margins": 1.0165519714355469, + "rewards_train/rejected": -3.089740753173828, + "step": 2361 + }, + { + "epoch": 0.66, + "learning_rate": 1.818085864449709e-08, + "loss": 0.4651, + "step": 2362 + }, + { + "epoch": 0.66, + "logps_train/chosen": -50.30708312988281, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -56.24629211425781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.228315830230713, + "rewards_train/margins": 0.4563720226287842, + "rewards_train/rejected": -1.684687852859497, + "step": 2362 + }, + { + "epoch": 0.66, + "logps_train/chosen": -54.42514419555664, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -63.159305572509766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1516941785812378, + "rewards_train/margins": 1.2406038045883179, + "rewards_train/rejected": -2.3922979831695557, + "step": 2363 + }, + { + "epoch": 0.66, + "learning_rate": 1.7949863167514207e-08, + "loss": 0.4729, + "step": 2364 + }, + { + "epoch": 0.66, + "logps_train/chosen": -54.35980224609375, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -62.448402404785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5064880847930908, + "rewards_train/margins": 0.3297581672668457, + "rewards_train/rejected": -1.8362462520599365, + "step": 2364 + }, + { + "epoch": 0.66, + "logps_train/chosen": -88.7553939819336, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -130.08148193359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.585695743560791, + "rewards_train/margins": 3.8771395683288574, + "rewards_train/rejected": -5.462835311889648, + "step": 2365 + }, + { + "epoch": 0.66, + "learning_rate": 1.7720289882128092e-08, + "loss": 0.3635, + "step": 2366 + }, + { + "epoch": 0.66, + "logps_train/chosen": -115.94821166992188, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -155.0402069091797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.00575852394104, + "rewards_train/margins": 3.8826377391815186, + "rewards_train/rejected": -5.888396263122559, + "step": 2366 + }, + { + "epoch": 0.66, + "logps_train/chosen": -30.946998596191406, + "logps_train/ref_chosen": -22.125, + "logps_train/ref_rejected": -32.25, + "logps_train/rejected": -49.723960876464844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8831765055656433, + "rewards_train/margins": 0.8765245079994202, + "rewards_train/rejected": -1.7597010135650635, + "step": 2367 + }, + { + "epoch": 0.66, + "learning_rate": 1.749214019535028e-08, + "loss": 0.2652, + "step": 2368 + }, + { + "epoch": 0.66, + "logps_train/chosen": -103.5555419921875, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -128.9398956298828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.375476837158203, + "rewards_train/margins": 2.0962467193603516, + "rewards_train/rejected": -4.471723556518555, + "step": 2368 + }, + { + "epoch": 0.66, + "logps_train/chosen": -69.94168853759766, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -82.79621887207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5803020000457764, + "rewards_train/margins": 1.3461954593658447, + "rewards_train/rejected": -1.926497459411621, + "step": 2369 + }, + { + "epoch": 0.66, + "learning_rate": 1.7265415505467202e-08, + "loss": 0.4227, + "step": 2370 + }, + { + "epoch": 0.66, + "logps_train/chosen": -59.2435302734375, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -57.10908508300781, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -1.37298583984375, + "rewards_train/margins": 0.28577423095703125, + "rewards_train/rejected": -1.6587600708007812, + "step": 2370 + }, + { + "epoch": 0.66, + "logps_train/chosen": -48.712791442871094, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -60.24412536621094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8595608472824097, + "rewards_train/margins": 0.6164143085479736, + "rewards_train/rejected": -1.4759751558303833, + "step": 2371 + }, + { + "epoch": 0.66, + "learning_rate": 1.7040117202031774e-08, + "loss": 0.5745, + "step": 2372 + }, + { + "epoch": 0.66, + "logps_train/chosen": -54.472015380859375, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -61.366668701171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1296234130859375, + "rewards_train/margins": 0.9418091773986816, + "rewards_train/rejected": -2.071432590484619, + "step": 2372 + }, + { + "epoch": 0.66, + "logps_train/chosen": -51.196144104003906, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -67.7552719116211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5069681406021118, + "rewards_train/margins": 1.1002975702285767, + "rewards_train/rejected": -2.6072657108306885, + "step": 2373 + }, + { + "epoch": 0.66, + "learning_rate": 1.6816246665854905e-08, + "loss": 0.4071, + "step": 2374 + }, + { + "epoch": 0.66, + "logps_train/chosen": -67.62458801269531, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -78.61567687988281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8679275512695312, + "rewards_train/margins": 1.0901976823806763, + "rewards_train/rejected": -1.9581252336502075, + "step": 2374 + }, + { + "epoch": 0.66, + "logps_train/chosen": -121.6823501586914, + "logps_train/ref_chosen": -101.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -146.6358184814453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.001828670501709, + "rewards_train/margins": 2.202378749847412, + "rewards_train/rejected": -4.204207420349121, + "step": 2375 + }, + { + "epoch": 0.66, + "learning_rate": 1.6593805268996952e-08, + "loss": 0.3113, + "step": 2376 + }, + { + "epoch": 0.66, + "logps_train/chosen": -39.511253356933594, + "logps_train/ref_chosen": -29.875, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -58.38397979736328, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9601094722747803, + "rewards_train/margins": 0.8243824243545532, + "rewards_train/rejected": -1.7844918966293335, + "step": 2376 + }, + { + "epoch": 0.66, + "logps_train/chosen": -38.205841064453125, + "logps_train/ref_chosen": -30.25, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -58.22001266479492, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8016389608383179, + "rewards_train/margins": 1.2359873056411743, + "rewards_train/rejected": -2.037626266479492, + "step": 2377 + }, + { + "epoch": 0.66, + "learning_rate": 1.6372794374759385e-08, + "loss": 0.5008, + "step": 2378 + }, + { + "epoch": 0.66, + "logps_train/chosen": -57.011898040771484, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -72.63736724853516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9588067531585693, + "rewards_train/margins": 1.4664533138275146, + "rewards_train/rejected": -2.425260066986084, + "step": 2378 + }, + { + "epoch": 0.66, + "logps_train/chosen": -70.20790100097656, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -102.10855102539062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9248183965682983, + "rewards_train/margins": 2.0233408212661743, + "rewards_train/rejected": -2.9481592178344727, + "step": 2379 + }, + { + "epoch": 0.67, + "learning_rate": 1.615321533767633e-08, + "loss": 0.331, + "step": 2380 + }, + { + "epoch": 0.67, + "logps_train/chosen": -83.17626953125, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -109.589111328125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.453174352645874, + "rewards_train/margins": 1.6352295875549316, + "rewards_train/rejected": -3.0884039402008057, + "step": 2380 + }, + { + "epoch": 0.67, + "logps_train/chosen": -83.79940795898438, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -115.67716979980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8533779382705688, + "rewards_train/margins": 3.0080899000167847, + "rewards_train/rejected": -4.8614678382873535, + "step": 2381 + }, + { + "epoch": 0.67, + "learning_rate": 1.5935069503506317e-08, + "loss": 0.3274, + "step": 2382 + }, + { + "epoch": 0.67, + "logps_train/chosen": -64.24397277832031, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -39.5, + "logps_train/rejected": -56.80262756347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1977365016937256, + "rewards_train/margins": 0.5394594669342041, + "rewards_train/rejected": -1.7371959686279297, + "step": 2382 + }, + { + "epoch": 0.67, + "logps_train/chosen": -86.50545501708984, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -95.79205322265625, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -2.671053647994995, + "rewards_train/margins": 0.4847145080566406, + "rewards_train/rejected": -3.1557681560516357, + "step": 2383 + }, + { + "epoch": 0.67, + "learning_rate": 1.5718358209224153e-08, + "loss": 0.6025, + "step": 2384 + }, + { + "epoch": 0.67, + "logps_train/chosen": -78.83937072753906, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -109.42338562011719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8662126064300537, + "rewards_train/margins": 1.6802761554718018, + "rewards_train/rejected": -3.5464887619018555, + "step": 2384 + }, + { + "epoch": 0.67, + "logps_train/chosen": -74.46876525878906, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -86.62158203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.0051772594451904, + "rewards_train/margins": 1.1073722839355469, + "rewards_train/rejected": -3.1125495433807373, + "step": 2385 + }, + { + "epoch": 0.67, + "learning_rate": 1.5503082783012546e-08, + "loss": 0.3948, + "step": 2386 + }, + { + "epoch": 0.67, + "logps_train/chosen": -72.53384399414062, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -88.03909301757812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4690089225769043, + "rewards_train/margins": 0.7313849925994873, + "rewards_train/rejected": -3.2003939151763916, + "step": 2386 + }, + { + "epoch": 0.67, + "logps_train/chosen": -74.85980987548828, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -111.21375274658203, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2601996660232544, + "rewards_train/margins": 1.556097388267517, + "rewards_train/rejected": -2.8162970542907715, + "step": 2387 + }, + { + "epoch": 0.67, + "learning_rate": 1.5289244544254036e-08, + "loss": 0.4011, + "step": 2388 + }, + { + "epoch": 0.67, + "logps_train/chosen": -71.07477569580078, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -92.66500854492188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3321850299835205, + "rewards_train/margins": 1.4595105648040771, + "rewards_train/rejected": -2.7916955947875977, + "step": 2388 + }, + { + "epoch": 0.67, + "logps_train/chosen": -53.74809265136719, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -77.03923797607422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9548389911651611, + "rewards_train/margins": 0.9410768747329712, + "rewards_train/rejected": -1.8959158658981323, + "step": 2389 + }, + { + "epoch": 0.67, + "learning_rate": 1.507684480352292e-08, + "loss": 0.3496, + "step": 2390 + }, + { + "epoch": 0.67, + "logps_train/chosen": -39.05058288574219, + "logps_train/ref_chosen": -21.0, + "logps_train/ref_rejected": -23.25, + "logps_train/rejected": -39.60066223144531, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.7998826503753662, + "rewards_train/margins": -0.16611015796661377, + "rewards_train/rejected": -1.6337724924087524, + "step": 2390 + }, + { + "epoch": 0.67, + "logps_train/chosen": -91.94862365722656, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -63.25, + "logps_train/rejected": -82.77864074707031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6413460969924927, + "rewards_train/margins": 0.300189733505249, + "rewards_train/rejected": -1.9415358304977417, + "step": 2391 + }, + { + "epoch": 0.67, + "learning_rate": 1.4865884862577254e-08, + "loss": 0.7812, + "step": 2392 + }, + { + "epoch": 0.67, + "logps_train/chosen": -36.97472381591797, + "logps_train/ref_chosen": -27.875, + "logps_train/ref_rejected": -35.0, + "logps_train/rejected": -52.20393371582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9113396406173706, + "rewards_train/margins": 0.8012415170669556, + "rewards_train/rejected": -1.7125811576843262, + "step": 2392 + }, + { + "epoch": 0.67, + "logps_train/chosen": -51.82833480834961, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -61.646697998046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1044155359268188, + "rewards_train/margins": 0.8557621240615845, + "rewards_train/rejected": -1.9601776599884033, + "step": 2393 + }, + { + "epoch": 0.67, + "learning_rate": 1.4656366014350746e-08, + "loss": 0.4348, + "step": 2394 + }, + { + "epoch": 0.67, + "logps_train/chosen": -95.79158020019531, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -121.02859497070312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.239314556121826, + "rewards_train/margins": 2.274287223815918, + "rewards_train/rejected": -4.513601779937744, + "step": 2394 + }, + { + "epoch": 0.67, + "logps_train/chosen": -84.07028198242188, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -124.91248321533203, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7320280075073242, + "rewards_train/margins": 2.346329689025879, + "rewards_train/rejected": -4.078357696533203, + "step": 2395 + }, + { + "epoch": 0.67, + "learning_rate": 1.4448289542944997e-08, + "loss": 0.3491, + "step": 2396 + }, + { + "epoch": 0.67, + "logps_train/chosen": -97.93675231933594, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -106.16511535644531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.622971773147583, + "rewards_train/margins": 1.857015609741211, + "rewards_train/rejected": -3.479987382888794, + "step": 2396 + }, + { + "epoch": 0.67, + "logps_train/chosen": -45.19966125488281, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -25.125, + "logps_train/rejected": -45.684791564941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9754351377487183, + "rewards_train/margins": 1.0760518312454224, + "rewards_train/rejected": -2.0514869689941406, + "step": 2397 + }, + { + "epoch": 0.67, + "learning_rate": 1.4241656723621515e-08, + "loss": 0.4637, + "step": 2398 + }, + { + "epoch": 0.67, + "logps_train/chosen": -87.3427734375, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -126.21279907226562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.769433617591858, + "rewards_train/margins": 2.011221766471863, + "rewards_train/rejected": -3.7806553840637207, + "step": 2398 + }, + { + "epoch": 0.67, + "logps_train/chosen": -100.74357604980469, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -136.00559997558594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3298254013061523, + "rewards_train/margins": 1.7863597869873047, + "rewards_train/rejected": -3.116185188293457, + "step": 2399 + }, + { + "epoch": 0.67, + "learning_rate": 1.4036468822793967e-08, + "loss": 0.3085, + "step": 2400 + }, + { + "epoch": 0.67, + "logps_train/chosen": -66.79975891113281, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -90.70407104492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5506794452667236, + "rewards_train/margins": 1.7453136444091797, + "rewards_train/rejected": -2.2959930896759033, + "step": 2400 + }, + { + "epoch": 0.67, + "logps_train/chosen": -83.78105163574219, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -114.36720275878906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.472440719604492, + "rewards_train/margins": 1.049436092376709, + "rewards_train/rejected": -3.521876811981201, + "step": 2401 + }, + { + "epoch": 0.67, + "learning_rate": 1.383272709802033e-08, + "loss": 0.3326, + "step": 2402 + }, + { + "epoch": 0.67, + "logps_train/chosen": -73.37391662597656, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -89.45342254638672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0397354364395142, + "rewards_train/margins": 2.0501378774642944, + "rewards_train/rejected": -3.0898733139038086, + "step": 2402 + }, + { + "epoch": 0.67, + "logps_train/chosen": -69.87894439697266, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -107.5324935913086, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4334388971328735, + "rewards_train/margins": 1.7757552862167358, + "rewards_train/rejected": -3.2091941833496094, + "step": 2403 + }, + { + "epoch": 0.67, + "learning_rate": 1.3630432797995251e-08, + "loss": 0.3009, + "step": 2404 + }, + { + "epoch": 0.67, + "logps_train/chosen": -63.660797119140625, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -98.97848510742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.012954831123352, + "rewards_train/margins": 1.8061822652816772, + "rewards_train/rejected": -2.8191370964050293, + "step": 2404 + }, + { + "epoch": 0.67, + "logps_train/chosen": -51.920997619628906, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -65.692138671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9235451817512512, + "rewards_train/margins": 1.3056291937828064, + "rewards_train/rejected": -2.2291743755340576, + "step": 2405 + }, + { + "epoch": 0.67, + "learning_rate": 1.34295871625425e-08, + "loss": 0.3286, + "step": 2406 + }, + { + "epoch": 0.67, + "logps_train/chosen": -67.31393432617188, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -83.87126922607422, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8841273784637451, + "rewards_train/margins": 0.9834682941436768, + "rewards_train/rejected": -2.867595672607422, + "step": 2406 + }, + { + "epoch": 0.67, + "logps_train/chosen": -101.5591049194336, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -166.26010131835938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.001710891723633, + "rewards_train/margins": 3.502619743347168, + "rewards_train/rejected": -5.504330635070801, + "step": 2407 + }, + { + "epoch": 0.67, + "learning_rate": 1.3230191422607063e-08, + "loss": 0.2789, + "step": 2408 + }, + { + "epoch": 0.67, + "logps_train/chosen": -67.17030334472656, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -75.45077514648438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6024301052093506, + "rewards_train/margins": 1.0479693412780762, + "rewards_train/rejected": -2.6503994464874268, + "step": 2408 + }, + { + "epoch": 0.67, + "logps_train/chosen": -59.81527328491211, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -101.85376739501953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2873866558074951, + "rewards_train/margins": 1.9499430656433105, + "rewards_train/rejected": -3.2373297214508057, + "step": 2409 + }, + { + "epoch": 0.67, + "learning_rate": 1.303224680024792e-08, + "loss": 0.4324, + "step": 2410 + }, + { + "epoch": 0.67, + "logps_train/chosen": -64.89202880859375, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -75.514892578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9850528240203857, + "rewards_train/margins": 1.0158016681671143, + "rewards_train/rejected": -3.0008544921875, + "step": 2410 + }, + { + "epoch": 0.67, + "logps_train/chosen": -103.09857940673828, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -116.10062408447266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7422800064086914, + "rewards_train/margins": 2.121102809906006, + "rewards_train/rejected": -2.8633828163146973, + "step": 2411 + }, + { + "epoch": 0.67, + "learning_rate": 1.2835754508630392e-08, + "loss": 0.3235, + "step": 2412 + }, + { + "epoch": 0.67, + "logps_train/chosen": -74.21398162841797, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -96.1976318359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8448357582092285, + "rewards_train/margins": 1.417896032333374, + "rewards_train/rejected": -2.2627317905426025, + "step": 2412 + }, + { + "epoch": 0.67, + "logps_train/chosen": -97.18973541259766, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -138.10935974121094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8095985651016235, + "rewards_train/margins": 2.70250928401947, + "rewards_train/rejected": -4.512107849121094, + "step": 2413 + }, + { + "epoch": 0.67, + "learning_rate": 1.2640715752018777e-08, + "loss": 0.2717, + "step": 2414 + }, + { + "epoch": 0.67, + "logps_train/chosen": -58.3160400390625, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -62.888755798339844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.1085573434829712, + "rewards_train/margins": 0.2411586046218872, + "rewards_train/rejected": -1.3497159481048584, + "step": 2414 + }, + { + "epoch": 0.67, + "logps_train/chosen": -86.14068603515625, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -99.3829345703125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3257873058319092, + "rewards_train/margins": 1.9703190326690674, + "rewards_train/rejected": -3.2961063385009766, + "step": 2415 + }, + { + "epoch": 0.68, + "learning_rate": 1.2447131725768806e-08, + "loss": 0.5444, + "step": 2416 + }, + { + "epoch": 0.68, + "logps_train/chosen": -90.866455078125, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -126.46876525878906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9405517578125, + "rewards_train/margins": 2.5719494819641113, + "rewards_train/rejected": -3.5125012397766113, + "step": 2416 + }, + { + "epoch": 0.68, + "logps_train/chosen": -37.76715087890625, + "logps_train/ref_chosen": -22.875, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -61.101043701171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4909729957580566, + "rewards_train/margins": 1.0597565174102783, + "rewards_train/rejected": -2.550729513168335, + "step": 2417 + }, + { + "epoch": 0.68, + "learning_rate": 1.2255003616320592e-08, + "loss": 0.3856, + "step": 2418 + }, + { + "epoch": 0.68, + "logps_train/chosen": -55.25697326660156, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -91.46592712402344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0396625995635986, + "rewards_train/margins": 1.8676722049713135, + "rewards_train/rejected": -2.907334804534912, + "step": 2418 + }, + { + "epoch": 0.68, + "logps_train/chosen": -82.95384216308594, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -103.22655487060547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2568106651306152, + "rewards_train/margins": 1.695141315460205, + "rewards_train/rejected": -3.9519519805908203, + "step": 2419 + }, + { + "epoch": 0.68, + "learning_rate": 1.2064332601191163e-08, + "loss": 0.3177, + "step": 2420 + }, + { + "epoch": 0.68, + "logps_train/chosen": -78.79591369628906, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -100.02770233154297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.1458516120910645, + "rewards_train/margins": 1.7836523056030273, + "rewards_train/rejected": -3.929503917694092, + "step": 2420 + }, + { + "epoch": 0.68, + "logps_train/chosen": -65.70633697509766, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -70.92654418945312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6659465432167053, + "rewards_train/margins": 0.7820789217948914, + "rewards_train/rejected": -1.4480254650115967, + "step": 2421 + }, + { + "epoch": 0.68, + "learning_rate": 1.187511984896719e-08, + "loss": 0.4298, + "step": 2422 + }, + { + "epoch": 0.68, + "logps_train/chosen": -67.92259216308594, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -99.214599609375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2611314058303833, + "rewards_train/margins": 2.0294686555862427, + "rewards_train/rejected": -3.290600061416626, + "step": 2422 + }, + { + "epoch": 0.68, + "logps_train/chosen": -51.32395935058594, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -34.25, + "logps_train/rejected": -56.72447967529297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2930402755737305, + "rewards_train/margins": 0.9630014896392822, + "rewards_train/rejected": -2.2560417652130127, + "step": 2423 + }, + { + "epoch": 0.68, + "learning_rate": 1.1687366519298053e-08, + "loss": 0.3201, + "step": 2424 + }, + { + "epoch": 0.68, + "logps_train/chosen": -87.52438354492188, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -100.53943634033203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0840210914611816, + "rewards_train/margins": 0.8536046743392944, + "rewards_train/rejected": -1.937625765800476, + "step": 2424 + }, + { + "epoch": 0.68, + "logps_train/chosen": -78.11419677734375, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -99.64435577392578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3555598258972168, + "rewards_train/margins": 1.7846570014953613, + "rewards_train/rejected": -3.140216827392578, + "step": 2425 + }, + { + "epoch": 0.68, + "learning_rate": 1.1501073762888559e-08, + "loss": 0.3919, + "step": 2426 + }, + { + "epoch": 0.68, + "logps_train/chosen": -35.351890563964844, + "logps_train/ref_chosen": -31.625, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -63.67839813232422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.36946624517440796, + "rewards_train/margins": 1.6733737587928772, + "rewards_train/rejected": -2.042840003967285, + "step": 2426 + }, + { + "epoch": 0.68, + "logps_train/chosen": -62.18972396850586, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -71.81778717041016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.101003646850586, + "rewards_train/margins": 1.4165170192718506, + "rewards_train/rejected": -2.5175206661224365, + "step": 2427 + }, + { + "epoch": 0.68, + "learning_rate": 1.1316242721491954e-08, + "loss": 0.351, + "step": 2428 + }, + { + "epoch": 0.68, + "logps_train/chosen": -75.05734252929688, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -101.861083984375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6071021556854248, + "rewards_train/margins": 2.1075222492218018, + "rewards_train/rejected": -3.7146244049072266, + "step": 2428 + }, + { + "epoch": 0.68, + "logps_train/chosen": -88.50202941894531, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -110.95804595947266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9464918375015259, + "rewards_train/margins": 1.4665004014968872, + "rewards_train/rejected": -3.412992238998413, + "step": 2429 + }, + { + "epoch": 0.68, + "learning_rate": 1.113287452790282e-08, + "loss": 0.3863, + "step": 2430 + }, + { + "epoch": 0.68, + "logps_train/chosen": -90.89009094238281, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -123.1275863647461, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.7041456699371338, + "rewards_train/margins": 1.8570501804351807, + "rewards_train/rejected": -3.5611958503723145, + "step": 2430 + }, + { + "epoch": 0.68, + "logps_train/chosen": -96.65910339355469, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -109.24962615966797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3877854347229004, + "rewards_train/margins": 1.326240062713623, + "rewards_train/rejected": -3.7140254974365234, + "step": 2431 + }, + { + "epoch": 0.68, + "learning_rate": 1.0950970305950352e-08, + "loss": 0.351, + "step": 2432 + }, + { + "epoch": 0.68, + "logps_train/chosen": -60.00078201293945, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -64.7528305053711, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0823049545288086, + "rewards_train/margins": 1.9136815071105957, + "rewards_train/rejected": -2.9959864616394043, + "step": 2432 + }, + { + "epoch": 0.68, + "logps_train/chosen": -63.58074951171875, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -80.39212036132812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.7463560104370117, + "rewards_train/margins": 1.0858254432678223, + "rewards_train/rejected": -2.832181453704834, + "step": 2433 + }, + { + "epoch": 0.68, + "learning_rate": 1.0770531170491287e-08, + "loss": 0.3667, + "step": 2434 + }, + { + "epoch": 0.68, + "logps_train/chosen": -91.77403259277344, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -108.3724365234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6567487716674805, + "rewards_train/margins": 2.376051902770996, + "rewards_train/rejected": -4.032800674438477, + "step": 2434 + }, + { + "epoch": 0.68, + "logps_train/chosen": -78.766845703125, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -93.99950408935547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.624145746231079, + "rewards_train/margins": 0.8383047580718994, + "rewards_train/rejected": -2.4624505043029785, + "step": 2435 + }, + { + "epoch": 0.68, + "learning_rate": 1.059155822740304e-08, + "loss": 0.31, + "step": 2436 + }, + { + "epoch": 0.68, + "logps_train/chosen": -95.65286254882812, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -130.82110595703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.56919264793396, + "rewards_train/margins": 2.257840633392334, + "rewards_train/rejected": -3.827033281326294, + "step": 2436 + }, + { + "epoch": 0.68, + "logps_train/chosen": -99.4281005859375, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -127.45632934570312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1505249738693237, + "rewards_train/margins": 1.4359287023544312, + "rewards_train/rejected": -2.586453676223755, + "step": 2437 + }, + { + "epoch": 0.68, + "learning_rate": 1.0414052573577136e-08, + "loss": 0.3998, + "step": 2438 + }, + { + "epoch": 0.68, + "logps_train/chosen": -40.05958557128906, + "logps_train/ref_chosen": -31.5, + "logps_train/ref_rejected": -30.625, + "logps_train/rejected": -46.46166229248047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8520525693893433, + "rewards_train/margins": 0.7300509214401245, + "rewards_train/rejected": -1.5821034908294678, + "step": 2438 + }, + { + "epoch": 0.68, + "logps_train/chosen": -96.32131958007812, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -115.99043273925781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.13877272605896, + "rewards_train/margins": 1.467301368713379, + "rewards_train/rejected": -2.606074094772339, + "step": 2439 + }, + { + "epoch": 0.68, + "learning_rate": 1.0238015296912345e-08, + "loss": 0.4685, + "step": 2440 + }, + { + "epoch": 0.68, + "logps_train/chosen": -75.55229187011719, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -109.10525512695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9167520999908447, + "rewards_train/margins": 2.6869375705718994, + "rewards_train/rejected": -3.603689670562744, + "step": 2440 + }, + { + "epoch": 0.68, + "logps_train/chosen": -57.32146072387695, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -102.18501281738281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.8899586200714111, + "rewards_train/margins": 1.5357694625854492, + "rewards_train/rejected": -2.4257280826568604, + "step": 2441 + }, + { + "epoch": 0.68, + "learning_rate": 1.0063447476307886e-08, + "loss": 0.3098, + "step": 2442 + }, + { + "epoch": 0.68, + "logps_train/chosen": -132.6461181640625, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -125.29907989501953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -3.954845666885376, + "rewards_train/margins": 0.7719371318817139, + "rewards_train/rejected": -4.72678279876709, + "step": 2442 + }, + { + "epoch": 0.68, + "logps_train/chosen": -67.41500854492188, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -85.75605773925781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4844694137573242, + "rewards_train/margins": 1.384300708770752, + "rewards_train/rejected": -2.868770122528076, + "step": 2443 + }, + { + "epoch": 0.68, + "learning_rate": 9.890350181657126e-09, + "loss": 0.4678, + "step": 2444 + }, + { + "epoch": 0.68, + "logps_train/chosen": -85.39179992675781, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -110.13862609863281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.985273838043213, + "rewards_train/margins": 1.6444087028503418, + "rewards_train/rejected": -3.6296825408935547, + "step": 2444 + }, + { + "epoch": 0.68, + "logps_train/chosen": -107.15191650390625, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -131.0458221435547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.5230047702789307, + "rewards_train/margins": 1.701108694076538, + "rewards_train/rejected": -4.224113464355469, + "step": 2445 + }, + { + "epoch": 0.68, + "learning_rate": 9.718724473840784e-09, + "loss": 0.3512, + "step": 2446 + }, + { + "epoch": 0.68, + "logps_train/chosen": -79.40707397460938, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -104.53474426269531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0442235469818115, + "rewards_train/margins": 2.2678444385528564, + "rewards_train/rejected": -3.312067985534668, + "step": 2446 + }, + { + "epoch": 0.68, + "logps_train/chosen": -70.65157318115234, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -95.6650161743164, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -2.3841028213500977, + "rewards_train/margins": 1.440992832183838, + "rewards_train/rejected": -3.8250956535339355, + "step": 2447 + }, + { + "epoch": 0.68, + "learning_rate": 9.54857140472054e-09, + "loss": 0.5246, + "step": 2448 + }, + { + "epoch": 0.68, + "logps_train/chosen": -113.29756164550781, + "logps_train/ref_chosen": -101.5, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -149.53109741210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.168135166168213, + "rewards_train/margins": 2.3838024139404297, + "rewards_train/rejected": -3.5519375801086426, + "step": 2448 + }, + { + "epoch": 0.68, + "logps_train/chosen": -119.50375366210938, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -125.91334533691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.349594831466675, + "rewards_train/margins": 1.2878334522247314, + "rewards_train/rejected": -3.6374282836914062, + "step": 2449 + }, + { + "epoch": 0.68, + "learning_rate": 9.379892017132463e-09, + "loss": 0.321, + "step": 2450 + }, + { + "epoch": 0.68, + "logps_train/chosen": -59.321170806884766, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -89.38465881347656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1541874408721924, + "rewards_train/margins": 1.7940442562103271, + "rewards_train/rejected": -2.9482316970825195, + "step": 2450 + }, + { + "epoch": 0.69, + "logps_train/chosen": -82.37020111083984, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -98.11495971679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5903408527374268, + "rewards_train/margins": 1.909045934677124, + "rewards_train/rejected": -3.499386787414551, + "step": 2451 + }, + { + "epoch": 0.69, + "learning_rate": 9.21268734488076e-09, + "loss": 0.3678, + "step": 2452 + }, + { + "epoch": 0.69, + "logps_train/chosen": -66.78101348876953, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -94.06474304199219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3488047122955322, + "rewards_train/margins": 1.8186075687408447, + "rewards_train/rejected": -3.167412281036377, + "step": 2452 + }, + { + "epoch": 0.69, + "logps_train/chosen": -47.781639099121094, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -75.32060241699219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5617574453353882, + "rewards_train/margins": 1.2742092609405518, + "rewards_train/rejected": -1.83596670627594, + "step": 2453 + }, + { + "epoch": 0.69, + "learning_rate": 9.046958412731454e-09, + "loss": 0.3313, + "step": 2454 + }, + { + "epoch": 0.69, + "logps_train/chosen": -113.29007720947266, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -108.9620590209961, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -2.6899452209472656, + "rewards_train/margins": 0.1687610149383545, + "rewards_train/rejected": -2.85870623588562, + "step": 2454 + }, + { + "epoch": 0.69, + "logps_train/chosen": -74.42170715332031, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -102.59025573730469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.8433425426483154, + "rewards_train/margins": 2.214120626449585, + "rewards_train/rejected": -4.0574631690979, + "step": 2455 + }, + { + "epoch": 0.69, + "learning_rate": 8.882706236405885e-09, + "loss": 0.4355, + "step": 2456 + }, + { + "epoch": 0.69, + "logps_train/chosen": -76.48384857177734, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -108.02183532714844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.20658802986145, + "rewards_train/margins": 2.171766996383667, + "rewards_train/rejected": -4.378355026245117, + "step": 2456 + }, + { + "epoch": 0.69, + "logps_train/chosen": -95.93270874023438, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -113.05723571777344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6924901008605957, + "rewards_train/margins": 0.9589362144470215, + "rewards_train/rejected": -2.651426315307617, + "step": 2457 + }, + { + "epoch": 0.69, + "learning_rate": 8.719931822574716e-09, + "loss": 0.4546, + "step": 2458 + }, + { + "epoch": 0.69, + "logps_train/chosen": -68.79032897949219, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -36.25, + "logps_train/rejected": -66.95207977294922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4576951265335083, + "rewards_train/margins": 1.623987078666687, + "rewards_train/rejected": -3.0816822052001953, + "step": 2458 + }, + { + "epoch": 0.69, + "logps_train/chosen": -54.415740966796875, + "logps_train/ref_chosen": -41.75, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -59.605167388916016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2603728771209717, + "rewards_train/margins": 0.5190891027450562, + "rewards_train/rejected": -1.7794619798660278, + "step": 2459 + }, + { + "epoch": 0.69, + "learning_rate": 8.558636168851745e-09, + "loss": 0.3846, + "step": 2460 + }, + { + "epoch": 0.69, + "logps_train/chosen": -109.34850311279297, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -116.5679931640625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -3.1129746437072754, + "rewards_train/margins": 0.7996840476989746, + "rewards_train/rejected": -3.91265869140625, + "step": 2460 + }, + { + "epoch": 0.69, + "logps_train/chosen": -75.21438598632812, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -94.56011962890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4101901054382324, + "rewards_train/margins": 1.4834990501403809, + "rewards_train/rejected": -2.8936891555786133, + "step": 2461 + }, + { + "epoch": 0.69, + "learning_rate": 8.39882026378766e-09, + "loss": 0.4584, + "step": 2462 + }, + { + "epoch": 0.69, + "logps_train/chosen": -61.35057830810547, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -69.88313293457031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1280266046524048, + "rewards_train/margins": 1.4540365934371948, + "rewards_train/rejected": -2.5820631980895996, + "step": 2462 + }, + { + "epoch": 0.69, + "logps_train/chosen": -76.86917114257812, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -107.19986724853516, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3388700485229492, + "rewards_train/margins": 1.6354131698608398, + "rewards_train/rejected": -2.974283218383789, + "step": 2463 + }, + { + "epoch": 0.69, + "learning_rate": 8.240485086864007e-09, + "loss": 0.3694, + "step": 2464 + }, + { + "epoch": 0.69, + "logps_train/chosen": -63.74079895019531, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -80.04426574707031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5031814575195312, + "rewards_train/margins": 0.8016355037689209, + "rewards_train/rejected": -2.304816961288452, + "step": 2464 + }, + { + "epoch": 0.69, + "logps_train/chosen": -76.92068481445312, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -90.85658264160156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2391629219055176, + "rewards_train/margins": 1.473839282989502, + "rewards_train/rejected": -2.7130022048950195, + "step": 2465 + }, + { + "epoch": 0.69, + "learning_rate": 8.083631608487268e-09, + "loss": 0.482, + "step": 2466 + }, + { + "epoch": 0.69, + "logps_train/chosen": -52.597015380859375, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -70.30706787109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5288236141204834, + "rewards_train/margins": 1.6440527439117432, + "rewards_train/rejected": -2.1728763580322266, + "step": 2466 + }, + { + "epoch": 0.69, + "logps_train/chosen": -41.66633605957031, + "logps_train/ref_chosen": -29.75, + "logps_train/ref_rejected": -29.5, + "logps_train/rejected": -50.57468032836914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1877273321151733, + "rewards_train/margins": 0.9138811826705933, + "rewards_train/rejected": -2.1016085147857666, + "step": 2467 + }, + { + "epoch": 0.69, + "learning_rate": 7.928260789982932e-09, + "loss": 0.4205, + "step": 2468 + }, + { + "epoch": 0.69, + "logps_train/chosen": -64.90805053710938, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -81.55690002441406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.2579922676086426, + "rewards_train/margins": 0.8851981163024902, + "rewards_train/rejected": -2.143190383911133, + "step": 2468 + }, + { + "epoch": 0.69, + "logps_train/chosen": -62.74514389038086, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -104.23786163330078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.254934549331665, + "rewards_train/margins": 2.454007863998413, + "rewards_train/rejected": -3.708942413330078, + "step": 2469 + }, + { + "epoch": 0.69, + "learning_rate": 7.774373583589455e-09, + "loss": 0.3021, + "step": 2470 + }, + { + "epoch": 0.69, + "logps_train/chosen": -90.30842590332031, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -69.93529510498047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.580256700515747, + "rewards_train/margins": 0.21385860443115234, + "rewards_train/rejected": -1.7941153049468994, + "step": 2470 + }, + { + "epoch": 0.69, + "logps_train/chosen": -91.61563110351562, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -115.80494689941406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8113679885864258, + "rewards_train/margins": 1.9533071517944336, + "rewards_train/rejected": -3.7646751403808594, + "step": 2471 + }, + { + "epoch": 0.69, + "learning_rate": 7.621970932452536e-09, + "loss": 0.6114, + "step": 2472 + }, + { + "epoch": 0.69, + "logps_train/chosen": -83.10690307617188, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -117.90853881835938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5819799900054932, + "rewards_train/margins": 2.6479365825653076, + "rewards_train/rejected": -4.229916572570801, + "step": 2472 + }, + { + "epoch": 0.69, + "logps_train/chosen": -51.63884353637695, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -74.3192138671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9325367212295532, + "rewards_train/margins": 1.8177446126937866, + "rewards_train/rejected": -2.75028133392334, + "step": 2473 + }, + { + "epoch": 0.69, + "learning_rate": 7.47105377061935e-09, + "loss": 0.2614, + "step": 2474 + }, + { + "epoch": 0.69, + "logps_train/chosen": -80.03575897216797, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -107.07798767089844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.789513111114502, + "rewards_train/margins": 2.2671141624450684, + "rewards_train/rejected": -4.05662727355957, + "step": 2474 + }, + { + "epoch": 0.69, + "logps_train/chosen": -33.94361877441406, + "logps_train/ref_chosen": -25.5, + "logps_train/ref_rejected": -26.625, + "logps_train/rejected": -42.953208923339844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8385021686553955, + "rewards_train/margins": 0.7933425903320312, + "rewards_train/rejected": -1.6318447589874268, + "step": 2475 + }, + { + "epoch": 0.69, + "learning_rate": 7.321623023032797e-09, + "loss": 0.3948, + "step": 2476 + }, + { + "epoch": 0.69, + "logps_train/chosen": -101.93995666503906, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -113.46424102783203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.4521994590759277, + "rewards_train/margins": 0.7137560844421387, + "rewards_train/rejected": -3.1659555435180664, + "step": 2476 + }, + { + "epoch": 0.69, + "logps_train/chosen": -117.43596649169922, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -140.4024658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.3443779945373535, + "rewards_train/margins": 2.070868492126465, + "rewards_train/rejected": -4.415246486663818, + "step": 2477 + }, + { + "epoch": 0.69, + "learning_rate": 7.1736796055257285e-09, + "loss": 0.3932, + "step": 2478 + }, + { + "epoch": 0.69, + "logps_train/chosen": -68.93977355957031, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -88.85133361816406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.894759178161621, + "rewards_train/margins": 0.8424253463745117, + "rewards_train/rejected": -2.737184524536133, + "step": 2478 + }, + { + "epoch": 0.69, + "logps_train/chosen": -62.757423400878906, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -74.76496887207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0546488761901855, + "rewards_train/margins": 0.9101297855377197, + "rewards_train/rejected": -2.9647786617279053, + "step": 2479 + }, + { + "epoch": 0.69, + "learning_rate": 7.027224424815542e-09, + "loss": 0.4252, + "step": 2480 + }, + { + "epoch": 0.69, + "logps_train/chosen": -108.4504165649414, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -133.88917541503906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.838791847229004, + "rewards_train/margins": 1.2126259803771973, + "rewards_train/rejected": -3.051417827606201, + "step": 2480 + }, + { + "epoch": 0.69, + "logps_train/chosen": -44.91813278198242, + "logps_train/ref_chosen": -32.5, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -69.77384948730469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2348432540893555, + "rewards_train/margins": 0.6959223747253418, + "rewards_train/rejected": -1.9307656288146973, + "step": 2481 + }, + { + "epoch": 0.69, + "learning_rate": 6.882258378498457e-09, + "loss": 0.4873, + "step": 2482 + }, + { + "epoch": 0.69, + "logps_train/chosen": -91.01998138427734, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -119.45240783691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.745943307876587, + "rewards_train/margins": 1.5054504871368408, + "rewards_train/rejected": -3.2513937950134277, + "step": 2482 + }, + { + "epoch": 0.69, + "logps_train/chosen": -61.96266174316406, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -77.428955078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4462659358978271, + "rewards_train/margins": 0.9817855358123779, + "rewards_train/rejected": -2.428051471710205, + "step": 2483 + }, + { + "epoch": 0.69, + "learning_rate": 6.738782355044048e-09, + "loss": 0.3518, + "step": 2484 + }, + { + "epoch": 0.69, + "logps_train/chosen": -69.67442321777344, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -63.17634963989258, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.5654404163360596, + "rewards_train/margins": 0.2586398124694824, + "rewards_train/rejected": -1.824080228805542, + "step": 2484 + }, + { + "epoch": 0.69, + "logps_train/chosen": -109.82067108154297, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -137.20709228515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9070677757263184, + "rewards_train/margins": 2.1558289527893066, + "rewards_train/rejected": -4.062896728515625, + "step": 2485 + }, + { + "epoch": 0.69, + "learning_rate": 6.596797233789863e-09, + "loss": 0.5463, + "step": 2486 + }, + { + "epoch": 0.69, + "logps_train/chosen": -39.382389068603516, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -63.26960754394531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6440982818603516, + "rewards_train/margins": 1.630518913269043, + "rewards_train/rejected": -2.2746171951293945, + "step": 2486 + }, + { + "epoch": 0.7, + "logps_train/chosen": -73.81800842285156, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -113.29178619384766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1682264804840088, + "rewards_train/margins": 2.5187647342681885, + "rewards_train/rejected": -3.6869912147521973, + "step": 2487 + }, + { + "epoch": 0.7, + "learning_rate": 6.45630388493601e-09, + "loss": 0.2414, + "step": 2488 + }, + { + "epoch": 0.7, + "logps_train/chosen": -68.95889282226562, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -68.53378295898438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.294912815093994, + "rewards_train/margins": 0.24713706970214844, + "rewards_train/rejected": -2.5420498847961426, + "step": 2488 + }, + { + "epoch": 0.7, + "logps_train/chosen": -60.429569244384766, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -65.44770050048828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.300281286239624, + "rewards_train/margins": 0.7160708904266357, + "rewards_train/rejected": -2.0163521766662598, + "step": 2489 + }, + { + "epoch": 0.7, + "learning_rate": 6.317303169539739e-09, + "loss": 0.5899, + "step": 2490 + }, + { + "epoch": 0.7, + "logps_train/chosen": -48.39255142211914, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -63.63333511352539, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6046849489212036, + "rewards_train/margins": 1.203863501548767, + "rewards_train/rejected": -1.8085484504699707, + "step": 2490 + }, + { + "epoch": 0.7, + "logps_train/chosen": -30.971542358398438, + "logps_train/ref_chosen": -25.75, + "logps_train/ref_rejected": -20.125, + "logps_train/rejected": -32.14308166503906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5250838994979858, + "rewards_train/margins": 0.6758455038070679, + "rewards_train/rejected": -1.2009294033050537, + "step": 2491 + }, + { + "epoch": 0.7, + "learning_rate": 6.179795939510263e-09, + "loss": 0.4329, + "step": 2492 + }, + { + "epoch": 0.7, + "logps_train/chosen": -48.52174377441406, + "logps_train/ref_chosen": -35.0, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -62.720191955566406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.355494499206543, + "rewards_train/margins": 0.9501185417175293, + "rewards_train/rejected": -2.3056130409240723, + "step": 2492 + }, + { + "epoch": 0.7, + "logps_train/chosen": -23.89380645751953, + "logps_train/ref_chosen": -19.0, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -57.6026496887207, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4851815700531006, + "rewards_train/margins": 0.8491067886352539, + "rewards_train/rejected": -1.3342883586883545, + "step": 2493 + }, + { + "epoch": 0.7, + "learning_rate": 6.043783037603472e-09, + "loss": 0.4688, + "step": 2494 + }, + { + "epoch": 0.7, + "logps_train/chosen": -34.178165435791016, + "logps_train/ref_chosen": -25.0, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -55.25493621826172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9166446924209595, + "rewards_train/margins": 0.8803333044052124, + "rewards_train/rejected": -1.7969779968261719, + "step": 2494 + }, + { + "epoch": 0.7, + "logps_train/chosen": -62.27611541748047, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -92.29428100585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.1438229084014893, + "rewards_train/margins": 1.6285741329193115, + "rewards_train/rejected": -2.772397041320801, + "step": 2495 + }, + { + "epoch": 0.7, + "learning_rate": 5.909265297416921e-09, + "loss": 0.357, + "step": 2496 + }, + { + "epoch": 0.7, + "logps_train/chosen": -60.37770080566406, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -80.50286865234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1416759490966797, + "rewards_train/margins": 1.761735200881958, + "rewards_train/rejected": -2.9034111499786377, + "step": 2496 + }, + { + "epoch": 0.7, + "logps_train/chosen": -78.12179565429688, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -136.73385620117188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.4813199043273926, + "rewards_train/margins": 3.231128215789795, + "rewards_train/rejected": -4.7124481201171875, + "step": 2497 + }, + { + "epoch": 0.7, + "learning_rate": 5.776243543384435e-09, + "loss": 0.3082, + "step": 2498 + }, + { + "epoch": 0.7, + "logps_train/chosen": -73.89704895019531, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -56.75, + "logps_train/rejected": -91.88042449951172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6803300380706787, + "rewards_train/margins": 1.821774959564209, + "rewards_train/rejected": -3.5021049976348877, + "step": 2498 + }, + { + "epoch": 0.7, + "logps_train/chosen": -80.3949203491211, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -116.55427551269531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.4425199031829834, + "rewards_train/margins": 2.521501302719116, + "rewards_train/rejected": -3.9640212059020996, + "step": 2499 + }, + { + "epoch": 0.7, + "learning_rate": 5.644718590771341e-09, + "loss": 0.34, + "step": 2500 + }, + { + "epoch": 0.7, + "logps_train/chosen": -55.54356384277344, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -69.06123352050781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9430282711982727, + "rewards_train/margins": 0.4974704384803772, + "rewards_train/rejected": -1.44049870967865, + "step": 2500 + }, + { + "epoch": 0.7, + "logps_train/chosen": -109.88554382324219, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -114.14254760742188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2955855131149292, + "rewards_train/margins": 1.2350753545761108, + "rewards_train/rejected": -2.53066086769104, + "step": 2501 + }, + { + "epoch": 0.7, + "learning_rate": 5.514691245669279e-09, + "loss": 0.4098, + "step": 2502 + }, + { + "epoch": 0.7, + "logps_train/chosen": -117.96464538574219, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -127.90826416015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4212193489074707, + "rewards_train/margins": 1.5367944240570068, + "rewards_train/rejected": -3.9580137729644775, + "step": 2502 + }, + { + "epoch": 0.7, + "logps_train/chosen": -80.95791625976562, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -90.92584228515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.09735369682312, + "rewards_train/margins": 0.9616367816925049, + "rewards_train/rejected": -3.058990478515625, + "step": 2503 + }, + { + "epoch": 0.7, + "learning_rate": 5.386162304991393e-09, + "loss": 0.4264, + "step": 2504 + }, + { + "epoch": 0.7, + "logps_train/chosen": -64.06707763671875, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -90.91767883300781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6518247127532959, + "rewards_train/margins": 2.4473655223846436, + "rewards_train/rejected": -3.0991902351379395, + "step": 2504 + }, + { + "epoch": 0.7, + "logps_train/chosen": -114.7740249633789, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -135.31288146972656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3707621097564697, + "rewards_train/margins": 1.3536896705627441, + "rewards_train/rejected": -3.724451780319214, + "step": 2505 + }, + { + "epoch": 0.7, + "learning_rate": 5.259132556467316e-09, + "loss": 0.2807, + "step": 2506 + }, + { + "epoch": 0.7, + "logps_train/chosen": -73.70887756347656, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -91.44155883789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4282361268997192, + "rewards_train/margins": 1.0232445001602173, + "rewards_train/rejected": -2.4514806270599365, + "step": 2506 + }, + { + "epoch": 0.7, + "logps_train/chosen": -76.83233642578125, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -91.50949096679688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.363507866859436, + "rewards_train/margins": 1.5085357427597046, + "rewards_train/rejected": -2.8720436096191406, + "step": 2507 + }, + { + "epoch": 0.7, + "learning_rate": 5.133602778638474e-09, + "loss": 0.3832, + "step": 2508 + }, + { + "epoch": 0.7, + "logps_train/chosen": -67.82963562011719, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -75.24113464355469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1230028867721558, + "rewards_train/margins": 1.0438843965530396, + "rewards_train/rejected": -2.1668872833251953, + "step": 2508 + }, + { + "epoch": 0.7, + "logps_train/chosen": -45.889976501464844, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -66.9637222290039, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1408531665802002, + "rewards_train/margins": 0.8887221813201904, + "rewards_train/rejected": -2.0295753479003906, + "step": 2509 + }, + { + "epoch": 0.7, + "learning_rate": 5.009573740853313e-09, + "loss": 0.4551, + "step": 2510 + }, + { + "epoch": 0.7, + "logps_train/chosen": -49.59282302856445, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -58.731849670410156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3594775199890137, + "rewards_train/margins": 0.7906608581542969, + "rewards_train/rejected": -2.1501383781433105, + "step": 2510 + }, + { + "epoch": 0.7, + "logps_train/chosen": -118.69239807128906, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -143.94659423828125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3348653316497803, + "rewards_train/margins": 1.9566686153411865, + "rewards_train/rejected": -4.291533946990967, + "step": 2511 + }, + { + "epoch": 0.7, + "learning_rate": 4.887046203262357e-09, + "loss": 0.4218, + "step": 2512 + }, + { + "epoch": 0.7, + "logps_train/chosen": -72.07984924316406, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -93.16964721679688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.031886100769043, + "rewards_train/margins": 0.603804349899292, + "rewards_train/rejected": -2.635690450668335, + "step": 2512 + }, + { + "epoch": 0.7, + "logps_train/chosen": -86.69515228271484, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -126.12506103515625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.1351399421691895, + "rewards_train/margins": 2.3808813095092773, + "rewards_train/rejected": -3.516021251678467, + "step": 2513 + }, + { + "epoch": 0.7, + "learning_rate": 4.7660209168138285e-09, + "loss": 0.4646, + "step": 2514 + }, + { + "epoch": 0.7, + "logps_train/chosen": -64.74380493164062, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -70.17572021484375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.0950838327407837, + "rewards_train/margins": 1.245388388633728, + "rewards_train/rejected": -2.3404722213745117, + "step": 2514 + }, + { + "epoch": 0.7, + "logps_train/chosen": -87.3644790649414, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -91.03467559814453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.362717628479004, + "rewards_train/margins": 1.1411406993865967, + "rewards_train/rejected": -3.5038583278656006, + "step": 2515 + }, + { + "epoch": 0.7, + "learning_rate": 4.646498623248979e-09, + "loss": 0.4612, + "step": 2516 + }, + { + "epoch": 0.7, + "logps_train/chosen": -99.24951171875, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -146.4476318359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6050289869308472, + "rewards_train/margins": 3.420204281806946, + "rewards_train/rejected": -5.025233268737793, + "step": 2516 + }, + { + "epoch": 0.7, + "logps_train/chosen": -84.79946899414062, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -94.22405242919922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9858070611953735, + "rewards_train/margins": 0.9464620351791382, + "rewards_train/rejected": -2.9322690963745117, + "step": 2517 + }, + { + "epoch": 0.7, + "learning_rate": 4.528480055097372e-09, + "loss": 0.257, + "step": 2518 + }, + { + "epoch": 0.7, + "logps_train/chosen": -89.5019302368164, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -115.20271301269531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.750583291053772, + "rewards_train/margins": 2.248594641685486, + "rewards_train/rejected": -3.999177932739258, + "step": 2518 + }, + { + "epoch": 0.7, + "logps_train/chosen": -73.71375274658203, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -100.21600341796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0963748693466187, + "rewards_train/margins": 1.8213196992874146, + "rewards_train/rejected": -2.917694568634033, + "step": 2519 + }, + { + "epoch": 0.7, + "learning_rate": 4.41196593567264e-09, + "loss": 0.2622, + "step": 2520 + }, + { + "epoch": 0.7, + "logps_train/chosen": -60.51446533203125, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -73.4498519897461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9528138637542725, + "rewards_train/margins": 1.3773272037506104, + "rewards_train/rejected": -2.330141067504883, + "step": 2520 + }, + { + "epoch": 0.7, + "logps_train/chosen": -65.96441650390625, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -86.78656768798828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.142828345298767, + "rewards_train/margins": 1.0246959924697876, + "rewards_train/rejected": -2.1675243377685547, + "step": 2521 + }, + { + "epoch": 0.7, + "learning_rate": 4.2969569790678454e-09, + "loss": 0.3736, + "step": 2522 + }, + { + "epoch": 0.7, + "logps_train/chosen": -31.64518928527832, + "logps_train/ref_chosen": -23.75, + "logps_train/ref_rejected": -30.875, + "logps_train/rejected": -49.446434020996094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7922532558441162, + "rewards_train/margins": 1.0594215393066406, + "rewards_train/rejected": -1.8516747951507568, + "step": 2522 + }, + { + "epoch": 0.71, + "logps_train/chosen": -77.87290954589844, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -92.91309356689453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9161972999572754, + "rewards_train/margins": 1.5172991752624512, + "rewards_train/rejected": -3.4334964752197266, + "step": 2523 + }, + { + "epoch": 0.71, + "learning_rate": 4.183453890151289e-09, + "loss": 0.3236, + "step": 2524 + }, + { + "epoch": 0.71, + "logps_train/chosen": -58.016319274902344, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -74.41314697265625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0327842235565186, + "rewards_train/margins": 1.125718355178833, + "rewards_train/rejected": -2.1585025787353516, + "step": 2524 + }, + { + "epoch": 0.71, + "logps_train/chosen": -81.72956085205078, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -89.55940246582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.950885534286499, + "rewards_train/margins": 1.691382884979248, + "rewards_train/rejected": -3.642268419265747, + "step": 2525 + }, + { + "epoch": 0.71, + "learning_rate": 4.0714573645619645e-09, + "loss": 0.4365, + "step": 2526 + }, + { + "epoch": 0.71, + "logps_train/chosen": -125.89964294433594, + "logps_train/ref_chosen": -103.0, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -151.95408630371094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2696521282196045, + "rewards_train/margins": 2.822632074356079, + "rewards_train/rejected": -5.092284202575684, + "step": 2526 + }, + { + "epoch": 0.71, + "logps_train/chosen": -107.92918395996094, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -119.59950256347656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8476057052612305, + "rewards_train/margins": 1.6846108436584473, + "rewards_train/rejected": -3.5322165489196777, + "step": 2527 + }, + { + "epoch": 0.71, + "learning_rate": 3.960968088705524e-09, + "loss": 0.3241, + "step": 2528 + }, + { + "epoch": 0.71, + "logps_train/chosen": -57.016517639160156, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -82.94845581054688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.274308204650879, + "rewards_train/margins": 1.2373342514038086, + "rewards_train/rejected": -2.5116424560546875, + "step": 2528 + }, + { + "epoch": 0.71, + "logps_train/chosen": -105.4737777709961, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -106.50234985351562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.0741353034973145, + "rewards_train/margins": 1.7837176322937012, + "rewards_train/rejected": -3.8578529357910156, + "step": 2529 + }, + { + "epoch": 0.71, + "learning_rate": 3.85198673974993e-09, + "loss": 0.3638, + "step": 2530 + }, + { + "epoch": 0.71, + "logps_train/chosen": -82.160888671875, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -73.44884490966797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.467456340789795, + "rewards_train/margins": 0.8141472339630127, + "rewards_train/rejected": -2.2816035747528076, + "step": 2530 + }, + { + "epoch": 0.71, + "logps_train/chosen": -76.90202331542969, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -70.87318420410156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9978537559509277, + "rewards_train/margins": 0.7424285411834717, + "rewards_train/rejected": -1.7402822971343994, + "step": 2531 + }, + { + "epoch": 0.71, + "learning_rate": 3.744513985621339e-09, + "loss": 0.5711, + "step": 2532 + }, + { + "epoch": 0.71, + "logps_train/chosen": -91.89335632324219, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -96.53278350830078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.4848430156707764, + "rewards_train/margins": 0.6058864593505859, + "rewards_train/rejected": -3.0907294750213623, + "step": 2532 + }, + { + "epoch": 0.71, + "logps_train/chosen": -48.245994567871094, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -58.76687240600586, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3714743852615356, + "rewards_train/margins": 0.6583379507064819, + "rewards_train/rejected": -2.0298123359680176, + "step": 2533 + }, + { + "epoch": 0.71, + "learning_rate": 3.63855048500003e-09, + "loss": 0.5459, + "step": 2534 + }, + { + "epoch": 0.71, + "logps_train/chosen": -111.29991912841797, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -122.70579528808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4971795082092285, + "rewards_train/margins": 1.4253535270690918, + "rewards_train/rejected": -3.9225330352783203, + "step": 2534 + }, + { + "epoch": 0.71, + "logps_train/chosen": -52.40454864501953, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -53.28978729248047, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6623297929763794, + "rewards_train/margins": 0.42866051197052, + "rewards_train/rejected": -1.0909903049468994, + "step": 2535 + }, + { + "epoch": 0.71, + "learning_rate": 3.5340968873163457e-09, + "loss": 0.4747, + "step": 2536 + }, + { + "epoch": 0.71, + "logps_train/chosen": -63.09726333618164, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -99.51697540283203, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1542572975158691, + "rewards_train/margins": 2.721268653869629, + "rewards_train/rejected": -3.875525951385498, + "step": 2536 + }, + { + "epoch": 0.71, + "logps_train/chosen": -50.594722747802734, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -45.75, + "logps_train/rejected": -69.31079864501953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5809564590454102, + "rewards_train/margins": 0.785670280456543, + "rewards_train/rejected": -2.366626739501953, + "step": 2537 + }, + { + "epoch": 0.71, + "learning_rate": 3.4311538327467525e-09, + "loss": 0.331, + "step": 2538 + }, + { + "epoch": 0.71, + "logps_train/chosen": -43.12479782104492, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -55.727699279785156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3640422523021698, + "rewards_train/margins": 1.0585326254367828, + "rewards_train/rejected": -1.4225748777389526, + "step": 2538 + }, + { + "epoch": 0.71, + "logps_train/chosen": -42.5294189453125, + "logps_train/ref_chosen": -37.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -61.11363220214844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5201295614242554, + "rewards_train/margins": 0.8588117361068726, + "rewards_train/rejected": -1.378941297531128, + "step": 2539 + }, + { + "epoch": 0.71, + "learning_rate": 3.3297219522098185e-09, + "loss": 0.4105, + "step": 2540 + }, + { + "epoch": 0.71, + "logps_train/chosen": -73.4512939453125, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -97.45016479492188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8943475484848022, + "rewards_train/margins": 1.713950753211975, + "rewards_train/rejected": -3.6082983016967773, + "step": 2540 + }, + { + "epoch": 0.71, + "logps_train/chosen": -83.4524154663086, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -112.43692016601562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8120381832122803, + "rewards_train/margins": 0.9123179912567139, + "rewards_train/rejected": -2.724356174468994, + "step": 2541 + }, + { + "epoch": 0.71, + "learning_rate": 3.229801867362436e-09, + "loss": 0.3953, + "step": 2542 + }, + { + "epoch": 0.71, + "logps_train/chosen": -100.3590316772461, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -114.94544982910156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.642935037612915, + "rewards_train/margins": 1.7891104221343994, + "rewards_train/rejected": -4.4320454597473145, + "step": 2542 + }, + { + "epoch": 0.71, + "logps_train/chosen": -59.420745849609375, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -76.69956970214844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2723479270935059, + "rewards_train/margins": 1.3438987731933594, + "rewards_train/rejected": -2.6162467002868652, + "step": 2543 + }, + { + "epoch": 0.71, + "learning_rate": 3.131394190595993e-09, + "loss": 0.3992, + "step": 2544 + }, + { + "epoch": 0.71, + "logps_train/chosen": -85.40623474121094, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -82.9942398071289, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.5839824676513672, + "rewards_train/margins": 0.4677853584289551, + "rewards_train/rejected": -2.0517678260803223, + "step": 2544 + }, + { + "epoch": 0.71, + "logps_train/chosen": -74.05406188964844, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -112.29505920410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.039976716041565, + "rewards_train/margins": 3.261355757713318, + "rewards_train/rejected": -4.301332473754883, + "step": 2545 + }, + { + "epoch": 0.71, + "learning_rate": 3.0344995250326245e-09, + "loss": 0.4002, + "step": 2546 + }, + { + "epoch": 0.71, + "logps_train/chosen": -108.33120727539062, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -115.80293273925781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.568178653717041, + "rewards_train/margins": 0.8703188896179199, + "rewards_train/rejected": -4.438497543334961, + "step": 2546 + }, + { + "epoch": 0.71, + "logps_train/chosen": -60.30493927001953, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -74.990966796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2004163265228271, + "rewards_train/margins": 1.5908684730529785, + "rewards_train/rejected": -2.7912847995758057, + "step": 2547 + }, + { + "epoch": 0.71, + "learning_rate": 2.9391184645214952e-09, + "loss": 0.4832, + "step": 2548 + }, + { + "epoch": 0.71, + "logps_train/chosen": -38.85019302368164, + "logps_train/ref_chosen": -25.75, + "logps_train/ref_rejected": -30.0, + "logps_train/rejected": -53.136985778808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3068944215774536, + "rewards_train/margins": 1.00485098361969, + "rewards_train/rejected": -2.3117454051971436, + "step": 2548 + }, + { + "epoch": 0.71, + "logps_train/chosen": -75.02616882324219, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -110.71455383300781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5113331079483032, + "rewards_train/margins": 2.076332926750183, + "rewards_train/rejected": -3.5876660346984863, + "step": 2549 + }, + { + "epoch": 0.71, + "learning_rate": 2.845251593635134e-09, + "loss": 0.303, + "step": 2550 + }, + { + "epoch": 0.71, + "logps_train/chosen": -73.92768096923828, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -99.3314208984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.50937020778656, + "rewards_train/margins": 1.98568594455719, + "rewards_train/rejected": -3.49505615234375, + "step": 2550 + }, + { + "epoch": 0.71, + "logps_train/chosen": -80.48007202148438, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -101.70580291748047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3081636428833008, + "rewards_train/margins": 1.1171047687530518, + "rewards_train/rejected": -2.4252684116363525, + "step": 2551 + }, + { + "epoch": 0.71, + "learning_rate": 2.7528994876659663e-09, + "loss": 0.4651, + "step": 2552 + }, + { + "epoch": 0.71, + "logps_train/chosen": -86.15945434570312, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -86.7913818359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.3490512371063232, + "rewards_train/margins": 1.1609461307525635, + "rewards_train/rejected": -2.5099973678588867, + "step": 2552 + }, + { + "epoch": 0.71, + "logps_train/chosen": -66.86172485351562, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -108.44461822509766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8227444887161255, + "rewards_train/margins": 2.2795294523239136, + "rewards_train/rejected": -4.102273941040039, + "step": 2553 + }, + { + "epoch": 0.71, + "learning_rate": 2.6620627126226205e-09, + "loss": 0.4658, + "step": 2554 + }, + { + "epoch": 0.71, + "logps_train/chosen": -114.74082946777344, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -111.30467224121094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6873644590377808, + "rewards_train/margins": 1.7747431993484497, + "rewards_train/rejected": -3.4621076583862305, + "step": 2554 + }, + { + "epoch": 0.71, + "logps_train/chosen": -48.35145950317383, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -52.562843322753906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.280165433883667, + "rewards_train/margins": 0.377974271774292, + "rewards_train/rejected": -1.658139705657959, + "step": 2555 + }, + { + "epoch": 0.71, + "learning_rate": 2.5727418252266265e-09, + "loss": 0.374, + "step": 2556 + }, + { + "epoch": 0.71, + "logps_train/chosen": -96.28713989257812, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -128.15771484375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4005889892578125, + "rewards_train/margins": 3.024655342102051, + "rewards_train/rejected": -4.425244331359863, + "step": 2556 + }, + { + "epoch": 0.71, + "logps_train/chosen": -94.68815612792969, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -100.0, + "logps_train/rejected": -147.94271850585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4899098873138428, + "rewards_train/margins": 3.2996747493743896, + "rewards_train/rejected": -4.789584636688232, + "step": 2557 + }, + { + "epoch": 0.71, + "learning_rate": 2.484937372908835e-09, + "loss": 0.1877, + "step": 2558 + }, + { + "epoch": 0.71, + "logps_train/chosen": -99.77741241455078, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -127.62248229980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7371158599853516, + "rewards_train/margins": 2.3470077514648438, + "rewards_train/rejected": -4.084123611450195, + "step": 2558 + }, + { + "epoch": 0.72, + "logps_train/chosen": -50.96588897705078, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -74.98343658447266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6544015407562256, + "rewards_train/margins": 1.6210906505584717, + "rewards_train/rejected": -2.2754921913146973, + "step": 2559 + }, + { + "epoch": 0.72, + "learning_rate": 2.3986498938062537e-09, + "loss": 0.2611, + "step": 2560 + }, + { + "epoch": 0.72, + "logps_train/chosen": -53.0804557800293, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -53.82887268066406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7264049053192139, + "rewards_train/margins": 0.9108771085739136, + "rewards_train/rejected": -1.6372820138931274, + "step": 2560 + }, + { + "epoch": 0.72, + "logps_train/chosen": -62.467803955078125, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -114.39625549316406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7627962827682495, + "rewards_train/margins": 2.4947978258132935, + "rewards_train/rejected": -3.257594108581543, + "step": 2561 + }, + { + "epoch": 0.72, + "learning_rate": 2.3138799167585488e-09, + "loss": 0.3925, + "step": 2562 + }, + { + "epoch": 0.72, + "logps_train/chosen": -84.17210388183594, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -107.87734985351562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.0143778324127197, + "rewards_train/margins": 1.4485528469085693, + "rewards_train/rejected": -3.462930679321289, + "step": 2562 + }, + { + "epoch": 0.72, + "logps_train/chosen": -75.2925033569336, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -93.16082763671875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.767727255821228, + "rewards_train/margins": 0.875503659248352, + "rewards_train/rejected": -2.64323091506958, + "step": 2563 + }, + { + "epoch": 0.72, + "learning_rate": 2.2306279613049926e-09, + "loss": 0.4548, + "step": 2564 + }, + { + "epoch": 0.72, + "logps_train/chosen": -77.18624877929688, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -69.58381652832031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.8937714099884033, + "rewards_train/margins": 0.8580672740936279, + "rewards_train/rejected": -2.7518386840820312, + "step": 2564 + }, + { + "epoch": 0.72, + "logps_train/chosen": -85.76945495605469, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -120.61970520019531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4031178951263428, + "rewards_train/margins": 2.1205708980560303, + "rewards_train/rejected": -3.523688793182373, + "step": 2565 + }, + { + "epoch": 0.72, + "learning_rate": 2.1488945376810785e-09, + "loss": 0.5526, + "step": 2566 + }, + { + "epoch": 0.72, + "logps_train/chosen": -117.46063232421875, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -142.2309112548828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2866880893707275, + "rewards_train/margins": 2.3489034175872803, + "rewards_train/rejected": -4.635591506958008, + "step": 2566 + }, + { + "epoch": 0.72, + "logps_train/chosen": -102.19227600097656, + "logps_train/ref_chosen": -86.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -99.00234985351562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.6325082778930664, + "rewards_train/margins": 0.9575705528259277, + "rewards_train/rejected": -2.590078830718994, + "step": 2567 + }, + { + "epoch": 0.72, + "learning_rate": 2.06868014681566e-09, + "loss": 0.4992, + "step": 2568 + }, + { + "epoch": 0.72, + "logps_train/chosen": -47.828514099121094, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -68.7930679321289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1507221460342407, + "rewards_train/margins": 0.863301157951355, + "rewards_train/rejected": -2.0140233039855957, + "step": 2568 + }, + { + "epoch": 0.72, + "logps_train/chosen": -83.5516357421875, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -135.83253479003906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2588741779327393, + "rewards_train/margins": 3.0040667057037354, + "rewards_train/rejected": -4.262940883636475, + "step": 2569 + }, + { + "epoch": 0.72, + "learning_rate": 1.989985280327566e-09, + "loss": 0.3847, + "step": 2570 + }, + { + "epoch": 0.72, + "logps_train/chosen": -80.92173767089844, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -108.75089263916016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7683460712432861, + "rewards_train/margins": 2.3823297023773193, + "rewards_train/rejected": -4.1506757736206055, + "step": 2570 + }, + { + "epoch": 0.72, + "logps_train/chosen": -97.18829345703125, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -94.86675262451172, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2614080905914307, + "rewards_train/margins": 1.3947980403900146, + "rewards_train/rejected": -2.6562061309814453, + "step": 2571 + }, + { + "epoch": 0.72, + "learning_rate": 1.9128104205228534e-09, + "loss": 0.275, + "step": 2572 + }, + { + "epoch": 0.72, + "logps_train/chosen": -36.382652282714844, + "logps_train/ref_chosen": -33.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -65.79957580566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.33304083347320557, + "rewards_train/margins": 1.8988693952560425, + "rewards_train/rejected": -2.231910228729248, + "step": 2572 + }, + { + "epoch": 0.72, + "logps_train/chosen": -65.64483642578125, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -99.76315307617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.298858642578125, + "rewards_train/margins": 1.9219884872436523, + "rewards_train/rejected": -3.2208471298217773, + "step": 2573 + }, + { + "epoch": 0.72, + "learning_rate": 1.8371560403916963e-09, + "loss": 0.2957, + "step": 2574 + }, + { + "epoch": 0.72, + "logps_train/chosen": -61.229835510253906, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -104.45869445800781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8568215370178223, + "rewards_train/margins": 2.9730324745178223, + "rewards_train/rejected": -3.8298540115356445, + "step": 2574 + }, + { + "epoch": 0.72, + "logps_train/chosen": -77.876953125, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -81.69902038574219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.372558832168579, + "rewards_train/margins": 0.7455852031707764, + "rewards_train/rejected": -3.1181440353393555, + "step": 2575 + }, + { + "epoch": 0.72, + "learning_rate": 1.7630226036055295e-09, + "loss": 0.482, + "step": 2576 + }, + { + "epoch": 0.72, + "logps_train/chosen": -107.43798828125, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -166.614501953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -3.076221466064453, + "rewards_train/margins": 3.1914777755737305, + "rewards_train/rejected": -6.267699241638184, + "step": 2576 + }, + { + "epoch": 0.72, + "logps_train/chosen": -55.397789001464844, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -82.51815032958984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9651696681976318, + "rewards_train/margins": 1.2850830554962158, + "rewards_train/rejected": -2.2502527236938477, + "step": 2577 + }, + { + "epoch": 0.72, + "learning_rate": 1.690410564514244e-09, + "loss": 0.2142, + "step": 2578 + }, + { + "epoch": 0.72, + "logps_train/chosen": -84.96260070800781, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -91.90412902832031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.000166893005371, + "rewards_train/margins": 2.0347776412963867, + "rewards_train/rejected": -3.034944534301758, + "step": 2578 + }, + { + "epoch": 0.72, + "logps_train/chosen": -43.4246826171875, + "logps_train/ref_chosen": -31.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -82.26408386230469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.188562273979187, + "rewards_train/margins": 1.9011269807815552, + "rewards_train/rejected": -3.089689254760742, + "step": 2579 + }, + { + "epoch": 0.72, + "learning_rate": 1.6193203681433842e-09, + "loss": 0.2988, + "step": 2580 + }, + { + "epoch": 0.72, + "logps_train/chosen": -62.937217712402344, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -72.88941192626953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8898157477378845, + "rewards_train/margins": 1.0434615015983582, + "rewards_train/rejected": -1.9332772493362427, + "step": 2580 + }, + { + "epoch": 0.72, + "logps_train/chosen": -108.91561126708984, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -117.49498748779297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.241560935974121, + "rewards_train/margins": 1.604032039642334, + "rewards_train/rejected": -3.845592975616455, + "step": 2581 + }, + { + "epoch": 0.72, + "learning_rate": 1.5497524501913162e-09, + "loss": 0.3374, + "step": 2582 + }, + { + "epoch": 0.72, + "logps_train/chosen": -23.017427444458008, + "logps_train/ref_chosen": -18.5, + "logps_train/ref_rejected": -20.0, + "logps_train/rejected": -32.56489181518555, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.44817832112312317, + "rewards_train/margins": 0.8055765330791473, + "rewards_train/rejected": -1.2537548542022705, + "step": 2582 + }, + { + "epoch": 0.72, + "logps_train/chosen": -83.52470397949219, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -116.595458984375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.1735641956329346, + "rewards_train/margins": 2.918989896774292, + "rewards_train/rejected": -5.092554092407227, + "step": 2583 + }, + { + "epoch": 0.72, + "learning_rate": 1.481707237026758e-09, + "loss": 0.3747, + "step": 2584 + }, + { + "epoch": 0.72, + "logps_train/chosen": -42.2349853515625, + "logps_train/ref_chosen": -36.0, + "logps_train/ref_rejected": -28.375, + "logps_train/rejected": -49.74072265625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6160765886306763, + "rewards_train/margins": 1.5263549089431763, + "rewards_train/rejected": -2.1424314975738525, + "step": 2584 + }, + { + "epoch": 0.72, + "logps_train/chosen": -92.0221939086914, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -98.5, + "logps_train/rejected": -142.62228393554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.937375783920288, + "rewards_train/margins": 2.4701645374298096, + "rewards_train/rejected": -4.407540321350098, + "step": 2585 + }, + { + "epoch": 0.72, + "learning_rate": 1.4151851456859765e-09, + "loss": 0.2493, + "step": 2586 + }, + { + "epoch": 0.72, + "logps_train/chosen": -74.80470275878906, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -100.12959289550781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.05058753490448, + "rewards_train/margins": 1.2408875226974487, + "rewards_train/rejected": -2.2914750576019287, + "step": 2586 + }, + { + "epoch": 0.72, + "logps_train/chosen": -113.98797607421875, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -144.82891845703125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.9788753986358643, + "rewards_train/margins": 1.0831177234649658, + "rewards_train/rejected": -4.06199312210083, + "step": 2587 + }, + { + "epoch": 0.72, + "learning_rate": 1.3501865838703718e-09, + "loss": 0.4007, + "step": 2588 + }, + { + "epoch": 0.72, + "logps_train/chosen": -113.89677429199219, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -132.39170837402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2963175773620605, + "rewards_train/margins": 2.0303540229797363, + "rewards_train/rejected": -4.326671600341797, + "step": 2588 + }, + { + "epoch": 0.72, + "logps_train/chosen": -152.4592742919922, + "logps_train/ref_chosen": -124.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -149.7528839111328, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -2.8599905967712402, + "rewards_train/margins": 1.2711572647094727, + "rewards_train/rejected": -4.131147861480713, + "step": 2589 + }, + { + "epoch": 0.72, + "learning_rate": 1.2867119499438973e-09, + "loss": 0.4554, + "step": 2590 + }, + { + "epoch": 0.72, + "logps_train/chosen": -81.25377655029297, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -94.271728515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9240100383758545, + "rewards_train/margins": 1.115809440612793, + "rewards_train/rejected": -3.0398194789886475, + "step": 2590 + }, + { + "epoch": 0.72, + "logps_train/chosen": -71.81185150146484, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -103.9495849609375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.4714198112487793, + "rewards_train/margins": 2.3200230598449707, + "rewards_train/rejected": -3.79144287109375, + "step": 2591 + }, + { + "epoch": 0.72, + "learning_rate": 1.2247616329305876e-09, + "loss": 0.4398, + "step": 2592 + }, + { + "epoch": 0.72, + "logps_train/chosen": -97.08218383789062, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -116.63507080078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.7507970333099365, + "rewards_train/margins": 2.0326321125030518, + "rewards_train/rejected": -3.7834291458129883, + "step": 2592 + }, + { + "epoch": 0.72, + "logps_train/chosen": -53.58721923828125, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -64.55078125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.49036306142807007, + "rewards_train/margins": 1.0194998383522034, + "rewards_train/rejected": -1.5098628997802734, + "step": 2593 + }, + { + "epoch": 0.72, + "learning_rate": 1.1643360125123125e-09, + "loss": 0.3161, + "step": 2594 + }, + { + "epoch": 0.72, + "logps_train/chosen": -42.81266403198242, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -49.0, + "logps_train/rejected": -78.10739135742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4896649122238159, + "rewards_train/margins": 2.4249807596206665, + "rewards_train/rejected": -2.9146456718444824, + "step": 2594 + }, + { + "epoch": 0.73, + "logps_train/chosen": -124.36282348632812, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -131.0, + "logps_train/rejected": -187.78466796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.990774631500244, + "rewards_train/margins": 2.725193500518799, + "rewards_train/rejected": -5.715968132019043, + "step": 2595 + }, + { + "epoch": 0.73, + "learning_rate": 1.105435459026305e-09, + "loss": 0.2363, + "step": 2596 + }, + { + "epoch": 0.73, + "logps_train/chosen": -79.27178955078125, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -101.41029357910156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2160460948944092, + "rewards_train/margins": 1.918341875076294, + "rewards_train/rejected": -3.134387969970703, + "step": 2596 + }, + { + "epoch": 0.73, + "logps_train/chosen": -83.04338073730469, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -110.1080322265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.6672286987304688, + "rewards_train/margins": 1.820918083190918, + "rewards_train/rejected": -3.4881467819213867, + "step": 2597 + }, + { + "epoch": 0.73, + "learning_rate": 1.048060333462969e-09, + "loss": 0.2407, + "step": 2598 + }, + { + "epoch": 0.73, + "logps_train/chosen": -65.78709411621094, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -26.375, + "logps_train/rejected": -50.349822998046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.8619128465652466, + "rewards_train/margins": 0.5308820009231567, + "rewards_train/rejected": -2.3927948474884033, + "step": 2598 + }, + { + "epoch": 0.73, + "logps_train/chosen": -85.44850158691406, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -138.4266357421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.5466082096099854, + "rewards_train/margins": 3.2335546016693115, + "rewards_train/rejected": -4.780162811279297, + "step": 2599 + }, + { + "epoch": 0.73, + "learning_rate": 9.922109874636875e-10, + "loss": 0.3591, + "step": 2600 + }, + { + "epoch": 0.73, + "logps_train/chosen": -64.54884338378906, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -72.33523559570312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.3623065948486328, + "rewards_train/margins": 0.6255141496658325, + "rewards_train/rejected": -1.9878207445144653, + "step": 2600 + }, + { + "epoch": 0.73, + "logps_train/chosen": -90.21068572998047, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -96.4468994140625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.2124139070510864, + "rewards_train/margins": 1.480554461479187, + "rewards_train/rejected": -2.6929683685302734, + "step": 2601 + }, + { + "epoch": 0.73, + "learning_rate": 9.378877633185178e-10, + "loss": 0.4334, + "step": 2602 + }, + { + "epoch": 0.73, + "logps_train/chosen": -106.14705657958984, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -113.75762176513672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.2240805625915527, + "rewards_train/margins": 1.2075417041778564, + "rewards_train/rejected": -3.431622266769409, + "step": 2602 + }, + { + "epoch": 0.73, + "logps_train/chosen": -91.43550872802734, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -110.93279266357422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.2091755867004395, + "rewards_train/margins": 2.1323466300964355, + "rewards_train/rejected": -4.341522216796875, + "step": 2603 + }, + { + "epoch": 0.73, + "learning_rate": 8.85090993964277e-10, + "loss": 0.3414, + "step": 2604 + }, + { + "epoch": 0.73, + "logps_train/chosen": -54.87451934814453, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -53.47871780395508, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.2569833993911743, + "rewards_train/margins": 0.2319040298461914, + "rewards_train/rejected": -1.4888874292373657, + "step": 2604 + }, + { + "epoch": 0.73, + "logps_train/chosen": -89.64334106445312, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -99.81838989257812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.39353346824646, + "rewards_train/margins": 0.425805926322937, + "rewards_train/rejected": -1.819339394569397, + "step": 2605 + }, + { + "epoch": 0.73, + "learning_rate": 8.338210029824877e-10, + "loss": 0.6774, + "step": 2606 + }, + { + "epoch": 0.73, + "logps_train/chosen": -103.01151275634766, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -158.04318237304688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2964632511138916, + "rewards_train/margins": 3.812737226486206, + "rewards_train/rejected": -6.109200477600098, + "step": 2606 + }, + { + "epoch": 0.73, + "logps_train/chosen": -65.50096893310547, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -94.0868148803711, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5175776481628418, + "rewards_train/margins": 1.2215728759765625, + "rewards_train/rejected": -2.7391505241394043, + "step": 2607 + }, + { + "epoch": 0.73, + "learning_rate": 7.840781045972134e-10, + "loss": 0.2404, + "step": 2608 + }, + { + "epoch": 0.73, + "logps_train/chosen": -44.519287109375, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -79.1734848022461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6569089293479919, + "rewards_train/margins": 1.4751859307289124, + "rewards_train/rejected": -2.1320948600769043, + "step": 2608 + }, + { + "epoch": 0.73, + "logps_train/chosen": -98.72407531738281, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -102.75523376464844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.331782817840576, + "rewards_train/margins": 1.4964756965637207, + "rewards_train/rejected": -3.828258514404297, + "step": 2609 + }, + { + "epoch": 0.73, + "learning_rate": 7.358626036733373e-10, + "loss": 0.3815, + "step": 2610 + }, + { + "epoch": 0.73, + "logps_train/chosen": -46.78388977050781, + "logps_train/ref_chosen": -41.25, + "logps_train/ref_rejected": -34.0, + "logps_train/rejected": -59.62596893310547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5613968372344971, + "rewards_train/margins": 2.0004189014434814, + "rewards_train/rejected": -2.5618157386779785, + "step": 2610 + }, + { + "epoch": 0.73, + "logps_train/chosen": -64.04884338378906, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -72.33780670166016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.1375988721847534, + "rewards_train/margins": 0.2727445363998413, + "rewards_train/rejected": -1.4103434085845947, + "step": 2611 + }, + { + "epoch": 0.73, + "learning_rate": 6.891747957145921e-10, + "loss": 0.4822, + "step": 2612 + }, + { + "epoch": 0.73, + "logps_train/chosen": -90.50855255126953, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -104.34603881835938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.787964344024658, + "rewards_train/margins": 1.141561508178711, + "rewards_train/rejected": -3.929525852203369, + "step": 2612 + }, + { + "epoch": 0.73, + "logps_train/chosen": -45.346405029296875, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -70.21902465820312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.602023184299469, + "rewards_train/margins": 1.6985411047935486, + "rewards_train/rejected": -2.3005642890930176, + "step": 2613 + }, + { + "epoch": 0.73, + "learning_rate": 6.440149668617556e-10, + "loss": 0.379, + "step": 2614 + }, + { + "epoch": 0.73, + "logps_train/chosen": -32.510215759277344, + "logps_train/ref_chosen": -26.5, + "logps_train/ref_rejected": -24.0, + "logps_train/rejected": -35.33387756347656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.6028770208358765, + "rewards_train/margins": 0.5250420570373535, + "rewards_train/rejected": -1.12791907787323, + "step": 2614 + }, + { + "epoch": 0.73, + "logps_train/chosen": -62.134742736816406, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -70.88180541992188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.052536964416504, + "rewards_train/margins": 1.064549446105957, + "rewards_train/rejected": -2.117086410522461, + "step": 2615 + }, + { + "epoch": 0.73, + "learning_rate": 6.003833938908742e-10, + "loss": 0.4904, + "step": 2616 + }, + { + "epoch": 0.73, + "logps_train/chosen": -72.50592041015625, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -96.1620864868164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4924870729446411, + "rewards_train/margins": 2.4229406118392944, + "rewards_train/rejected": -3.9154276847839355, + "step": 2616 + }, + { + "epoch": 0.73, + "logps_train/chosen": -80.89410400390625, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -94.4085922241211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.099176287651062, + "rewards_train/margins": 1.5959798097610474, + "rewards_train/rejected": -2.6951560974121094, + "step": 2617 + }, + { + "epoch": 0.73, + "learning_rate": 5.58280344211709e-10, + "loss": 0.2502, + "step": 2618 + }, + { + "epoch": 0.73, + "logps_train/chosen": -41.728641510009766, + "logps_train/ref_chosen": -31.75, + "logps_train/ref_rejected": -28.625, + "logps_train/rejected": -42.172752380371094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.000537633895874, + "rewards_train/margins": 0.3592545986175537, + "rewards_train/rejected": -1.3597922325134277, + "step": 2618 + }, + { + "epoch": 0.73, + "logps_train/chosen": -122.71369171142578, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -139.3235321044922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.4494946002960205, + "rewards_train/margins": 2.359421491622925, + "rewards_train/rejected": -4.808916091918945, + "step": 2619 + }, + { + "epoch": 0.73, + "learning_rate": 5.177060758659036e-10, + "loss": 0.3948, + "step": 2620 + }, + { + "epoch": 0.73, + "logps_train/chosen": -64.05299377441406, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -58.91072082519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2408467531204224, + "rewards_train/margins": 0.2281550168991089, + "rewards_train/rejected": -1.4690017700195312, + "step": 2620 + }, + { + "epoch": 0.73, + "logps_train/chosen": -133.38526916503906, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -154.12063598632812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -3.9533708095550537, + "rewards_train/margins": 1.56553053855896, + "rewards_train/rejected": -5.518901348114014, + "step": 2621 + }, + { + "epoch": 0.73, + "learning_rate": 4.786608375254853e-10, + "loss": 0.5219, + "step": 2622 + }, + { + "epoch": 0.73, + "logps_train/chosen": -73.6378173828125, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -84.26182556152344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.5983517169952393, + "rewards_train/margins": 0.7883777618408203, + "rewards_train/rejected": -2.3867294788360596, + "step": 2622 + }, + { + "epoch": 0.73, + "logps_train/chosen": -68.96737670898438, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -71.81686401367188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.123691439628601, + "rewards_train/margins": 1.0577021837234497, + "rewards_train/rejected": -2.181393623352051, + "step": 2623 + }, + { + "epoch": 0.73, + "learning_rate": 4.4114486849136656e-10, + "loss": 0.4835, + "step": 2624 + }, + { + "epoch": 0.73, + "logps_train/chosen": -106.56568908691406, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -135.99807739257812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.9518814086914062, + "rewards_train/margins": 2.0166759490966797, + "rewards_train/rejected": -4.968557357788086, + "step": 2624 + }, + { + "epoch": 0.73, + "logps_train/chosen": -73.24900817871094, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -87.77178955078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.4643534421920776, + "rewards_train/margins": 1.6003261804580688, + "rewards_train/rejected": -3.0646796226501465, + "step": 2625 + }, + { + "epoch": 0.73, + "learning_rate": 4.0515839869190163e-10, + "loss": 0.3318, + "step": 2626 + }, + { + "epoch": 0.73, + "logps_train/chosen": -89.29573822021484, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -104.28428649902344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.748079538345337, + "rewards_train/margins": 1.2377712726593018, + "rewards_train/rejected": -3.9858508110046387, + "step": 2626 + }, + { + "epoch": 0.73, + "logps_train/chosen": -45.72940444946289, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -58.500343322753906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.6848545074462891, + "rewards_train/margins": 1.64369535446167, + "rewards_train/rejected": -2.328549861907959, + "step": 2627 + }, + { + "epoch": 0.73, + "learning_rate": 3.707016486813597e-10, + "loss": 0.3812, + "step": 2628 + }, + { + "epoch": 0.73, + "logps_train/chosen": -37.667354583740234, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -23.375, + "logps_train/rejected": -31.715017318725586, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.252868115901947, + "rewards_train/margins": 0.5866023898124695, + "rewards_train/rejected": -0.8394705057144165, + "step": 2628 + }, + { + "epoch": 0.73, + "logps_train/chosen": -97.2012939453125, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -104.85595703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.655089855194092, + "rewards_train/margins": 0.9578502178192139, + "rewards_train/rejected": -3.6129400730133057, + "step": 2629 + }, + { + "epoch": 0.74, + "learning_rate": 3.377748296386762e-10, + "loss": 0.5732, + "step": 2630 + }, + { + "epoch": 0.74, + "logps_train/chosen": -97.13841247558594, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -149.4308624267578, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.711400032043457, + "rewards_train/margins": 3.483640193939209, + "rewards_train/rejected": -5.195040225982666, + "step": 2630 + }, + { + "epoch": 0.74, + "logps_train/chosen": -100.67832946777344, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -97.8401107788086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6525983810424805, + "rewards_train/margins": 1.3847332000732422, + "rewards_train/rejected": -3.0373315811157227, + "step": 2631 + }, + { + "epoch": 0.74, + "learning_rate": 3.0637814336612053e-10, + "loss": 0.2806, + "step": 2632 + }, + { + "epoch": 0.74, + "logps_train/chosen": -78.55406951904297, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -100.8720932006836, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.9934930801391602, + "rewards_train/margins": 2.019594669342041, + "rewards_train/rejected": -4.013087749481201, + "step": 2632 + }, + { + "epoch": 0.74, + "logps_train/chosen": -49.850643157958984, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -26.5, + "logps_train/rejected": -49.26601028442383, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.39939546585083, + "rewards_train/margins": 0.8737633228302002, + "rewards_train/rejected": -2.2731587886810303, + "step": 2633 + }, + { + "epoch": 0.74, + "learning_rate": 2.765117822880747e-10, + "loss": 0.4619, + "step": 2634 + }, + { + "epoch": 0.74, + "logps_train/chosen": -111.60806274414062, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -110.72080993652344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.213834285736084, + "rewards_train/margins": 1.2634224891662598, + "rewards_train/rejected": -3.4772567749023438, + "step": 2634 + }, + { + "epoch": 0.74, + "logps_train/chosen": -85.77435302734375, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -104.79190063476562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.9071223735809326, + "rewards_train/margins": 0.6876928806304932, + "rewards_train/rejected": -2.594815254211426, + "step": 2635 + }, + { + "epoch": 0.74, + "learning_rate": 2.481759294498398e-10, + "loss": 0.6999, + "step": 2636 + }, + { + "epoch": 0.74, + "logps_train/chosen": -119.15330505371094, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -146.40960693359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.920018196105957, + "rewards_train/margins": 1.6740684509277344, + "rewards_train/rejected": -4.594086647033691, + "step": 2636 + }, + { + "epoch": 0.74, + "logps_train/chosen": -70.18196105957031, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -101.83208465576172, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.7195870876312256, + "rewards_train/margins": 1.5519025325775146, + "rewards_train/rejected": -3.2714896202087402, + "step": 2637 + }, + { + "epoch": 0.74, + "learning_rate": 2.2137075851652587e-10, + "loss": 0.3861, + "step": 2638 + }, + { + "epoch": 0.74, + "logps_train/chosen": -72.2225112915039, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -96.84630584716797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.47615736722946167, + "rewards_train/margins": 1.8420670628547668, + "rewards_train/rejected": -2.3182244300842285, + "step": 2638 + }, + { + "epoch": 0.74, + "logps_train/chosen": -47.962032318115234, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -32.75, + "logps_train/rejected": -46.229488372802734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.9924926161766052, + "rewards_train/margins": 0.35311228036880493, + "rewards_train/rejected": -1.3456048965454102, + "step": 2639 + }, + { + "epoch": 0.74, + "learning_rate": 1.960964337719695e-10, + "loss": 0.4215, + "step": 2640 + }, + { + "epoch": 0.74, + "logps_train/chosen": -87.33999633789062, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -103.14662170410156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.2516021728515625, + "rewards_train/margins": 2.0252671241760254, + "rewards_train/rejected": -3.276869297027588, + "step": 2640 + }, + { + "epoch": 0.74, + "logps_train/chosen": -113.96342468261719, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -131.891357421875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.593217372894287, + "rewards_train/margins": 1.622480869293213, + "rewards_train/rejected": -4.2156982421875, + "step": 2641 + }, + { + "epoch": 0.74, + "learning_rate": 1.7235311011778998e-10, + "loss": 0.3257, + "step": 2642 + }, + { + "epoch": 0.74, + "logps_train/chosen": -56.97750473022461, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -86.05448913574219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1559534072875977, + "rewards_train/margins": 1.0323081016540527, + "rewards_train/rejected": -2.1882615089416504, + "step": 2642 + }, + { + "epoch": 0.74, + "logps_train/chosen": -57.61783981323242, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -105.30703735351562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.9145675301551819, + "rewards_train/margins": 2.4689196944236755, + "rewards_train/rejected": -3.3834872245788574, + "step": 2643 + }, + { + "epoch": 0.74, + "learning_rate": 1.5014093307227915e-10, + "loss": 0.3497, + "step": 2644 + }, + { + "epoch": 0.74, + "logps_train/chosen": -80.65318298339844, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -124.08656311035156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.1981306076049805, + "rewards_train/margins": 1.4226343631744385, + "rewards_train/rejected": -2.620764970779419, + "step": 2644 + }, + { + "epoch": 0.74, + "logps_train/chosen": -63.08063888549805, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -65.68746948242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.740876317024231, + "rewards_train/margins": 1.1562882661819458, + "rewards_train/rejected": -1.8971645832061768, + "step": 2645 + }, + { + "epoch": 0.74, + "learning_rate": 1.2946003876976308e-10, + "loss": 0.3455, + "step": 2646 + }, + { + "epoch": 0.74, + "logps_train/chosen": -16.557817459106445, + "logps_train/ref_chosen": -11.9375, + "logps_train/ref_rejected": -13.4375, + "logps_train/rejected": -26.84172821044922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.46125054359436035, + "rewards_train/margins": 0.8781956434249878, + "rewards_train/rejected": -1.3394461870193481, + "step": 2646 + }, + { + "epoch": 0.74, + "logps_train/chosen": -52.47710418701172, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -84.27853393554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7590386867523193, + "rewards_train/margins": 2.109830617904663, + "rewards_train/rejected": -2.8688693046569824, + "step": 2647 + }, + { + "epoch": 0.74, + "learning_rate": 1.1031055395946398e-10, + "loss": 0.2915, + "step": 2648 + }, + { + "epoch": 0.74, + "logps_train/chosen": -104.32331085205078, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -138.26004028320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -2.2963931560516357, + "rewards_train/margins": 1.9561731815338135, + "rewards_train/rejected": -4.252566337585449, + "step": 2648 + }, + { + "epoch": 0.74, + "logps_train/chosen": -50.243953704833984, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -73.75155639648438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.8341608047485352, + "rewards_train/margins": 2.3757612705230713, + "rewards_train/rejected": -3.2099220752716064, + "step": 2649 + }, + { + "epoch": 0.74, + "learning_rate": 9.269259600505618e-11, + "loss": 0.2996, + "step": 2650 + }, + { + "epoch": 0.74, + "logps_train/chosen": -105.0044937133789, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -110.92948150634766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.9816014766693115, + "rewards_train/margins": 1.481659173965454, + "rewards_train/rejected": -3.4632606506347656, + "step": 2650 + }, + { + "epoch": 0.74, + "logps_train/chosen": -52.51024627685547, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -61.42645263671875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.8603506088256836, + "rewards_train/margins": 0.9895212650299072, + "rewards_train/rejected": -1.8498718738555908, + "step": 2651 + }, + { + "epoch": 0.74, + "learning_rate": 7.660627288361143e-11, + "loss": 0.3913, + "step": 2652 + }, + { + "epoch": 0.74, + "logps_train/chosen": -79.01078796386719, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -89.0014419555664, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.3760783672332764, + "rewards_train/margins": 0.261566162109375, + "rewards_train/rejected": -1.6376445293426514, + "step": 2652 + }, + { + "epoch": 0.74, + "logps_train/chosen": -131.6029052734375, + "logps_train/ref_chosen": -107.5, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -157.19580078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -2.3950564861297607, + "rewards_train/margins": 1.9878051280975342, + "rewards_train/rejected": -4.382861614227295, + "step": 2653 + }, + { + "epoch": 0.74, + "learning_rate": 6.205168318523802e-11, + "loss": 0.4755, + "step": 2654 + }, + { + "epoch": 0.74, + "logps_train/chosen": -62.2870979309082, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -74.03931427001953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.290428638458252, + "rewards_train/margins": 1.2308855056762695, + "rewards_train/rejected": -2.5213141441345215, + "step": 2654 + }, + { + "epoch": 0.74, + "logps_train/chosen": -39.191017150878906, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -61.628631591796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.5071878433227539, + "rewards_train/margins": 1.1421985626220703, + "rewards_train/rejected": -1.6493864059448242, + "step": 2655 + }, + { + "epoch": 0.74, + "learning_rate": 4.9028916112220374e-11, + "loss": 0.3524, + "step": 2656 + }, + { + "epoch": 0.74, + "logps_train/chosen": -93.15544128417969, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -114.2016372680664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.0510915517807007, + "rewards_train/margins": 1.5128222703933716, + "rewards_train/rejected": -2.5639138221740723, + "step": 2656 + }, + { + "epoch": 0.74, + "logps_train/chosen": -55.345123291015625, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -59.86796188354492, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.6190825700759888, + "rewards_train/margins": 0.8321670293807983, + "rewards_train/rejected": -2.451249599456787, + "step": 2657 + }, + { + "epoch": 0.74, + "learning_rate": 3.7538051478686053e-11, + "loss": 0.4123, + "step": 2658 + }, + { + "epoch": 0.74, + "logps_train/chosen": -97.30484008789062, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -112.37997436523438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.487417221069336, + "rewards_train/margins": 1.5802674293518066, + "rewards_train/rejected": -3.0676846504211426, + "step": 2658 + }, + { + "epoch": 0.74, + "logps_train/chosen": -47.846046447753906, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -37.75, + "logps_train/rejected": -54.81316375732422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6760110259056091, + "rewards_train/margins": 1.0334305167198181, + "rewards_train/rejected": -1.7094415426254272, + "step": 2659 + }, + { + "epoch": 0.74, + "learning_rate": 2.757915970996727e-11, + "loss": 0.4072, + "step": 2660 + }, + { + "epoch": 0.74, + "logps_train/chosen": -82.4228744506836, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -130.42816162109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.0680689811706543, + "rewards_train/margins": 2.5282626152038574, + "rewards_train/rejected": -3.5963315963745117, + "step": 2660 + }, + { + "epoch": 0.74, + "logps_train/chosen": -65.0551986694336, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -106.5331802368164, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.022121548652649, + "rewards_train/margins": 3.1245564222335815, + "rewards_train/rejected": -4.1466779708862305, + "step": 2661 + }, + { + "epoch": 0.74, + "learning_rate": 1.915230184224015e-11, + "loss": 0.2196, + "step": 2662 + }, + { + "epoch": 0.74, + "logps_train/chosen": -31.293441772460938, + "logps_train/ref_chosen": -24.75, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -46.453125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.6559067964553833, + "rewards_train/margins": 0.70073401927948, + "rewards_train/rejected": -1.3566408157348633, + "step": 2662 + }, + { + "epoch": 0.74, + "logps_train/chosen": -69.38417053222656, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -75.73250579833984, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.2813860177993774, + "rewards_train/margins": 1.1850284337997437, + "rewards_train/rejected": -2.466414451599121, + "step": 2663 + }, + { + "epoch": 0.74, + "learning_rate": 1.2257529522108346e-11, + "loss": 0.4508, + "step": 2664 + }, + { + "epoch": 0.74, + "logps_train/chosen": -93.66033935546875, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -104.41761779785156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -2.036736488342285, + "rewards_train/margins": 1.322603464126587, + "rewards_train/rejected": -3.359339952468872, + "step": 2664 + }, + { + "epoch": 0.74, + "logps_train/chosen": -76.0361328125, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -115.82254028320312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.26904296875, + "rewards_train/margins": 2.4534459114074707, + "rewards_train/rejected": -3.7224888801574707, + "step": 2665 + }, + { + "epoch": 0.75, + "learning_rate": 6.894885006381024e-12, + "loss": 0.3754, + "step": 2666 + }, + { + "epoch": 0.75, + "logps_train/chosen": -89.47756958007812, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -80.26798248291016, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.9157260060310364, + "rewards_train/margins": 0.460388720035553, + "rewards_train/rejected": -1.3761147260665894, + "step": 2666 + }, + { + "epoch": 0.75, + "logps_train/chosen": -80.7105712890625, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -85.43064880371094, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -1.9282830953598022, + "rewards_train/margins": 0.6216179132461548, + "rewards_train/rejected": -2.549901008605957, + "step": 2667 + }, + { + "epoch": 0.75, + "learning_rate": 3.0644011616287603e-12, + "loss": 0.6348, + "step": 2668 + }, + { + "epoch": 0.75, + "logps_train/chosen": -84.00387573242188, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -124.59268188476562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0566372871398926, + "rewards_train/margins": 1.7485294342041016, + "rewards_train/rejected": -2.805166721343994, + "step": 2668 + }, + { + "epoch": 0.75, + "logps_train/chosen": -89.93424224853516, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -117.02482604980469, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -1.6844398975372314, + "rewards_train/margins": 0.7242922782897949, + "rewards_train/rejected": -2.4087321758270264, + "step": 2669 + }, + { + "epoch": 0.75, + "learning_rate": 7.661014642390551e-13, + "loss": 0.6116, + "step": 2670 + }, + { + "epoch": 0.75, + "logps_train/chosen": -104.87940979003906, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -134.00120544433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -1.3824727535247803, + "rewards_train/margins": 2.4367880821228027, + "rewards_train/rejected": -3.819260835647583, + "step": 2670 + }, + { + "epoch": 0.75, + "logps_train/chosen": -93.1978988647461, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -113.64189147949219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -1.8246729373931885, + "rewards_train/margins": 1.763441801071167, + "rewards_train/rejected": -3.5881147384643555, + "step": 2671 + }, + { + "epoch": 0.75, + "learning_rate": 0.0, + "loss": 0.3292, + "step": 2672 + }, + { + "epoch": 0.75, + "step": 2672, + "total_flos": 0.0, + "train_loss": 0.4759224293876194, + "train_runtime": 5798.6467, + "train_samples_per_second": 3.686, + "train_steps_per_second": 0.461 + } + ], + "logging_steps": 2, + "max_steps": 2672, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}